I am trying to map users to each other and assign a common identifier for website visitors.
I have rows (call it table a) a.UUID, a.seen_time, a.ip_address, a.user_id, a.subdomain, and I'm trying to find a.matched_idwhere , if the IP address of the row is +/- 4 hours of the last (i.e. continuous), one is assigned for these rows matched_id.
Please note that for my purposes, IP on two different subdomains does NOT necessarily match it unless they have the same user ID.
Here is the basic process that I would execute in a regular programming language (however I need to build SQL):
Amazon Redshift, , Postgres, ( , . ): Postgres/ANSI SQL.
?
SQL?
- -
, :
- , .
discovery_time seen_time, , mydata a, a b- MD5
MIN(uuid), , - , . - : +/- 4 ' '
:
WITH cte1 AS (
SELECT m.ip, m.subdomain, MAX(m.discovery_time) AS max_discovery_time,
CASE WHEN MIN(m.user_id) IS NOT NULL THEN MD5(MIN(m.user_id))
ELSE MIN(m.matched_id) END AS known_matched_id
FROM mydata m
GROUP BY m.ip, m.subdomain
), cte2 AS (
SELECT m.uuid, CASE WHEN c.known_matched_id IS NOT NULL THEN c.known_matched_id
ELSE MD5(CONCAT(c.ip, c.subdomain, c.max_discovery_time)) END AS matched_id
FROM mydata m
RIGHT OUTER JOIN cte1 c ON CONCAT(c.ip, c.subdomain) = CONCAT(m.ip, m.subdomain)
WHERE m.discovery_time >= (c.max_discovery_time - INTERVAL '4 hours')
)
UPDATE mydata m
SET matched_id = c.matched_id
FROM cte2 c
WHERE c.uuid = m.uuid;
SELECT m.discovery_time, m.ip, m.matched_id, m.uuid
FROM mydata m
WHERE m.ip = '12.34.56.78'
ORDER BY m.ip, m.discovery_time;
, script:
CREATE TABLE mydata
(
ip character varying(255),
subdomain character varying(255),
matched_id character varying(255),
user_id character varying(255),
uuid character varying(255) NOT NULL,
discovery_time timestamp without time zone,
CONSTRAINT pk_mydata PRIMARY KEY (uuid)
);
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '222b5991-9780-11e3-9304-127b2ab15ea7', '2014-02-14 00:03:26');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '333b5991-9780-11e3-9304-127b2ab15ea7', '2014-02-16 22:22:26');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '379b641b-9782-11e3-9304-127b2ab15ea7', '2014-02-17 03:18:48');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, 'ac0f6416-977e-11e3-9304-127b2ab15ea7', '2014-02-17 02:53:25');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '11fb5991-9780-11e3-9304-127b2ab15ea7', '2014-02-17 03:03:26');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '849d8d61-9781-11e3-9304-127b2ab15ea7', '2014-02-17 03:13:48');
, matched_id, ( INSERT), , 4 ( user_id ).
- 2 -
- . , , , ,
min_time max_time min max 4-
:
UPDATE mydata AS m SET matched_id = matching.new_matched_id
FROM (
SELECT a.user_id, MIN(a.uuid) AS new_matched_id FROM mydata a
WHERE a.user_id IS NOT NULL
GROUP BY a.user_id
) AS matching
WHERE m.matched_id IS NULL
AND m.user_id IS NOT NULL
AND matching.user_id = m.user_id;
UPDATE mydata my SET min_time = matching.min_dist, max_time = matching.max_dist, matched_id = new_matched_id
FROM (
SELECT a.uuid, MIN(b.matched_id) AS new_matched_id, max(COALESCE(b.min_time, b.discovery_time)) - interval '4 hour' AS min_dist, max(COALESCE(b.max_time, b.discovery_time)) + interval '4 hour' AS max_dist
FROM mydata a
JOIN mydata b
ON (a.ip = b.ip AND a.subdomain = b.subdomain)
GROUP BY a.uuid
HAVING ABS(EXTRACT(EPOCH FROM max(COALESCE(a.min_time, b.discovery_time)) - a.discovery_time)/3600) <= 4
) matching
WHERE matching.uuid = my.uuid
AND min_time IS NULL;
UPDATE mydata m SET matched_id = new_matched_id, min_time = matching.min_time, max_time = matching.max_time
FROM (
SELECT a.uuid, MAX(b.min_time) AS min_time, MAX(b.max_time) AS max_time, COALESCE(a.matched_id, MIN(b.uuid)) AS new_matched_id FROM mydata a
INNER JOIN mydata b
ON a.ip = b.ip AND a.subdomain = b.subdomain
WHERE a.discovery_time >= b.min_time
AND a.discovery_time <= b.max_time
GROUP BY a.uuid
) matching
WHERE matching.uuid = m.uuid;