User correlation with each other in SQL query

Question

User correlation with each other in SQL query

I am trying to map users to each other and assign a common identifier for website visitors.

I have rows (call it table a) a.UUID, a.seen_time, a.ip_address, a.user_id, a.subdomain, and I'm trying to find a.matched_idwhere , if the IP address of the row is +/- 4 hours of the last (i.e. continuous), one is assigned for these rows matched_id.

Please note that for my purposes, IP on two different subdomains does NOT necessarily match it unless they have the same user ID.

Here is the basic process that I would execute in a regular programming language (however I need to build SQL):

Get the required rows of table a
For each line, if any line has the corresponding user_id (the subdomain does not matter), assign the same one to them matched_id(ceteris paribus, use MIN(uuid))
Division into subdomains.
For each of these subdomain sections:
- Now divide into buckets of IP addresses, where each line is <4 hours from what it saw before (/ after) it (i.e., in stages)
  For each of these sections of IP addresses:
  - If any 1 element already has one matched_id, assign it to everyone. Otherwise, assign a new one to matched_ideveryone (using MIN(uuid)). Continue.

Amazon Redshift, , Postgres, ( , . ): Postgres/ANSI SQL.

?

SQL?

- -

, :

, .
discovery_time seen_time, , mydata a, a b
MD5 MIN(uuid), , - , .
: +/- 4 ' '

:

--UPDATE mydata m SET matched_id = NULL; --for testing

WITH cte1 AS (
    --start with the max discovery time and go down from there
    --select the matched id if one already exists
    SELECT m.ip, m.subdomain, MAX(m.discovery_time) AS max_discovery_time, 
        CASE WHEN MIN(m.user_id) IS NOT NULL THEN MD5(MIN(m.user_id)) 
        ELSE MIN(m.matched_id) END AS known_matched_id
    FROM mydata m
    GROUP BY m.ip, m.subdomain

    ), cte2 AS (

    SELECT m.uuid, CASE WHEN c.known_matched_id IS NOT NULL THEN c.known_matched_id 
        ELSE MD5(CONCAT(c.ip, c.subdomain, c.max_discovery_time)) END AS matched_id
    FROM mydata m 
    --IP on different subdomains are not necessarily the same match
    RIGHT OUTER JOIN cte1 c ON CONCAT(c.ip, c.subdomain) = CONCAT(m.ip, m.subdomain) 
    WHERE m.discovery_time >= (c.max_discovery_time - INTERVAL '4 hours')
    --Does not work 'row by row' instead in terms of absolutes - need to make this recursive somehow,
    --but Redshift does not support recursive CTEs or user-defined functions
)

UPDATE mydata m
SET matched_id = c.matched_id
FROM cte2 c
WHERE c.uuid = m.uuid;

--view result for an example IP
SELECT m.discovery_time, m.ip, m.matched_id, m.uuid 
FROM mydata m
WHERE m.ip = '12.34.56.78'
ORDER BY m.ip, m.discovery_time;

, script:

CREATE TABLE mydata
(
  ip character varying(255),
  subdomain character varying(255),
  matched_id character varying(255),
  user_id character varying(255),
  uuid character varying(255) NOT NULL,
  discovery_time timestamp without time zone,
  CONSTRAINT pk_mydata PRIMARY KEY (uuid)
);

-- should all get the same matched_id in result, except the 1st
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '222b5991-9780-11e3-9304-127b2ab15ea7', '2014-02-14 00:03:26');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '333b5991-9780-11e3-9304-127b2ab15ea7', '2014-02-16 22:22:26');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '379b641b-9782-11e3-9304-127b2ab15ea7', '2014-02-17 03:18:48');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, 'ac0f6416-977e-11e3-9304-127b2ab15ea7', '2014-02-17 02:53:25');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '11fb5991-9780-11e3-9304-127b2ab15ea7', '2014-02-17 03:03:26');
INSERT INTO mydata (ip, subdomain, matched_id, user_id, uuid, discovery_time) VALUES ('12.34.56.78', 'sub1', NULL, NULL, '849d8d61-9781-11e3-9304-127b2ab15ea7', '2014-02-17 03:13:48');

, matched_id, ( INSERT), , 4 ( user_id ).

- 2 -

. , , , ,
min_time max_time min max 4-

:

-- Set user IDs that are the same 
UPDATE mydata AS m SET matched_id = matching.new_matched_id
FROM (
    SELECT a.user_id, MIN(a.uuid) AS new_matched_id FROM mydata a
    WHERE a.user_id IS NOT NULL
    GROUP BY a.user_id
) AS matching
WHERE m.matched_id IS NULL
AND m.user_id IS NOT NULL
AND matching.user_id = m.user_id;


-- Find rows +/- 4hrs of each other 
-- 1. Set min and max times for a 4hr set --
UPDATE mydata my SET min_time = matching.min_dist, max_time = matching.max_dist, matched_id = new_matched_id
FROM (
    -- mintime is approx
    SELECT a.uuid, MIN(b.matched_id) AS new_matched_id, max(COALESCE(b.min_time, b.discovery_time)) - interval '4 hour' AS min_dist, max(COALESCE(b.max_time, b.discovery_time)) + interval '4 hour' AS max_dist
    FROM mydata a
    JOIN mydata b
    ON (a.ip = b.ip AND a.subdomain = b.subdomain)
    GROUP BY a.uuid
    HAVING ABS(EXTRACT(EPOCH FROM max(COALESCE(a.min_time, b.discovery_time)) - a.discovery_time)/3600) <= 4
) matching
WHERE matching.uuid = my.uuid
AND min_time IS NULL;

-- 2. Set the matched id of all the +/- 4hr records --
UPDATE mydata m SET matched_id = new_matched_id, min_time = matching.min_time, max_time = matching.max_time
FROM (
    SELECT a.uuid, MAX(b.min_time) AS min_time, MAX(b.max_time) AS max_time, COALESCE(a.matched_id, MIN(b.uuid)) AS new_matched_id FROM mydata a
    INNER JOIN mydata b
    ON a.ip = b.ip AND a.subdomain = b.subdomain
    WHERE a.discovery_time >= b.min_time
    AND a.discovery_time <= b.max_time
    GROUP BY a.uuid
) matching
WHERE matching.uuid = m.uuid;

+3

sql postgresql amazon-redshift gaps-and-islands

Chris Riddell 19 . '14 8:13

2

James K. Lowden · Answer 1 · 2014-02-28T04:05:22+0000

, , , , , IP- +/- 4 , "" IP- ( IP + UUID, ).

select ip_address, max(seen_time) group by ip_address

, . .

Postgres, , .

select * from a as A 
where exists (
    select 1 from a 
    where ip_address = A.ip_address
    and   UUID = A.UUID
    group by ip_address, UUID
    having hour(max(seen_time)) - hour(A.seen_time) < 4
)

.

Rory · Answer 2 · 2014-03-03T10:51:33+0000

:

a: id_1, id_2, min_time, max_time

id_1 min(uuid) user_id. - :

 -- match any records with a userid
 update a 
 set id_1 = x.uuid 
 from a 
 inner join (   
        select min(uuid) as uuid, userid 
        from a where userid is not null group by userid ) as x
   on a.userId = x.userId

min_time max_time, last_seen / 4 . , .

update a 
set min_time = seen_time - interval '4 hour'
,   max_time = seen_time + interval '4 hour'

, ip subdomain, a.seen_time 4 . :

update a 
set id_2 = other_uuid
from ( 

    -- join a onto all matching records by ip and subdomain
    -- where a.seen_time within 4 hours of the other record.
    select a.uuid, min(other.uuid) as other_uuid 
    from a 
    inner join a AS other
    on a.ip_address = other.ip_address
    and a.subdomain = other.subdomain
    and a.uuid <> other.uuid
    where a.seen_time > other.min_time
    and a.seen_time < other.max_time
    group by a.uuid
) AS matching 
where a.uuid = matching.uuid
-- no need to match ones already matched on userid
and id_1 is null

id_1 id_2 - , .

User correlation with each other in SQL query

More articles: