How to condense a data frame based on the closest matches in R

Question

How to condense a data frame based on the closest matches in R

I have a data frame that currently contains two "temporary columns in the format HH: MM: SS. I would like to condense this data frame so that I only have one row for each unique id value. I would like to save the row for each unique id value that has the value time1, which is the closest match to the value of time2. However, "time1" must be greater than "time2".

Here is a simple example:

> dput(df)
structure(list(id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L), count = c(23L, 23L, 23L, 23L, 45L, 45L, 
45L, 45L, 67L, 67L, 67L, 67L, 88L, 88L, 88L, 88L), time1 = structure(c(1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L), .Label = c("00:13:00", 
"01:13:00", "07:18:00", "18:14:00"), class = "factor"), time2 = structure(c(4L, 
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L), .Label = c("00:00:00", 
"06:00:00", "12:00:00", "18:00:00"), class = "factor"), afn = c(3.36, 
0.63, 1.77, 3.89, 3.36, 0.63, 1.77, 3.89, 3.36, 0.63, 1.77, 3.89, 
3.36, 0.63, 1.77, 3.89), dfn = c(201.67, 157.27, 103.55, 191.41, 
201.67, 157.27, 103.55, 191.41, 201.67, 157.27, 103.55, 191.41, 
201.67, 157.27, 103.55, 191.41)), .Names = c("id", "count", "time1", 
"time2", "afn", "dfn"), class = "data.frame", row.names = c(NA, 
-16L))

> df
   id count    time1    time2  afn    dfn
1   1    23 00:13:00 18:00:00 3.36 201.67
2   1    23 00:13:00 00:00:00 0.63 157.27
3   1    23 00:13:00 06:00:00 1.77 103.55
4   1    23 00:13:00 12:00:00 3.89 191.41
5   2    45 01:13:00 18:00:00 3.36 201.67
6   2    45 01:13:00 00:00:00 0.63 157.27
7   2    45 01:13:00 06:00:00 1.77 103.55
8   2    45 01:13:00 12:00:00 3.89 191.41
9   3    67 18:14:00 18:00:00 3.36 201.67
10  3    67 18:14:00 00:00:00 0.63 157.27
11  3    67 18:14:00 06:00:00 1.77 103.55
12  3    67 18:14:00 12:00:00 3.89 191.41
13  4    88 07:18:00 18:00:00 3.36 201.67
14  4    88 07:18:00 00:00:00 0.63 157.27
15  4    88 07:18:00 06:00:00 1.77 103.55
16  4    88 07:18:00 12:00:00 3.89 191.41

I would like to get this matrix in the above case:

id  count   time1       time2       afn     dfn
1   23      00:13:00    00:00:00    0.63    157.27
2   45      01:13:00    00:00:00    0.63    157.27
3   67      18:14:00    18:00:00    3.36    201.67
4   88      07:18:00    06:00:00    1.77    103.55

ddply() , . ( , , ), , , . . !

+3

time r match datetime-format

Emily 23 . '14 17:46

3

dplyr:

library(dplyr)

(df %.%
   mutate(timeDiff = as.integer(strptime(time1, "%X") - strptime(time2, "%X")),
          posDiff = timeDiff >= 0) %.%
   filter(posDiff) %.%
   group_by(id) %.%
   filter(min(timeDiff) == timeDiff))[names(df)]

#   id count    time1    time2  afn    dfn
# 1  1    23 00:13:00 00:00:00 0.63 157.27
# 2  2    45 01:13:00 00:00:00 0.63 157.27
# 3  3    67 18:14:00 18:00:00 3.36 201.67
# 4  4    88 07:18:00 06:00:00 1.77 103.55

+2

Sven Hohenstein 23 . '14 18:23

The approach using ddplyand merge. (Assuming the “near term matches” are the minimum absolute values of difftimes)

t1 <- strptime(df$time1, "%H:%M:%S")
t2 <- strptime(df$time2, "%H:%M:%S")
df$min.diff <- abs(as.numeric(difftime(t1, t2, units='mins')))

d1 <- ddply(df, .(id), summarize, min.diff = min(min.diff))

> merge(df, d1, by = c("id", "min.diff"))
  id min.diff count    time1    time2  afn    dfn
1  1       13    23 00:13:00 00:00:00 0.63 157.27
2  2       73    45 01:13:00 00:00:00 0.63 157.27
3  3       14    67 18:14:00 18:00:00 3.36 201.67
4  4       78    88 07:18:00 06:00:00 1.77 103.55

+1

Julien navarre Feb 23 '14 at 18:40

source share

G. Grothendieck · Accepted Answer · 2014-02-23T18:35:09+0000

.

1) ave chron times, subset ave R:

library(chron)

delta <- as.vector(times(df$time1) - times(df$time2))
df2 <- subset(df, delta > 0)
df2[ave(delta, df2$id, FUN = function(delta) delta == min(delta)) == 1, ]

2) dplyr chron times dplyr:

library(chron)
library(dplyr) 

df %.% 
   mutate(delta = as.vector(times(time1) - times(time2))) %.% 
   filter(delta > 0) %.% 
   group_by(id) %.% 
   filter(delta == min(delta)) %.% 
   select(- delta)

3) sqldf

library(sqldf)

sqldf("select *, min(strftime('%s', time1) - strftime('%s', time2)) delta
  from (select * from df where strftime('%s', time1) > strftime('%s', time2))
  group by id")[seq_along(df)]

, , , delta R, sqldf:

library(sqldf)
library(chron)

df2 = transform(df, delta = as.vector(times(time1) - times(time2)))

sqldf("select *, min(delta) delta
  from (select * from df2 where delta > 0)
  group by id")[-ncol(df2)]

4) data.table

library(data.table)
library(chron)

DT <- data.table(df)
DT[, delta := times(time1) - times(time2)
 ][delta > 0
 ][, .SD[delta == min(delta)], by = id
 ][, seq_along(df), with = FALSE]

. library subset. .

How to condense a data frame based on the closest matches in R

More articles: