The total time interval of daily data in R with overlapping dates

Maybe the answer should be obvious, but I'm a little stuck.

My data looks something like this:

> df <- data.frame(person = c("A", "B", "C"), start = c("2014-01-01", "2014-01-02", "2014-01-03"), stop = c("2014-01-05", "2014-01-06", "2014-01-04") )
> df
  person       start       stop
1      A  2014-01-01 2014-01-05
2      B  2014-01-02 2014-01-06
3      C  2014-01-03 2014-01-04

Ultimately, I want to calculate the total number of people performing activities on a specific day, but I agree only to count the number per day (i.e., count the total number of cases of each date when the start and end dates are known). For the data above, this The answer I'm looking for:

      Date  Tally
2014-01-01  1
2014-01-02  2
2014-01-03  3
2014-01-04  3
2014-01-05  2
2014-01-06  1

One of the ways I tried is to use seq () to generate all dates, but this doesn't seem to work for start / end dates of length> 1:

seq(df$start, df$stop, length = "1 day") ## Does not work

Any help would be greatly appreciated.

+3
source share
2 answers

Away may be:

as.data.frame(table(unlist(apply(df[-1], 1, 
        function(x) as.character(seq(as.Date(x[1], "%Y-%m-%d"), 
                                     as.Date(x[2], "%Y-%m-%d"), "1 day"))))))
        Var1 Freq
1 2014-01-01    1
2 2014-01-02    2
3 2014-01-03    3
4 2014-01-04    3
5 2014-01-05    2
6 2014-01-06    1

, , . -, , as.Date apply. , , , , apply , , seq . -, seq "". -, . "".

f1 = function() {  #keeping dates
  as.data.frame(table(unlist(apply(df[-1], 1, 
       function(x) as.character(seq(as.Date(x[1], "%Y-%m-%d"), 
                                    as.Date(x[2], "%Y-%m-%d"), "1 day"))))))
}                                     
f2 = function() {  #using numeric
  df$start = as.numeric(as.Date(df$start, "%Y-%m-%d"))
  df$stop = as.numeric(as.Date(df$stop, "%Y-%m-%d"))
  res = as.data.frame(table(unlist(apply(df[-1], 1, 
                        function(x) seq(x[1], x[2])))))
  res$Var1 = factor(as.Date(as.numeric(as.character(res$Var1)), 
                            origin = "1970-01-01"))
  res                      
}
f1()
#        Var1 Freq
#1 2014-01-01    1
#2 2014-01-02    2
#3 2014-01-03    3
#4 2014-01-04    3
#5 2014-01-05    2
#6 2014-01-06    1
f2()
#        Var1 Freq
#1 2014-01-01    1
#2 2014-01-02    2
#3 2014-01-03    3
#4 2014-01-04    3
#5 2014-01-05    2
#6 2014-01-06    1

.

df = data.frame(person = paste("ID", 1:1e3, sep = ""),
                start = as.Date(sample(Sys.Date() : (Sys.Date()+10), 1e3, T), 
                                origin = "1970-01-01"))
df$stop = df$start + 5
head(df)
#  person      start       stop
#1    ID1 2014-03-07 2014-03-12
#2    ID2 2014-03-01 2014-03-06
#3    ID3 2014-03-04 2014-03-09
#4    ID4 2014-02-28 2014-03-05
#5    ID5 2014-02-27 2014-03-04
#6    ID6 2014-03-07 2014-03-12
identical(f1(), f2())
#[1] TRUE
library(microbenchmark)
microbenchmark(f1(), f2(), times = 10)
#Unit: milliseconds
# expr       min        lq    median        uq       max neval
# f1() 366.90895 368.36777 379.78573 395.82724 410.17782    10
# f2()  31.66473  32.11122  33.04891  33.62642  35.75063    10
+3

:

df[, -1] <- lapply(df[-1], as.Date)

data.frame(table(unlist(lapply(1:nrow(df), function(i) {
    as.character(seq.Date(df$start[i], df$stop[i], "day"))
}))))

##         Var1 Freq
## 1 2014-01-01    1
## 2 2014-01-02    2
## 3 2014-01-03    3
## 4 2014-01-04    3
## 5 2014-01-05    2
## 6 2014-01-06    1
+2

All Articles