R - need help speeding up the for loop

Question

R - need help speeding up the for loop

I have two data frames; one is 48 lines long and looks like this:

name = Z31

     Est.Date   Site    Cultivar   Planting
1   24/07/2011  Birchip Axe           1
2   08/08/2011  Birchip Bolac         1
3   24/07/2011  Birchip Derrimut      1
4   12/08/2011  Birchip Eaglehawk     1
5   29/07/2011  Birchip Gregory       1
6   29/07/2011  Birchip Lincoln       1
7   23/07/2011  Birchip Mace          1
8   29/07/2011  Birchip Scout         1
9   17/09/2011  Birchip Axe           2
10  19/09/2011  Birchip Bolac         2

Another → 23,000 lines and contains the output from the simulator. It looks like this:

name = pred

    Date        maxt    mint    Cultivar    Site    Planting    tt  cum_tt
1   5/05/2011   18       6.5    Axe        Birchip  1        12.25  0
2   6/05/2011   17.5     2.5    Axe        Birchip  1        10     0
3   7/05/2011   18       2.5    Axe        Birchip  1        10.25  0
4   8/05/2011   19.5       2    Axe        Birchip  1        10.75  0
5   9/05/2011   17       4.5    Axe        Birchip  1        10.75  0
6   10/05/2011  15.5    -0.5    Axe        Birchip  1        7.5    0
7   11/05/2011  14       5.5    Axe        Birchip  1        9.75   0
8   12/05/2011  19         8    Axe        Birchip  1        13.5   0
9   13/05/2011  18.5     7.5    Axe        Birchip  1        13     0
10  14/05/2011  16       3.5    Axe        Birchip  1        9.75   0

I want the cum_tt column to start adding the tt column of the current row to the cum_tt of the previous row (cumulative addition) ONLY if the date in pre-DF is equal to or greater than Z31 est.Date. I wrote the following loop:

for (i in 1:nrow(Z31)){
  for (j in 1:nrow(pred)){
    if (Z31[i,]$Site == pred[j,]$Site & Z31[i,]$Cultivar == pred[j,]$Cultivar & Z31[i,]$Planting == pred[j,]$Planting &
        pred[j,]$Date >= Z31[i,]$Est.Date)
    {
      pred[j,]$cum_tt <- pred[j,]$tt + pred[j-1,]$cum_tt
    }
  }
}

It works, but it is so slow that it takes about an hour to start the whole set. I know that loops are not strong, so can someone help me with vectorizing this operation?

Thanks in advance.

UPDATE

Here is the output from dput (Z31):

structure(list(Est.Date = structure(c(15179, 15194, 15179, 15198, 15184, 15184, 15178, 15184, 15234, 15236, 15230, 15238, 15229, 15236, 15229, 15231, 15155, 15170, 15160, 15168, 15165, 15159, 15170, 15170, 15191, 15205, 15198, 15203, 15202, 15195, 15203, 15206, 15193, 15193, 15195, 15200, 15193, 15205, 15200, 15205, 15226, 15245, 15231, 15259, 15241, 15241, 15241, 15241), class = "Date"), Site = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Birchip", "Gatton", "Tarlee"), class = "factor"), Cultivar = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("Axe", "Bolac", "Derrimut", "Eaglehawk", "Gregory", "Lincoln", "Mace", "Scout"), class = "factor"), Planting = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)), .Names = c("Est.Date", "Site", "Cultivar", "Planting"), row.names = c(NA, -48L), class = "data.frame")

Here before. Please note that there are several additional columns. I just included the relevant ones above for readability.

structure(list(Date = structure(c(15099, 15100, 15101, 15102, 
15103, 15104, 15105, 15106, 15107, 15108, 15109, 15110, 15111, 
15112, 15113, 15114, 15115, 15116, 15117, 15118), class = "Date"), 
    flowering_das = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Zadok = c(9, 9, 
    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11, 11.032, 11.085, 
    11.157), stagename = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 
    8L, 8L, 9L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 1L, 3L, 3L, 3L), .Label = c("emergence", 
    "end_grain_fill", "end_of_juvenil", "floral_initiat", "flowering", 
    "germination", "maturity", "out", "sowing", "start_grain_fi"
    ), class = "factor"), node_no = c(0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 2, 2.032, 2.085, 2.157), maxt = c(18, 
    17.5, 18, 19.5, 17, 15.5, 14, 19, 18.5, 16, 16, 15, 16.5, 
    16.5, 20.5, 23, 25.5, 16.5, 16.5, 15), mint = c(6.5, 2.5, 
    2.5, 2, 4.5, -0.5, 5.5, 8, 7.5, 3.5, 6, 1, 5.5, 2, 7, 7, 
    9, 13.5, 11.5, 8.5), Cultivar = c("Axe", "Axe", "Axe", "Axe", 
    "Axe", "Axe", "Axe", "Axe", "Axe", "Axe", "Axe", "Axe", "Axe", 
    "Axe", "Axe", "Axe", "Axe", "Axe", "Axe", "Axe"), Site = c("Birchip", 
    "Birchip", "Birchip", "Birchip", "Birchip", "Birchip", "Birchip", 
    "Birchip", "Birchip", "Birchip", "Birchip", "Birchip", "Birchip", 
    "Birchip", "Birchip", "Birchip", "Birchip", "Birchip", "Birchip", 
    "Birchip"), Planting = c("1", "1", "1", "1", "1", "1", "1", 
    "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
    "1"), `NA` = c("Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", 
    "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", 
    "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", 
    "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", 
    "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", 
    "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", 
    "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out", "Birchip TOS1 Axe.out"
    ), tt = c(12.25, 10, 10.25, 10.75, 10.75, 7.5, 9.75, 13.5, 
    13, 9.75, 11, 8, 11, 9.25, 13.75, 15, 17.25, 15, 14, 11.75
    ), cum_tt = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0)), .Names = c("Date", "flowering_das", "Zadok", 
"stagename", "node_no", "maxt", "mint", "Cultivar", "Site", "Planting", 
NA, "tt", "cum_tt"), row.names = c(NA, 20L), class = "data.frame")

UPDATE

. , . , Subs. , , . Z P.

: 59.77

Subs: 14.62

Subs : 11.12

+5

vectorization for-loop r dataframe data.table

Justin 06 . '12 2:15

3

:

 temp_cumtt=c(rep(0,nrow(pred)))

:

if (Z31[i,2] == pred[j,5] & Z31[i,3] == pred[j,4] & Z31[i,4] == pred[j,6] & pred[j,1] >= Z31[i,1]){
   temp_cumtt[j]=pred[j,7] + pred[j-1,8]
}

data.frame.

:

 pred$cum_tt = temp_cumtt

, j-1 j 1. .

EDIT:

, , .

1) Date, .

2) Z31 data_frame Date-vector: Z31=Z31[with(Z31, order(-Date)), ] ( , pred [, ] >= Z31 [, ]

3) 1- pred. pred → pred[i,1] , Z31 , . Date , temp_cumtt[i], .

( 48 Z31, .

+1

Subs 06 . '12 2:48

Use data.tablethat should speed things up.

Z31 <- data.table(Z31,key="Site,Cultivar,Planting")
pred <- data.table(pred)

## First, let create an extra column in `pred` to see the corresponding date from `Z31` 
## Note 1: The JT is necessary since both sets have the same column names
## Note 2: I needed to use as.integer on Planting to make it work

pred[,Z31Est.Date:={JT=J(Site,Cultivar,as.integer(Planting)); Z31[JT,Est.Date][[4]]}]

## Now we can see for each row whether the date in `pred` is higher than or equal to that from `Z31`.

pred[,DateTrue:=Date>=Z31Est.Date]

## Finally, we only have to add up `pred[i,tt]` and `pred[i-1,cum_tt]` for each row where `DateTrue` equals `TRUE`.

for (i in 1:nrow(pred)) set(pred,i,13L,if(pred[i,DateTrue]) pred[i-1,cum_tt]+pred[i,tt] else(0))

0

Dirk Jun 06 '12 at 13:14

source share

Tim P · Accepted Answer · 2012-06-06T03:12:46+0000

, ... !

## first make sure we have dates in a suitable format for comparison
## by using strptime, creating the columns estdate_tidy and date_tidy
## in Z31 and pred respectively

Z31$estdate_tidy = strptime(as.character(Z31$Est.Date), "%d/%m/%Y")
pred$date_tidy = strptime(as.character(pred$Date), "%d/%m/%Y")

## now map the estdate_tidy column over to pred using the match command -
## Z31_m and pred_m are dummy variables that hopefully make this clear

Z31_m = paste(Z31$Site, Z31$Cultivar, Z31$Planting)
pred_m = paste(pred$Site, pred$Cultivar, pred$Planting)
pred$estdate_tidy = Z31$estdate_tidy[match(pred_m, Z31_m)]

## then define a ttfilter column that copies tt, except for being 0 when
## estdate_tidy is after date_tidy (think this is what you described)

pred$ttfilter = ifelse(pred$date_tidy >= pred$estdate_tidy, pred$tt, 0)

## finally use cumsum function to sum this new column up (looks like you
## wanted the answer in cum_tt so I've put it there)

pred$cum_tt = cumsum(pred$ttfilter)

, :)

(7 ):

- , cumsum (Site/Cultivar/Planting) - :

Z31$Lookup=with(Z31,paste(Site,Cultivar,Planting,sep="~"))
Z31$LookupNum=match(Z31$Lookup,unique(Z31$Lookup))
pred$Lookup=with(pred,paste(Site,Cultivar,Planting,sep="~"))
pred$LookupNum=match(pred$Lookup,unique(pred$Lookup))

pred$Est.Date = Z31$Est.Date[match(pred$Lookup,Z31$Lookup)]
pred$ttValid = (pred$Date>=pred$Est.Date)
pred$ttFiltered = ifelse(pred$ttValid, pred$tt, 0)

### now fill in cumsum of ttFiltered separately for each LookupNum
pred$cum_tt_Z31 = as.vector(unlist(tapply(pred$ttFiltered,
                                          pred$LookupNum,cumsum)))

0,16 , pred$cum_tt_Z31 - :)

, , tapply 48 :

pred$cum_tt_Z31 = rep(NA, nrow(pred))
for (lookup in unique(pred$Lookup)) {
    subs = which(pred$Lookup==lookup)
    pred$cum_tt_Z31[subs] = cumsum(pred$ttFiltered[subs])
}

0,25 , , , , .

, !:)

(8 ):

, 0,16 . , , , 10 000, 2-5, , .

: - - , Z31 pred. , , , .

: Est.Date Z31 pred, "paste'n'match". (, ), , pred ( ). , , , . ~ ( , , ), , - . , - ( "AB", "C", "D" ) , ( "A", "BC", "D" ), - !

: , , (. pred $ttValid) (. pred $ttFiltered). , .

: , pred $cum_tt_Z31, , cumsum , pred $LookupNum, tapply, ( pred $LookupNum). pred $LookupNum - 1s, 2s .. , cumsum, , . , , , ( ).

: , , (, 48) , . " " UPDATE , cumsum-on-groups ( NA), 48 cumsum. , , , , , .

- - , , .

R - need help speeding up the for loop

More articles: