Performing operations on a subset using a data table

I have a wide range of survey data. For a specific question in the source data, a set of variables was created to represent the different fact that the research question was asked in a particular month.

I want to create a new set of variables with monthly invariant names; the value of these variables will correspond to the value of the monthly version of the question for the observed month.

See example / dummy dataset:

require(data.table)

data <- data.table(month = rep(c('may', 'jun', 'jul'),  each = 5),
                   may.q1 = rep(c('yes', 'no', 'yes'),  each = 5),
                   jun.q1 = rep(c('breakfast', 'lunch', 'dinner'),  each = 5),
                   jul.q1 = rep(c('oranges', 'apples', 'oranges'),  each = 5),
                   may.q2 = rep(c('econ', 'math', 'science'), each = 5),
                   jun.q2 = rep(c('sunny', 'foggy', 'cloudy'), each = 5),
                   jul.q2 = rep(c('no rain', 'light mist', 'heavy rain'), each = 5))

There are only two questions in this survey: "q1" and "q2". Each of these questions has been repeatedly requested for several months. However, the observation contains a valid answer only if the month observed in the data coincides with the question of the survey for a specific month.

: "may.q1" "" "". , "Q1" "may.q1" , "jun.q1" "jul.q1". "Q1" "may.q1" , "", "Q1" "jun.q1", "jun",.

, , - :

mdata <- data[month == 'may', c('month', 'may.q1', 'may.q2'), with = F]
setnames(mdata, names(mdata), gsub('may\\.', '', names(mdata)))

, "by = month".

"plyr" , :

require(plyr)
data <- data.frame(data)

mdata <- ddply(data, .(month), function(dfmo) {
    dfmo <- dfmo[, c(1, grep(dfmo$month[1], names(dfmo)))]
    names(dfmo) <- gsub(paste0(dfmo$month[1], '\\.'), '', names(dfmo))
    return(dfmo)
})

, data.table, , . .

+5
3

:

data[, .SD[,paste0(month,c(".q1",".q2")), with=FALSE], by=month]

    month  may.q1     may.q2
 1:   may     yes       econ
 2:   may     yes       econ
 3:   may     yes       econ
 4:   may     yes       econ
 5:   may     yes       econ
 6:   jun   lunch      foggy
 7:   jun   lunch      foggy
 8:   jun   lunch      foggy
 9:   jun   lunch      foggy
10:   jun   lunch      foggy
11:   jul oranges heavy rain
12:   jul oranges heavy rain
13:   jul oranges heavy rain
14:   jul oranges heavy rain
15:   jul oranges heavy rain

, ( setnames). , . , , .

+5

: . @MatthewDowle .

data.table.

dd <- melt.dt(data, id.var=c("month"))[month == gsub("\\..*$", "", ind)][, 
        ind := gsub("^.*\\.", "", ind)][, split(values, ind), by=list(month)]

melt.dt ( ). melt a data.table, melt plyr (/ , , ).

melt.dt <- function(DT, id.var) {
    stopifnot(inherits(DT, "data.table"))
    measure.var <- setdiff(names(DT), id.var)
    ind <- rep.int(measure.var, rep.int(nrow(DT), length(measure.var)))
    m1  <- lapply(c("list", id.var), as.name)
    m2  <- as.call(lapply(c("factor", "ind"), as.name))
    m3  <- as.call(lapply(c("c", measure.var), as.name))    
    quoted <- as.call(c(m1, ind = m2, values = m3))
    DT[, eval(quoted)]
}

: data.table id.var = month. month.question. , ".question" month, . , "". "ind". , gsub "". q1, q2 .. reshape ( cast) . month values ind ( q1, q2). , 2 ( ) .

+3

-

data <- data.table(
                   may.q1 = rep(c('yes', 'no', 'yes'),  each = 5),
                   jun.q1 = rep(c('breakfast', 'lunch', 'dinner'),  each = 5),
                   jul.q1 = rep(c('oranges', 'apples', 'oranges'),  each = 5),
                   may.q2 = rep(c('econ', 'math', 'science'), each = 5),
                   jun.q2 = rep(c('sunny', 'foggy', 'cloudy'), each = 5),
                   jul.q2 = rep(c('no rain', 'light mist', 'heavy rain'), each = 5)
                   )


tmp <- reshape(data, direction = "long", varying = 1:6, sep = ".", timevar = "question")

str(tmp)
## Classes ‘data.table’ and 'data.frame':   30 obs. of  5 variables:
##  $ question: chr  "q1" "q1" "q1" "q1" ...
##  $ may     : chr  "yes" "yes" "yes" "yes" ...
##  $ jun     : chr  "breakfast" "breakfast" "breakfast" "breakfast" ...
##  $ jul     : chr  "oranges" "oranges" "oranges" "oranges" ...
##  $ id      : int  1 2 3 4 5 6 7 8 9 10 ...

,

require(reshape2)
## remove the id column if you want (id is the last col so ncol(tmp))
res <- melt(tmp[,-ncol(tmp), with = FALSE], measure.vars = c("may", "jun", "jul"), value.name = "response", variable.name = "month")

str(res)
## 'data.frame':    90 obs. of  3 variables:
##  $ question: chr  "q1" "q1" "q1" "q1" ...
##  $ month   : Factor w/ 3 levels "may","jun","jul": 1 1 1 1 1 1 1 1 1 1 ...
##  $ response: chr  "yes" "yes" "yes" "yes" ...
+1
source

All Articles