Transcoding data in r

I have a huge 1000 × 100000 data, for example, the following, to transcode numerical values.

myd <- data.frame (v1 = sample (c("AA", "AB", "BB", NA), 10, replace = T),
                   v2 = sample (c("CC", "CG", "GG", NA), 10, replace = T),
                   v3 = sample (c("AA", "AT", "TT", NA) , 10, replace = T),
                   v4 = sample (c("AA", "AT", "TT", NA) , 10, replace = T),
                   v5 = sample (c("CC", "CA", "AA", NA) , 10, replace = T)
                   )
myd
     v1   v2   v3   v4   v5
1    AB   CC <NA> <NA>   AA
2    AB   CG   TT   TT   AA
3    AA   GG   AT   AT   CA
4  <NA> <NA> <NA>   AT <NA>
5    AA <NA>   AA <NA>   CA
6    BB <NA>   TT   TT   CC
7    AA   GG   AA   AT   CA
8  <NA>   GG <NA>   AT   CA
9    AA <NA>   AT <NA>   CC
10   AA   GG   TT   AA   CC

Each variable has potentially four unique values.

unique(myd$v1)

[1] AB   AA   <NA> BB  
Levels: AA AB BB

unique(myd$v2)

[1] CC   CG   GG   <NA>
  Levels: CC CG GG

Such unique values ​​can be any combination, however, consists of two alphabets (except A). For example, "A", "B" in the first case will make combinations "AA", "AB", "BB". The numerical code for them will be 1, 0, -1, respectively. Similarly, for second-order alphabets, “C” “G” makes “CC”, “CG”, “GG”, so the numeric codes will be 1, 0, -1, respectively. Thus, the above myd needs to be transcoded to:

 myd
         v1   v2   v3    v4      v5
    1    0   1     <NA>  <NA>    1
    2    0   0     -1    -1      1
    3    1   -1     0    0       0
    4  <NA>  <NA>  <NA>   0     <NA>
    5    1  <NA>    1  < NA>      0
    6   -1  <NA>    -1    -1      -1
    7    1   -1    1      0        0
    8  <NA>   -1   <NA>   0        0
    9    1  <NA>    0    <NA>     -1
    10   1   -1    -1     1       -1
+5
source share
3

, , .

:

> as.numeric(myd$v1)
 [1]  2  2  1 NA  1  3  1 NA  1  1

levels() :

> levels(myd$v1)
[1] "AA" "AB" "BB"

, 1 == AA, 2 == AB, 3 == BB... ..

, , , . , 2, -1, :

(sapply(myd, as.numeric) - 2) * -1
#-----
      v1 v2 v3 v4 v5
 [1,]  0  1 NA NA  1
 [2,]  0  0 -1 -1  1
 [3,]  1 -1  0  0  0
 [4,] NA NA NA  0 NA
 [5,]  1 NA  1 NA  0
 [6,] -1 NA -1 -1 -1
 [7,]  1 -1  1  0  0
 [8,] NA -1 NA  0  0
 [9,]  1 NA  0 NA -1
[10,]  1 -1 -1  1 -1
+7

- (skip to data.table !)

AA, AB, BB, 1,0,-1 .., ( ). , !

simple_recode <- function(.x, new_codes){
  new_codes[as.numeric(.x)]
 }

as.data.frame(lapply( myd, simple_recode, new_codes = 1:-1)) 

factor

, factor labels

as.data.frame(lapply(myd, factor, labels = 1:-1))

data.table

, data.table, .

library(data.table)
DT <- as.data.table(myd)
as.data.table(DT[,lapply(.SD, simple_recode, new_codes = 1:-1))])

,

as.data.table(DT[, lapply(.SD, setattr, 'levels', 1:-1)])

( as.data.table)

 for(name in names(DT)){
    setattr(DT[[name]],'levels',1:-1)
     }

setattr , .

data.table setattr

# some big data (100 columns, 1e6 rows)
big  <- replicate(100, factor(sample(c('AA','AB','BB', NA), 1e6, T)), simplify = F)
bigDT <- as.data.table(big)

system.time({
  for(name in names(big)){
    setattr(big[[name]],'levels',1:-1)
     }
  }))

##  user  system elapsed 
##    0        0       0
+8

, LHS , , :

> myd[] <- c(-1,0,1)[data.matrix(myd)]
> myd
   v1 v2 v3 v4 v5
1  NA  0  0  0  1
2  -1  1  0  0 -1
3   0 NA  1  0  0
4  NA -1 -1  0 -1
5  -1  0  1 -1 NA
6   0 NA  0  1 NA
7  NA  0  1 NA -1
8   0  0  0 -1  1
9  -1 NA  1 -1 NA
10  0  1  1 NA NA
+4

All Articles