Find sequences of NA values ​​in data.frame rows

I have a huge one data.framewith several NAmeanings in it. It seems that I get problems if many values NAappear sequentially.

Is there an easy way to find those strings that contain values NA, for example. 20 times one after another, but not those where 20 NAvalues ​​stand out?

EDIT (added by agstudy)

In the decision apply, it is used , which is not very effective for the hudge matrix. Therefore, I am editing the solution (adding a tag Rcpp) to request a more efficient solution .

+3
source share
3 answers

anlagous complete.cases, rle:

cons.missings <- 
function(dat,n)
apply(is.na(dat),1,function(x){
  yy <- rle(x)
  any(yy$lengths[yy$values]>n)
})

, :

dat[!cons.missings(dat,20),]

4 :

dat <- as.matrix(t(data.frame(a= c(1,rep(NA,4),5),
           b= c(2,rep(NA,2),1,rep(NA,2)))))

 [,1] [,2] [,3] [,4] [,5] [,6]
a    1   NA   NA   NA   NA    5
b    2   NA   NA    1   NA   NA

dat[!cons.missings(dat,3),]
[1]  2 NA NA  1 NA NA
+3

"Rcpp", , .Call, :

library(inline)

ff = cfunction(sig = c(R_mat = "matrix", R_n = "numeric"), body = '
    SEXP mat, dims, ans;

    PROTECT(mat = coerceVector(R_mat, REALSXP)); //or `as.numeric(.)` in R
    PROTECT(dims = getAttrib(R_mat, R_DimSymbol));
    R_len_t rows = INTEGER(dims)[0], cols = INTEGER(dims)[1];
    R_len_t n = INTEGER(coerceVector(R_n, INTSXP))[0];

    R_len_t *buf = (int *) R_alloc(rows, sizeof(int)), b = 0; //dynamic allocation 
                                                             //of a pointer to store 
                                                             //the indices of rows
                                                          //that match the criterion.
                                                           //a classic usage of this
                                                         //is in do_which (summary.c)

    double *pmat = REAL(mat);  //pointer to the matrix input
    for(int ir = 0; ir < rows; ir++) {
       R_len_t COUNT_CONS_NAS = 0;
       for(int ic = 0; ic < cols; ic++) {
           if(ISNAN(pmat[ir + ic*rows])) { //if NA is found
               COUNT_CONS_NAS++;          //start counting NAs  
               if(COUNT_CONS_NAS == n) break;  //no need to search all columns
           }
           else {
               COUNT_CONS_NAS = 0; //if not NA, counter back to zero 
           }
       }
       if(COUNT_CONS_NAS == n) {  //if the specific row matched the criterion
           buf[b] = ir + 1;   //store its index
           b++;
       }
    }

    PROTECT(ans = allocVector(INTSXP, b));  //allocate a vector with 
                                           //length = No rows that matched criterion
    memcpy(INTEGER(ans), buf, sizeof(int)*b);  //copy rows indices to 
                                               //the pointer of ans

    UNPROTECT(3);

    return(ans);
')


set.seed(11);mat = matrix(sample(c(NA, 0:2), 30, T, prob = c(0.7, 0.1, 0.1, 0.1)), 6)
mat
#     [,1] [,2] [,3] [,4] [,5]
#[1,]   NA   NA    0   NA   NA
#[2,]   NA   NA    2   NA   NA
#[3,]   NA    2    1   NA   NA
#[4,]   NA   NA   NA   NA   NA
#[5,]   NA   NA   NA   NA   NA
#[6,]    0   NA   NA   NA   NA
ff(mat, 3)
#[1] 4 5 6    
mat[-ff(mat, 3),]      
#     [,1] [,2] [,3] [,4] [,5]
#[1,]   NA   NA    0   NA   NA
#[2,]   NA   NA    2   NA   NA
#[3,]   NA    2    1   NA   NA

:

#library(Rcpp) ; sourceCpp("~/ffcpp.cpp")
identical(dat[!cons.missings(dat,3), ], dat[cons_missings(is.na(dat),3), ])
#[1] TRUE
identical(dat[!cons.missings(dat,3), ], dat[-ff(dat, 4), ])
#[1] TRUE
library(microbenchmark)
microbenchmark(dat[!cons.missings(dat,3), ], 
               dat[cons_missings(is.na(dat),3), ],
               dat[-ff(dat, 4), ], times = 10)
#Unit: milliseconds
                                expr         min          lq      median         uq        max neval
       #dat[!cons.missings(dat, 3), ] 3628.960362 3674.531704 3777.270890 3840.79075 3885.58709    10
 #dat[cons_missings(is.na(dat), 3), ] 5256.550903 5267.137257 5325.497516 5365.13947 5554.88023    10
                  #dat[-ff(dat, 4), ]    6.444897    7.749669    8.971304   11.35649   58.94499    10

#the rows that each function will remove
resff <- ff(dat, 4)
rescons.mis <- which(cons.missings(dat,3)) 
rescons_mis <- seq_len(nrow(dat))[-cons_missings(is.na(dat),3)]

sum(resff != rescons.mis)
#[1] 0
sum(resff != rescons_mis)
#[1] 0
sum(rescons_mis != rescons.mis)
#[1] 0
length(resff)
#[1] 5671
length(rescons.mis)
#[1] 5671
length(rescons_mis)
#[1] 5671
+2

I am adding another answer using Rcpp, as the OP uses a large matrix. I am not an Rcpp specialist, so I cannot get a better solution even if I tried to implement efficient rle missings algorithms.

#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
bool maxMissingSequence(IntegerVector x,int n) {

  // Initialise first value
  int lmissings = 1;
  double prev = x[0];
  for(IntegerVector::iterator it = x.begin() + 1; it != x.end(); ++it) {
    if (prev == *it && prev==1)lmissings++;
    if(lmissings >n) break;   // we are OK
    if(*it==0) lmissings =1;  // reset counter
    prev = *it;
  }
  return lmissings >n;
}

// [[Rcpp::export]]
IntegerVector cons_missings(IntegerMatrix Im, int n ){
   IntegerVector res ; 
   int nrows = Im.nrow();
   for (int i = 0; i < nrows; i++)
      if(!maxMissingSequence(Im(i,_),n))
         res.push_back(i+1);
  return res;
}

Benchmarkings

set.seed(2)
N <- 3*1e5
dat <- matrix(sample(c(1,NA),N,replace=TRUE),ncol=5)

cons.missings <- 
function(dat,n)
apply(is.na(dat),1,function(x){
  yy <- rle(x)
  any(yy$lengths[yy$values]>n)
})


identical(dat[!cons.missings(dat,3),],dat[cons_missings(is.na(dat),3),])
[1] TRUE

system.time(dat[!cons.missings(dat,3),])
   user  system elapsed 
   4.24    0.02    4.35 

> system.time(dat[cons_missings(is.na(dat),3),])
   user  system elapsed 
   6.34    0.00    6.48 
0
source

All Articles