Winsorize dataframe

I want to do winsorization in a dataframe like this:

event_date  beta_before     beta_after
2000-05-05  1.2911707054    1.3215648954
1999-03-30  0.5089734305    0.4269575657
2000-05-05  0.5414700258    0.5326762272
2000-02-09  1.5491034852    1.2839988507
1999-03-30  1.9380674599    1.6169735009
1999-03-30  1.3109909155    1.4468207148
2000-05-05  1.2576420753    1.3659492507
1999-03-30  1.4393018341    0.7417777965
2000-05-05  0.2624037804    0.3860641307
2000-05-05  0.5532216441    0.2618245169
2000-02-08  2.6642931822    2.3815576738
2000-02-09  2.3007578964    2.2626960407
2001-08-14  3.2681270302    2.1611010935
2000-02-08  2.2509121123    2.9481325199
2000-09-20  0.6624503316    0.947935581
2006-09-26  0.6431111805    0.8745333151

Through winsorization, I want to find max and min for beta, for example, for example. This value should be replaced by the second highest or second lowest value in the same column, without losing the rest of the details in the observation. For instance. In this case, in beta versions up to max, the value is 3.2681270302 and should be replaced by 3.2681270302. The same process will run for min, and then for the beta_after variable. Therefore, only 2 values ​​per column will be changes, the highest and minimum, the rest will remain unchanged.

? plyr, , . 2 , beta_before_winsorized beta _after_winsorized

+3
5

, :

winsorize <- function(x) {
    Min <- which.min(x)
    Max <- which.max(x)
    ord <- order(x)
    x[Min] <- x[ord][2]
    x[Max] <- x[ord][length(x)-1]
    x
}

dat, windsoroize , , :

dat2 <- dat
dat2[, -1] <- sapply(dat[,-1], winsorize)

:

R> dat2
   event_date beta_before beta_after
1  2000-05-05   1.2911707  1.3215649
2  1999-03-30   0.5089734  0.4269576
3  2000-05-05   0.5414700  0.5326762
4  2000-02-09   1.5491035  1.2839989
5  1999-03-30   1.9380675  1.6169735
6  1999-03-30   1.3109909  1.4468207
7  2000-05-05   1.2576421  1.3659493
8  1999-03-30   1.4393018  0.7417778
9  2000-05-05   0.5089734  0.3860641
10 2000-05-05   0.5532216  0.3860641
11 2000-02-08   2.6642932  2.3815577
12 2000-02-09   2.3007579  2.2626960
13 2001-08-14   2.6642932  2.1611011
14 2000-02-08   2.2509121  2.3815577
15 2000-09-20   0.6624503  0.9479356
16 2006-09-26   0.6431112  0.8745333

, , , , max beta_before, 2.6642932 , , .

, , - , which.min() which.max() . , , - :

winsorize2 <- function(x) {
    Min <- which(x == min(x))
    Max <- which(x == max(x))
    ord <- order(x)
    x[Min] <- x[ord][length(Min)+1]
    x[Max] <- x[ord][length(x)-length(Max)]
    x
}

( ).

+5

, winsorizing x% ( 10%, 15% 20%) . . , winsorizing .

+7

, "winsorization" - ( ). R winsor psych. :

dat$beta_before = psych::winsor(dat$beta_before, trim = 0.0625)
dat$beta_after  = psych::winsor(dat$beta_after , trim = 0.0625)

trim = 0,0625 (6.25- 93,75- ), 16 , "" : 1/16 = 0,0625

, , , : n- .

+2

statar . readme:

# winsorize (default based on 5 x interquartile range)
v <- c(1:4, 99)
winsorize(v)
winsorize(v, replace = NA)
winsorize(v, probs = c(0.01, 0.99))
winsorize(v, cutpoints = c(1, 50))

https://github.com/matthieugomez/statar

+1
source

follow my previous points about actually replacing the values ​​to be trimmed with the value in the trim position:

winsorized.sample<-function (x, trim = 0, na.rm = FALSE, ...) 
{
  if (!is.numeric(x) && !is.complex(x) && !is.logical(x)) {
    warning("argument is not numeric or logical: returning NA")
    return(NA_real_)
  }
  if (na.rm) 
    x <- x[!is.na(x)]
  if (!is.numeric(trim) || length(trim) != 1L) 
    stop("'trim' must be numeric of length one")
  n <- length(x)
  if (trim > 0 && n) {
    if (is.complex(x)) 
      stop("trimmed sample is not defined for complex data")
    if (any(is.na(x))) 
      return(NA_real_)
    if (trim >= 0.5) { 
      warning("trim >= 0.5 is odd...trying it anyway")
    }
    lo <- floor(n * trim) + 1
    hi <- n + 1 - lo
    #this line would work for just trimming 
    #  x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi]
    #instead, we're going to replace what would be trimmed
    #with value at trim position using the next 7 lines
    idx<-seq(1,n)
    myframe<-data.frame(idx,x)
    myframe<-myframe[ order(x,idx),]
    myframe$x[1:lo]<-x[lo]
    myframe$x[hi:n]<-x[hi]
    myframe<-myframe[ order(idx,x),]
    x<-myframe$x
  }
  x
}
#test it
mydist<-c(1,20,1,5,2,40,5,2,6,1,5)
mydist2<-winsorized.sample(mydist, trim=.2)
mydist
mydist2
descStat(mydist)
descStat(mydist2)
0
source

All Articles