Masked Numpy arrays are much slower than regular numpy arrays

I have a function that calculates pair correlation between all pairs of rows in a numpy array. It worked fine, but then I remembered that every time I often have to deal with missing data. I used a masked array to try to solve this problem, but it slows down the calculation a bit. Any ideas around using masked functions. The real bottleneck that I think will be in functionnp.ma.dot. but I added some profiling, I quickly mocked iPython. I have to say that 5000 is at the lower end of the spectrum in terms of the number of rows these arrays will have. Some may have more than 300,000 people. Code with masked arrays looks about 20 times slower than code without, but obviously with missing data, code without masked arrays just produces NaN so often.

First a quick and dirty way to generate some sample data

genotypes = np.empty((5000,200))
for i in xrange(5000):
    genotypes[i] = np.random.binomial(3,.333, size=200)

pValues = np.random.uniform(0,1,5000)

Then check functions

 def testMask(pValsArray, genotypeArray):
    nSNPs = len(pValsArray)-1
    genotypeArray = np.ma.masked_array(genotypeArray, np.isnan(genotypeArray))
    chisq = np.sum(-2 * np.log(pValsArray))
    ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)]
    datam = genotypeArray - ms
    datass = np.ma.sqrt(np.ma.sum(datam**2, axis=1))
    runningSum = 0
    for i in xrange(nSNPs):
        temp = np.ma.dot(datam[i:],datam[i].T)
        d = (datass[i:]*datass[i])
        rs = temp / d
        rs = np.absolute(rs)[1:]
        runningSum += 3.25 * np.sum(rs) +  .75 * np.dot(rs, rs)
    sigmaSq = 4*nSNPs+2*runningSum
    E = 2*nSNPs
    df = (2*(E*E))/sigmaSq
    runningSum = sigmaSq/(2*E)
    d = chisq/runningSum
    brownsP = stats.chi2.sf(d, df)
    return brownsP

def testNotMask(pValsArray, genotypeArray):
    nSNPs = len(pValsArray)-1
    chisq = np.sum(-2 * np.log(pValsArray))
    ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)]
    datam = genotypeArray - ms
    datass = np.sqrt(stats.ss(datam, axis=1))
    runningSum = 0
    for i in xrange(nSNPs):
        temp = np.dot(datam[i:],datam[i].T)
        d = (datass[i:]*datass[i])
        rs = temp / d
        rs = np.absolute(rs)[1:]
        runningSum += 3.25 * np.sum(rs) +  .75 * np.dot(rs, rs)
    sigmaSq = 4*nSNPs+2*runningSum
    E = 2*nSNPs
    df = (2*(E*E))/sigmaSq
    runningSum = sigmaSq/(2*E)
    d = chisq/runningSum
    brownsP = stats.chi2.sf(d, df)
    return brownsP

And some time

%timeit testMask(pValues, genotypes)
1 loops, best of 3: 14.3 s per loop

%timeit testNotMask(pValues, genotypes)
1 loops, best of 3: 678 ms per loop

add some missing data and run again:

randis = np.random.randint(0,5000, 10)
randjs = np.random.randint(0,200, 10)

for i,j in zip(randis, randjs):
    genotypes[i,j] = None



%timeit testMask(pValues, genotypes)
1 loops, best of 3: 14.2 s per loop

%timeit testNotMask(pValues, genotypes)
1 loops, best of 3: 654 ms per loop

and some profiling:

%prun

       2559677 function calls in 15.045 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     9791    5.259    0.001    5.259    0.001 {method 'copy' of 'numpy.ndarray' objects}
     4999    2.877    0.001   11.888    0.002 extras.py:949(dot)
     9794    1.566    0.000    1.566    0.000 {numpy.core.multiarray.copyto}
    14997    1.497    0.000    1.564    0.000 {numpy.core._dotblas.dot}
    30007    0.559    0.000    0.559    0.000 {method 'reduce' of 'numpy.ufunc' objects}
    94996    0.450    0.000    0.875    0.000 core.py:2751(_update_from)
   864970    0.347    0.000    0.347    0.000 {getattr}
     5000    0.346    0.000    0.802    0.000 core.py:1065(__call__)
        1    0.240    0.240   15.045   15.045 <ipython-input-115-2aab1c8ea4c5>:1(testMask)
     5000    0.196    0.000    0.196    0.000 core.py:771(__call__)
     5001    0.147    0.000    0.551    0.000 core.py:917(__call__)
    24996    0.143    0.000    0.609    0.000 core.py:2930(__getitem__)
    54998    0.140    0.000    0.775    0.000 core.py:2776(__array_finalize__)
   419985    0.126    0.000    0.126    0.000 {method 'update' of 'dict' objects}
        1    0.093    0.093    0.111    0.111 core.py:5990(power)
   339994    0.077    0.000    0.077    0.000 {isinstance}
    50015    0.072    0.000    0.072    0.000 {numpy.core.multiarray.array}
    60002    0.060    0.000    0.568    0.000 {method 'view' of 'numpy.ndarray' objects}
     5000    0.060    0.000    0.199    0.000 core.py:2626(__new__)
    14999    0.058    0.000    7.412    0.000 core.py:3341(filled)
    25005    0.055    0.000    0.092    0.000 core.py:609(getdata)

EDIT:

I tried perimosocordiae answers, but I still get nans. It seems that the function mean, stats.ssand np.sqrtall care about values nan.

def fastNotMask(pValsArray, genotypeArray):
    nSNPs = len(pValsArray)-1
    chisq = np.sum(-2 * np.log(pValsArray))
    ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)]
    print ms
    datam = genotypeArray - ms
    print datam
    datass = np.sqrt(stats.ss(datam, axis=1))
    print datass
    runningSum = 0
    for i in xrange(nSNPs):
        temp = np.dot(datam[i:],datam[i].T)
        d = (datass[i:]*datass[i])
        rs = temp / d
        rs = np.absolute(rs)[1:]
        runningSum += 3.25 * np.nansum(rs) +  .75 * np.nansum(rs * rs)
    print runningSum
    sigmaSq = 4*nSNPs+2*runningSum
    E = 2*nSNPs
    df = (2*(E*E))/sigmaSq
    runningSum = sigmaSq/(2*E)
    d = chisq/runningSum
    brownsP = stats.chi2.sf(d, df)
    return brownsP

, nan .

pValues = np.random.uniform(0,1,10)

genotypes = np.empty((10,10))
for i in xrange(10):
    genotypes[i] = np.random.binomial(2,.5, size=10)

randis = np.random.randint(0,10, 2)
randjs = np.random.randint(0,10, 2)

for i,j in zip(randis, randjs):
    genotypes[i,j] = None

print testfastMask(pValues, genotypes)



[[ 1.5]
 [ 1.2]
 [ 0.9]
 [ 1.2]
 [ 1.1]
 [ 0.6]
 [ nan]
 [ 1.1]
 [ nan]
 [ 0.8]]
[[-0.5 -0.5  0.5  0.5 -0.5 -0.5  0.5  0.5 -0.5  0.5]
 [-0.2  0.8 -0.2 -0.2 -0.2 -0.2 -0.2  0.8 -0.2 -0.2]
 [-0.9  0.1 -0.9  1.1  0.1 -0.9  1.1  0.1  0.1  0.1]
 [-0.2 -0.2 -0.2 -0.2  0.8 -0.2 -0.2 -1.2  0.8  0.8]
 [-0.1 -0.1 -0.1 -0.1 -0.1 -0.1  0.9 -0.1  0.9 -1.1]
 [-0.6  1.4  0.4 -0.6 -0.6  0.4 -0.6 -0.6  0.4  0.4]
 [ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan]
 [-0.1  0.9 -1.1 -1.1  0.9 -0.1  0.9 -1.1  0.9 -0.1]
 [ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan]
 [ 1.2 -0.8 -0.8  0.2 -0.8  0.2  1.2  0.2  0.2 -0.8]]
[ 1.58113883  1.26491106  2.21359436  1.8973666   1.70293864  2.0976177
         nan  2.62678511         nan  2.36643191]
nan
nan

- ? . python 2.7 numpy 1.7.1?

.

+3
1

. numpy <= 1.8, np.nansum([NaN, NaN]) == 0.0 ( FutureWarning). :

tmp = 3.25 * np.nansum(rs) +  .75 * np.nansum(rs * rs)
if not np.isnan(tmp):
  runningSum += tmp

/ tmp np.nansum.


:

, , testNotMask:

runningSum += 3.25 * np.sum(rs) +  .75 * np.dot(rs, rs)

:

runningSum += 3.25 * np.nansum(rs) +  .75 * np.nansum(rs * rs)

NaN, , .

. fastNotMask - testNotMask .

In [63]: genotypes = np.random.binomial(3, .333, size=(5000, 200)).astype(float)

In [64]: pValues = np.random.uniform(0,1,5000)

In [65]: %timeit testMask(pValues, genotypes)
1 loops, best of 3: 11.3 s per loop

In [66]: %timeit testNotMask(pValues, genotypes)
1 loops, best of 3: 3.53 s per loop

In [67]: %timeit fastNotMask(pValues, genotypes)
1 loops, best of 3: 3.96 s per loop

In [68]: randjs = np.random.randint(0,200, 10)

In [69]: randis = np.random.randint(0,5000,10)

In [70]: genotypes[randis,randjs] = None

In [71]: %timeit testMask(pValues, genotypes)
1 loops, best of 3: 33 s per loop

In [72]: %timeit testNotMask(pValues, genotypes)
1 loops, best of 3: 3.6 s per loop

In [73]: %timeit fastNotMask(pValues, genotypes)
1 loops, best of 3: 3.98 s per loop

In [74]: testMask(pValues, genotypes)
Out[74]: 0.47606794747438386

In [75]: testNotMask(pValues, genotypes)
Out[75]: nan

In [76]: fastNotMask(pValues, genotypes)
Out[76]: 0.47613597091679449

, testMask fastNotMask. , , , .

+3

All Articles