SSE code for sum bytes. Where is the mistake?

I wrote SSE code to sum byte values. (VS2005.)

As simple as that, it works quite well (and fast). Just crashing with some array sizes. And it crashes only in release mode - never in debugging. Maybe someone sees an “obvious” error? Any help was appreciated.

__int64 Sum (const unsigned char* pData, const unsigned int& nLength)
{
    __int64 nSum (0);

    __m128i* pp = (__m128i*)pData;

    ATLASSERT( ( (DWORD)pp & 15 ) == 0 ); // pointer must point to address multiple of 16 (cache line)

    __m128i zero = _mm_setzero_si128(),
        a, b, c, d, tmp;

    unsigned int i (0);

    for ( ; i < nLength; i+=64) // 4-fach loop-unroll (x 16)
    {
        a = _mm_sad_epu8( *(pp++), zero);           
        b = _mm_sad_epu8( *(pp++), zero);  // It crashes here.
        c = _mm_sad_epu8( *(pp++), zero);
        d = _mm_sad_epu8( *(pp++), zero);

        // commenting the following line prevents the crash (???)
        tmp = _mm_add_epi64( _mm_add_epi64( _mm_add_epi64( a, b ), c ), d);

        a = _mm_srli_si128 ( tmp, 8 );

        nSum += _mm_cvtsi128_si32( a ) + _mm_cvtsi128_si32( tmp );
    }

    // ... the rest
    if (nLength % 64)
        for (i -= 64; i < nLength; i++)
            nSum += pData [i];

    return nSum;
}

The function is called as follows:

unsigned int nLength = 3571653;  // One of the values that causes crash
unsigned char *pData = (unsigned char*) _aligned_malloc(nLength, 16);
Sum (pData,  nLength);
+5
source share
3 answers

As requested, this is what I had in mind when I said "4 batteries in a cycle and summed them after the cycle."

__int64 Sum (const unsigned char* pData, const int& nLength)
{
    __int64 nSum (0);

    __m128i* pp = (__m128i*)pData;


    __m128i zero = _mm_setzero_si128(),
        a = _mm_setzero_si128(),
        b = _mm_setzero_si128(),
        c = _mm_setzero_si128(),
        d = _mm_setzero_si128(), tmp;

    int i (0);

    for ( ; i < (nLength - 63); i+=64)
    {
        a = _mm_add_epi64( _mm_sad_epu8( *(pp++), zero ), a );
        b = _mm_add_epi64( _mm_sad_epu8( *(pp++), zero ), b );
        c = _mm_add_epi64( _mm_sad_epu8( *(pp++), zero ), c );
        d = _mm_add_epi64( _mm_sad_epu8( *(pp++), zero ), d );
    }

    tmp = _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ));
    tmp = _mm_add_epi64( _mm_srli_si128( tmp, 8 ), tmp );
    nSum = (_mm_cvtsi128_si32( tmp ) & 0xFFFFFFFFULL) + 
               (((__int64)_mm_cvtsi128_si32( _mm_srli_si128( tmp, 4 ) )) << 32);

    // ... the rest
    for (; i < nLength; i++)
        nSum += pData [i];

    return nSum;
}
+4
source

Your for loop should be defined as follows:

for ( ; i < (nLength - 63); i+=64)

, nLength 120. . 64. < 120, . , 128 undefined. (0xC0000005), .

nLength = 128, . = 64. 65, . 128 . , == nLength. :)

+6

, harold answer - , . , , , , , , ?

0
source

All Articles