I want to change my code from CPP to CUDA, any idea?

I have a problem that I cannot solve.

The problem is as follows.

CPP Code

const int dataSize = 65535;
const int category = 10;
float data[dataSize][category];
const float threshold = 0.5f;

int cnt = 0;

// data array contains any values

for(int i=0;i<dataSize;i++)
{
    if( data[i][9] > threshold )
    {
        data[cnt][0] = data[i][0];
        data[cnt][1] = data[i][1];
        data[cnt][2] = data[i][2];
        data[cnt][3] = data[i][3];
        data[cnt][4] = data[i][4];
        data[cnt][5] = data[i][5];
        data[cnt][6] = data[i][6];
        data[cnt][7] = data[i][7];
        data[cnt][8] = data[i][8];
        data[cnt][9] = data[i][9];
        cnt++;
    }
}

Using this code, I expect the data array element "data" to be collected by a threshold value. (An element that does not exceed the threshold value is not important to me. Only exceeding the threshold value is important.)

I want code that works with the same result in CUDA.

So, I tried to do so.

CUDA Code

__global__ void checkOverThreshold(float *data, float threshold, int *nCount)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if( data[idx*10+9] > threshold )
    {
        data[nCount+0] = data[idx*10+0];
        data[nCount+1] = data[idx*10+1];
        data[nCount+2] = data[idx*10+2];
        data[nCount+3] = data[idx*10+3];
        data[nCount+4] = data[idx*10+4];
        data[nCount+5] = data[idx*10+5];
        data[nCount+6] = data[idx*10+6];
        data[nCount+7] = data[idx*10+7];
        data[nCount+8] = data[idx*10+8];
        data[nCount+9] = data[idx*10+9];
        atomicAdd( nCount, 1);
    }
}

....

// kernel function call
checkOverThreshold<<< dataSize / 128, 128 >>>(d_data, treshold, d_count);

But the result of the CUDA code is not the one I expected.

It contains many values ​​for garbage and even the result does not match the CPP result.

I think the nCount variable synchronization problem makes this situation.

But I have no idea to solve this problem.

Please help my code. Thank you in advance.

+2
3

:

    data[nCount+0] = data[idx*10+0];
    data[nCount+1] = data[idx*10+1];
    data[nCount+2] = data[idx*10+2];
    data[nCount+3] = data[idx*10+3];
    data[nCount+4] = data[idx*10+4];
    data[nCount+5] = data[idx*10+5];
    data[nCount+6] = data[idx*10+6];
    data[nCount+7] = data[idx*10+7];
    data[nCount+8] = data[idx*10+8];
    data[nCount+9] = data[idx*10+9];
    atomicAdd( nCount, 1);

nCount , .

    int d = atomicAdd(nCount, 1);
    data[d+0] = data[idx*10+0];
    data[d+1] = data[idx*10+1];
    data[d+2] = data[idx*10+2];
    data[d+3] = data[idx*10+3];
    data[d+4] = data[idx*10+4];
    data[d+5] = data[idx*10+5];
    data[d+6] = data[idx*10+6];
    data[d+7] = data[idx*10+7];
    data[d+8] = data[idx*10+8];
    data[d+9] = data[idx*10+9];
+3

Stream Compaction Thrust.

,

#include <thrust/copy.h>
// ...
const int dataSize = 65535;
struct Datum {
  float f0, f1, f2, ..., f9;
};
Datum data[dataSize];
const float threshold = 0.5f;

struct below_threshold
{
  __host__ __device__
  bool operator()(const Datum &d)
  {
    return d.f9 <= threshold;
  }
};

// data array is contains any values
Datum *new_end = thrust::remove_if(data, data + N, below_threshold());
int cnt = new_end - data;
// first cnt elements have the f9 term > threshold
// other elements are undefined

:

Thrust, . STL, Boost. , Thrust , , . , , , , . thrust:: device_vector ( , std::vector), Thrust GPU, thrust:: remove_if() . , Thrust, .

+3

As suggested by SchighSchagh, traction will be one way. ArrayFire provides a more mathematical representation of what is happening.

const int dataSize = 65535;
const int category = 10;
float data[dataSize][category];
const float threshold = 0.5f;

int cnt = 0;

// populate data

// Transfer to device
array Data(data, category, dataSize); // Column major
array idx = where(Data(9, span) > threshold);
Data = Data(span, idx);
+1
source

All Articles