CUDA: the correctness of the loop index

This core does the right thing, giving me the right result. My problem is the correctness of the while loop if I want to improve performance. I tried several configurations of blocks and threads, but if I am going to change them, the while loop will not give me the correct result. The results obtained by changing the kernel configuration are that firstArray and secondArray will not be completely filled (they will have 0 inside the cells). Both arrays must be filled with curValue obtained from the if loop.

Any tips are welcome :)

Thank you in advance

#define N 65536

__global__ void whileLoop(int* firstArray_device, int* secondArray_device)
{   
    int curValue = 0;
    int curIndex = 1;

    int i = (threadIdx.x)+2;

    while(i < N) {
        if (i % curIndex == 0) {
            curValue = curValue + curIndex;
            curIndex *= 2;
        }
        firstArray_device[i] = curValue;
        secondArray_device[i] = curValue;
        i += blockDim.x * gridDim.x;
    }
}

int main(){

  firstArray_host[0] = 0;
  firstArray_host[1] = 1;

  secondArray_host[0] = 0;
  secondArray_host[1] = 1;


  // memory allocation + copy on GPU

  // definition number of blocks and threads
  dim3 dimBlock(1, 1);
  dim3 dimGrid(1, 1);

  whileLoop<<<dimGrid, dimBlock>>>(firstArray_device, secondArray_device);

  // copy back to CPU + free memory
}
+3
source share
3 answers

, - . curValue curIndex while . , , .

, , while , . , , threadIdx, blockDim, gridDim...

. . .

+3

:

  • , . .
  • . , , ( , ), , , - , .
  • , , , ... . ( SMP (. )...

:

  • int i = (threadIdx.x)+2;... 2 ; 2 3 .. , , (0, 1), . (, 0 C.)

  • , (, 2 ), (, 2 b x 1 t → b1t1: 2, b1t2: 2), , . - int i = threadIdx.x + blockDim.x * blockIdx.x; , .

  • i += blockDim.x * gridDim.x; , , # , , .

  • GPU ? , , .

1 2, , , , - , .

+1

, , . , , , , . . , , . .

:

:

__global__ void serial(int* array)
{
  int j(0);
  for (int i(0); i < 1024; ++i) {
    array[i] = j;
    j += 5;
}

int main() {
  dim3 dimBlock(1);
  dim3 dimGrid(1);
  serial<<<dimGrid, dimBlock>>>(array);
}

Parallel:

__global__ void parallel(int* array)
{
  int i(threadIdx.x + blockDim.x * blockIdx.x);
  int j(i * 5);
  array[i] = j;
}

int main(){
  dim3 dimBlock(256);
  dim3 dimGrid(1024 / 256);
  parallel<<<dimGrid, dimBlock>>>(array);
}
+1

All Articles