CUDA - memcpy2d - wrong step

I just started CUDA programming and tried to execute the code shown below. The idea is to copy a 2-dimensional array to the device, calculate the sum of all the elements and get the sum subsequently (I know that this algorithm is not parallelized. In fact, it does more work and then is needed. As a practice for memcopy).

#include<stdio.h>
#include<cuda.h>
#include <iostream>
#include <cutil_inline.h>

#define height 50
#define width 50

using namespace std;

// Device code
__global__ void kernel(float* devPtr, int pitch,int* sum)
{
int tempsum = 0;    
for (int r = 0; r < height; ++r) {
        int* row = (int*)((char*)devPtr + r * pitch);
        for (int c = 0; c < width; ++c) {
             int element = row[c];
             tempsum = tempsum + element;
        }
    }
*sum = tempsum;
}

//Host Code
int main()
{

int testarray[2][8] = {{4,4,4,4,4,4,4,4},{4,4,4,4,4,4,4,4}};
int* sum =0;
int* sumhost = 0;
sumhost = (int*)malloc(sizeof(int));

cout << *sumhost << endl;

float* devPtr;
size_t pitch;
cudaMallocPitch((void**)&devPtr, &pitch, width * sizeof(int), height);
cudaMemcpy2D(devPtr,pitch,testarray,0,8* sizeof(int),4,cudaMemcpyHostToDevice);

cudaMalloc((void**)&sum, sizeof(int));
kernel<<<1, 4>>>(devPtr, pitch, sum);
cutilCheckMsg("kernel launch failure");
cudaMemcpy(sumhost, sum, sizeof(int), cudaMemcpyDeviceToHost);

cout << *sumhost << endl;

return 0;
}

This code compiles just fine (in a candidate for release 4.0 sdk). However, as soon as I try to execute, I get

0
cpexample.cu(43) : cutilCheckMsg() CUTIL CUDA error : kernel launch failure : invalid pitch argument.

Unfortunately, since I have no idea how to fix this ;-( As far as I know, the step is an offset in memory, which allows you to copy data faster. However, this step is used only in the memory device and not in the host memory, is it ? Therefore, the step of my host memory should be 0, right?

, :

  • int * sumhost (. ), ? cudaMalloc ?
  • cutilCheckMsg . , ?
+3
1

:

cudaMemcpy2D(devPtr,pitch,testarray,0,8* sizeof(int),4,cudaMemcpyHostToDevice);

, testarray 0, , T* elem = (T*)((char*)base_address + row * pitch) + column? 0 , (x, y) . , pitch = width + padding. , 0, 0, . , . , pitch >= width . , , testarray, 8*sizeof(int). , 2D- 2 , 4.

, , malloc(), , . , , . , , cudaMalloc(), , . , , , . , ​​ , , , . CUDA , , / (- -). , int* malloc(), ( , free() ) cudaMalloc(), , , . , , , .

+3

All Articles