Initialize an array of devices in CUDA

How to initialize a device array that is allocated with cudaMalloc()?

I tried cudaMemset, but it was not able to initialize all the values โ€‹โ€‹except 0.code, since cudaMemset looks below where the value is initialized to 5.

cudaMemset(devPtr,value,number_bytes)
+5
source share
2 answers

As you discover, it cudaMemsetworks like a standard C library memset. Quoting from the documentation:

cudaError_t cudaMemset  (   void *      devPtr,
                            int         value,
                            size_t      count    
                        )           

Fills the first bytes of the memory area count indicated by devPtr with a constant value for the byte value.

So, valueis a byte value. If you do something like:

int *devPtr;
cudaMalloc((void **)&devPtr,number_bytes);
const int value = 5;
cudaMemset(devPtr,value,number_bytes);

, , , devPtr 5. devPtr , , 84215045. , , , .

API- , โ€‹โ€‹ . ,

template<typename T>
__global__ void initKernel(T * devPtr, const T val, const size_t nwords)
{
    int tidx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = blockDim.x * gridDim.x;

    for(; tidx < nwords; tidx += stride)
        devPtr[tidx] = val;
}

( : , , , ).

, , , , cudaMemset. , cudaMemset , API , , .

, API- , cuMemsetD16 cuMemsetD32, , 32- . 64- (, ), .

+10

, . , , for(; tidx < nwords; tidx += stride) , , .

โ€‹โ€‹ , , . :

template <typename T>
__global__ void kernelInitializeArray(T* __restrict__ a, const T value, 
   const size_t n, const size_t incx) {
      int tid = threadIdx.x + blockDim.x * blockIdx.x;
      if (tid*incx < n) {
           a[tid*incx] = value;
       }
}

โ€‹โ€‹ :

template <typename T>
void deviceInitializeArray(T* a, const T value, const size_t n, const size_t incx) {
      int number_of_blocks = ((n / incx) + BLOCK_SIZE - 1) / BLOCK_SIZE;
      dim3 gridDim(number_of_blocks, 1);
      dim3 blockDim(BLOCK_SIZE, 1);
      kernelInitializeArray<T> <<<gridDim, blockDim>>>(a, value, n, incx);
}
+1

All Articles