I ran into a serialization issue in CUDA kernels where parallel execution is expected. I use cudaEvents as markers to track kernel execution.
In my experiments on parallel cores with multiple threads, we noticed that using events in their respective threads forces serialization of parallel cores.
The code below demonstrates this problem. I tested this on two different devices that have simultaneous kernel execution capabilities, listed below:
- Tesla C2070, driver version 4.10, Runtime version 4.10, CUDA 2.0 feature
- Tesla M2090, driver version 4.10, Runtime version 4.10, CUDA 2.0 feature
You can run the program with and without events by changing the macro USE_EVENTS, and you will see the difference due to simultaneous execution and sequential execution.
#include<cuda.h>
#include<pthread.h>
#include<stdio.h>
#include<stdlib.h>
#include<stdint.h>
#define CUDA_SAFE_CALL( call) do { \
cudaError_t err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in call at file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(-1); \
} } while (0)
__global__ void VecAdd(uint64_t len)
{
volatile int a;
for(uint64_t n = 0 ; n < len ; n ++)
a++;
return ;
}
#define USE_EVENTS
int
main(int argc, char *argv[])
{
cudaStream_t stream[2];
for(int i = 0 ; i < 2 ; i++)
CUDA_SAFE_CALL(cudaStreamCreate(&stream[i]));
#ifdef USE_EVENTS
cudaEvent_t e[4];
CUDA_SAFE_CALL(cudaEventCreate(&e[0]));
CUDA_SAFE_CALL(cudaEventCreate(&e[1]));
CUDA_SAFE_CALL(cudaEventRecord(e[0],stream[0]));
#endif
VecAdd<<<1, 32, 0, stream[0]>>>(0xfffffff);
#ifdef USE_EVENTS
CUDA_SAFE_CALL(cudaEventRecord(e[1],stream[0]));
#endif
#ifdef USE_EVENTS
CUDA_SAFE_CALL(cudaEventCreate(&e[2]));
CUDA_SAFE_CALL(cudaEventCreate(&e[3]));
CUDA_SAFE_CALL(cudaEventRecord(e[2],stream[1]));
#endif
VecAdd<<<1, 32, 0, stream[1]>>>(0xfffffff);
#ifdef USE_EVENTS
CUDA_SAFE_CALL(cudaEventRecord(e[3],stream[1]));
#endif
CUDA_SAFE_CALL(cudaDeviceSynchronize());
for(int i = 0 ; i < 2 ; i++)
CUDA_SAFE_CALL(cudaStreamDestroy(stream[i]));
return 0;
}
Any suggestions on why this might happen and how to get around this serialization would be helpful.
source
share