I find it hard to understand why my cuda code is slower than my processor code
my desktop configuration is i7 2600S, geforce 560ti
and my code is as follows:
int** kernel_shiftSeam(int **MCEnergyMat, int **newE, int *seam, int width, int height, int direction)
{
float elapsed_time_ms = 0;
cudaEvent_t start, stop;
dim3 threads(16,16);
dim3 blocks((width+threads.x-1)/threads.x, (height+threads.y-1)/threads.y);
int *device_Seam;
int *host_Seam;
int seamSize;
if(direction == 1)
{
seamSize = height*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<height;i++)
host_Seam[i] = seam[i];
}
else
{
seamSize = width*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<width;i++)
host_Seam[i] = seam[i];
}
cudaMalloc((void**)&device_Seam, seamSize);
cudaMemcpy(device_Seam, host_Seam, seamSize, cudaMemcpyHostToDevice);
global_host_MC = MCEnergyMat;
new_host_MC = newE;
cudaMemcpy(global_MC, global_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
cudaMemcpy(global_MC2[i], global_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);
cudaMemcpy(new_MC, new_MC2, sizeof(int*)*width, cudaMemcpyHostToDevice);
for(int i=0;i<width;i++)
cudaMemcpy(new_MC2[i], new_host_MC[i], sizeof(int)*height, cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
gpu_shiftSeam<<< blocks,threads >>>(global_MC, new_MC, device_Seam, width, height);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms, start, stop );
execTime += elapsed_time_ms;
for(int i=0;i<width;i++)
{
cudaMemcpy(newE[i], new_MC2[i], sizeof(int)*height, cudaMemcpyDeviceToHost);
}
return newE;
}
I looped it 800 times and got the following results:
GPU Computing time (part of gpu_shiftseam): 1176 ms Total program execution time: 22 s
Processor Calculation time (same operation as gpu_shiftseam, but on the host): 12522ms Total program execution time: 12 s
-, , ,
- gpu ,
- ? - /,
? "" ?
!