, , , - , . CUDA , , , , , . , "" SM GPU, .
- 128 , . GPU Fermi Kepler , .
:
__global__
void UChar2FloatKernel(float *out, unsigned char *in, int nElem)
{
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
if(i<nElem)
out[i] = (float) in[i];
}
__global__
void UChar2FloatKernel2(float *out,
const unsigned char *in,
int nElem)
{
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
for(; i<nElem; i+=gridDim.x*blockDim.x) {
out[i] = (float) in[i];
}
}
__global__
void UChar2FloatKernel3(float4 *out,
const uchar4 *in,
int nElem)
{
unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
for(; i<nElem; i+=gridDim.x*blockDim.x) {
uchar4 ival = in[i];
float4 oval = make_float4(ival.x, ival.y, ival.z, ival.w);
out[i] = oval;
}
}
int main(void)
{
const int n = 2 << 20;
unsigned char *a = new unsigned char[n];
for(int i=0; i<n; i++) {
a[i] = i%255;
}
unsigned char *a_;
cudaMalloc((void **)&a_, sizeof(unsigned char) * size_t(n));
float *b_;
cudaMalloc((void **)&b_, sizeof(float) * size_t(n));
cudaMemset(b_, 0, sizeof(float) * size_t(n));
for(int i=0; i<5; i++)
{
dim3 blocksize(512);
dim3 griddize(n/512);
UChar2FloatKernel<<<griddize, blocksize>>>(b_, a_, n);
}
for(int i=0; i<5; i++)
{
dim3 blocksize(512);
dim3 griddize(8);
UChar2FloatKernel2<<<griddize, blocksize>>>(b_, a_, n);
}
for(int i=0; i<5; i++)
{
dim3 blocksize(512);
dim3 griddize(8);
UChar2FloatKernel3<<<griddize, blocksize>>>((float4*)b_, (uchar4*)a_, n/4);
}
cudaDeviceReset();
return 0;
}
:
>nvcc -m32 -Xptxas="-v" -arch=sm_21 cast.cu
cast.cu
tmpxft_000014c4_00000000-5_cast.cudafe1.gpu
tmpxft_000014c4_00000000-10_cast.cudafe2.gpu
cast.cu
ptxas : info : 0 bytes gmem
ptxas : info : Compiling entry function '_Z18UChar2FloatKernel2PfPKhi' for 'sm_2
1'
ptxas : info : Function properties for _Z18UChar2FloatKernel2PfPKhi
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 5 registers, 44 bytes cmem[0]
ptxas : info : Compiling entry function '_Z18UChar2FloatKernel3P6float4PK6uchar4
i' for 'sm_21'
ptxas : info : Function properties for _Z18UChar2FloatKernel3P6float4PK6uchar4i
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 8 registers, 44 bytes cmem[0]
ptxas : info : Compiling entry function '_Z17UChar2FloatKernelPfPhi' for 'sm_21'
ptxas : info : Function properties for _Z17UChar2FloatKernelPfPhi
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 3 registers, 44 bytes cmem[0]
tmpxft_000014c4_00000000-5_cast.cudafe1.cpp
tmpxft_000014c4_00000000-15_cast.ii
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
40.20 6.61ms 5 1.32ms 1.32ms 1.32ms UChar2FloatKernel(float*, unsigned char*, int)
29.43 4.84ms 5 968.32us 966.53us 969.46us UChar2FloatKernel2(float*, unsigned char const *, int)
26.35 4.33ms 5 867.00us 866.26us 868.10us UChar2FloatKernel3(float4*, uchar4 const *, int)
4.02 661.34us 1 661.34us 661.34us 661.34us [CUDA memset]
8 4096 , , - , .