Simple CUDA Core Optimization

In the process of speeding up the application, I have a very simple kernel that does type casting, as shown below:

__global__ void UChar2FloatKernel(float *out, unsigned char *in, int nElem){
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if(i<nElem)
        out[i] = (float) in[i];
}

Global access to memory is integrated, and in my understanding the use of shared memory will also not be beneficial, since there will be no repeated reading of the same memory. Does anyone have any ideas if there is any optimization that can be done to speed up this kernel. Input and output data are already on the device, so a copy of the device’s memory on the host is not required.

+1
source share
4 answers

, , , - , . CUDA , , , , , . , "" SM GPU, .

- 128 , . GPU Fermi Kepler , .

:

__global__ 
void UChar2FloatKernel(float *out, unsigned char *in, int nElem)
{
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if(i<nElem)
        out[i] = (float) in[i];
}

__global__
void UChar2FloatKernel2(float  *out, 
                const unsigned char *in, 
            int nElem)
{
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;    
    for(; i<nElem; i+=gridDim.x*blockDim.x) {
        out[i] = (float) in[i];
    }
}

__global__
void UChar2FloatKernel3(float4  *out, 
                const uchar4 *in, 
            int nElem)
{
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;    
    for(; i<nElem; i+=gridDim.x*blockDim.x) {
        uchar4 ival = in[i]; // 32 bit load
        float4 oval = make_float4(ival.x, ival.y, ival.z, ival.w);
        out[i] = oval; // 128 bit store
    }
}

int main(void)
{

    const int n = 2 << 20;
    unsigned char *a = new unsigned char[n];

    for(int i=0; i<n; i++) {
        a[i] = i%255;
    }

    unsigned char *a_;
    cudaMalloc((void **)&a_, sizeof(unsigned char) * size_t(n));
    float *b_;
    cudaMalloc((void **)&b_, sizeof(float) * size_t(n));
    cudaMemset(b_, 0, sizeof(float) * size_t(n)); // warmup

    for(int i=0; i<5; i++)
    {
        dim3 blocksize(512);
        dim3 griddize(n/512);
        UChar2FloatKernel<<<griddize, blocksize>>>(b_, a_, n);
    }

    for(int i=0; i<5; i++)
    {
        dim3 blocksize(512);
        dim3 griddize(8); // 4 blocks per SM
        UChar2FloatKernel2<<<griddize, blocksize>>>(b_, a_, n);
    }

    for(int i=0; i<5; i++)
    {
        dim3 blocksize(512);
        dim3 griddize(8); // 4 blocks per SM
        UChar2FloatKernel3<<<griddize, blocksize>>>((float4*)b_, (uchar4*)a_, n/4);
    }
    cudaDeviceReset();
    return 0;
}  

:

>nvcc -m32 -Xptxas="-v" -arch=sm_21 cast.cu
cast.cu
tmpxft_000014c4_00000000-5_cast.cudafe1.gpu
tmpxft_000014c4_00000000-10_cast.cudafe2.gpu
cast.cu
ptxas : info : 0 bytes gmem
ptxas : info : Compiling entry function '_Z18UChar2FloatKernel2PfPKhi' for 'sm_2
1'
ptxas : info : Function properties for _Z18UChar2FloatKernel2PfPKhi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 5 registers, 44 bytes cmem[0]
ptxas : info : Compiling entry function '_Z18UChar2FloatKernel3P6float4PK6uchar4
i' for 'sm_21'
ptxas : info : Function properties for _Z18UChar2FloatKernel3P6float4PK6uchar4i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 8 registers, 44 bytes cmem[0]
ptxas : info : Compiling entry function '_Z17UChar2FloatKernelPfPhi' for 'sm_21'

ptxas : info : Function properties for _Z17UChar2FloatKernelPfPhi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 3 registers, 44 bytes cmem[0]
tmpxft_000014c4_00000000-5_cast.cudafe1.cpp
tmpxft_000014c4_00000000-15_cast.ii

>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
 Time(%)      Time   Calls       Avg       Min       Max  Name
   40.20    6.61ms       5    1.32ms    1.32ms    1.32ms  UChar2FloatKernel(float*, unsigned char*, int)
   29.43    4.84ms       5  968.32us  966.53us  969.46us  UChar2FloatKernel2(float*, unsigned char const *, int)
   26.35    4.33ms       5  867.00us  866.26us  868.10us  UChar2FloatKernel3(float4*, uchar4 const *, int)
    4.02  661.34us       1  661.34us  661.34us  661.34us  [CUDA memset]

8 4096 , , - , .

+11

const __restrict__, , . , , , ( , >= 3.5, , ).

__restrict__, .

, DarkZeros.

+1

4 gpu. 3 @talonmies, kernel2, .

// cpu version for comparison
void UChar2Float(unsigned char *a, float *b, const int n){
    for(int i=0;i<n;i++)
        b[i] = (float)a[i];
}

__global__ void UChar2FloatKernel1(float *out, const unsigned char *in, int nElem){
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if(i<nElem)     out[i] = (float) in[i];
}

__global__ void UChar2FloatKernel2(float4  *out, const uchar4 *in, int nElem){
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    if(i<nElem) {
        uchar4 ival = in[i]; // 32 bit load
        float4 oval = make_float4(ival.x, ival.y, ival.z, ival.w);
        out[i] = oval; // 128 bit store
    }
}

__global__ void UChar2FloatKernel3(float  *out, const unsigned char *in, int nElem) {
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    for(; i<nElem; i+=gridDim.x*blockDim.x) 
    {
        out[i] = (float) in[i];
    }
}

__global__ void UChar2FloatKernel4(float4  *out, const uchar4 *in, int nElem) {
    unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    for(; i<nElem; i+=gridDim.x*blockDim.x) 
    {
        uchar4 ival = in[i]; // 32 bit load
        float4 oval = make_float4(ival.x, ival.y, ival.z, ival.w);
        out[i] = oval; // 128 bit store
    }
}

GeForce GT 640 :

simpleKernel (cpu):         0.101463 seconds.
simpleKernel 1 (gpu):       0.007845 seconds.
simpleKernel 2 (gpu):       0.004914 seconds.
simpleKernel 3 (gpu):       0.005461 seconds.
simpleKernel 4 (gpu):       0.005461 seconds.

Thus, we can see that kernel2, which uses only vector types, is the winner. I conducted these tests for (32 * 1024 * 768) elements. The following is the output of nvprof:

Time(%)      Time     Calls       Avg       Min       Max  Name
91.68%  442.45ms         4  110.61ms  107.43ms  119.51ms  [CUDA memcpy DtoH]
3.76%  18.125ms         1  18.125ms  18.125ms  18.125ms  [CUDA memcpy HtoD]
1.43%  6.8959ms         1  6.8959ms  6.8959ms  6.8959ms  UChar2FloatKernel1(float*, unsigned char const *, int)
1.10%  5.3315ms         1  5.3315ms  5.3315ms  5.3315ms  UChar2FloatKernel3(float*, unsigned char const *, int)
1.04%  5.0184ms         1  5.0184ms  5.0184ms  5.0184ms  UChar2FloatKernel4(float4*, uchar4 const *, int)
0.99%  4.7816ms         1  4.7816ms  4.7816ms  4.7816ms  UChar2FloatKernel2(float4*, uchar4 const *, int)
+1
source

It is better to write a vectorized version of your code, immediately setting float4. this should be pretty simple if nElem is a 4x border, otherwise you might need to consider leftovers.

0
source

All Articles