Cuda Bayer / CFA Demo Example

I wrote a CUDA4 Bayer demo procedure, but it's slower than a single-threaded processor code running on a16core GTS250.
Blocksize (16.16), and the image dims is a multiple of 16, but changing this does not improve it.

Am I doing something obviously stupid?

--------------- calling routine ------------------
uchar4 *d_output;
size_t num_bytes; 

cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);    
cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource);

// Do the conversion, leave the result in the PBO fordisplay
kernel_wrapper( imageWidth, imageHeight, blockSize, gridSize, d_output );

cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);

--------------- cuda -------------------------------
texture<uchar, 2, cudaReadModeElementType> tex;
cudaArray *d_imageArray = 0;

__global__ void convertGRBG(uchar4 *d_output, uint width, uint height)
{
    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
    uint i = __umul24(y, width) + x;

    // input is GR/BG output is BGRA
    if ((x < width) && (y < height)) {

        if ( y & 0x01 ) {
            if ( x & 0x01 ) {  
                d_output[i].x =  (tex2D(tex,x+1,y)+tex2D(tex,x-1,y))/2;  // B                
                d_output[i].y = (tex2D(tex,x,y));     // G in B
                d_output[i].z = (tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/2;  // R                    
            } else {
                d_output[i].x = (tex2D(tex,x,y));        //B
                d_output[i].y = (tex2D(tex,x+1,y) + tex2D(tex,x-1,y)+tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/4;  // G
                d_output[i].z = (tex2D(tex,x+1,y+1) + tex2D(tex,x+1,y-1)+tex2D(tex,x-1,y+1)+tex2D(tex,x-1,y-1))/4;   // R
            }
        } else {
            if ( x & 0x01 ) {
                 // odd col = R
                d_output[i].y = (tex2D(tex,x+1,y+1) + tex2D(tex,x+1,y-1)+tex2D(tex,x-1,y+1)+tex2D(tex,x-1,y-1))/4;  // B
                d_output[i].z = (tex2D(tex,x,y));        //R
                d_output[i].y = (tex2D(tex,x+1,y) + tex2D(tex,x-1,y)+tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/4;  // G    
            } else {    
                d_output[i].x = (tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/2;  // B
                d_output[i].y = (tex2D(tex,x,y));               // G  in R               
                d_output[i].z = (tex2D(tex,x+1,y)+tex2D(tex,x-1,y))/2;  // R                    
            }
        }                                
    }
}



void initTexture(int imageWidth, int imageHeight, uchar *imagedata)
{

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cutilSafeCall( cudaMallocArray(&d_imageArray, &channelDesc, imageWidth, imageHeight) ); 
    uint size = imageWidth * imageHeight * sizeof(uchar);
    cutilSafeCall( cudaMemcpyToArray(d_imageArray, 0, 0, imagedata, size, cudaMemcpyHostToDevice) );
    cutFree(imagedata);

    // bind array to texture reference with point sampling
    tex.addressMode[0] = cudaAddressModeClamp;
    tex.addressMode[1] = cudaAddressModeClamp;
    tex.filterMode = cudaFilterModePoint;
    tex.normalized = false; 

    cutilSafeCall( cudaBindTextureToArray(tex, d_imageArray) );
}
+4
source share
3 answers

There are no obvious errors in your code, but there are some obvious performance features:

1) for best performance, you should use a texture to go into shared memory - see the SobelFilter SDK sample.

2) , , . .

3) , . Tesla , , , 16x4. (64 )

. SSE2 16 , 5 .

+8

Nvidia, ( ) , 2x2 . .

, ,

dim3 blockSize(16, 16); // for example
dim3 gridSize((width/2) / blockSize.x, (height/2) / blockSize.y);


__global__ void d_convertGRBG(uchar4 *d_output, uint width, uint height)
{
    uint x = 2 * (__umul24(blockIdx.x, blockDim.x) + threadIdx.x);
    uint y = 2 * (__umul24(blockIdx.y, blockDim.y) + threadIdx.y);
    uint i = __umul24(y, width) + x;

    // input is GR/BG output is BGRA
    if ((x < width-1) && (y < height-1)) {
        // x+1, y+1:

        d_output[i+width+1] = make_uchar4( (tex2D(tex,x+2,y+1)+tex2D(tex,x,y+1))/2,  // B                
                                             (tex2D(tex,x+1,y+1)),     // G in B
                                             (tex2D(tex,x+1,y+2)+tex2D(tex,x+1,y))/2,  // R                    
                                             0xff);

        // x, y+1:
        d_output[i+width] =   make_uchar4( (tex2D(tex,x,y+1)),        //B
                                             (tex2D(tex,x+1,y+1) + tex2D(tex,x-1,y+1)+tex2D(tex,x,y+2)+tex2D(tex,x,y))/4,  // G
                                             (tex2D(tex,x+1,y+2) + tex2D(tex,x+1,y)+tex2D(tex,x-1,y+2)+tex2D(tex,x-1,y))/4,   // R
                                             0xff);


        // x+1, y:
        d_output[i+1] =       make_uchar4( (tex2D(tex,x,y-1) + tex2D(tex,x+2,y-1)+tex2D(tex,x,y+1)+tex2D(tex,x+2,y-1))/4,  // B
                                            (tex2D(tex,x+2,y) + tex2D(tex,x,y)+tex2D(tex,x+1,y+1)+tex2D(tex,x+1,y-1))/4,  // G
                                            (tex2D(tex,x+1,y)),        //R
                                            0xff);


        // x, y:
        d_output[i] =         make_uchar4( (tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/2,  // B
                                             (tex2D(tex,x,y)),               // G  in R           
                                             (tex2D(tex,x+1,y)+tex2D(tex,x-1,y))/2,  // R                    
                                             0xff);

    }
}
+1

There is a lot of if and else in the code. If you structure the code to eliminate all conditional statements, then you will get a huge performance boost since branching is a performance killer. Indeed, you can delete branches. There are exactly 30 cases that you will need to explicitly indicate. I implemented it on a processor and does not contain any conditional statements. I am thinking of making a blog explaining this. Will be published after its completion.

0
source

All Articles