How CUDA Streams Work

I have many doubts about how threads are formed and executed.

First, the documentation describes GPU threads as light threads. Suppose I want to multiply two matrices 100*100. This would require streams 100*100if each element were calculated by a different stream. However, the specifications of my GPU (NVIDIA GT 640M LE) show two SMs, each of which can only support 2048 threads. How can I calculate the rest of the elements parallel to y, given that my GPU cannot support so many threads.

Also consider the basic vector add code. Suppose I call a kernel with 1 block and 64 threads to add two arrays of 100 elements each as follows:

    __global__ void add(int* a,int* b,int* c)
    {
        int i = threadIdx.x;
        for(i<100)
        {
            c[i] = a[i] + b[i];
        {    
     }

Since only 64 threads were initialized, I assume that 64 elements are added in parallel.

  • How are other elements added?
  • How does the warp scheduler determine which threads will be assigned to add the last 36 items?

My main problem:

I don’t understand how the thread knows which elements should work.

+3
source share
3 answers

Your card has the ability to calculate 3.0, see here .

12 CUDA C 2048, , . , , 2048 . , , x - 2^31-1. , , , 1d, , 8192 . , , : GPU?.

, add . i , for.

__global__ void add(int* a,int* b,int* c)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

: ,

    int i = threadIdx.x;
    c[i] = a[i] + b[i];

, , #3 threadIdx.x 3. , thread #3 i, , 3. , a[3] b[3] , , c[3] . , , , 100 64, 100.

, . CUDA .

+6

4 * 4 CUDA. , .

int main()
    {
     int *a, *b, *c;            //To store your matrix A & B in RAM. Result will be stored in matrix C
     int *ad, *bd, *cd;         // To store matrices into GPU RAM. 
     int N =16;   

          //No of rows and columns.

 size_t size=sizeof(float)* N * N;

 a=(float*)malloc(size);     //Allocate space of RAM for matrix A
 b=(float*)malloc(size);     //Allocate space of RAM for matrix B

//allocate memory on device
  cudaMalloc(&ad,size);
  cudaMalloc(&bd,size);
  cudaMalloc(&cd,size);

//initialize host memory with its own indices
    for(i=0;i<N;i++)
      {
    for(j=0;j<N;j++)
         {
            a[i * N + j]=(float)(i * N + j);
            b[i * N + j]= -(float)(i * N + j);
         }
      }

//copy data from host memory to device memory
     cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
     cudaMemcpy(bd, b, size, cudaMemcpyHostToDevice);

//calculate execution configuration 
   dim3 grid (1, 1, 1); 
   dim3 block (16, 1, 1);

//each block contains N * N threads, each thread calculates 1 data element

    add_matrices<<<grid, block>>>(ad, bd, cd, N);

   cudaMemcpy(c,cd,size,cudaMemcpyDeviceToHost);  
   printf("Matrix A was---\n");
    for(i=0;i<N;i++)
    {
        for(j=0;j<N;j++)
            printf("%f ",a[i*N+j]);
        printf("\n");
    }

   printf("\nMatrix B was---\n");
   for(i=0;i<N;i++)
    {
        for(j=0;j<N;j++)
            printf("%f ",b[i*N+j]);
        printf("\n");
    }

    printf("\nAddition of A and B gives C----\n");
    for(i=0;i<N;i++)
    {
        for(j=0;j<N;j++)
            printf("%f ",c[i*N+j]);   //if correctly evaluated, all values will be 0
        printf("\n");
    }



    //deallocate host and device memories
    cudaFree(ad); 
    cudaFree(bd); 
    cudaFree (cd);

    free(a);
    free(b);
    free(c);

    getch();
    return 1;
}

/////Kernel Part

__global__ void add_matrices(float *ad,float *bd,float *cd,int N)
{
  int index;
  index = blockIDx.x * blockDim.x + threadIDx.x            

  cd[index] = ad[index] + bd[index];
}

16 * 16 .   A B, 16 * 16..

, . , , GPU.

, . 65,535 , . (65535 * 65535 * 65535).

1024 . (1024 * 1024 * 64)

16 * 16 ..

A | 1  2  3  4 |        B | 1  2  3  4 |      C| 1  2  3  4 |
  | 5  6  7  8 |   +      | 5  6  7  8 |   =   | 5  6  7  8 | 
  | 9 10 11 12 |          | 9 10 11 12 |       | 9 10 11 12 |  
  | 13 14 15 16|          | 13 14 15 16|       | 13 14 15 16|

16 .

i.e. A(1,1) + B (1,1) = C(1,1)
     A(1,2) + B (1,2) = C(1,2) 
     .        .          .
     .        .          . 
     A(4,4) + B (4,4) = C(4,4) 

. 16 . (16 * 1 * 1) 16, 16 .

dim3 Grid(1,1,1), dim3 block(16,1,1), 16 , .

. ( threadIDs, blockDim, blockID). CUDA. , .! cuda...: -)

+1

for here it is very bad - some threads with threadid <100 will trigger the shutter. For beginners, this could be explained as follows: streams are predetermined by the system value, which displays the current stream number. The current thread takes a value from a, from b and writes it to c, so it will

int i = threadIdx.x;
c[i] = a[i] + b[i];

If you have a loke 100 array size that does not match the 64x block size so that some stream does not read / write outside, do:

int i = threadIdx.x;
    if(i < 100){

        c[i] = a[i] + b[i];
    }

You will have a discrepancy only on the very last block. Perhaps you would like

0
source

All Articles