4 * 4 CUDA. , .
int main()
{
int *a, *b, *c;
int *ad, *bd, *cd;
int N =16;
size_t size=sizeof(float)* N * N;
a=(float*)malloc(size);
b=(float*)malloc(size);
cudaMalloc(&ad,size);
cudaMalloc(&bd,size);
cudaMalloc(&cd,size);
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
a[i * N + j]=(float)(i * N + j);
b[i * N + j]= -(float)(i * N + j);
}
}
cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(bd, b, size, cudaMemcpyHostToDevice);
dim3 grid (1, 1, 1);
dim3 block (16, 1, 1);
add_matrices<<<grid, block>>>(ad, bd, cd, N);
cudaMemcpy(c,cd,size,cudaMemcpyDeviceToHost);
printf("Matrix A was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
printf("%f ",a[i*N+j]);
printf("\n");
}
printf("\nMatrix B was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
printf("%f ",b[i*N+j]);
printf("\n");
}
printf("\nAddition of A and B gives C----\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
printf("%f ",c[i*N+j]);
printf("\n");
}
cudaFree(ad);
cudaFree(bd);
cudaFree (cd);
free(a);
free(b);
free(c);
getch();
return 1;
}
__global__ void add_matrices(float *ad,float *bd,float *cd,int N)
{
int index;
index = blockIDx.x * blockDim.x + threadIDx.x
cd[index] = ad[index] + bd[index];
}
16 * 16 .
A B, 16 * 16..
, .
, , GPU.
, .
65,535 , . (65535 * 65535 * 65535).
1024 . (1024 * 1024 * 64)
16 * 16 ..
A | 1 2 3 4 | B | 1 2 3 4 | C| 1 2 3 4 |
| 5 6 7 8 | + | 5 6 7 8 | = | 5 6 7 8 |
| 9 10 11 12 | | 9 10 11 12 | | 9 10 11 12 |
| 13 14 15 16| | 13 14 15 16| | 13 14 15 16|
16 .
i.e. A(1,1) + B (1,1) = C(1,1)
A(1,2) + B (1,2) = C(1,2)
. . .
. . .
A(4,4) + B (4,4) = C(4,4)
.
16 .
(16 * 1 * 1)
16, 16 .
dim3 Grid(1,1,1),
dim3 block(16,1,1), 16 , .
.
( threadIDs, blockDim, blockID). CUDA. , .! cuda...: -)