, .
NVIDIA " ", . , sum_re sum_im, (, , ), , no-op. 15- , , , . , , .
, , , , , - , 1,5 , ( ) 30- .
, , , , , :
#include <iostream>
#include <OpenCL/opencl.h>
size_t source_size;
const char * source_str =
"kernel void ndft( \n" \
" global float *re, global float *im, int num_values, \n" \
" global float *spectrum_re, global float *spectrum_im, \n" \
" global float *spectrum_abs, \n" \
" global float *sin_array, global float *cos_array, \n" \
" float sqrt_num_values_reciprocal) \n" \
"{ \n" \
" // MATH MAGIC - DISREGARD FROM HERE ----------- \n" \
" \n" \
" float x; \n" \
" float y; \n" \
" float sum_re = 0; \n" \
" float sum_im = 0; \n" \
" \n" \
" size_t thread_id = get_global_id(0); \n" \
" \n" \
" for (int i = 0; i < num_values; i++) \n" \
" { \n" \
" x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal; \n" \
" y = sin_array[thread_id * num_values + i] * sqrt_num_values_reciprocal; \n" \
" sum_re += re[i] * x + im[i] * y; \n" \
" sum_im -= re[i] * y + x * im[i]; \n" \
" } \n" \
" \n" \
" // MATH MAGIC DONE ---------------------------- \n" \
" \n" \
" //spectrum_re[thread_id] = sum_re; \n" \
" //spectrum_im[thread_id] = sum_im; \n" \
" //spectrum_abs[thread_id] = hypot(sum_re, sum_im); \n" \
"} \n";
int main(void)
{
int err;
cl_device_id device_id;
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &err);
err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_uint program_num_devices;
clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &program_num_devices, NULL);
size_t * binaries_sizes = new size_t[program_num_devices];
clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, program_num_devices*sizeof(size_t), binaries_sizes, NULL);
char **binaries = new char*[program_num_devices];
for (size_t i = 0; i < program_num_devices; i++)
binaries[i] = new char[binaries_sizes[i]+1];
clGetProgramInfo(program, CL_PROGRAM_BINARIES, program_num_devices*sizeof(size_t), binaries, NULL);
for (size_t i = 0; i < program_num_devices; i++)
{
binaries[i][binaries_sizes[i]] = '\0';
std::cout << "Program " << i << ":" << std::endl;
std::cout << binaries[i];
}
return 0;
}
PTX OpenCL:
Program 0:
bplist00?^clBinaryDriver\clBinaryData_clBinaryVersionWCLH 1.0O!.version 1.5
.target sm_12
.target texmode_independent
.reg .b32 r<126>;
.reg .b64 x<126>;
.reg .b32 f<128>;
.reg .pred p<32>;
.reg .u32 sp;
.reg .b8 wb0,wb1,wb2,wb3;
.reg .b16 ws0,ws1,ws2,ws3;
.reg .b32 tb0,tb1,tb2,tb3;
.reg .b64 vl0,vl1;
.reg .b16 cvt16_0,cvt16_1;
.const .align 1 .b8 ndft_gid_base[52];
.local .align 16 .b8 ndft_stack[8];
.entry ndft(
.param.b32 ndft_0 ,
.param.b32 ndft_1 ,
.param.b32 ndft_2 ,
.param.b32 ndft_3 ,
.param.b32 ndft_4 ,
.param.b32 ndft_5 ,
.param.b32 ndft_6 ,
.param.b32 ndft_7 ,
.param.f32 ndft_8
) {
mov.u32 sp, ndft_stack;
mov.u32 r0, 4294967295;
ld.param.u32 r1, [ndft_2 + 0];
LBB1_1:
add.u32 r0, r0, 1;
setp.lt.s32 p0, r0, r1;
@p0 bra LBB1_1;
LBB1_2:
ret;
}
. , . , , :
Program 0:
S.version 1.5inaryDriver\clBinaryData_clBinaryVersionWCLH 1.0O
.target sm_12
.target texmode_independent
.reg .b32 r<126>;
.reg .b64 x<126>;
.reg .b32 f<128>;
.reg .pred p<32>;
.reg .u32 sp;
.reg .b8 wb0,wb1,wb2,wb3;
.reg .b16 ws0,ws1,ws2,ws3;
.reg .b32 tb0,tb1,tb2,tb3;
.reg .b64 vl0,vl1;
.reg .b16 cvt16_0,cvt16_1;
.const .align 1 .b8 ndft_gid_base[52];
.local .align 16 .b8 ndft_stack[8];
.entry ndft(
.param.b32 ndft_0 ,
.param.b32 ndft_1 ,
.param.b32 ndft_2 ,
.param.b32 ndft_3 ,
.param.b32 ndft_4 ,
.param.b32 ndft_5 ,
.param.b32 ndft_6 ,
.param.b32 ndft_7 ,
.param.f32 ndft_8
) {
mov.u32 sp, ndft_stack;
cvt.u32.u16 r0, %tid.x;
cvt.u32.u16 r1, %ntid.x;
cvt.u32.u16 r2, %ctaid.x;
mad24.lo.u32 r0, r2, r1, r0;
mov.u32 r1, 0;
shl.b32 r2, r1, 2;
mov.u32 r3, ndft_gid_base;
add.u32 r2, r2, r3;
ld.const.u32 r2, [r2 + 40];
add.u32 r0, r0, r2;
ld.param.u32 r2, [ndft_2 + 0];
mul.lo.u32 r3, r0, r2;
shl.b32 r3, r3, 2;
mov.f32 f0, 0f00000000 ;
ld.param.f32 f1, [ndft_8 + 0];
ld.param.u32 r4, [ndft_7 + 0];
ld.param.u32 r5, [ndft_6 + 0];
ld.param.u32 r6, [ndft_5 + 0];
ld.param.u32 r7, [ndft_4 + 0];
ld.param.u32 r8, [ndft_3 + 0];
ld.param.u32 r9, [ndft_1 + 0];
ld.param.u32 r10, [ndft_0 + 0];
mov.u32 r11, r1;
mov.f32 f2, f0;
LBB1_1:
setp.ge.s32 p0, r11, r2;
@!p0 bra LBB1_7;
LBB1_2:
shl.b32 r1, r0, 2;
add.u32 r2, r8, r1;
st.global.f32 [r2+0], f0;
add.u32 r1, r7, r1;
st.global.f32 [r1+0], f2;
abs.f32 f1, f2;
abs.f32 f0, f0;
setp.gt.f32 p0, f0, f1;
selp.f32 f2, f0, f1, p0;
abs.f32 f3, f2;
mov.f32 f4, 0f7E800000 ;
setp.gt.f32 p1, f3, f4;
selp.f32 f0, f1, f0, p0;
shl.b32 r0, r0, 2;
add.u32 r0, r6, r0;
@!p1 bra LBB1_8;
LBB1_3:
mul.rn.f32 f3, f2, 0f3E800000 ;
mul.rn.f32 f1, f0, 0f3E800000 ;
LBB1_4:
mov.f32 f4, 0f00000000 ;
setp.eq.f32 p0, f2, f4;
@!p0 bra LBB1_9;
LBB1_5:
add.f32 f1, f2, f0;
LBB1_6:
mov.f32 f3, 0f7F800000 ;
setp.eq.f32 p0, f0, f3;
setp.eq.f32 p1, f2, f3;
or.pred p0, p1, p0;
selp.f32 f0, f3, f1, p0;
st.global.f32 [r0+0], f0;
ret;
LBB1_7:
add.u32 r12, r3, r1;
add.u32 r13, r4, r12;
ld.global.f32 f3, [r13+0];
mul.rn.f32 f3, f3, f1;
add.u32 r13, r9, r1;
ld.global.f32 f4, [r13+0];
mul.rn.f32 f5, f3, f4;
add.u32 r12, r5, r12;
ld.global.f32 f6, [r12+0];
mul.rn.f32 f6, f6, f1;
add.u32 r12, r10, r1;
ld.global.f32 f7, [r12+0];
mul.rn.f32 f8, f7, f6;
add.f32 f5, f8, f5;
sub.f32 f2, f2, f5;
mul.rn.f32 f4, f4, f6;
mul.rn.f32 f3, f7, f3;
add.f32 f3, f3, f4;
add.f32 f0, f0, f3;
add.u32 r11, r11, 1;
add.u32 r1, r1, 4;
bra LBB1_1;
LBB1_8:
mov.f32 f1, f0;
mov.f32 f3, f2;
bra LBB1_4;
LBB1_9:
div.approx.f32 f1, f1, f3;
mul.rn.f32 f1, f1, f1;
add.f32 f1, f1, 0f3F800000 ;
sqrt.approx.ftz.f32 f1, f1;
mul.rn.f32 f1, f2, f1;
bra LBB1_6;
}
, , , , , .
, , ( , ). 1,5 , , , . , . , :
x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;
num_values 24, 48. , , L1 GPU Fermi . . , warp ( , ). 24 48 1 .
24 48 re im:
sum_re += re[i] * x + im[i] * y;
sum_im -= re[i] * y + x * im[i];
( GPU , ). , __local . , , , , .