, 60k , . , 10-30 , . , , "float" "double", . , , .
Params:
- global float * inVector -
- global float * outVector - ,
- const int inVectorSize - , inVector
- local float * resultScratch - . . = sizeof (cl_float) * get_local_size (0). , 64 , 64 floats = 256 . 512 . LDS, openCL, 16 . . (NULL) .
:
- .
- , .
- "resultScratch".
- , outVector
- outVector , .
:
- , . .
inVectorSize ( ) ( ) * ( ). . . ( , cpu ). outVector 5 . .
__kernel void floatSum(__global float* inVector, __global float* outVector, const int inVectorSize, __local float* resultScratch){
int gid = get_global_id(0);
int wid = get_local_id(0);
int wsize = get_local_size(0);
int grid = get_group_id(0);
int grcount = get_num_groups(0);
int i;
int workAmount = inVectorSize/grcount;
int startOffest = workAmount * grid + wid;
int maxOffest = workAmount * (grid + 1);
if(maxOffset > inVectorSize){
maxOffset = inVectorSize;
}
resultScratch[wid] = 0.0;
for(i=startOffest;i<maxOffest;i+=wsize){
resultScratch[wid] += inVector[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0){
for(i=1;i<wsize;i++){
resultScratch[0] += resultScratch[i];
}
outVector[grid] = resultScratch[0];
}
}
, :
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#else
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif
#endif
: AMD APP KernelAnalyzer (v12), , ALU, 5870 6970.