I'm afraid this is a little more complicated than it sounds, but I will do my best to explain here a possible route that you can take to implement this.
, , , , , - , "" . - . CUDA OpenCL , , , , Google CUDA reduction, .
, , , , , . , , . , :

, , 16- . , 8- , 8 .
, 4 - . ...
, - .
RenderScript :
Java:
int[] ints;
Allocation data = Allocation.createSized(rs, Element.I32(rs), ints.length, Allocation.USAGE_SCRIPT);
data.copy1DRangeFrom(0, ints.length, ints);
ScriptC_Reduce script = new ScriptC_Reduce(rs);
script.bind_data(data);
for (int stride = ints.length / 2; stride > 0; stride /= 2) {
script.set_stride(stride);
script.forEach_root(input, output);
}
data.copyTo(ints);
int totalsum = ints[0];
Renderscript:
#pragma version(1)
#pragma rs java_package_name(...[your package here]...)
int stride;
int * data;
void root(const int32_t *v_in, int32_t *v_out, uint32_t x) {
if (x < stride) data[x] += data[x + stride];
}
RS, :
- , "v_in" "v_out" RS , , , . , " " int, Java , , .
- Java, , . , previuos , " [x + ]" . RS , , , . , __syncthreads() CUDA, .
, . . , , , , , ints.length , .
, , 2, . 0-pad . , , 0-padding .
, , , , 64 . , , "" 64 . , ( ), 64 . , 64 - , . 2, , , 16 32. , .
EDIT:. , RenderScript , , . , , , , .