, , . " " , ( 22) @talonmies, .
, OP, .
, OP , . -
template <class T>
__device__ void warpReduce(T *sdata, int tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
template <class T>
__global__ void reduce4_no_synchthreads(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i+blockDim.x];
sdata[tid] = mySum;
__syncthreads();
for (unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
__syncthreads();
}
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
, sdata volatile, . , __device__ :
template <class T>
__device__ void warpReduce(volatile T *sdata, int tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
, , .. sdata, volatile volatile ( ).
volatile
MOV R1, c[0x1][0x100];
S2R R0, SR_CTAID.X;
SHL R3, R0, 0x1;
S2R R2, SR_TID.X;
IMAD R3, R3, c[0x0][0x8], R2;
IADD R4, R3, c[0x0][0x8];
ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT;
ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT;
@P0 ISCADD R3, R3, c[0x0][0x20], 0x2;
@!P1 ISCADD R4, R4, c[0x0][0x20], 0x2;
@!P0 MOV R5, RZ;
@!P1 LD R4, [R4];
@P0 LD R5, [R3];
SHL R3, R2, 0x2;
NOP;
@!P1 IADD R5, R4, R5;
MOV R4, c[0x0][0x8];
STS [R3], R5;
BAR.RED.POPC RZ, RZ, RZ, PT;
MOV R6, c[0x0][0x8];
ISETP.LT.U32.AND P0, PT, R6, 0x42, PT;
@P0 BRA 0x118;
NOP;
NOP;
MOV R6, R4;
SHR.U32 R4, R4, 0x1;
ISETP.GE.U32.AND P0, PT, R2, R4, PT;
@!P0 IADD R7, R4, R2;
@!P0 SHL R7, R7, 0x2;
@!P0 LDS R7, [R7];
@!P0 IADD R5, R7, R5;
@!P0 STS [R3], R5;
BAR.RED.POPC RZ, RZ, RZ, PT;
ISETP.GT.U32.AND P0, PT, R6, 0x83, PT;
@P0 BRA 0xc0;
ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT;
@P0 BRA.U 0x198;
@!P0 LDS R8, [R3];
@!P0 LDS R5, [R3+0x80];
@!P0 LDS R4, [R3+0x40];
@!P0 LDS R7, [R3+0x20];
@!P0 LDS R6, [R3+0x10];
@!P0 IADD R8, R8, R5;
@!P0 IADD R8, R8, R4;
@!P0 LDS R5, [R3+0x8];
@!P0 IADD R7, R8, R7;
@!P0 LDS R4, [R3+0x4];
@!P0 IADD R6, R7, R6;
@!P0 IADD R5, R6, R5;
@!P0 IADD R4, R5, R4;
@!P0 STS [R3], R4;
ISETP.NE.AND P0, PT, R2, RZ, PT;
@P0 BRA.U 0x1c0;
@!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;
@!P0 LDS R2, [RZ];
@!P0 ST [R0], R2;
EXIT;
/*0128*/-/*0148*/, /*0160*/ /*0170*/ /*0190*/ . , . , ( ), , .
volatile
MOV R1, c[0x1][0x100];
S2R R0, SR_CTAID.X;
SHL R3, R0, 0x1;
S2R R2, SR_TID.X;
IMAD R3, R3, c[0x0][0x8], R2;
IADD R4, R3, c[0x0][0x8];
ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT;
ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT;
@P0 ISCADD R3, R3, c[0x0][0x20], 0x2;
@!P1 ISCADD R4, R4, c[0x0][0x20], 0x2;
@!P0 MOV R5, RZ;
@!P1 LD R4, [R4];
@P0 LD R5, [R3];
SHL R3, R2, 0x2;
NOP;
@!P1 IADD R5, R4, R5;
MOV R4, c[0x0][0x8];
STS [R3], R5;
BAR.RED.POPC RZ, RZ, RZ, PT;
MOV R6, c[0x0][0x8];
ISETP.LT.U32.AND P0, PT, R6, 0x42, PT;
@P0 BRA 0x118;
NOP;
NOP;
MOV R6, R4;
SHR.U32 R4, R4, 0x1;
ISETP.GE.U32.AND P0, PT, R2, R4, PT;
@!P0 IADD R7, R4, R2;
@!P0 SHL R7, R7, 0x2;
@!P0 LDS R7, [R7];
@!P0 IADD R5, R7, R5;
@!P0 STS [R3], R5;
BAR.RED.POPC RZ, RZ, RZ, PT;
ISETP.GT.U32.AND P0, PT, R6, 0x83, PT;
@P0 BRA 0xc0;
ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT;
SSY 0x1f0;
@P0 NOP.S;
LDS R5, [R3];
LDS R4, [R3+0x80];
IADD R6, R5, R4;
STS [R3], R6;
LDS R5, [R3];
LDS R4, [R3+0x40];
IADD R6, R5, R4;
STS [R3], R6;
LDS R5, [R3];
LDS R4, [R3+0x20];
IADD R6, R5, R4;
STS [R3], R6;
LDS R5, [R3];
LDS R4, [R3+0x10];
IADD R6, R5, R4;
STS [R3], R6;
LDS R5, [R3];
LDS R4, [R3+0x8];
IADD R6, R5, R4;
STS [R3], R6;
LDS R5, [R3];
LDS R4, [R3+0x4];
IADD R4, R5, R4;
STS.S [R3], R4;
ISETP.NE.AND P0, PT, R2, RZ, PT;
@P0 BRA.U 0x218;
@!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;
@!P0 LDS R2, [RZ];
@!P0 ST [R0], R2;
EXIT;
/*0130*/-/*01e8*/, , , .