- . , , . , , , , , .
, , "", , ,
thread 0 works on i*rows+j % (thread count) = 0
thread 1 works on i*rows+j % (thread count) = 1
(and so on)
, , . ( ). , , .
fDeepCopy(F2D* in)
{
int i, j;
F2D* out;
int rows, cols;
rows = in->height;
cols = in->width;
out = fMallocHandle(rows, cols);
for(i=0; i<rows; i++) {
int j = 0;
int pads = (cols / 4)*4;
for(; j < pads; j = j + 4) {
subsref(out,i,j) = subsref(in,i,j);
subsref(out,i,j+1) = subsref(in,i,j+1);
subsref(out,i,j+2) = subsref(in,i,j+2);
subsref(out,i,j+3) = subsref(in,i,j+3);
}
for(; j < pads; j++) {
subsref(out,i,j) = subsref(in,i,j);
}
}
return out;
}
, , , , , .
, SSE, , MMX, , , 32- MMX , .
, -
vec_res.x = v1.x + v2.x;
vec_res.y = v1.y + v2.y;
vec_res.z = v1.z + v2.z;
vec_res.w = v1.w + v2.w;
, (16 ),
movaps xmm0, [v1] ;xmm0 = v1.w | v1.z | v1.y | v1.x
addps xmm0, [v2] ;xmm0 = v1.w+v2.w | v1.z+v2.z | v1.y+v2.y | v1.x+v2.x
movaps [vec_res], xmm0
, .
, , , .