Pretty simple implementation. I changed your function prototype - a regular function (against an object method).
This code works about 3 times faster than the byte-byte function (1500 ms for 1,000,000 iterations in an array with 256 elements, about 0.7 GB / s on my old 2.2 GHz Athlon XP)
function Combine(Pixels: PByte; Weights: PInteger; const Size: Cardinal): Integer;
//x86, register calling convention - three parameters in EAX, EDX, ECX
const
Precision: Single = 1.0;
asm
pxor XMM6, XMM6 //zero const
pxor XMM4, XMM4 // zero accum
@@cycle:
movd XMM1, [eax] //load color data
movss XMM3, [edx] //load weight
punpcklbw XMM1, XMM6 //bytes to words
shufps XMM3, XMM3, 0 // 4 x weight
punpcklwd XMM1, XMM6 //words to ints
cvtdq2ps XMM2, XMM3 //ints to singles
cvtdq2ps XMM0, XMM1 //ints to singles
mulps XMM0, XMM2 //data * weight
addps XMM4, XMM0 //accum = accum + data * weight
add eax, 4 // inc pointers
add edx, 4
loop @@cycle
movss XMM5, Precision
shufps XMM5, XMM5, 0 // 4 x precision constant
divps XMM4, XMM5 //accum/precision
cvtps2dq XMM2, XMM4 //rounding singles to ints
packssdw XMM2, XMM2 //ints to ShortInts
packuswb XMM2, XMM2 //ShortInts to bytes
movd eax, XMM2 //result
end;
source
share