The problem you are facing is that SSE intrins perform far more memory operations than a version other than SSE. Using your vector class, I wrote the following:
int main (int argc, char *argv [])
{
vec4 a (static_cast <float> (argc));
cout << "argc = " << argc << endl;
a=++a;
cout << "a = (" << a.v.m128_f32 [0] << ", " << a.v.m128_f32 [1] << ", " << a.v.m128_f32 [2] << ", " << a.v.m128_f32 [3] << ", " << ")" << endl;
}
( , SSE):
fild dword ptr [ebp+8] // load argc into FPU
fstp dword ptr [esp+10h] // save argc as a float
movss xmm0,dword ptr [esp+10h] // load argc into SSE
shufps xmm0,xmm0,0 // copy argc to all values in SSE register
movaps xmmword ptr [esp+20h],xmm0 // save to stack frame
fld1 // load 1 into FPU
fst dword ptr [esp+20h]
fst dword ptr [esp+28h]
fst dword ptr [esp+30h]
fstp dword ptr [esp+38h] // create a (1,1,1,1) vector
movaps xmm0,xmmword ptr [esp+2Ch] // load above vector into SSE
addps xmm0,xmmword ptr [esp+1Ch] // add to vector a
movaps xmmword ptr [esp+38h],xmm0 // save back to a
: ESP, , .
:
int main (int argc, char *argv [])
{
float a[4];
for (int i = 0 ; i < 4 ; ++i)
{
a [i] = static_cast <float> (argc + i);
}
cout << "argc = " << argc << endl;
for (int i = 0 ; i < 4 ; ++i)
{
a [i] += 1.0f;
}
cout << "a = (" << a [0] << ", " << a [1] << ", " << a [2] << ", " << a [3] << ", " << ")" << endl;
}
( , )
fild dword ptr [argc]
fstp dword ptr [esp+8]
fild dword ptr [esp+4]
fstp dword ptr [esp+0Ch]
fild dword ptr [esp+8]
fstp dword ptr [esp+18h]
fild dword ptr [esp+10h]
fstp dword ptr [esp+24h]
fld dword ptr [esp+8]
fld1
fadd st(1),st
fxch st(1)
fstp dword ptr [esp+14h]
fld dword ptr [esp+1Ch]
fadd st,st(1)
fstp dword ptr [esp+24h]
fld dword ptr [esp+28h]
fadd st,st(1)
fstp dword ptr [esp+28h]
fadd dword ptr [esp+2Ch]
fstp dword ptr [esp+2Ch]
:
SSE FPU
4xfloat write 1xfloat read
1xsse read 1xfloat write
1xsse read+add 1xfloat read
1xsse write 1xfloat write
1xfloat read
1xfloat write
1xfloat read
1xfloat write
total
8 float reads 4 float reads
8 float writes 4 float writes
, SSE FPU, .
SSE, aglorithm SSE, / . intrinsics .