I called GCC as follows:
$ gcc -I/usr/include/SDL2 -D_REENTRANT -Ibuild -I. -S -fverbose-asm -O2 -m64 -mpc64 -mfpmath=both -fipa-pta -ftree-loop-linear -floop-interchange -floop-strip-mine -floop-block -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops -ftree-vectorize -march=core-avx-i -c algo/collision.c -o build/collision.s
important parameters are:
-S : output assembly
-ftree-vectorize : vectorize loops
-march=core-avx-i : enable "MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2,
: AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C
: instruction set support."
here is the source before assembly:
#include "collision.h"
int8_t currentField[FIELD_W][FIELD_H];
void buildField (const gravityWell *body) {
int x, y;
int w, h, Cx, Cy;
int Vx[2], Vy[2];
for (x = 0; x < FIELD_W; x++) {
memset (currentField[x], 0x00, FIELD_H);
}
for (x = 0; x < body->object_count; x++) {
Cx = body->stuff[x].pos.x;
Cy = body->stuff[x].pos.y;
w = body->stuff[x].pos.w;
h = body->stuff[x].pos.h;
w = w / 2;
h = h / 2;
Vx[0] = Cx - w;
Vx[1] = Cx + w;
Vy[0] = Cy - h;
Vy[1] = Cy + h;
Vx[0] += FIELD_W / 2;
Vx[1] += FIELD_W / 2;
Vy[0] += FIELD_H / 2;
Vy[1] += FIELD_H / 2;
Vx[1]++;
Vy[1]++;
for (y = Vx[0]; y < Vx[1]; y++) {
memset (currentField[y], 0x01, (Vy[1] - Vy[0]));
}
}
return;
}
and here is the compiled source (GAS syntax):
.file "collision.c"
.text
.p2align 4,,15
.globl buildField
.type buildField, @function
buildField:
.LFB24:
.cfi_startproc
pushq %r14
.cfi_def_cfa_offset 16
.cfi_offset 14, -16
pushq %r13
.cfi_def_cfa_offset 24
.cfi_offset 13, -24
movq %rdi, %r13
pushq %r12
.cfi_def_cfa_offset 32
.cfi_offset 12, -32
pushq %rbp
.cfi_def_cfa_offset 40
.cfi_offset 6, -40
pushq %rbx
.cfi_def_cfa_offset 48
.cfi_offset 3, -48
movl $currentField, %ebx
.p2align 4,,10
.p2align 3
.L3:
xorl %esi, %esi
movq %rbx, %rdi
movl $4000, %edx
call memset
addq $4000, %rbx
cmpq $currentField+16000000, %rbx
jne .L3
movl 8(%r13), %eax
xorl %r14d, %r14d
xorl %r12d, %r12d
testl %eax, %eax
jle .L12
.p2align 4,,10
.p2align 3
.L11:
movq %r14, %rax
addq 0(%r13), %rax
movl 96(%rax), %edx
vmovss 88(%rax), %xmm0
vmovss 92(%rax), %xmm1
movl 100(%rax), %eax
vcvttss2si %xmm0, %esi
movl %edx, %edi
vcvttss2si %xmm1, %ecx
shrl $31, %edi
addl %edi, %edx
movl %eax, %edi
sarl %edx
shrl $31, %edi
movl %ecx, %r8d
addl %edi, %eax
movl %esi, %edi
sarl %eax
subl %edx, %edi
addl %esi, %edx
leal 2001(%rcx,%rax), %ebp
subl %eax, %r8d
leal 2000(%rdi), %esi
addl $2000, %r8d
leal 2001(%rdx), %eax
cmpl %eax, %esi
jge .L8
movslq %esi, %rax
subl %edi, %edx
subl %r8d, %ebp
leaq (%rdx,%rax), %rbx
movslq %ebp, %rbp
imulq $4000, %rax, %rcx
imulq $4000, %rbx, %rbx
addq $currentField, %rcx
addq $currentField+4000, %rbx
.p2align 4,,10
.p2align 3
.L9:
movq %rcx, %rdi
movq %rbp, %rdx
movl $1, %esi
call memset
movq %rax, %rcx
addq $4000, %rcx
cmpq %rbx, %rcx
jne .L9
.L8:
addl $1, %r12d
subq $-128, %r14
cmpl %r12d, 8(%r13)
jg .L11
.L12:
popq %rbx
.cfi_def_cfa_offset 40
popq %rbp
.cfi_def_cfa_offset 32
popq %r12
.cfi_def_cfa_offset 24
popq %r13
.cfi_def_cfa_offset 16
popq %r14
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size buildField, .-buildField
.comm currentField,16000000,32
.ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1"
.section .note.GNU-stack,"",@progbits
GCC uses SSE instructions instead of AVX instructions, especially considering that it uses 128 bit SSE registers %xmmas opposed to 256 bit AVX registers %ymm.
Why is this and, more importantly, how can I get gccAVX to be used on top of SSE?