GCC generates SSE instructions instead of AVX

Question

GCC generates SSE instructions instead of AVX

I called GCC as follows:

$ gcc -I/usr/include/SDL2 -D_REENTRANT -Ibuild -I. -S -fverbose-asm -O2 -m64 -mpc64 -mfpmath=both -fipa-pta -ftree-loop-linear -floop-interchange -floop-strip-mine -floop-block -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops -ftree-vectorize -march=core-avx-i -c algo/collision.c -o build/collision.s

important parameters are:

-S                      : output assembly
-ftree-vectorize        : vectorize loops
-march=core-avx-i       : enable "MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2,
                        : AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C
                        : instruction set support."

here is the source before assembly:

#include "collision.h"

int8_t currentField[FIELD_W][FIELD_H];

// Clear and rebuild the field based on the objects with a gravity well
void buildField (const gravityWell *body) {
    int x, y;
    int w, h, Cx, Cy;
    int Vx[2], Vy[2];

    // Clear the field
    for (x = 0; x < FIELD_W; x++) {
        memset (currentField[x], 0x00, FIELD_H);
    }

    // Rebuild the field
    for (x = 0; x < body->object_count; x++) {
        // Fetch the position and dimensions of the object and round
        // them to ints
        Cx =    body->stuff[x].pos.x;
        Cy =    body->stuff[x].pos.y;
        w = body->stuff[x].pos.w;
        h = body->stuff[x].pos.h;

        // Calculate the lower left and upper right edges of a
        // rectangle encompassing the object
        w = w / 2;
        h = h / 2;
        Vx[0] = Cx - w;
        Vx[1] = Cx + w;
        Vy[0] = Cy - h;
        Vy[1] = Cy + h;

        // Add in the offset for array accesses
        Vx[0] += FIELD_W / 2;
        Vx[1] += FIELD_W / 2;
        Vy[0] += FIELD_H / 2;
        Vy[1] += FIELD_H / 2;

        Vx[1]++;
        Vy[1]++;

        // Set the area occupied by the object to ones
        for (y = Vx[0]; y < Vx[1]; y++) {
            memset (currentField[y], 0x01, (Vy[1] - Vy[0]));
        }
    }

    return;
}

and here is the compiled source (GAS syntax):

    .file   "collision.c"
# GNU C (Ubuntu/Linaro 4.8.1-10ubuntu9) version 4.8.1 (x86_64-linux-gnu)
#   compiled by GNU C version 4.8.1, GMP version 5.1.2, MPFR version 3.1.1-p2, MPC version 1.0.1
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  -I /usr/include/SDL2 -I build -I .
# -imultiarch x86_64-linux-gnu -D _REENTRANT algo/collision.c -m64 -mpc64
# -mfpmath=both -march=core-avx-i -auxbase-strip build/collision.s -O2
# -fverbose-asm -fipa-pta -floop-interchange -floop-strip-mine -floop-block
# -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops
# -ftree-vectorize -fstack-protector -Wformat -Wformat-security
# options enabled:  -faggressive-loop-optimizations
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fcombine-stack-adjustments -fcommon -fcompare-elim
# -fcprop-registers -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm
# -fearly-inlining -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-lm -fgnu-runtime
# -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion
# -fif-conversion2 -findirect-inlining -finline -finline-atomics
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-profile -fipa-pta -fipa-pure-const -fipa-reference -fipa-sra
# -fira-hoist-pressure -fira-share-save-slots -fira-share-spill-slots
# -fivopts -fkeep-static-consts -fleading-underscore -floop-block
# -floop-interchange -floop-strip-mine -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeephole -fpeephole2 -fprefetch-loop-arrays -free
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fshrink-wrap -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-wide-types -fstack-protector -fstrict-aliasing -fstrict-overflow
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-distribution -ftree-loop-if-convert -ftree-loop-im
# -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops=
# -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc -ftree-scev-cprop
# -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fverbose-asm -fzero-initialized-in-bss
# -m128bit-long-double -m64 -m80387 -maccumulate-outgoing-args -maes
# -malign-stringops -mavx -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mcx16 -mf16c -mfancy-math-387
# -mfp-ret-in-387 -mfsgsbase -mfxsr -mglibc -mieee-fp -mlong-double-80
# -mmmx -mpc64 -mpclmul -mpopcnt -mpush-args -mrdrnd -mred-zone -msahf
# -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mssse3
# -mtls-direct-seg-refs -mvzeroupper -mxsave -mxsaveopt

    .text
    .p2align 4,,15
    .globl  buildField
    .type   buildField, @function
buildField:
.LFB24:
    .cfi_startproc
    pushq   %r14    #
    .cfi_def_cfa_offset 16
    .cfi_offset 14, -16
    pushq   %r13    #
    .cfi_def_cfa_offset 24
    .cfi_offset 13, -24
    movq    %rdi, %r13  # body, body
    pushq   %r12    #
    .cfi_def_cfa_offset 32
    .cfi_offset 12, -32
    pushq   %rbp    #
    .cfi_def_cfa_offset 40
    .cfi_offset 6, -40
    pushq   %rbx    #
    .cfi_def_cfa_offset 48
    .cfi_offset 3, -48
    movl    $currentField, %ebx #, ivtmp.26
    .p2align 4,,10
    .p2align 3
.L3:
    xorl    %esi, %esi  #
    movq    %rbx, %rdi  # ivtmp.26,
    movl    $4000, %edx #,
    call    memset  #
    addq    $4000, %rbx #, ivtmp.26
    cmpq    $currentField+16000000, %rbx    #, ivtmp.26
    jne .L3 #,
    movl    8(%r13), %eax   # body_11(D)->object_count,
    xorl    %r14d, %r14d    # ivtmp.19
    xorl    %r12d, %r12d    # x
    testl   %eax, %eax  #
    jle .L12    #,
    .p2align 4,,10
    .p2align 3
.L11:
    movq    %r14, %rax  # ivtmp.19, D.2657
    addq    0(%r13), %rax   # body_11(D)->stuff, D.2657
    movl    96(%rax), %edx  # _16->pos.w, w
    vmovss  88(%rax), %xmm0 # _16->pos.x,
    vmovss  92(%rax), %xmm1 # _16->pos.y,
    movl    100(%rax), %eax # _16->pos.h, h
    vcvttss2si  %xmm0, %esi #, Cx
    movl    %edx, %edi  # w, tmp125
    vcvttss2si  %xmm1, %ecx #, Cy
    shrl    $31, %edi   #, tmp125
    addl    %edi, %edx  # tmp125, tmp127
    movl    %eax, %edi  # h, tmp128
    sarl    %edx    # tmp127
    shrl    $31, %edi   #, tmp128
    movl    %ecx, %r8d  # Cy, D.2655
    addl    %edi, %eax  # tmp128, tmp130
    movl    %esi, %edi  # Cx, D.2655
    sarl    %eax    # tmp130
    subl    %edx, %edi  # tmp127, D.2655
    addl    %esi, %edx  # Cx, D.2655
    leal    2001(%rcx,%rax), %ebp   #, D.2655
    subl    %eax, %r8d  # tmp130, D.2655
    leal    2000(%rdi), %esi    #, y
    addl    $2000, %r8d #, D.2655
    leal    2001(%rdx), %eax    #, D.2655
    cmpl    %eax, %esi  # D.2655, y
    jge .L8 #,
    movslq  %esi, %rax  # y, D.2660
    subl    %edi, %edx  # D.2655, D.2654
    subl    %r8d, %ebp  # D.2655, D.2655
    leaq    (%rdx,%rax), %rbx   #, D.2654
    movslq  %ebp, %rbp  # D.2655, D.2661
    imulq   $4000, %rax, %rcx   #, D.2660, D.2660
    imulq   $4000, %rbx, %rbx   #, D.2654, D.2654
    addq    $currentField, %rcx #, ivtmp.12
    addq    $currentField+4000, %rbx    #, D.2654
    .p2align 4,,10
    .p2align 3
.L9:
    movq    %rcx, %rdi  # ivtmp.12,
    movq    %rbp, %rdx  # D.2661,
    movl    $1, %esi    #,
    call    memset  #
    movq    %rax, %rcx  #, ivtmp.12
    addq    $4000, %rcx #, ivtmp.12
    cmpq    %rbx, %rcx  # D.2654, ivtmp.12
    jne .L9 #,
.L8:
    addl    $1, %r12d   #, x
    subq    $-128, %r14 #, ivtmp.19
    cmpl    %r12d, 8(%r13)  # x, body_11(D)->object_count
    jg  .L11    #,
.L12:
    popq    %rbx    #
    .cfi_def_cfa_offset 40
    popq    %rbp    #
    .cfi_def_cfa_offset 32
    popq    %r12    #
    .cfi_def_cfa_offset 24
    popq    %r13    #
    .cfi_def_cfa_offset 16
    popq    %r14    #
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE24:
    .size   buildField, .-buildField
    .comm   currentField,16000000,32
    .ident  "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1"
    .section    .note.GNU-stack,"",@progbits

GCC uses SSE instructions instead of AVX instructions, especially considering that it uses 128 bit SSE registers %xmmas opposed to 256 bit AVX registers %ymm.

Why is this and, more importantly, how can I get gccAVX to be used on top of SSE?

+3

c assembly gcc sse avx

haneefmubarak Feb 17 '14 at 10:36

source share

1 answer

Stephen Canon · Accepted Answer · 2014-02-17T22:44:08+0000

Your code does all the arithmetic; there are no integer operations in the AVX extension. They were added to AVX2, which you did not allow.

float AVX2, , , , -, , , , AVX, . , , .

GCC generates SSE instructions instead of AVX

More articles: