Which one is better, gcc or armcc for optimizing NEON?

Turning to @auselen, answer here: Using ARM NEON intrinsics to add alpha and permutation , it looks like the armcc compiler is much better than the gcc compiler for NEON optimizations. It's true? I really have not tried the armcc compiler. But I got quite optimized code using the gcc compiler with the -O3 optimization flag. But now I wonder if armcc is really good? So which of the two compilers is better considering all the factors?

+5
source share
2 answers

Compilers are also software; they improve over time. Any general requirement, such as armcc, is better than GCC on NEON (or better said, vectorization) cannot remain true forever, as one development team can close the gap with enough attention. However, it is initially logical to expect that compilers developed by hardware companies will be better because they need to demonstrate / sell these features.

One recent example I saw was here in Stack Overflow, about the answer to branch prediction . A quote from the last line of the updated section, "This shows that even mature, modern compilers can vary greatly in their ability to optimize code ...".

GCC, , Intel ARM. , - , , , GCC.

, hilbert-space , .

void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
  int i;
  uint8x8_t rfac = vdup_n_u8 (77);
  uint8x8_t gfac = vdup_n_u8 (151);
  uint8x8_t bfac = vdup_n_u8 (28);
  n/=8;

  for (i=0; i<n; i++)
  {
    uint16x8_t  temp;
    uint8x8x3_t rgb  = vld3_u8 (src);
    uint8x8_t result;

    temp = vmull_u8 (rgb.val[0],      rfac);
    temp = vmlal_u8 (temp,rgb.val[1], gfac);
    temp = vmlal_u8 (temp,rgb.val[2], bfac);

    result = vshrn_n_u16 (temp, 8);
    vst1_u8 (dest, result);
    src  += 8*3;
    dest += 8;
  }
}

armcc 5.01

  20:   f421140d    vld3.8  {d1-d3}, [r1]!
  24:   e2822001    add r2, r2, #1
  28:   f3810c04    vmull.u8    q0, d1, d4
  2c:   f3820805    vmlal.u8    q0, d2, d5
  30:   f3830806    vmlal.u8    q0, d3, d6
  34:   f2880810    vshrn.i16   d0, q0, #8
  38:   f400070d    vst1.8  {d0}, [r0]!
  3c:   e1520003    cmp r2, r3
  40:   bafffff6    blt 20 <neon_convert+0x20>

GCC 4.4.3-4.7.1

  1e:   f961 040d   vld3.8  {d16-d18}, [r1]!
  22:   3301        adds    r3, #1
  24:   4293        cmp r3, r2
  26:   ffc0 4ca3   vmull.u8    q10, d16, d19
  2a:   ffc1 48a6   vmlal.u8    q10, d17, d22
  2e:   ffc2 48a7   vmlal.u8    q10, d18, d23
  32:   efc8 4834   vshrn.i16   d20, q10, #8
  36:   f940 470d   vst1.8  {d20}, [r0]!
  3a:   d1f0        bne.n   1e <neon_convert+0x1e>

, . , .

void neonPermuteRGBtoBGRA(unsigned char* src, unsigned char* dst, int numPix)
{
    numPix /= 8; //process 8 pixels at a time

    uint8x8_t alpha = vdup_n_u8 (0xff);

    for (int i=0; i<numPix; i++)
    {
        uint8x8x3_t rgb  = vld3_u8 (src);
        uint8x8x4_t bgra;

        bgra.val[0] = rgb.val[2]; //these lines are slow
        bgra.val[1] = rgb.val[1]; //these lines are slow 
        bgra.val[2] = rgb.val[0]; //these lines are slow

        bgra.val[3] = alpha;

        vst4_u8(dst, bgra);

        src += 8*3;
        dst += 8*4;
    }
}

gcc...

$ arm-linux-gnueabihf-gcc --version
arm-linux-gnueabihf-gcc (crosstool-NG linaro-1.13.1-2012.05-20120523 - Linaro GCC 2012.05) 4.7.1 20120514 (prerelease)
$ arm-linux-gnueabihf-gcc -std=c99 -O3 -c ~/temp/permute.c -marm -mfpu=neon-vfpv4 -mcpu=cortex-a9 -o ~/temp/permute_gcc.o

00000000 <neonPermuteRGBtoBGRA>:
   0:   e3520000    cmp r2, #0
   4:   e2823007    add r3, r2, #7
   8:   b1a02003    movlt   r2, r3
   c:   e92d01f0    push    {r4, r5, r6, r7, r8}
  10:   e1a021c2    asr r2, r2, #3
  14:   e24dd01c    sub sp, sp, #28
  18:   e3520000    cmp r2, #0
  1c:   da000019    ble 88 <neonPermuteRGBtoBGRA+0x88>
  20:   e3a03000    mov r3, #0
  24:   f460040d    vld3.8  {d16-d18}, [r0]!
  28:   eccd0b06    vstmia  sp, {d16-d18}
  2c:   e59dc014    ldr ip, [sp, #20]
  30:   e2833001    add r3, r3, #1
  34:   e59d6010    ldr r6, [sp, #16]
  38:   e1530002    cmp r3, r2
  3c:   e59d8008    ldr r8, [sp, #8]
  40:   e1a0500c    mov r5, ip
  44:   e59dc00c    ldr ip, [sp, #12]
  48:   e1a04006    mov r4, r6
  4c:   f3c73e1f    vmov.i8 d19, #255   ; 0xff
  50:   e1a06008    mov r6, r8
  54:   e59d8000    ldr r8, [sp]
  58:   e1a0700c    mov r7, ip
  5c:   e59dc004    ldr ip, [sp, #4]
  60:   ec454b34    vmov    d20, r4, r5
  64:   e1a04008    mov r4, r8
  68:   f26401b4    vorr    d16, d20, d20
  6c:   e1a0500c    mov r5, ip
  70:   ec476b35    vmov    d21, r6, r7
  74:   f26511b5    vorr    d17, d21, d21
  78:   ec454b34    vmov    d20, r4, r5
  7c:   f26421b4    vorr    d18, d20, d20
  80:   f441000d    vst4.8  {d16-d19}, [r1]!
  84:   1affffe6    bne 24 <neonPermuteRGBtoBGRA+0x24>
  88:   e28dd01c    add sp, sp, #28
  8c:   e8bd01f0    pop {r4, r5, r6, r7, r8}
  90:   e12fff1e    bx  lr

armcc...

$ armcc
ARM C/C++ Compiler, 5.01 [Build 113]
$ armcc --C99 --cpu=Cortex-A9 -O3 -c permute.c -o permute_arm.o

00000000 <neonPermuteRGBtoBGRA>:
   0:   e1a03fc2    asr r3, r2, #31
   4:   f3870e1f    vmov.i8 d0, #255    ; 0xff
   8:   e0822ea3    add r2, r2, r3, lsr #29
   c:   e1a031c2    asr r3, r2, #3
  10:   e3a02000    mov r2, #0
  14:   ea000006    b   34 <neonPermuteRGBtoBGRA+0x34>
  18:   f420440d    vld3.8  {d4-d6}, [r0]!
  1c:   e2822001    add r2, r2, #1
  20:   eeb01b45    vmov.f64    d1, d5
  24:   eeb02b46    vmov.f64    d2, d6
  28:   eeb05b40    vmov.f64    d5, d0
  2c:   eeb03b41    vmov.f64    d3, d1
  30:   f401200d    vst4.8  {d2-d5}, [r1]!
  34:   e1520003    cmp r2, r3
  38:   bafffff6    blt 18 <neonPermuteRGBtoBGRA+0x18>
  3c:   e12fff1e    bx  lr

armcc . , fgp . GCC , , , /.

+8

NEON, . ( ) NEON intrinsics NEON, , , - . , GCC 4.2, Clang 3.1 .

, , NEON , NEON instrinsics. , / NEON , , . NEON intrinsics , , NEON load/store intrinsic load/store -. , / , . NEON, , , , . "align" , , , Clang, , ...

, Clang, GCC , . , - 10%, 100%.

, , - . , NEON, , . , , -, , , , . , , , . , 16 128- 32 64- !

, GCC, Clang, , Idiosyncrasies. GCC Clang, .

, , , GCC . , .

+7

All Articles