128-bit rotation using ARM Neon intrinsics

I am trying to optimize my code using Neon intrinsics. I have 24-bit rotation over a 128-bit array (8 each uint16_t).

Here is my c code:

uint16_t rotated[8];
uint16_t temp[8];
uint16_t j;
for(j = 0; j < 8; j++)
{
     //Rotation <<< 24  over 128 bits (x << shift) | (x >> (16 - shift)
     rotated[j] = ((temp[(j+1) % 8] << 8) & 0xffff) | ((temp[(j+2) % 8] >> 8) & 0x00ff);
}

I checked the gcc documentation on Neon Intrinsics and have no instructions for rotating the vector. Moreover, I tried to do this with help vshlq_n_u16(temp, 8), but all bits shifted beyond the word uint16_tare lost.

How to achieve this using neon properties? By the way, is there any better documentation on GCC Neon Intrinsics?

+5
source share
3 answers

After some reading on Arm Community Blogs , I found this:

Neon Arm Bitwise Rotation

VEXT: VEXT . . , , . VEXT , -. .

Neon GCC Intrinsic , , :

uint16x8_t vextq_u16 (uint16x8_t, uint16x8_t, const int)

, 24- 128- ( ) :

uint16x8_t input;
uint16x8_t t0;
uint16x8_t t1;
uint16x8_t rotated;

t0 = vextq_u16(input, input, 1);
t0 = vshlq_n_u16(t0, 8);
t1 = vextq_u16(input, input, 2);
t1 = vshrq_n_u16(t1, 8);
rotated = vorrq_u16(t0, t1);
+6

100%, , NEON .

, /, :

uint8_t ror(uint8_t in, int rotation)
{
    return (in >> rotation) | (in << (8-rotation));
}

, /.

uint16x8_t temp;
uint8_t rot;

uint16x8_t rotated =  vorrq_u16 ( vshlq_n_u16(temp, rot) , vshrq_n_u16(temp, 16 - rot) );

. http://en.wikipedia.org/wiki/Circular_shift " ".

. , VEXT, .

+4

vext.8 16- ( 3 ).

intrinsics , , :

#include <arm_neon.h>

uint16x8_t byterotate3(uint16x8_t input) {
    uint8x16_t tmp = vreinterpretq_u8_u16(input);
    uint8x16_t rotated = vextq_u8(tmp, tmp, 16-3);
    return vreinterpretq_u16_u8(rotated);
}

g++5.4 -O3 -march=armv7-a -mfloat-abi=hard -mfpu=neon ( Godbolt) :

byterotate3(__simd128_uint16_t):
    vext.8  q0, q0, q0, #13
    bx      lr

16-3 , 3 . ( , 13 3 , 13).


: x86 , : palignr ( SSSE3).


, - NEON, , OP vext.16 (vextq_u16), 16- . , vext.8, , . vext.8 :

- VEXT

You can specify the data type from 16, 32 or 64 instead of 8. In this case, #imm refers to half-words, words or double words instead of referring to bytes, and the allowed ranges are accordingly reduced.

+2
source

All Articles