, . , . SSE2, SSE3, SSE4, AVX AVX2, . . .
: 8008, 64000 2560 * 1920 = 4915200. . . vectorize8_unroll2 . vectorize8_unroll2_parallel. vec16_loop_unroll2_fix vec16_loop_unroll2_parallel_fix - , , , . AVX, AVX, SSE4 SSE2
, : "W * H 8, W H 8".
, W * H 16 . vectorize8_unroll2 , 16 ( = 8008 , , ). .
Ander Fog . lib dll. . OpenMP . :
Intel Xeon E5630 @2.53GHz (supports upto SSE4.2)
size 8008, size2 8032, iterations 1000000
default_loop time: 7.935 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000
size 64000, size2 64000, iterations 100000
default_loop time: 6.387 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 5.125 seconds, diff 0.000000
vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000
: AVX, GCC .
. , .
http://www.agner.org/optimize/#vectorclass. (vectorclass.h, instrset.h, vectorf128.h, vectorf256.h, vectorf256e.h, vectori128.h, vectori256.h, vectori256e.h) , . /D __SSE4_2__ ++/CommandLine. . AVX, /arch: AVX. OpenMP / ++.
In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp
vec16_loop_unroll2_parallel , 32. 32 ( 2), , vec16_loop_unroll2_parallel_fix, . .
#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
inline void* aligned_malloc(size_t size, size_t align) {
void *result;
#ifdef _MSC_VER
result = _aligned_malloc(size, align);
#else
if(posix_memalign(&result, align, size)) result = 0;
#endif
return result;
}
inline void aligned_free(void *ptr) {
#ifdef _MSC_VER
_aligned_free(ptr);
#else
free(ptr);
#endif
}
void default_loop(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f/value;
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f / value;
#pragma omp parallel for
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
for (; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for (; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
#pragma omp parallel for
for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for(int i = ROUND_DOWN(size, 32); i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 8)
{
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void copy_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
b[i] = a[i];
}
}
float compare_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
float diff = a[i] - b[i];
if(diff!=0) {
printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
break;
}
sum += diff;
}
return sum;
}
void randomize_array(unsigned short* a, const int size) {
for(int i=0; i<size; i++) {
float r = (float)rand()/RAND_MAX;
a[i] = (int)(65536*r);
}
}
void run(int size, int iterations) {
int rd = ROUND_DOWN(size, 32);
int size2 = rd == size ? size : rd + 32;
float value = 1.1f;
printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);
void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
fp[0] = default_loop;
fp[1] = vec8_loop;
fp[2] = vec8_loop_unroll2;
fp[3] = vec16_loop;
fp[4] = vec16_loop_unroll2;
fp[5] = vec16_loop_unroll2_fix;
fp[6] = vectorize8_unroll1;
fp[7] = vectorize8_unroll2;
fp[8] = default_loop_parallel;
fp[9] = vec8_loop_parallel;
fp[10] = vec8_loop_unroll2_parallel;
fp[11] = vec16_loop_parallel;
fp[12] = vec16_loop_unroll2_parallel;
fp[13] = vec16_loop_unroll2_parallel_fix;
fp[14] = vectorize8_unroll1_parallel;
fp[15] = vectorize8_unroll2_parallel;
char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
"default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
"vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};
randomize_array(source, size2);
copy_arrays(destination_old, destination_ref, size);
fp[0](destination_ref, source, value, size);
for(int i=0; i<16; i++) {
copy_arrays(destination_old, destination, size);
double dtime = omp_get_wtime();
for (int it = 0; it < iterations; it++){
fp[i](destination, source, value, size);
}
dtime = omp_get_wtime() - dtime;
float diff = compare_arrays(destination, destination_ref, size);
printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
}
printf("\n");
aligned_free(source);
aligned_free(destination);
aligned_free(destination_old);
aligned_free(destination_ref);
}
int main() {
run(8008, 1000000);
run(64000, 100000);
run(2560*1920, 1000);
}
GCC AVX. GCC (Visual Studio - , , int). . . 8008 OpenMP . , 128000 OpenMP . 4915200 , OpenMP .
i7-2600k @ 4.4GHz
size 8008, size2 8032, iterations 1000000
default_loop time: 1.319 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
vectorize8_unroll2 time: 1.227 seconds, diff 0.000000
vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000
size 128000, size2 128000, iterations 100000
default_loop time: 2.902 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
vectorize8_unroll2 time: 2.844 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 2.313 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000
vectorize8_unroll2 time: 2.318 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000