Loop cycle optimization in c

I looked online and in my books, but I can’t understand it. I was asked to optimize a small part of the program. In particular, to take an array and add its contents over a short period of time, with vi and gcc, without using the built-in optimizer. I tried a loop reversal and a couple of other product optimizations. Could you help me?

int length = ARRAY_SIZE;
int limit = length-4;
for (j=0; j < limit; j+=5) {
    sum += array[j] + array[j+1] + array[j+2] + array[j+3] + array[j+4];
}
for(; j < length; j++){
    sum += array[j];    
}

The array values ​​are volatile intand all values ​​have been initialized.

+3
source share
7 answers

Create sub-units, which are then summed up to the sum.

Here is the basic version of what it might look like

for (j=0; j < limit; j+=4) {
    sum1 += array[j];
    sum2 += array[j+1];
    sum3 += array[j+2];
    sum4 += array[j+3];
}
sum = sum1 + sum2 + sum3 + sum4;

, .. sum2 sum1, .

+9

sse/mmx set:

__m128i sum;
for (j=0; j < limit; j+=4) {
    sum = _mm_add_epi32(sum, array+j);
}
+3

, 5.

, .

:

int* p = array;
for (j = 0; j < ARRAY_SIZE - 4; j += 5, p += 5){
  sum += p[0] + p[1] + p[2] + p[3] + p[4];
}

( j sizeof(int) ).

: , ARRAY_SIZE , , , , ( ), :

sum += array[0];
sum += array[1];
...
sum += array[ARRAY_SIZE - 1];

, ARRAY_SIZE 2, 64, :

#define FOO64(i) FOO32(i); FOO32((i)+32)
#define FOO32(i) FOO16(i); FOO16((i)+16)
#define FOO16(i) FOO8(i); FOO8((i)+8)
#define FOO8(i) FOO4(i); FOO4((i)+4)
#define FOO4(i) FOO2(i); FOO2((i)+2)
#define FOO2(i) FOO1(i); FOO1((i)+1)
#define FOO1(i) sum += array[i]

FOO64(0);

, , 10.

+1

. , , , , , .

0

, , , , , "" :-) , - , , , 0,01% , , 20%.

, .

, " ", , .


( , " " ), . "" . :

def initArray (sz):
    allocate data as sz+1 integers
    foreach i 0 thru sz:
        set data[i] to 0

def killArray(data):
    free data

def getArray (data,indx):
    return data[indx+1]

def setArray (data,indx,val):
    data[0] = data[0] - data[indx] + val
    data[indx+1] = val

def sumArray(data):
    return data[0]

.


C , :

#include <stdio.h>
#include <stdlib.h>

static int *initArray (int sz) {
    int i;
    int *ret = malloc (sizeof (int) * (sz + 1));
    for (i = 0; i <= sz; i++)
        ret[i] = 0;
    return ret;
}

static void killArray(int *data) {
    free (data);
}

static int getArray (int *data, int indx) {
    return data[indx+1];
}

static void setArray (int *data, int indx, int val) {
    data[0] = data[0] - data[indx] + val;
    data[indx+1] = val;
}

static int sumArray (int *data) {
    return data[0];
}

 

int main (void) {
    int i;
    int *mydata = initArray (10);
    if (mydata != NULL) {
        setArray (mydata, 5, 27);
        setArray (mydata, 9, -7);
        setArray (mydata, 7, 42);
        for (i = 0; i < 10; i++)
            printf ("Element %d is %3d\n", i, getArray (mydata, i));
        printf ("Sum is %3d\n", sumArray (mydata));
    }
    killArray (mydata);
    return 0;
}

:

Element 0 is   0
Element 1 is   0
Element 2 is   0
Element 3 is   0
Element 4 is   0
Element 5 is  27
Element 6 is   0
Element 7 is  42
Element 8 is   0
Element 9 is  -7
Sum is  62

, , , , .


, , : , -bounds, indx .

0

, , . 2, . , . - , , . , , .

int sum1, sum2, sum3, sum4;

for(j = ARRAY_SIZE; j%5; j--){
    sum += array[j]; 
}
sum1 = sum2 = sum3 = sum4 = 0;
for (; j; j-=5) {
    sum += array[j-1];
    sum1 += array[j-2];
    sum2 += array[j-3];
    sum3 += array[j-4];
    sum4 += array[j-5];
}
sum += sum1+sum2+sum3+sum4;
0

You can increase productivity by pre-programming the data inside the loop. I will describe Drew's answer:

register int value1, value2, value3, value4;
or (j=0; j < limit; j+=4)
{
    // Prefetch the data
    value1 = array[j];
    value2 = array[j + 1];
    value3 = array[j + 2];
    value4 = array[j + 4];

    // Use the prefetched data
    sum1 += value1;
    sum2 += value2;
    sum3 += value3;
    sum4 += value4;
}
sum = sum1 + sum2 + sum3 + sum4;

The idea here is that the processor loads continuous data into the cache and then works with cached data. For this to be effective, the compiler does not have to optimize prefetching; this can be done by declaring temporary variables as volatile. I do not know if it can volatilebe combined with register.

Search the Internet for "Data driven design."

0
source

All Articles