Optimization of nested loops

We have an appointment where we are provided with a very inefficient program, and we must optimize the code so that it works faster. I have most of them going pretty fast, with the exception of the two, which hurts me so much, because they VERY simply function. One of them basically sets all the values ​​in a two-dimensional array to the same value, and one basically changes the values ​​of two two-dimensional arrays. And this is a problem. They take the most time, but since they are so simple, I can’t figure out how to reduce them without disturbing the functions. And I know that you can make them work faster, because other students in the class get ridiculous accelerations. These two questions:

fSetArray(int rows, int cols, float val)
{
    int i, j;
    F2D *out;
    out = fMallocHandle(rows, cols);

    for(i=0; i<rows; i++)
    for(j=0; j<cols; j++)
        subsref(out,i,j) = val;

    return out;

}

- . , () (cols), val. ( ). - . cols rows , .

:

fDeepCopy(F2D* in)
{
    int i, j;
    F2D* out;
    int rows, cols;

    rows = in->height;
    cols = in->width;

    out = fMallocHandle(rows, cols);

    for(i=0; i<rows; i++)
    for(j=0; j<cols; j++)
        subsref(out,i,j) = subsref(in,i,j);

    return out;
}

, , . , , , .

- , - , , , . . , .

, OpenMP, , . .

EDIT: , ! , !

+3
2

- . , , . , , , , , .

, , "", , ,

thread 0 works on i*rows+j % (thread count) = 0
thread 1 works on i*rows+j % (thread count) = 1
(and so on)

, , . ( ). , , .

fDeepCopy(F2D* in)
{
    int i, j;
    F2D* out;
    int rows, cols;

    rows = in->height;
    cols = in->width;

    out = fMallocHandle(rows, cols);

    for(i=0; i<rows; i++) {
      // rewrite to ensure we don't walk off "4 long" pads
      int j = 0;
      int pads = (cols / 4)*4;
      for(; j < pads; j = j + 4) {
        subsref(out,i,j) = subsref(in,i,j);
        subsref(out,i,j+1) = subsref(in,i,j+1);
        subsref(out,i,j+2) = subsref(in,i,j+2);
        subsref(out,i,j+3) = subsref(in,i,j+3);
      }
      // process the remainders
      for(; j < pads; j++) {
        subsref(out,i,j) = subsref(in,i,j);
      }
    }
    return out;
}

, , , , ​​ , .

, SSE, , MMX, , , 32- MMX , .

, -

vec_res.x = v1.x + v2.x;
vec_res.y = v1.y + v2.y;
vec_res.z = v1.z + v2.z;
vec_res.w = v1.w + v2.w;

, (16 ),

movaps xmm0, [v1]          ;xmm0 = v1.w | v1.z | v1.y | v1.x 
addps xmm0, [v2]           ;xmm0 = v1.w+v2.w | v1.z+v2.z | v1.y+v2.y | v1.x+v2.x               
movaps [vec_res], xmm0

, .

, , , .

+1

memset .

0

All Articles