I don't know Fortran, but when they tell him to optimize, C compilers are usually smart enough to do this. Here's what GCC does with the following equivalent C code:
void foo(int n, float x[], float y[], float alpha)
{
int i;
for (i=0; i<n; i++)
{
if (alpha == 0.0) x[i] = y[i];
else x[i] = alpha*x[i]+y[i];
}
}
Here are two snippets of compiled code, optimization level 2 ( -O3). My comment.
Vectorized loop without multiplication:
movl %edi, %r8d ; %r8d = n
xorps %xmm1, %xmm1 ; clear xmm1
shrl $2, %r8d ; %r8d = n >> 2
xorl %eax, %eax ; clear eax = pointeur increment
xorl %ecx, %ecx ; clear ecx = (i>>2)
leal 0(,%r8,4), %r9d ; not relevant here
L19:
movaps %xmm1, %xmm0 ; xmm0 = 0
addl $1, %ecx ; (i>>2) ++
movlps (%rdx,%rax), %xmm0 ; Load floats into xmm0 (vector registers)
movhps 8(%rdx,%rax), %xmm0 ; Load floats into xmm0 (vector registers)
movlps %xmm0, (%rsi,%rax) ; store floats in xmm0 into memory
movhps %xmm0, 8(%rsi,%rax) ; store floats in xmm0 into memory
addq $16, %rax ; increment pointer by 16
cmpl %r8d, %ecx ; if (i>>2) < (n>>2)
jb .L19 ; go back to .L19
; else finish the non vectorized part of the loop
Vectorized loop with multiplication:
movaps %xmm0, %xmm4 ; alpha -> xmm4
movl %edi, %r8d ; %r8d = n
shrl $2, %r8d ; %r8d = n >> 2
xorps %xmm3, %xmm3 ; clear xmm3
shufps $0, %xmm4, %xmm4 ; distribute xmm4 to all vector elements
leal 0(,%r8,4), %r9d ; not relevant here
xorl %eax, %eax ; clear eax = pointeur increment
xorl %ecx, %ecx ; clear ecx = (i>>2)
.L11:
movaps %xmm3, %xmm1 ; xmm1 = 0
addl $1, %ecx ; (i>>2) ++
movaps %xmm3, %xmm2 ; xmm2 = 0
movlps (%rsi,%rax), %xmm1 ; Load floats X into xmm1 (vector registers)
movlps (%rdx,%rax), %xmm2 ; Load floats Y into xmm2 (vector registers)
movhps 8(%rsi,%rax), %xmm1 ; Load floats X into xmm1 (vector registers)
movhps 8(%rdx,%rax), %xmm2 ; Load floats Y into xmm2 (vector registers)
mulps %xmm4, %xmm1 ; multiply xmm1 by xmm4
addps %xmm2, %xmm1 ; add xmm2 to xmm1
movlps %xmm1, (%rsi,%rax) ; store floats in xmm1 into memory
movhps %xmm1, 8(%rsi,%rax) ; store floats in xmm1 into memory
addq $16, %rax ; increment pointer by 16
cmpl %r8d, %ecx ; if i>>2 < n>>2 then
jb .L11 ; go back to .L19
; else finish the non vectorized part of the loop
- , -, n 4. , , . , Fortran .
SO , , , , , , . ,
for i in 1..n for j in 1..n
for j in 1..n for i in 1..n
: , , , http://en.wikipedia.org/wiki/Polytope_model