I am doing inline-asm with gcc. Everything FULLY works, up to some behavior that just puzzles me. I evaluate a rational polynomial, but 80-bit constants need to be used. The generated code seems perfect, but when executing one of the 80-bit coefficients when loaded into fpu is loaded as 0, although the byte values ββare not equal to zero in the memory (and I think it is a real 80-bit real, since the exact same constant is loaded with a fine when starting with the code generated by masm). Here is the result of the gdb session:
(gdb) disassemble
Dump of assembler code for function poly4(double):
0x00402d7c <+0>: push %ebp
0x00402d7d <+1>: mov %esp,%ebp
0x00402d7f <+3>: sub $0x8,%esp
0x00402d82 <+6>: mov 0x8(%ebp),%eax
0x00402d85 <+9>: mov %eax,-0x8(%ebp)
0x00402d88 <+12>: mov 0xc(%ebp),%eax
0x00402d8b <+15>: mov %eax,-0x4(%ebp)
0x00402d8e <+18>: fld1
0x00402d90 <+20>: fldl -0x8(%ebp)
0x00402d93 <+23>: fmul %st(0),%st
0x00402d95 <+25>: fdivrp %st,%st(1)
0x00402d97 <+27>: fldt 0x40470e
0x00402d9d <+33>: fadd %st(1),%st
0x00402d9f <+35>: fmul %st(1),%st
0x00402da1 <+37>: fldt 0x404704
0x00402da7 <+43>: faddp %st,%st(1)
0x00402da9 <+45>: fmul %st(1),%st
0x00402dab <+47>: fldt 0x4046fa
0x00402db1 <+53>: faddp %st,%st(1)
0x00402db3 <+55>: fmul %st(1),%st
0x00402db5 <+57>: fldt 0x4046f0
0x00402dbb <+63>: faddp %st,%st(1)
0x00402dbd <+65>: fmul %st(1),%st
=> 0x00402dbf <+67>: fldt 0x4046e6
0x00402dc5 <+73>: faddp %st,%st(1)
...snip....
End of assembler dump.
(gdb) info registers st0 st1 st2 st3 st4 st5
st0 2.7412088761933612e-006 (raw 0x3fecb7f59c22579f9f60)
st1 0.00071574511983807409 (raw 0x3ff4bba0d78724c01468)
st2 <invalid float value> (raw 0x00077c81cc3b0002021e)
st3 <invalid float value> (raw 0x00020098007c00f8f0c0)
st4 0 (raw 0x000013af076300003654)
st5 <invalid float value> (raw 0x0762000000000002021e)
(gdb) x/5xh 0x4046e6
0x4046e6 <_ZL11s_NORMAL_q5>: 0x8996 0xa5d6 0x3d00 0x990a 0x3ff1
(gdb) stepi
0x00402dc5 1577 );
(gdb) info registers st0 st1 st2 st3 st4 st5
st0 0 (raw 0x00000000000000000000)
st1 2.7412088761933612e-006 (raw 0x3fecb7f59c22579f9f60)
st2 0.00071574511983807409 (raw 0x3ff4bba0d78724c01468)
st3 <invalid float value> (raw 0x00077c81cc3b0002021e)
st4 <invalid float value> (raw 0x00020098007c00f8f0c0)
st5 0 (raw 0x000013af076300003654)
(gdb) disassemble
Dump of assembler code for function poly4(double):
0x00402d7c <+0>: push %ebp
0x00402d7d <+1>: mov %esp,%ebp
0x00402d7f <+3>: sub $0x8,%esp
0x00402d82 <+6>: mov 0x8(%ebp),%eax
0x00402d85 <+9>: mov %eax,-0x8(%ebp)
0x00402d88 <+12>: mov 0xc(%ebp),%eax
0x00402d8b <+15>: mov %eax,-0x4(%ebp)
0x00402d8e <+18>: fld1
0x00402d90 <+20>: fldl -0x8(%ebp)
0x00402d93 <+23>: fmul %st(0),%st
0x00402d95 <+25>: fdivrp %st,%st(1)
0x00402d97 <+27>: fldt 0x40470e
0x00402d9d <+33>: fadd %st(1),%st
0x00402d9f <+35>: fmul %st(1),%st
0x00402da1 <+37>: fldt 0x404704
0x00402da7 <+43>: faddp %st,%st(1)
0x00402da9 <+45>: fmul %st(1),%st
0x00402dab <+47>: fldt 0x4046fa
0x00402db1 <+53>: faddp %st,%st(1)
0x00402db3 <+55>: fmul %st(1),%st
0x00402db5 <+57>: fldt 0x4046f0
0x00402dbb <+63>: faddp %st,%st(1)
0x00402dbd <+65>: fmul %st(1),%st
0x00402dbf <+67>: fldt 0x4046e6
=> 0x00402dc5 <+73>: faddp %st,%st(1)
...snip...
End of assembler dump.
(gdb)
, : "fldt 0x4046e6", 0x4046e6 , . , "fldt 0x4046e6" , st0. fldt ; ( masm ).
:
Double80 s_NORMAL_p5 = { 0xE0, 0x14, 0x24, 0x6E, 0x43, 0x6C, 0x37, 0xF4, 0xEF, 0x3F};
Double80 s_NORMAL_p4 = { 0x74, 0x5B, 0x7C, 0x72, 0xE2, 0x9F, 0x55, 0xBA, 0xF5, 0x3F};
Double80 s_NORMAL_p3 = { 0x3B, 0xD1, 0x83, 0xB3, 0xE8, 0xC1, 0x26, 0xB6, 0xF9, 0x3F};
Double80 s_NORMAL_p2 = { 0x4B, 0xA2, 0x6C, 0x9F, 0x32, 0x73, 0x75, 0x82, 0xFC, 0x3F};
Double80 s_NORMAL_p1 = { 0x49, 0xDC, 0x10, 0x22, 0x5C, 0x81, 0x14, 0xDD, 0xFC, 0x3F};
Double80 s_NORMAL_p0 = { 0x3E, 0xCE, 0xA6, 0x2B, 0xB9, 0x83, 0x04, 0xBD, 0xF9, 0x3F};
Double80 s_NORMAL_q5 = { 0x96, 0x89, 0xD6, 0xA5, 0x00, 0x3D, 0x0A, 0x99, 0xF1, 0x3F};
Double80 s_NORMAL_q4 = { 0xF8, 0x37, 0xEF, 0xEB, 0x8B, 0x14, 0xE2, 0xF7, 0xF6, 0x3F};
Double80 s_NORMAL_q3 = { 0x35, 0xC5, 0x61, 0x91, 0xF0, 0xC9, 0x24, 0x87, 0xFB, 0x3F};
Double80 s_NORMAL_q2 = { 0xCC, 0x68, 0x85, 0xAF, 0x42, 0xEB, 0xBC, 0xEF, 0xFD, 0x3F};
Double80 s_NORMAL_q1 = { 0xF3, 0xDB, 0x06, 0x40, 0x84, 0xA2, 0x62, 0xA4, 0xFF, 0x3F};
poly4:
inline long double poly4(double y)
{
__asm__(
"\n\t" "fld1"
"\n\t" "fldl %[y]"
"\n\t" "fmul %%st(0), %%st(0)"
"\n\t" "fdivp %%st(0), %%st(1)"
"\n\t" "fldt %[s_NORMAL_q1]"
"\n\t" "fadd %%st(1), %%st(0)"
"\n\t" "fmul %%st(1), %%st(0)"
"\n\t" "fldt %[s_NORMAL_q2]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmul %%st(1), %%st(0)"
"\n\t" "fldt %[s_NORMAL_q3]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmul %%st(1), %%st(0)"
"\n\t" "fldt %[s_NORMAL_q4]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmul %%st(1), %%st(0)"
"\n\t" "fldt %[s_NORMAL_q5]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fldt %[s_NORMAL_p0]"
"\n\t" "fmul %%st(2), %%st(0)"
"\n\t" "fldt %[s_NORMAL_p1]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmul %%st(2), %%st(0)"
"\n\t" "fldt %[s_NORMAL_p2]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmul %%st(2), %%st(0)"
"\n\t" "fldt %[s_NORMAL_p3]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmul %%st(2), %%st(0)"
"\n\t" "fldt %[s_NORMAL_p4]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmul %%st(2), %%st(0)"
"\n\t" "fldt %[s_NORMAL_p5]"
"\n\t" "faddp %%st(0), %%st(1)"
"\n\t" "fmulp %%st(0), %%st(2)"
"\n\t" "fdivp %%st(0), %%st(1)"
"\n\t" "fldt %[s_oneOverRootTwoPi]"
"\n\t" "fsubrp %%st(0), %%st(1)"
"\n\t" "fldl %[y]"
"\n\t" "fdiv %%st(0), %%st(1)"
"\n\t" "sub $8, %%esp"
"\n\t" "fstpl (%%esp)"
"\n\t" "call (%P[exp_X2_2])"
"\n\t" "add $8, %%esp"
"\n\t" "fmulp %%st(0), %%st(1)"
"\n\t" "leave"
"\n\t" "ret"
:
: [y] "m" (y)
, [s_oneOverRootTwoPi] "m" (*s_oneOverRootTwoPi)
, [s_NORMAL_p0] "m" (*s_NORMAL_p0)
, [s_NORMAL_p1] "m" (*s_NORMAL_p1)
, [s_NORMAL_p2] "m" (*s_NORMAL_p2)
, [s_NORMAL_p3] "m" (*s_NORMAL_p3)
, [s_NORMAL_p4] "m" (*s_NORMAL_p4)
, [s_NORMAL_p5] "m" (*s_NORMAL_p5)
, [s_NORMAL_q1] "m" (*s_NORMAL_q1)
, [s_NORMAL_q2] "m" (*s_NORMAL_q2)
, [s_NORMAL_q3] "m" (*s_NORMAL_q3)
, [s_NORMAL_q4] "m" (*s_NORMAL_q4)
, [s_NORMAL_q5] "m" (*s_NORMAL_q5)
, [exp_X2_2] "i" (exp_X2_2)
:
);
}
FPU :
(gdb) info float
R7: Valid 0x3ff4bba0d78724c01468 +0.00071574511983807409
=>R6: Valid 0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
R5: Empty 0x3ff6f7e2148bebef37f8
R4: Empty 0x000000020a0d00000007
R3: Empty 0xf1be000000000002021e
R2: Empty 0x00001697f1bf00003654
R1: Empty 0x00020098007c00f8f0c0
R0: Empty 0x00077c81cc3b0002021e
Status Word: 0xffff3320 PE C0 C1
TOP: 6
Control Word: 0xffff037f IM DM ZM OM UM PM
PC: Extended Precision (64-bits)
RC: Round to nearest
Tag Word: 0xffff0fff
Instruction Pointer: 0x1b:0x00402dbd
Operand Pointer: 0xffff0023:0x004046f0
Opcode: 0xd8c9
, "C1" - .
fldt ( stepi):
(gdb) stepi
0x00402dc5 1485 );
(gdb) info float
R7: Valid 0x3ff4bba0d78724c01468 +0.00071574511983807409
R6: Valid 0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
=>R5: Zero 0x00000000000000000000 +0
R4: Empty 0x000000020a0d00000007
R3: Empty 0xf1be000000000002021e
R2: Empty 0x00001697f1bf00003654
R1: Empty 0x00020098007c00f8f0c0
R0: Empty 0x00077c81cc3b0002021e
Status Word: 0xffff2920 PE C0
TOP: 5
Control Word: 0xffff037f IM DM ZM OM UM PM
PC: Extended Precision (64-bits)
RC: Round to nearest
Tag Word: 0xffff07ff
Instruction Pointer: 0x1b:0x00402dbf
Operand Pointer: 0xffff0023:0x0040cce6
Opcode: 0xdb2d
, , 0x00402db5 0x00402dbf . , . gdb, , fpu . - C1 fldt:
(gdb) disassemble
Dump of assembler code for function poly4(double):
0x00402d7c <+0>: push %ebp
0x00402d7d <+1>: mov %esp,%ebp
0x00402d7f <+3>: sub $0x8,%esp
0x00402d82 <+6>: mov 0x8(%ebp),%eax
0x00402d85 <+9>: mov %eax,-0x8(%ebp)
0x00402d88 <+12>: mov 0xc(%ebp),%eax
0x00402d8b <+15>: mov %eax,-0x4(%ebp)
=> 0x00402d8e <+18>: fld1
0x00402d90 <+20>: fldl -0x8(%ebp)
0x00402d93 <+23>: fmul %st(0),%st
0x00402d95 <+25>: fdivrp %st,%st(1)
0x00402d97 <+27>: fldt 0x40470e
0x00402d9d <+33>: fadd %st(1),%st
0x00402d9f <+35>: fmul %st(1),%st
0x00402da1 <+37>: fldt 0x404704
0x00402da7 <+43>: faddp %st,%st(1)
0x00402da9 <+45>: fmul %st(1),%st
0x00402dab <+47>: fldt 0x4046fa
0x00402db1 <+53>: faddp %st,%st(1)
0x00402db3 <+55>: fmul %st(1),%st
0x00402db5 <+57>: fldt 0x4046f0
0x00402dbb <+63>: faddp %st,%st(1)
0x00402dbd <+65>: fmul %st(1),%st
0x00402dbf <+67>: fldt 0x4046f0
0x00402dc5 <+73>: faddp %st,%st(1)
0x00402dc7 <+75>: fldt 0x4046dc
0x00402dcd <+81>: fmul %st(2),%st
0x00402dcf <+83>: fldt 0x4046d2
0x00402dd5 <+89>: faddp %st,%st(1)
0x00402dd7 <+91>: fmul %st(2),%st
0x00402dd9 <+93>: fldt 0x4046c8
0x00402ddf <+99>: faddp %st,%st(1)
0x00402de1 <+101>: fmul %st(2),%st
0x00402de3 <+103>: fldt 0x4046be
0x00402de9 <+109>: faddp %st,%st(1)
0x00402deb <+111>: fmul %st(2),%st
0x00402ded <+113>: fldt 0x4046b4
0x00402df3 <+119>: faddp %st,%st(1)
0x00402df5 <+121>: fmul %st(2),%st
0x00402df7 <+123>: fldt 0x4046aa
0x00402dfd <+129>: faddp %st,%st(1)
0x00402dff <+131>: fmulp %st,%st(2)
0x00402e01 <+133>: fdivrp %st,%st(1)
0x00402e03 <+135>: fldt 0x40408e
0x00402e09 <+141>: fsubrp %st,%st(1)
0x00402e0b <+143>: fldl -0x8(%ebp)
0x00402e0e <+146>: fdivr %st,%st(1)
0x00402e10 <+148>: sub $0x8,%esp
0x00402e13 <+151>: fstpl (%esp)
0x00402e16 <+154>: fwait
0x00402e17 <+155>: call 0x4013c0 <exp_X2_2(double)>
0x00402e1c <+160>: add $0x8,%esp
0x00402e1f <+163>: fmulp %st,%st(1)
0x00402e21 <+165>: fstl 0x406020
0x00402e27 <+171>: fld %st(0)
0x00402e29 <+173>: fsubl 0x406020
0x00402e2f <+179>: fildll 0x403020
0x00402e35 <+185>: fmulp %st,%st(1)
0x00402e37 <+187>: fstpl 0x406020
0x00402e3d <+193>: leave
0x00402e3e <+194>: ret
0x00402e3f <+195>: flds 0x40472c
0x00402e45 <+201>: leave
0x00402e46 <+202>: ret
End of assembler dump.
(gdb) tbreak *0x00402db5
Temporary breakpoint 61 at 0x402db5: file cody2.cpp, line 1489.
(gdb) continue
Continuing.
Temporary breakpoint 61, 0x00402db5 in poly4 (y=37.37840817302294) at cody2.cpp:1489
1489 );
(gdb) info float
R7: Valid 0x3ff4bba0d78724c01468 +0.00071574511983807409
=>R6: Valid 0x3ff0c71ba235b8f6a603 +4.7471033066735141e-005
R5: Empty 0x3ffb8724c9f09161c535
R4: Empty 0xf13d00000a0d00000007
R3: Empty 0x07ec000000000002021e
R2: Empty 0x000016cbc40900003654
R1: Empty 0x00020098007c00f8f0c0
R0: Empty 0x00077c81cc3b0002021e
Status Word: 0xffff3120 PE C0
TOP: 6
Control Word: 0xffff037f IM DM ZM OM UM PM
PC: Extended Precision (64-bits)
RC: Round to nearest
Tag Word: 0xffff0fff
Instruction Pointer: 0x1b:0x00402db3
Operand Pointer: 0xffff0023:0x004046fa
Opcode: 0xd8c9
(gdb) stepi
0x00402dbb 1489 );
(gdb) info float
R7: Valid 0x3ff4bba0d78724c01468 +0.00071574511983807409
R6: Valid 0x3ff0c71ba235b8f6a603 +4.7471033066735141e-005
=>R5: Valid 0x3ff6f7e2148bebef37f8 +0.0037823963320275824
R4: Empty 0xf13d00000a0d00000007
R3: Empty 0x07ec000000000002021e
R2: Empty 0x000016cbc40900003654
R1: Empty 0x00020098007c00f8f0c0
R0: Empty 0x00077c81cc3b0002021e
Status Word: 0xffff2920 PE C0
TOP: 5
Control Word: 0xffff037f IM DM ZM OM UM PM
PC: Extended Precision (64-bits)
RC: Round to nearest
Tag Word: 0xffff03ff
Instruction Pointer: 0x1b:0x00402db5
Operand Pointer: 0xffff0023:0x004046f0
Opcode: 0xdb2d
(gdb) stepi
0x00402dbd 1489 );
(gdb) stepi
0x00402dbf 1489 );
(gdb) info float
R7: Valid 0x3ff4bba0d78724c01468 +0.00071574511983807409
=>R6: Valid 0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
R5: Empty 0x3ff6f7e2148bebef37f8
R4: Empty 0xf13d00000a0d00000007
R3: Empty 0x07ec000000000002021e
R2: Empty 0x000016cbc40900003654
R1: Empty 0x00020098007c00f8f0c0
R0: Empty 0x00077c81cc3b0002021e
Status Word: 0xffff3320 PE C0 C1
TOP: 6
Control Word: 0xffff037f IM DM ZM OM UM PM
PC: Extended Precision (64-bits)
RC: Round to nearest
Tag Word: 0xffff0fff
Instruction Pointer: 0x1b:0x00402dbd
Operand Pointer: 0xffff0023:0x004046f0
Opcode: 0xd8c9
(gdb) stepi
0x00402dc5 1489 );
(gdb) info float
R7: Valid 0x3ff4bba0d78724c01468 +0.00071574511983807409
R6: Valid 0x3fecb7f59c22579f9f60 +2.7412088761933612e-006
=>R5: Zero 0x00000000000000000000 +0
R4: Empty 0xf13d00000a0d00000007
R3: Empty 0x07ec000000000002021e
R2: Empty 0x000016cbc40900003654
R1: Empty 0x00020098007c00f8f0c0
R0: Empty 0x00077c81cc3b0002021e
Status Word: 0xffff2920 PE C0
TOP: 5
Control Word: 0xffff037f IM DM ZM OM UM PM
PC: Extended Precision (64-bits)
RC: Round to nearest
Tag Word: 0xffff07ff
Instruction Pointer: 0x1b:0x00402dbf
Operand Pointer: 0xffff0023:0x0040ccf0
Opcode: 0xdb2d
(gdb)