359950 – Wrong result comparing doubles on x87

Bug 359950 - Wrong result comparing doubles on x87

Summary: Wrong result comparing doubles on x87

Status:	RESOLVED NOT A BUG

Alias:	None

Product:	valgrind
Classification:	Developer tools
Component:	general (show other bugs)
Version:	3.10.0
Platform:	Ubuntu Linux

Importance:	NOR normal
Target Milestone:	---
Assignee:	Julian Seward

URL:
Keywords:

Depends on:
Blocks:

Reported:	2016-03-01 13:36 UTC by Nick Wellnhofer
Modified:	2016-07-05 08:39 UTC (History)
CC List:	1 user (show)

See Also:
Latest Commit:
Version Fixed In:
Sentry Crash Report:

Attachments
Add an attachment

Note You need to log in before you can comment on or make changes to this bug.

Description Nick Wellnhofer 2016-03-01 13:36:47 UTC

This is on Ubuntu 15.04, x86 (32-bit). Using stock valgrind 1:3.10.1-1ubuntu3~15.04, gcc 4:4.9.2-2ubuntu2.

Test program:

#include <math.h>
#include <stdint.h>
#include <stdio.h>

int test(int64_t i, double d) {
    double a = (double)i;
    int comparison = (a == d || a < d || a > d);
    printf("%lld %f\n", i, d);
    return comparison;
}

int main() {
    int64_t i = (INT64_C(1) << 60) - 1;
    double  d = pow(2.0, 60.0);
    printf("%d\n", test(i, d));
    return 0;
}

When compiled with -O2 and run under valgrind, the result is:

==3514== Memcheck, a memory error detector
==3514== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==3514== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==3514== Command: ./vg_bug
==3514==
1152921504606846975 1152921504606846976.000000
0
==3514==
==3514== HEAP SUMMARY:
==3514==     in use at exit: 0 bytes in 0 blocks
==3514==   total heap usage: 0 allocs, 0 frees, 0 bytes allocated
==3514==
==3514== All heap blocks were freed -- no leaks are possible
==3514==
==3514== For counts of detected and suppressed errors, rerun with: -v
==3514== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

All three comparisons returned false.

Dump of assembler code for function test:
   0x08048480 <+0>:     push   %esi
   0x08048481 <+1>:     push   %ebx
   0x08048482 <+2>:     sub    $0x14,%esp
   0x08048485 <+5>:     mov    0x20(%esp),%ecx
   0x08048489 <+9>:     mov    0x24(%esp),%ebx
   0x0804848d <+13>:    fldl   0x28(%esp)
   0x08048491 <+17>:    mov    %ecx,(%esp)
   0x08048494 <+20>:    mov    %ebx,0x4(%esp)
   0x08048498 <+24>:    fildll (%esp)
   0x0804849b <+27>:    fucomi %st(1),%st
   0x0804849d <+29>:    jp     0x80484a6 <test+38>
   0x0804849f <+31>:    mov    $0x1,%esi
   0x080484a4 <+36>:    je     0x80484b8 <test+56>
   0x080484a6 <+38>:    xor    %eax,%eax
   0x080484a8 <+40>:    fucomip %st(1),%st
   0x080484aa <+42>:    setne  %al
   0x080484ad <+45>:    mov    %eax,%esi
   0x080484af <+47>:    jmp    0x80484ba <test+58>
   0x080484b1 <+49>:    lea    0x0(%esi,%eiz,1),%esi
   0x080484b8 <+56>:    fstp   %st(0)
   0x080484ba <+58>:    sub    $0x10,%esp
   0x080484bd <+61>:    fstpl  (%esp)
   0x080484c0 <+64>:    push   %ebx
   0x080484c1 <+65>:    push   %ecx
   0x080484c2 <+66>:    push   $0x8048560
   0x080484c7 <+71>:    push   $0x1
   0x080484c9 <+73>:    call   0x8048330 <__printf_chk@plt>
   0x080484ce <+78>:    add    $0x34,%esp
   0x080484d1 <+81>:    mov    %esi,%eax
   0x080484d3 <+83>:    pop    %ebx
   0x080484d4 <+84>:    pop    %esi
   0x080484d5 <+85>:    ret

The strange thing is that this works as expected if the first printf statement in function test is omitted. Without the first printf:

Dump of assembler code for function test:
   0x08048460 <+0>:     sub    $0xc,%esp
   0x08048463 <+3>:     fldl   0x18(%esp)
   0x08048467 <+7>:     fildll 0x10(%esp)
   0x0804846b <+11>:    fstpl  (%esp)
   0x0804846e <+14>:    fldl   (%esp)
   0x08048471 <+17>:    fucomi %st(1),%st
   0x08048473 <+19>:    jp     0x804847c <test+28>
   0x08048475 <+21>:    mov    $0x1,%eax
   0x0804847a <+26>:    je     0x8048490 <test+48>
   0x0804847c <+28>:    xor    %eax,%eax
   0x0804847e <+30>:    fucomip %st(1),%st
   0x08048480 <+32>:    fstp   %st(0)
   0x08048482 <+34>:    setne  %al
   0x08048485 <+37>:    jmp    0x8048494 <test+52>
   0x08048487 <+39>:    mov    %esi,%esi
   0x08048489 <+41>:    lea    0x0(%edi,%eiz,1),%edi
   0x08048490 <+48>:    fstp   %st(0)
   0x08048492 <+50>:    fstp   %st(0)
   0x08048494 <+52>:    add    $0xc,%esp
   0x08048497 <+55>:    ret


Reproducible: Always

Steps to Reproduce:
1. gcc -Wall -O2 vg_bug.c -o vg_bug
2. valgrind ./vg_bug


Actual Results:  
==3617== Memcheck, a memory error detector
==3617== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==3617== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==3617== Command: ./vg_bug
==3617==
1152921504606846975 1152921504606846976.000000
0
==3617==
==3617== HEAP SUMMARY:
==3617==     in use at exit: 0 bytes in 0 blocks
==3617==   total heap usage: 0 allocs, 0 frees, 0 bytes allocated
==3617==
==3617== All heap blocks were freed -- no leaks are possible
==3617==
==3617== For counts of detected and suppressed errors, rerun with: -v
==3617== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)


Expected Results:  
==3617== Memcheck, a memory error detector
==3617== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==3617== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==3617== Command: ./vg_bug
==3617==
1152921504606846975 1152921504606846976.000000
1
==3617==
==3617== HEAP SUMMARY:
==3617==     in use at exit: 0 bytes in 0 blocks
==3617==   total heap usage: 0 allocs, 0 frees, 0 bytes allocated
==3617==
==3617== All heap blocks were freed -- no leaks are possible
==3617==
==3617== For counts of detected and suppressed errors, rerun with: -v
==3617== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

Comment 1 Tom Hughes 2016-03-01 14:20:56 UTC

Please read the "limitations" section of the manual at http://valgrind.org/docs/manual/manual-core.html#manual-core.limits especially the part about x86 floating point, and in particular this:

"Precision: There is no support for 80 bit arithmetic. Internally, Valgrind represents all such "long double" numbers in 64 bits, and so there may be some differences in results."

That is likely why your program behaves differently under valgrind.

Your program is essentially not well defined anyway though, as floating point calculations are always imprecise so you should not expect to be able to compare a floating point result for equality with an integer version of the same calculation.

Comment 2 Nick Wellnhofer 2016-03-01 14:44:12 UTC

No, this bug report is not about precision. Note that the crucial line is:

int comparison = (a == d || a < d || a > d);

Under valgrind, this evaluates to 0. This is only possible if a or d is NaN, but both numbers clearly aren't.

Comment 3 Tom Hughes 2016-03-01 15:22:52 UTC

Although I'm not clear if that is what has happened here (and this should only happen when not running under valgrind) that's not actually true with x87 because if the compiler chooses to move one of the values from the FP stack to memory in the middle of the comparisons for any reason it will get converted from 80 bit to 64 bit and it's value will change...

Comment 4 Nick Wellnhofer 2016-05-15 12:28:03 UTC

You're correct. Given the statement

int comparison = (a == d || a < d || a > d);

and two values a and d in extended precision floating point registers that are very close to each other, and equal in double precision, the following could happen:

1. (a == d)  is evaluated with the extended precision values in registers, returning false.
2. The values are stored in memory and reloaded, casting them to double precision. The rounded double precision values are now equal.
3. (a < d) and (a > d) both return false.

And I think that's exactly what happens under Valgrind. IIUC, Valgrind doesn't support extended precision but before the floating state is saved in double precision, any instruction block still operates with extended precision.

Comment 5 Julian Seward 2016-07-05 06:41:41 UTC

(In reply to Tom Hughes from comment #3)
> Although I'm not clear if that is what has happened here (and this should
> only happen when not running under valgrind) that's not actually true with
> x87 because if the compiler chooses to move one of the values from the FP
> stack to memory in the middle of the comparisons for any reason it will get
> converted from 80 bit to 64 bit and it's value will change...

As a side note .. no .. x87 spills/reloads are done with 80-bit loads/stores,
exactly to avoid the situation that the values computed depend on the 
specifics of register allocation.

That said .. I'm not really clear what's going on here.  It might be some kind
of precision problem, given that there are only really 53 mantissa bits available
because V simulates 80 bit registers using normal double-precision values.
It would require further investigation.

Comment 6 Tom Hughes 2016-07-05 08:15:10 UTC

Note that I didn't mean spills/reloads that valgrind's internal implementation does, but rather any spills/reloads that the original compiler generated.

My understanding was that those would use the normal memory location for the variable, which as a double would only be 64 bits and hence would lose precision. Are you saying that gcc instead allocates an 80 bit temporary location for the spill to avoid losing precision?

Comment 7 Julian Seward 2016-07-05 08:39:13 UTC

(In reply to Tom Hughes from comment #6)
> Note that I didn't mean spills/reloads that valgrind's internal
> implementation does, but rather any spills/reloads that the original
> compiler generated.

Yes -- that's also what I meant.

> Are you saying that gcc instead allocates an 80 bit temporary
> location for the spill to avoid losing precision?

Well I *thought* that was the case, but it seems I am wrong.  Using
this spill-inducing loop

  int main ( int argc, char** argv )
  {
     double a, b, c, d, e, f, g, h;
     a=b=c=d=e=f=g=h=1.2345;
     int i;
     for (i = 0; i < 1234 * argc; i++) {
        a += i * 1.2;
        b += a * 1.3;
        c += d / 1.4;
        d += e - 1.5;
        e += f + 1.6;
        f += g - 1.7;
        g += h * 1.8;
        h += a - (b*c) + (d*e*f/g);
     }
     return (a-b-c-d-e-f-g-h > 987.345 ? 1 : 0);
  }

I get the assembly (for the loop body) below, which uses 64-bit loads
and stores (fstpl -16(%ebp), fmull -16(%ebp), etc) with gcc6 -m32 -O2.
To get 80-bit accesses (fstpt, fldt) I need to change the declaration
to be "long double".

Ah well.  Live and learn.  Now I am wondering why I believed gcc did
always do 80-bit spills/reloads.

.L3:
        movl    %eax, -28(%ebp)
        addl    $1, %eax
        fildl   -28(%ebp)
        cmpl    %edx, %eax
        fmull   .LC1
        faddp   %st, %st(1)
        fldl    .LC2
        fmul    %st(1), %st
        faddl   -24(%ebp)
        fld     %st(0)
        fstpl   -24(%ebp)
        fldl    .LC3
        fdivr   %st(7), %st
        faddl   -16(%ebp)
        fstpl   -16(%ebp)
        fld     %st(3)
        fsubs   .LC4
        faddp   %st, %st(7)
        fldl    .LC5
        fadd    %st(5), %st
        faddp   %st, %st(4)
        fldl    .LC6
        fsubr   %st(6), %st
        faddp   %st, %st(5)
        fldl    .LC7
        fmul    %st(3), %st
        faddp   %st, %st(6)
        fld     %st(6)
        fmul    %st(4), %st
        fmul    %st(5), %st
        fdiv    %st(6), %st
        fxch    %st(1)
        fmull   -16(%ebp)
        fsubr   %st(2), %st
        faddp   %st, %st(1)
        faddp   %st, %st(2)
        jne     .L3