math/big: fix rounding to smallest denormal for Float.Float32/64

author Robert Griesemer <gri@golang.org>

Fri, 18 Mar 2016 18:16:35 +0000 (11:16 -0700)

committer Robert Griesemer <gri@golang.org>

Mon, 21 Mar 2016 20:24:06 +0000 (20:24 +0000)
author Robert Griesemer <gri@golang.org>
Fri, 18 Mar 2016 18:16:35 +0000 (11:16 -0700)
committer Robert Griesemer <gri@golang.org>
Mon, 21 Mar 2016 20:24:06 +0000 (20:24 +0000)
diff --git a/src/math/big/float.go b/src/math/big/float.go

index eca85d4bb023dba357d82e0ac22900f7cd33e0bc..f19f21f068128eb2e6c24b4479b1e295d3b5f348 100644 (file)
--- a/src/math/big/float.go
+++ b/src/math/big/float.go
@@ -874,21 +874,43 @@ func (x *Float) Float32() (float32, Accuracy) {
                         emax  = bias              //   127  largest unbiased exponent (normal)
                 )
  
-               // Float mantissa m is 0.5 <= m < 1.0; compute exponent for float32 mantissa.
-               e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0
-               p := mbits + 1 // precision of normal float
+               // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
+               e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
  
-               // If the exponent is too small, we may have a denormal number
-               // in which case we have fewer mantissa bits available: recompute
-               // precision.
+               // Compute precision p for float32 mantissa.
+               // If the exponent is too small, we have a denormal number before
+               // rounding and fewer than p mantissa bits of precision available
+               // (the exponent remains fixed but the mantissa gets shifted right).
+               p := mbits + 1 // precision of normal float
                 if e < emin {
+                       // recompute precision
                         p = mbits + 1 - emin + int(e)
-                       // Make sure we have at least 1 bit so that we don't
-                       // lose numbers rounded up to the smallest denormal.
-                       if p < 1 {
-                               p = 1
+                       // If p == 0, the mantissa of x is shifted so much to the right
+                       // that its msb falls immediately to the right of the float32
+                       // mantissa space. In other words, if the smallest denormal is
+                       // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
+                       // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
+                       // If m == 0.5, it is rounded down to even, i.e., 0.0.
+                       // If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
+                       if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
+                               // underflow to ±0
+                               if x.neg {
+                                       var z float32
+                                       return -z, Above
+                               }
+                               return 0.0, Below
+                       }
+                       // otherwise, round up
+                       // We handle p == 0 explicitly because it's easy and because
+                       // Float.round doesn't support rounding to 0 bits of precision.
+                       if p == 0 {
+                               if x.neg {
+                                       return -math.SmallestNonzeroFloat32, Below
+                               }
+                               return math.SmallestNonzeroFloat32, Above
                         }
                 }
+               // p > 0
  
                 // round
                 var r Float
@@ -898,12 +920,8 @@ func (x *Float) Float32() (float32, Accuracy) {
  
                 // Rounding may have caused r to overflow to ±Inf
                 // (rounding never causes underflows to 0).
-               if r.form == inf {
-                       e = emax + 1 // cause overflow below
-               }
-
-               // If the exponent is too large, overflow to ±Inf.
-               if e > emax {
+               // If the exponent is too large, also overflow to ±Inf.
+               if r.form == inf || e > emax {
                         // overflow
                         if x.neg {
                                 return float32(math.Inf(-1)), Below
@@ -921,17 +939,10 @@ func (x *Float) Float32() (float32, Accuracy) {
                 // Rounding may have caused a denormal number to
                 // become normal. Check again.
                 if e < emin {
-                       // denormal number
-                       if e < dmin {
-                               // underflow to ±0
-                               if x.neg {
-                                       var z float32
-                                       return -z, Above
-                               }
-                               return 0.0, Below
-                       }
-                       // bexp = 0
-                       // recompute precision
+                       // denormal number: recompute precision
+                       // Since rounding may have at best increased precision
+                       // and we have eliminated p <= 0 early, we know p > 0.
+                       // bexp == 0 for denormals
                         p = mbits + 1 - emin + int(e)
                         mant = msb32(r.mant) >> uint(fbits-p)
                 } else {
@@ -983,21 +994,43 @@ func (x *Float) Float64() (float64, Accuracy) {
                         emax  = bias              //  1023  largest unbiased exponent (normal)
                 )
  
-               // Float mantissa m is 0.5 <= m < 1.0; compute exponent for float64 mantissa.
-               e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0
-               p := mbits + 1 // precision of normal float
+               // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
+               e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
  
-               // If the exponent is too small, we may have a denormal number
-               // in which case we have fewer mantissa bits available: recompute
-               // precision.
+               // Compute precision p for float64 mantissa.
+               // If the exponent is too small, we have a denormal number before
+               // rounding and fewer than p mantissa bits of precision available
+               // (the exponent remains fixed but the mantissa gets shifted right).
+               p := mbits + 1 // precision of normal float
                 if e < emin {
+                       // recompute precision
                         p = mbits + 1 - emin + int(e)
-                       // Make sure we have at least 1 bit so that we don't
-                       // lose numbers rounded up to the smallest denormal.
-                       if p < 1 {
-                               p = 1
+                       // If p == 0, the mantissa of x is shifted so much to the right
+                       // that its msb falls immediately to the right of the float64
+                       // mantissa space. In other words, if the smallest denormal is
+                       // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
+                       // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
+                       // If m == 0.5, it is rounded down to even, i.e., 0.0.
+                       // If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
+                       if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
+                               // underflow to ±0
+                               if x.neg {
+                                       var z float64
+                                       return -z, Above
+                               }
+                               return 0.0, Below
+                       }
+                       // otherwise, round up
+                       // We handle p == 0 explicitly because it's easy and because
+                       // Float.round doesn't support rounding to 0 bits of precision.
+                       if p == 0 {
+                               if x.neg {
+                                       return -math.SmallestNonzeroFloat64, Below
+                               }
+                               return math.SmallestNonzeroFloat64, Above
                         }
                 }
+               // p > 0
  
                 // round
                 var r Float
@@ -1007,17 +1040,13 @@ func (x *Float) Float64() (float64, Accuracy) {
  
                 // Rounding may have caused r to overflow to ±Inf
                 // (rounding never causes underflows to 0).
-               if r.form == inf {
-                       e = emax + 1 // cause overflow below
-               }
-
-               // If the exponent is too large, overflow to ±Inf.
-               if e > emax {
+               // If the exponent is too large, also overflow to ±Inf.
+               if r.form == inf || e > emax {
                         // overflow
                         if x.neg {
-                               return math.Inf(-1), Below
+                               return float64(math.Inf(-1)), Below
                         }
-                       return math.Inf(+1), Above
+                       return float64(math.Inf(+1)), Above
                 }
                 // e <= emax
  
@@ -1030,17 +1059,10 @@ func (x *Float) Float64() (float64, Accuracy) {
                 // Rounding may have caused a denormal number to
                 // become normal. Check again.
                 if e < emin {
-                       // denormal number
-                       if e < dmin {
-                               // underflow to ±0
-                               if x.neg {
-                                       var z float64
-                                       return -z, Above
-                               }
-                               return 0.0, Below
-                       }
-                       // bexp = 0
-                       // recompute precision
+                       // denormal number: recompute precision
+                       // Since rounding may have at best increased precision
+                       // and we have eliminated p <= 0 early, we know p > 0.
+                       // bexp == 0 for denormals
                         p = mbits + 1 - emin + int(e)
                         mant = msb64(r.mant) >> uint(fbits-p)
                 } else {
diff --git a/src/math/big/float_test.go b/src/math/big/float_test.go

index 6fb44026decf3656c75a1d23a54708fbffe74859..464619b3380f492f2e63df27163f5b612f40b4ee 100644 (file)
--- a/src/math/big/float_test.go
+++ b/src/math/big/float_test.go
@@ -829,7 +829,7 @@ func TestFloatFloat32(t *testing.T) {
         }{
                 {"0", 0, Exact},
  
-               // underflow
+               // underflow to zero
                 {"1e-1000", 0, Below},
                 {"0x0.000002p-127", 0, Below},
                 {"0x.0000010p-126", 0, Below},
@@ -843,25 +843,39 @@ func TestFloatFloat32(t *testing.T) {
                 {"1p-149", math.SmallestNonzeroFloat32, Exact},
                 {"0x.fffffep-126", math.Float32frombits(0x7fffff), Exact}, // largest denormal
  
-               // special cases (see issue 14553)
-               {"0x0.bp-149", math.Float32frombits(0x000000000), Below}, // ToNearestEven rounds down (to even)
-               {"0x0.cp-149", math.Float32frombits(0x000000001), Above},
-
-               {"0x1.0p-149", math.Float32frombits(0x000000001), Exact},
+               // special denormal cases (see issues 14553, 14651)
+               {"0x0.0000001p-126", math.Float32frombits(0x00000000), Below}, // underflow to zero
+               {"0x0.0000008p-126", math.Float32frombits(0x00000000), Below}, // underflow to zero
+               {"0x0.0000010p-126", math.Float32frombits(0x00000000), Below}, // rounded down to even
+               {"0x0.0000011p-126", math.Float32frombits(0x00000001), Above}, // rounded up to smallest denormal
+               {"0x0.0000018p-126", math.Float32frombits(0x00000001), Above}, // rounded up to smallest denormal
+
+               {"0x1.0000000p-149", math.Float32frombits(0x00000001), Exact}, // smallest denormal
+               {"0x0.0000020p-126", math.Float32frombits(0x00000001), Exact}, // smallest denormal
+               {"0x0.fffffe0p-126", math.Float32frombits(0x007fffff), Exact}, // largest denormal
+               {"0x1.0000000p-126", math.Float32frombits(0x00800000), Exact}, // smallest normal
+
+               {"0x0.8p-149", math.Float32frombits(0x000000000), Below}, // rounded down to even
+               {"0x0.9p-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
+               {"0x0.ap-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
+               {"0x0.bp-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
+               {"0x0.cp-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal
+
+               {"0x1.0p-149", math.Float32frombits(0x000000001), Exact}, // smallest denormal
                 {"0x1.7p-149", math.Float32frombits(0x000000001), Below},
                 {"0x1.8p-149", math.Float32frombits(0x000000002), Above},
                 {"0x1.9p-149", math.Float32frombits(0x000000002), Above},
  
                 {"0x2.0p-149", math.Float32frombits(0x000000002), Exact},
-               {"0x2.8p-149", math.Float32frombits(0x000000002), Below}, // ToNearestEven rounds down (to even)
+               {"0x2.8p-149", math.Float32frombits(0x000000002), Below}, // rounded down to even
                 {"0x2.9p-149", math.Float32frombits(0x000000003), Above},
  
                 {"0x3.0p-149", math.Float32frombits(0x000000003), Exact},
                 {"0x3.7p-149", math.Float32frombits(0x000000003), Below},
-               {"0x3.8p-149", math.Float32frombits(0x000000004), Above}, // ToNearestEven rounds up (to even)
+               {"0x3.8p-149", math.Float32frombits(0x000000004), Above}, // rounded up to even
  
                 {"0x4.0p-149", math.Float32frombits(0x000000004), Exact},
-               {"0x4.8p-149", math.Float32frombits(0x000000004), Below}, // ToNearestEven rounds down (to even)
+               {"0x4.8p-149", math.Float32frombits(0x000000004), Below}, // rounded down to even
                 {"0x4.9p-149", math.Float32frombits(0x000000005), Above},
  
                 // specific case from issue 14553
@@ -907,7 +921,7 @@ func TestFloatFloat32(t *testing.T) {
                         x := makeFloat(tx)
                         out, acc := x.Float32()
                         if !alike32(out, tout) || acc != tacc {
-                               t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", tx, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), tacc)
+                               t.Errorf("%s: got %g (%#08x, %s); want %g (%#08x, %s)", tx, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), tacc)
                         }
  
                         // test that x.SetFloat64(float64(f)).Float32() == f
@@ -929,21 +943,30 @@ func TestFloatFloat64(t *testing.T) {
         }{
                 {"0", 0, Exact},
  
-               // underflow
+               // underflow to zero
                 {"1e-1000", 0, Below},
                 {"0x0.0000000000001p-1023", 0, Below},
                 {"0x0.00000000000008p-1022", 0, Below},
  
                 // denormals
                 {"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal
-               {"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact},  // smallest denormal
+               {"0x0.00000000000010p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal
                 {"0x.8p-1073", math.SmallestNonzeroFloat64, Exact},
                 {"1p-1074", math.SmallestNonzeroFloat64, Exact},
                 {"0x.fffffffffffffp-1022", math.Float64frombits(0x000fffffffffffff), Exact}, // largest denormal
  
-               // special cases (see issue 14553)
-               {"0x0.bp-1074", math.Float64frombits(0x00000000000000000), Below}, // ToNearestEven rounds down (to even)
-               {"0x0.cp-1074", math.Float64frombits(0x00000000000000001), Above},
+               // special denormal cases (see issues 14553, 14651)
+               {"0x0.00000000000001p-1022", math.Float64frombits(0x00000000000000000), Below}, // underflow to zero
+               {"0x0.00000000000004p-1022", math.Float64frombits(0x00000000000000000), Below}, // underflow to zero
+               {"0x0.00000000000008p-1022", math.Float64frombits(0x00000000000000000), Below}, // rounded down to even
+               {"0x0.00000000000009p-1022", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
+               {"0x0.0000000000000ap-1022", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
+
+               {"0x0.8p-1074", math.Float64frombits(0x00000000000000000), Below}, // rounded down to even
+               {"0x0.9p-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
+               {"0x0.ap-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
+               {"0x0.bp-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
+               {"0x0.cp-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal
  
                 {"0x1.0p-1074", math.Float64frombits(0x00000000000000001), Exact},
                 {"0x1.7p-1074", math.Float64frombits(0x00000000000000001), Below},
@@ -951,15 +974,15 @@ func TestFloatFloat64(t *testing.T) {
                 {"0x1.9p-1074", math.Float64frombits(0x00000000000000002), Above},
  
                 {"0x2.0p-1074", math.Float64frombits(0x00000000000000002), Exact},
-               {"0x2.8p-1074", math.Float64frombits(0x00000000000000002), Below}, // ToNearestEven rounds down (to even)
+               {"0x2.8p-1074", math.Float64frombits(0x00000000000000002), Below}, // rounded down to even
                 {"0x2.9p-1074", math.Float64frombits(0x00000000000000003), Above},
  
                 {"0x3.0p-1074", math.Float64frombits(0x00000000000000003), Exact},
                 {"0x3.7p-1074", math.Float64frombits(0x00000000000000003), Below},
-               {"0x3.8p-1074", math.Float64frombits(0x00000000000000004), Above}, // ToNearestEven rounds up (to even)
+               {"0x3.8p-1074", math.Float64frombits(0x00000000000000004), Above}, // rounded up to even
  
                 {"0x4.0p-1074", math.Float64frombits(0x00000000000000004), Exact},
-               {"0x4.8p-1074", math.Float64frombits(0x00000000000000004), Below}, // ToNearestEven rounds down (to even)
+               {"0x4.8p-1074", math.Float64frombits(0x00000000000000004), Below}, // rounded down to even
                 {"0x4.9p-1074", math.Float64frombits(0x00000000000000005), Above},
  
                 // normals
@@ -1005,7 +1028,7 @@ func TestFloatFloat64(t *testing.T) {
                         x := makeFloat(tx)
                         out, acc := x.Float64()
                         if !alike64(out, tout) || acc != tacc {
-                               t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", tx, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), tacc)
+                               t.Errorf("%s: got %g (%#016x, %s); want %g (%#016x, %s)", tx, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), tacc)
                         }
  
                         // test that x.SetFloat64(f).Float64() == f
author	Robert Griesemer <gri@golang.org>
	Fri, 18 Mar 2016 18:16:35 +0000 (11:16 -0700)
committer	Robert Griesemer <gri@golang.org>
	Mon, 21 Mar 2016 20:24:06 +0000 (20:24 +0000)
src/math/big/float.go		patch \| blob \| history
src/math/big/float_test.go		patch \| blob \| history