return z
}
+func high32(x nat) uint32 {
+ // TODO(gri) This can be done more efficiently on 32bit platforms.
+ return uint32(high64(x) >> 32)
+}
+
func high64(x nat) uint64 {
i := len(x)
if i == 0 {
panic("unreachable")
}
-// Float64 returns the float64 value nearest to x by rounding ToNearestEven
-// with 53 bits of precision.
-// If x is too small to be represented by a float64
-// (|x| < math.SmallestNonzeroFloat64), the result is (0, Below) or
-// (-0, Above), respectively, depending on the sign of x.
-// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
+// TODO(gri) Float32 and Float64 are very similar internally but for the
+// floatxx parameters and some conversions. Should factor out shared code.
+
+// Float32 returns the float32 value nearest to x. If x is too small to be
+// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
+// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
+// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
// The result is (NaN, Undef) for NaNs.
-func (x *Float) Float64() (float64, Accuracy) {
+func (x *Float) Float32() (float32, Accuracy) {
if debugFloat {
x.validate()
}
switch x.form {
case finite:
// 0 < |x| < +Inf
+
+ const (
+ fbits = 32 // float size
+ mbits = 23 // mantissa size (excluding implicit msb)
+ ebits = fbits - mbits - 1 // 8 exponent size
+ bias = 1<<(ebits-1) - 1 // 127 exponent bias
+ dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal)
+ emin = 1 - bias // -126 smallest unbiased exponent (normal)
+ emax = bias // 127 largest unbiased exponent (normal)
+ )
+
+ // Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
+ // floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
+ // For a given mantissa m, we need to add 1 to a floatxx exponent to get the
+ // corresponding Float exponent.
+ // (see also implementation of math.Ldexp for similar code)
+
+ if x.exp < dmin+1 {
+ // underflow
+ if x.neg {
+ var z float32
+ return -z, Above
+ }
+ return 0.0, Below
+ }
+ // x.exp >= dmin+1
+
var r Float
- r.prec = 53
+ r.prec = mbits + 1 // +1 for implicit msb
+ if x.exp < emin+1 {
+ // denormal number - round to fewer bits
+ r.prec = uint32(x.exp - dmin)
+ }
r.Set(x)
- // Rounding via Set may have caused r to overflow
- // to ±Inf (rounding never causes underflows to 0).
+ // Rounding may have caused r to overflow to ±Inf
+ // (rounding never causes underflows to 0).
if r.form == inf {
- r.exp = 10000 // cause overflow below
+ r.exp = emax + 2 // cause overflow below
}
- // see also implementation of math.Ldexp
+ if r.exp > emax+1 {
+ // overflow
+ if x.neg {
+ return float32(math.Inf(-1)), Below
+ }
+ return float32(math.Inf(+1)), Above
+ }
+ // dmin+1 <= r.exp <= emax+1
+
+ var s uint32
+ if r.neg {
+ s = 1 << (fbits - 1)
+ }
+
+ m := high32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
+
+ // Rounding may have caused a denormal number to
+ // become normal. Check again.
+ c := float32(1.0)
+ if r.exp < emin+1 {
+ // denormal number
+ r.exp += mbits
+ c = 1.0 / (1 << mbits) // 2**-mbits
+ }
+ // emin+1 <= r.exp <= emax+1
+ e := uint32(r.exp-emin) << mbits
+
+ return c * math.Float32frombits(s|e|m), r.acc
- e := int64(r.exp) + 1022
- if e <= -52 {
+ case zero:
+ if x.neg {
+ var z float32
+ return -z, Exact
+ }
+ return 0.0, Exact
+
+ case inf:
+ if x.neg {
+ return float32(math.Inf(-1)), Exact
+ }
+ return float32(math.Inf(+1)), Exact
+
+ case nan:
+ return float32(math.NaN()), Undef
+ }
+
+ panic("unreachable")
+}
+
+// Float64 returns the float64 value nearest to x. If x is too small to be
+// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
+// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
+// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
+// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
+// The result is (NaN, Undef) for NaNs.
+func (x *Float) Float64() (float64, Accuracy) {
+ if debugFloat {
+ x.validate()
+ }
+
+ switch x.form {
+ case finite:
+ // 0 < |x| < +Inf
+
+ const (
+ fbits = 64 // float size
+ mbits = 52 // mantissa size (excluding implicit msb)
+ ebits = fbits - mbits - 1 // 11 exponent size
+ bias = 1<<(ebits-1) - 1 // 1023 exponent bias
+ dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal)
+ emin = 1 - bias // -1022 smallest unbiased exponent (normal)
+ emax = bias // 1023 largest unbiased exponent (normal)
+ )
+
+ // Float mantissae m have an explicit msb and are in the range 0.5 <= m < 1.0.
+ // floatxx mantissae have an implicit msb and are in the range 1.0 <= m < 2.0.
+ // For a given mantissa m, we need to add 1 to a floatxx exponent to get the
+ // corresponding Float exponent.
+ // (see also implementation of math.Ldexp for similar code)
+
+ if x.exp < dmin+1 {
// underflow
if x.neg {
- z := 0.0
+ var z float64
return -z, Above
}
return 0.0, Below
}
- // e > -52
+ // x.exp >= dmin+1
+
+ var r Float
+ r.prec = mbits + 1 // +1 for implicit msb
+ if x.exp < emin+1 {
+ // denormal number - round to fewer bits
+ r.prec = uint32(x.exp - dmin)
+ }
+ r.Set(x)
+
+ // Rounding may have caused r to overflow to ±Inf
+ // (rounding never causes underflows to 0).
+ if r.form == inf {
+ r.exp = emax + 2 // cause overflow below
+ }
- if e >= 2047 {
+ if r.exp > emax+1 {
// overflow
if x.neg {
return math.Inf(-1), Below
}
return math.Inf(+1), Above
}
- // -52 < e < 2047
-
- denormal := false
- if e < 0 {
- denormal = true
- e += 52
- }
- // 0 < e < 2047
+ // dmin+1 <= r.exp <= emax+1
- s := uint64(0)
+ var s uint64
if r.neg {
- s = 1 << 63
+ s = 1 << (fbits - 1)
}
- m := high64(r.mant) >> 11 & (1<<52 - 1) // cut off msb (implicit 1 bit)
- z := math.Float64frombits(s | uint64(e)<<52 | m)
- if denormal {
- // adjust for denormal
- // TODO(gri) does this change accuracy?
- z /= 1 << 52
+
+ m := high64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
+
+ // Rounding may have caused a denormal number to
+ // become normal. Check again.
+ c := 1.0
+ if r.exp < emin+1 {
+ // denormal number
+ r.exp += mbits
+ c = 1.0 / (1 << mbits) // 2**-mbits
}
- return z, r.acc
+ // emin+1 <= r.exp <= emax+1
+ e := uint64(r.exp-emin) << mbits
+
+ return c * math.Float64frombits(s|e|m), r.acc
case zero:
if x.neg {
- z := 0.0
+ var z float64
return -z, Exact
}
return 0.0, Exact
// TestFloatRound24 tests that rounding a float64 to 24 bits
// matches IEEE-754 rounding to nearest when converting a
-// float64 to a float32.
+// float64 to a float32 (excluding denormal numbers).
func TestFloatRound24(t *testing.T) {
const x0 = 1<<26 - 0x10 // 11...110000 (26 bits)
for d := 0; d <= 0x10; d++ {
x := float64(x0 + d)
f := new(Float).SetPrec(24).SetFloat64(x)
- got, _ := f.Float64()
- want := float64(float32(x))
+ got, _ := f.Float32()
+ want := float32(x)
if got != want {
t.Errorf("Round(%g, 24) = %g; want %g", x, got, want)
}
}
}
+func TestFloatFloat32(t *testing.T) {
+ for _, test := range []struct {
+ x string
+ out float32
+ acc Accuracy
+ }{
+ {"-Inf", float32(math.Inf(-1)), Exact},
+ {"-0x1.ffffff0p2147483646", float32(-math.Inf(+1)), Below}, // overflow in rounding
+ {"-1e10000", float32(math.Inf(-1)), Below}, // overflow
+ {"-0x1p128", float32(math.Inf(-1)), Below}, // overflow
+ {"-0x1.ffffff0p127", float32(-math.Inf(+1)), Below}, // overflow
+ {"-0x1.fffffe8p127", -math.MaxFloat32, Above},
+ {"-0x1.fffffe0p127", -math.MaxFloat32, Exact},
+ {"-12345.000000000000000000001", -12345, Above},
+ {"-12345.0", -12345, Exact},
+ {"-1.000000000000000000001", -1, Above},
+ {"-1", -1, Exact},
+ {"-0x0.000002p-126", -math.SmallestNonzeroFloat32, Exact},
+ {"-0x0.000002p-127", -0, Above}, // underflow
+ {"-1e-1000", -0, Above}, // underflow
+ {"0", 0, Exact},
+ {"1e-1000", 0, Below}, // underflow
+ {"0x0.000002p-127", 0, Below}, // underflow
+ {"0x0.000002p-126", math.SmallestNonzeroFloat32, Exact},
+ {"1", 1, Exact},
+ {"1.000000000000000000001", 1, Below},
+ {"12345.0", 12345, Exact},
+ {"12345.000000000000000000001", 12345, Below},
+ {"0x1.fffffe0p127", math.MaxFloat32, Exact},
+ {"0x1.fffffe8p127", math.MaxFloat32, Below},
+ {"0x1.ffffff0p127", float32(math.Inf(+1)), Above}, // overflow
+ {"0x1p128", float32(math.Inf(+1)), Above}, // overflow
+ {"1e10000", float32(math.Inf(+1)), Above}, // overflow
+ {"0x1.ffffff0p2147483646", float32(math.Inf(+1)), Above}, // overflow in rounding
+ {"+Inf", float32(math.Inf(+1)), Exact},
+ } {
+ // conversion should match strconv where syntax is agreeable
+ if f, err := strconv.ParseFloat(test.x, 32); err == nil && float32(f) != test.out {
+ t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out)
+ }
+
+ x := makeFloat(test.x)
+ out, acc := x.Float32()
+ if out != test.out || acc != test.acc {
+ t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), test.acc)
+ }
+
+ // test that x.SetFloat64(float64(f)).Float32() == f
+ var x2 Float
+ out2, acc2 := x2.SetFloat64(float64(out)).Float32()
+ if out2 != out || acc2 != Exact {
+ t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out)
+ }
+ }
+
+ // test NaN
+ x := makeFloat("NaN")
+ if out, acc := x.Float32(); out == out || acc != Undef {
+ t.Errorf("NaN: got %g (%s); want NaN (Undef)", out, acc)
+ }
+}
+
func TestFloatFloat64(t *testing.T) {
+ const smallestNormalFloat64 = 2.2250738585072014e-308 // 1p-1022
for _, test := range []struct {
x string
out float64
{"-0x1p1024", math.Inf(-1), Below}, // overflow
{"-0x1.fffffffffffff8p1023", -math.Inf(+1), Below}, // overflow
{"-0x1.fffffffffffff4p1023", -math.MaxFloat64, Above},
- {"-0x1.fffffffffffffp1023", -math.MaxFloat64, Exact},
+ {"-0x1.fffffffffffff0p1023", -math.MaxFloat64, Exact},
{"-12345.000000000000000000001", -12345, Above},
{"-12345.0", -12345, Exact},
{"-1.000000000000000000001", -1, Above},
{"1.000000000000000000001", 1, Below},
{"12345.0", 12345, Exact},
{"12345.000000000000000000001", 12345, Below},
- {"0x1.fffffffffffffp1023", math.MaxFloat64, Exact},
+ {"0x1.fffffffffffff0p1023", math.MaxFloat64, Exact},
{"0x1.fffffffffffff4p1023", math.MaxFloat64, Below},
{"0x1.fffffffffffff8p1023", math.Inf(+1), Above}, // overflow
{"0x1p1024", math.Inf(+1), Above}, // overflow
{"1e10000", math.Inf(+1), Above}, // overflow
{"0x1.fffffffffffff8p2147483646", math.Inf(+1), Above}, // overflow in rounding
{"+Inf", math.Inf(+1), Exact},
+
+ // selected denormalized values that were handled incorrectly in the past
+ {"0x.fffffffffffffp-1022", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact},
+ {"4503599627370495p-1074", smallestNormalFloat64 - math.SmallestNonzeroFloat64, Exact},
+
+ // http://www.exploringbinary.com/php-hangs-on-numeric-value-2-2250738585072011e-308/
+ {"2.2250738585072011e-308", 2.225073858507201e-308, Below},
+ // http://www.exploringbinary.com/java-hangs-when-converting-2-2250738585072012e-308/
+ {"2.2250738585072012e-308", 2.2250738585072014e-308, Above},
} {
+ // conversion should match strconv where syntax is agreeable
+ if f, err := strconv.ParseFloat(test.x, 64); err == nil && f != test.out {
+ t.Errorf("%s: got %g; want %g (incorrect test data)", test.x, f, test.out)
+ }
+
x := makeFloat(test.x)
out, acc := x.Float64()
if out != test.out || acc != test.acc {
- t.Errorf("%s: got %g (%s); want %g (%s)", test.x, out, acc, test.out, test.acc)
+ t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", test.x, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), test.acc)
+ }
+
+ // test that x.SetFloat64(f).Float64() == f
+ var x2 Float
+ out2, acc2 := x2.SetFloat64(out).Float64()
+ if out2 != out || acc2 != Exact {
+ t.Errorf("idempotency test: got %g (%s); want %g (Exact)", out2, acc2, out)
}
}
}
// TestFloatAdd32 tests that Float.Add/Sub of numbers with
-// 24bit mantissa behaves like float32 addition/subtraction.
+// 24bit mantissa behaves like float32 addition/subtraction
+// (excluding denormal numbers).
func TestFloatAdd32(t *testing.T) {
// chose base such that we cross the mantissa precision limit
const base = 1<<26 - 0x10 // 11...110000 (26 bits)
z := new(Float).SetPrec(24)
z.Add(x, y)
- got, acc := z.Float64()
- want := float64(float32(y0) + float32(x0))
+ got, acc := z.Float32()
+ want := float32(y0) + float32(x0)
if got != want || acc != Exact {
t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want)
}
z.Sub(z, y)
- got, acc = z.Float64()
- want = float64(float32(want) - float32(y0))
+ got, acc = z.Float32()
+ want = float32(want) - float32(y0)
if got != want || acc != Exact {
t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want)
}