From 067acd51b01f43681d9196c01a293ee5047b69a7 Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Wed, 7 Jan 2015 11:11:57 -0800 Subject: [PATCH] math/big: faster "pure Go" addition/subtraction for long vectors (platforms w/o corresponding assembly kernels) For short vector adds there's some erradic slow-down, but overall these routines have become significantly faster. This only matters for platforms w/o native (assembly) versions of these kernels, so we are not concerned about the minor slow-down for short vectors. This code was already reviewed under Mercurial (golang.org/cl/172810043) but wasn't submitted before the switch to git. Benchmarks run on 2.3GHz Intel Core i7, running OS X 10.9.5, with the respective AddVV and AddVW assembly routines disabled. benchmark old ns/op new ns/op delta BenchmarkAddVV_1 6.59 7.09 +7.59% BenchmarkAddVV_2 10.3 10.1 -1.94% BenchmarkAddVV_3 10.9 12.6 +15.60% BenchmarkAddVV_4 13.9 15.6 +12.23% BenchmarkAddVV_5 16.8 17.3 +2.98% BenchmarkAddVV_1e1 29.5 29.9 +1.36% BenchmarkAddVV_1e2 246 232 -5.69% BenchmarkAddVV_1e3 2374 2185 -7.96% BenchmarkAddVV_1e4 58942 22292 -62.18% BenchmarkAddVV_1e5 668622 225279 -66.31% BenchmarkAddVW_1 6.81 5.58 -18.06% BenchmarkAddVW_2 7.69 6.86 -10.79% BenchmarkAddVW_3 9.56 8.32 -12.97% BenchmarkAddVW_4 12.1 9.53 -21.24% BenchmarkAddVW_5 13.2 10.9 -17.42% BenchmarkAddVW_1e1 23.4 18.0 -23.08% BenchmarkAddVW_1e2 175 141 -19.43% BenchmarkAddVW_1e3 1568 1266 -19.26% BenchmarkAddVW_1e4 15425 12596 -18.34% BenchmarkAddVW_1e5 156737 133539 -14.80% BenchmarkFibo 381678466 132958666 -65.16% benchmark old MB/s new MB/s speedup BenchmarkAddVV_1 9715.25 9028.30 0.93x BenchmarkAddVV_2 12461.72 12622.60 1.01x BenchmarkAddVV_3 17549.64 15243.82 0.87x BenchmarkAddVV_4 18392.54 16398.29 0.89x BenchmarkAddVV_5 18995.23 18496.57 0.97x BenchmarkAddVV_1e1 21708.98 21438.28 0.99x BenchmarkAddVV_1e2 25956.53 27506.88 1.06x BenchmarkAddVV_1e3 26947.93 29286.66 1.09x BenchmarkAddVV_1e4 10857.96 28709.46 2.64x BenchmarkAddVV_1e5 9571.91 28409.21 2.97x BenchmarkAddVW_1 1175.28 1433.98 1.22x BenchmarkAddVW_2 2080.01 2332.54 1.12x BenchmarkAddVW_3 2509.28 2883.97 1.15x BenchmarkAddVW_4 2646.09 3356.83 1.27x BenchmarkAddVW_5 3020.69 3671.07 1.22x BenchmarkAddVW_1e1 3425.76 4441.40 1.30x BenchmarkAddVW_1e2 4553.17 5642.96 1.24x BenchmarkAddVW_1e3 5100.14 6318.72 1.24x BenchmarkAddVW_1e4 5186.15 6350.96 1.22x BenchmarkAddVW_1e5 5104.07 5990.74 1.17x Change-Id: I7a62023b1105248a0e85e5b9819d3fd4266123d4 Reviewed-on: https://go-review.googlesource.com/2480 Reviewed-by: Russ Cox Reviewed-by: Alan Donovan --- src/math/big/arith.go | 69 +++++++++++++++++++++++++++++++++----- src/math/big/arith_test.go | 2 +- src/math/big/nat_test.go | 52 ++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 10 deletions(-) diff --git a/src/math/big/arith.go b/src/math/big/arith.go index 3d5a8682d9..328c85c4f7 100644 --- a/src/math/big/arith.go +++ b/src/math/big/arith.go @@ -70,7 +70,7 @@ func mulWW_g(x, y Word) (z1, z0 Word) { // z1<<_W + z0 = x*y + c func mulAddWWW_g(x, y, c Word) (z1, z0 Word) { - z1, zz0 := mulWW(x, y) + z1, zz0 := mulWW_g(x, y) if z0 = zz0 + c; z0 < zz0 { z1++ } @@ -154,32 +154,82 @@ func divWW_g(u1, u0, v Word) (q, r Word) { return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s } +// Keep for performance debugging. +// Using addWW_g is likely slower. +const use_addWW_g = false + +// The resulting carry c is either 0 or 1. func addVV_g(z, x, y []Word) (c Word) { - for i := range z { - c, z[i] = addWW_g(x[i], y[i], c) + if use_addWW_g { + for i := range z { + c, z[i] = addWW_g(x[i], y[i], c) + } + return + } + + for i, xi := range x[:len(z)] { + yi := y[i] + zi := xi + yi + c + z[i] = zi + // see "Hacker's Delight", section 2-12 (overflow detection) + c = (xi&yi | (xi|yi)&^zi) >> (_W - 1) } return } +// The resulting carry c is either 0 or 1. func subVV_g(z, x, y []Word) (c Word) { - for i := range z { - c, z[i] = subWW_g(x[i], y[i], c) + if use_addWW_g { + for i := range z { + c, z[i] = subWW_g(x[i], y[i], c) + } + return + } + + for i, xi := range x[:len(z)] { + yi := y[i] + zi := xi - yi - c + z[i] = zi + // see "Hacker's Delight", section 2-12 (overflow detection) + c = (yi&^xi | (yi|^xi)&zi) >> (_W - 1) } return } +// Argument y must be either 0 or 1. +// The resulting carry c is either 0 or 1. func addVW_g(z, x []Word, y Word) (c Word) { + if use_addWW_g { + c = y + for i := range z { + c, z[i] = addWW_g(x[i], c, 0) + } + return + } + c = y - for i := range z { - c, z[i] = addWW_g(x[i], c, 0) + for i, xi := range x[:len(z)] { + zi := xi + c + z[i] = zi + c = xi &^ zi >> (_W - 1) } return } func subVW_g(z, x []Word, y Word) (c Word) { + if use_addWW_g { + c = y + for i := range z { + c, z[i] = subWW_g(x[i], c, 0) + } + return + } + c = y - for i := range z { - c, z[i] = subWW_g(x[i], c, 0) + for i, xi := range x[:len(z)] { + zi := xi - c + z[i] = zi + c = (zi &^ xi) >> (_W - 1) } return } @@ -222,6 +272,7 @@ func mulAddVWW_g(z, x []Word, y, r Word) (c Word) { return } +// TODO(gri) Remove use of addWW_g here and then we can remove addWW_g and subWW_g. func addMulVVW_g(z, x []Word, y Word) (c Word) { for i := range z { z1, z0 := mulAddWWW_g(x[i], y, z[i]) diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go index 3615a659c3..cd92dd7173 100644 --- a/src/math/big/arith_test.go +++ b/src/math/big/arith_test.go @@ -254,7 +254,7 @@ func benchmarkFunVW(b *testing.B, f funVW, n int) { x := rndV(n) y := rndW() z := make([]Word, n) - b.SetBytes(int64(n * _W)) + b.SetBytes(int64(n * _S)) b.ResetTimer() for i := 0; i < b.N; i++ { f(z, x, y) diff --git a/src/math/big/nat_test.go b/src/math/big/nat_test.go index a2ae53385c..5d93df735d 100644 --- a/src/math/big/nat_test.go +++ b/src/math/big/nat_test.go @@ -769,3 +769,55 @@ func BenchmarkExp3Power0x10000(b *testing.B) { ExpHelper(b, 3, 0x10000) } func BenchmarkExp3Power0x40000(b *testing.B) { ExpHelper(b, 3, 0x40000) } func BenchmarkExp3Power0x100000(b *testing.B) { ExpHelper(b, 3, 0x100000) } func BenchmarkExp3Power0x400000(b *testing.B) { ExpHelper(b, 3, 0x400000) } + +func fibo(n int) nat { + switch n { + case 0: + return nil + case 1: + return nat{1} + } + f0 := fibo(0) + f1 := fibo(1) + var f2 nat + for i := 1; i < n; i++ { + f2 = f2.add(f0, f1) + f0, f1, f2 = f1, f2, f0 + } + return f1 +} + +var fiboNums = []string{ + "0", + "55", + "6765", + "832040", + "102334155", + "12586269025", + "1548008755920", + "190392490709135", + "23416728348467685", + "2880067194370816120", + "354224848179261915075", +} + +func TestFibo(t *testing.T) { + for i, want := range fiboNums { + n := i * 10 + got := fibo(n).decimalString() + if got != want { + t.Errorf("fibo(%d) failed: got %s want %s", n, got, want) + } + } +} + +func BenchmarkFibo(b *testing.B) { + for i := 0; i < b.N; i++ { + fibo(1e0) + fibo(1e1) + fibo(1e2) + fibo(1e3) + fibo(1e4) + fibo(1e5) + } +} -- 2.48.1