From fe24837c4de6dac36f3496e6bac85f72209ee841 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Sun, 3 Mar 2019 14:47:20 -0800 Subject: [PATCH] math/big: add fast path for pure Go addVW for large z MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In the normal case, only a few words have to be updated when adding a word to a vector. When that happens, we can simply copy the rest of the words, which is much faster. However, the overhead of that makes it prohibitive for small vectors, so we check the size at the beginning. The implementation is a bit weird to allow addVW to continued to be inlined; see #30548. The AddVW benchmarks are surprising, but fully repeatable. The SubVW benchmarks are more or less as expected. I expect that removing the indirect function call will help both and make them a bit more normal. name old time/op new time/op delta AddVW/1-8 4.27ns ± 2% 3.81ns ± 3% -10.83% (p=0.000 n=89+90) AddVW/2-8 4.91ns ± 2% 4.34ns ± 1% -11.60% (p=0.000 n=83+90) AddVW/3-8 5.77ns ± 4% 5.76ns ± 2% ~ (p=0.365 n=91+87) AddVW/4-8 6.03ns ± 1% 6.03ns ± 1% ~ (p=0.392 n=80+76) AddVW/5-8 6.48ns ± 2% 6.63ns ± 1% +2.27% (p=0.000 n=76+74) AddVW/10-8 9.56ns ± 2% 9.56ns ± 1% -0.02% (p=0.002 n=69+76) AddVW/100-8 90.6ns ± 0% 18.1ns ± 4% -79.99% (p=0.000 n=72+94) AddVW/1000-8 865ns ± 0% 85ns ± 6% -90.14% (p=0.000 n=66+96) AddVW/10000-8 8.57µs ± 2% 1.82µs ± 3% -78.73% (p=0.000 n=99+94) AddVW/100000-8 84.4µs ± 2% 31.8µs ± 4% -62.29% (p=0.000 n=93+98) name old time/op new time/op delta SubVW/1-8 3.90ns ± 2% 4.13ns ± 4% +6.02% (p=0.000 n=92+95) SubVW/2-8 4.15ns ± 1% 5.20ns ± 1% +25.22% (p=0.000 n=83+85) SubVW/3-8 5.50ns ± 2% 6.22ns ± 6% +13.21% (p=0.000 n=91+97) SubVW/4-8 5.99ns ± 1% 6.63ns ± 1% +10.63% (p=0.000 n=79+61) SubVW/5-8 6.75ns ± 4% 6.88ns ± 2% +1.82% (p=0.000 n=98+73) SubVW/10-8 9.57ns ± 1% 9.56ns ± 1% -0.13% (p=0.000 n=77+64) SubVW/100-8 90.3ns ± 1% 18.1ns ± 2% -80.00% (p=0.000 n=75+94) SubVW/1000-8 860ns ± 4% 85ns ± 7% -90.14% (p=0.000 n=97+99) SubVW/10000-8 8.51µs ± 3% 1.77µs ± 6% -79.21% (p=0.000 n=100+97) SubVW/100000-8 84.4µs ± 3% 31.5µs ± 3% -62.66% (p=0.000 n=92+92) Change-Id: I721d7031d40f245b4a284f5bdd93e7bb85e7e937 Reviewed-on: https://go-review.googlesource.com/c/go/+/164968 Run-TryBot: Josh Bleecher Snyder TryBot-Result: Gobot Gobot Reviewed-by: Robert Griesemer --- src/math/big/arith.go | 44 +++++++++++++++++++++++++++++++-- src/math/big/arith_decl_pure.go | 14 +++++++++-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/src/math/big/arith.go b/src/math/big/arith.go index c291f74db6..ed51f38836 100644 --- a/src/math/big/arith.go +++ b/src/math/big/arith.go @@ -3,8 +3,10 @@ // license that can be found in the LICENSE file. // This file provides Go implementations of elementary multi-precision -// arithmetic operations on word vectors. Needed for platforms without -// assembly implementations of these routines. +// arithmetic operations on word vectors. These have the suffix _g. +// These are needed for platforms without assembly implementations of these routines. +// This file also contains elementary operations that can be implemented +// sufficiently efficiently in Go. package big @@ -98,6 +100,28 @@ func addVW_g(z, x []Word, y Word) (c Word) { return } +// addVWlarge is addVW, but intended for large z. +// The only difference is that we check on every iteration +// whether we are done with carries, +// and if so, switch to a much faster copy instead. +// This is only a good idea for large z, +// because the overhead of the check and the function call +// outweigh the benefits when z is small. +func addVWlarge(z, x []Word, y Word) (c Word) { + c = y + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + if c == 0 { + copy(z[i:], x[i:]) + return + } + zi, cc := bits.Add(uint(x[i]), uint(c), 0) + z[i] = Word(zi) + c = Word(cc) + } + return +} + func subVW_g(z, x []Word, y Word) (c Word) { c = y // The comment near the top of this file discusses this for loop condition. @@ -109,6 +133,22 @@ func subVW_g(z, x []Word, y Word) (c Word) { return } +// subVWlarge is to subVW as addVWlarge is to addVW. +func subVWlarge(z, x []Word, y Word) (c Word) { + c = y + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + if c == 0 { + copy(z[i:], x[i:]) + return + } + zi, cc := bits.Sub(uint(x[i]), uint(c), 0) + z[i] = Word(zi) + c = Word(cc) + } + return +} + func shlVU_g(z, x []Word, s uint) (c Word) { if s == 0 { copy(z, x) diff --git a/src/math/big/arith_decl_pure.go b/src/math/big/arith_decl_pure.go index 4ae49c123d..305f7ee03b 100644 --- a/src/math/big/arith_decl_pure.go +++ b/src/math/big/arith_decl_pure.go @@ -23,11 +23,21 @@ func subVV(z, x, y []Word) (c Word) { } func addVW(z, x []Word, y Word) (c Word) { - return addVW_g(z, x, y) + // TODO: remove indirect function call when golang.org/issue/30548 is fixed + fn := addVW_g + if len(z) > 32 { + fn = addVWlarge + } + return fn(z, x, y) } func subVW(z, x []Word, y Word) (c Word) { - return subVW_g(z, x, y) + // TODO: remove indirect function call when golang.org/issue/30548 is fixed + fn := subVW_g + if len(z) > 32 { + fn = subVWlarge + } + return fn(z, x, y) } func shlVU(z, x []Word, s uint) (c Word) { -- 2.50.0