From 067acd51b01f43681d9196c01a293ee5047b69a7 Mon Sep 17 00:00:00 2001
From: Robert Griesemer <gri@golang.org>
Date: Wed, 7 Jan 2015 11:11:57 -0800
Subject: [PATCH] math/big: faster "pure Go" addition/subtraction for long
 vectors           (platforms w/o corresponding assembly kernels)

For short vector adds there's some erradic slow-down, but overall
these routines have become significantly faster. This only matters
for platforms w/o native (assembly) versions of these kernels, so
we are not concerned about the minor slow-down for short vectors.

This code was already reviewed under Mercurial (golang.org/cl/172810043)
but wasn't submitted before the switch to git.

Benchmarks run on 2.3GHz Intel Core i7, running OS X 10.9.5,
with the respective AddVV and AddVW assembly routines disabled.

benchmark              old ns/op     new ns/op     delta
BenchmarkAddVV_1       6.59          7.09          +7.59%
BenchmarkAddVV_2       10.3          10.1          -1.94%
BenchmarkAddVV_3       10.9          12.6          +15.60%
BenchmarkAddVV_4       13.9          15.6          +12.23%
BenchmarkAddVV_5       16.8          17.3          +2.98%
BenchmarkAddVV_1e1     29.5          29.9          +1.36%
BenchmarkAddVV_1e2     246           232           -5.69%
BenchmarkAddVV_1e3     2374          2185          -7.96%
BenchmarkAddVV_1e4     58942         22292         -62.18%
BenchmarkAddVV_1e5     668622        225279        -66.31%
BenchmarkAddVW_1       6.81          5.58          -18.06%
BenchmarkAddVW_2       7.69          6.86          -10.79%
BenchmarkAddVW_3       9.56          8.32          -12.97%
BenchmarkAddVW_4       12.1          9.53          -21.24%
BenchmarkAddVW_5       13.2          10.9          -17.42%
BenchmarkAddVW_1e1     23.4          18.0          -23.08%
BenchmarkAddVW_1e2     175           141           -19.43%
BenchmarkAddVW_1e3     1568          1266          -19.26%
BenchmarkAddVW_1e4     15425         12596         -18.34%
BenchmarkAddVW_1e5     156737        133539        -14.80%
BenchmarkFibo          381678466     132958666     -65.16%

benchmark              old MB/s     new MB/s     speedup
BenchmarkAddVV_1       9715.25      9028.30      0.93x
BenchmarkAddVV_2       12461.72     12622.60     1.01x
BenchmarkAddVV_3       17549.64     15243.82     0.87x
BenchmarkAddVV_4       18392.54     16398.29     0.89x
BenchmarkAddVV_5       18995.23     18496.57     0.97x
BenchmarkAddVV_1e1     21708.98     21438.28     0.99x
BenchmarkAddVV_1e2     25956.53     27506.88     1.06x
BenchmarkAddVV_1e3     26947.93     29286.66     1.09x
BenchmarkAddVV_1e4     10857.96     28709.46     2.64x
BenchmarkAddVV_1e5     9571.91      28409.21     2.97x
BenchmarkAddVW_1       1175.28      1433.98      1.22x
BenchmarkAddVW_2       2080.01      2332.54      1.12x
BenchmarkAddVW_3       2509.28      2883.97      1.15x
BenchmarkAddVW_4       2646.09      3356.83      1.27x
BenchmarkAddVW_5       3020.69      3671.07      1.22x
BenchmarkAddVW_1e1     3425.76      4441.40      1.30x
BenchmarkAddVW_1e2     4553.17      5642.96      1.24x
BenchmarkAddVW_1e3     5100.14      6318.72      1.24x
BenchmarkAddVW_1e4     5186.15      6350.96      1.22x
BenchmarkAddVW_1e5     5104.07      5990.74      1.17x

Change-Id: I7a62023b1105248a0e85e5b9819d3fd4266123d4
Reviewed-on: https://go-review.googlesource.com/2480
Reviewed-by: Russ Cox <rsc@golang.org>
Reviewed-by: Alan Donovan <adonovan@google.com>
---
 src/math/big/arith.go      | 69 +++++++++++++++++++++++++++++++++-----
 src/math/big/arith_test.go |  2 +-
 src/math/big/nat_test.go   | 52 ++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/src/math/big/arith.go b/src/math/big/arith.go
index 3d5a8682d9..328c85c4f7 100644
--- a/src/math/big/arith.go
+++ b/src/math/big/arith.go
@@ -70,7 +70,7 @@ func mulWW_g(x, y Word) (z1, z0 Word) {
 
 // z1<<_W + z0 = x*y + c
 func mulAddWWW_g(x, y, c Word) (z1, z0 Word) {
-	z1, zz0 := mulWW(x, y)
+	z1, zz0 := mulWW_g(x, y)
 	if z0 = zz0 + c; z0 < zz0 {
 		z1++
 	}
@@ -154,32 +154,82 @@ func divWW_g(u1, u0, v Word) (q, r Word) {
 	return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s
 }
 
+// Keep for performance debugging.
+// Using addWW_g is likely slower.
+const use_addWW_g = false
+
+// The resulting carry c is either 0 or 1.
 func addVV_g(z, x, y []Word) (c Word) {
-	for i := range z {
-		c, z[i] = addWW_g(x[i], y[i], c)
+	if use_addWW_g {
+		for i := range z {
+			c, z[i] = addWW_g(x[i], y[i], c)
+		}
+		return
+	}
+
+	for i, xi := range x[:len(z)] {
+		yi := y[i]
+		zi := xi + yi + c
+		z[i] = zi
+		// see "Hacker's Delight", section 2-12 (overflow detection)
+		c = (xi&yi | (xi|yi)&^zi) >> (_W - 1)
 	}
 	return
 }
 
+// The resulting carry c is either 0 or 1.
 func subVV_g(z, x, y []Word) (c Word) {
-	for i := range z {
-		c, z[i] = subWW_g(x[i], y[i], c)
+	if use_addWW_g {
+		for i := range z {
+			c, z[i] = subWW_g(x[i], y[i], c)
+		}
+		return
+	}
+
+	for i, xi := range x[:len(z)] {
+		yi := y[i]
+		zi := xi - yi - c
+		z[i] = zi
+		// see "Hacker's Delight", section 2-12 (overflow detection)
+		c = (yi&^xi | (yi|^xi)&zi) >> (_W - 1)
 	}
 	return
 }
 
+// Argument y must be either 0 or 1.
+// The resulting carry c is either 0 or 1.
 func addVW_g(z, x []Word, y Word) (c Word) {
+	if use_addWW_g {
+		c = y
+		for i := range z {
+			c, z[i] = addWW_g(x[i], c, 0)
+		}
+		return
+	}
+
 	c = y
-	for i := range z {
-		c, z[i] = addWW_g(x[i], c, 0)
+	for i, xi := range x[:len(z)] {
+		zi := xi + c
+		z[i] = zi
+		c = xi &^ zi >> (_W - 1)
 	}
 	return
 }
 
 func subVW_g(z, x []Word, y Word) (c Word) {
+	if use_addWW_g {
+		c = y
+		for i := range z {
+			c, z[i] = subWW_g(x[i], c, 0)
+		}
+		return
+	}
+
 	c = y
-	for i := range z {
-		c, z[i] = subWW_g(x[i], c, 0)
+	for i, xi := range x[:len(z)] {
+		zi := xi - c
+		z[i] = zi
+		c = (zi &^ xi) >> (_W - 1)
 	}
 	return
 }
@@ -222,6 +272,7 @@ func mulAddVWW_g(z, x []Word, y, r Word) (c Word) {
 	return
 }
 
+// TODO(gri) Remove use of addWW_g here and then we can remove addWW_g and subWW_g.
 func addMulVVW_g(z, x []Word, y Word) (c Word) {
 	for i := range z {
 		z1, z0 := mulAddWWW_g(x[i], y, z[i])
diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go
index 3615a659c3..cd92dd7173 100644
--- a/src/math/big/arith_test.go
+++ b/src/math/big/arith_test.go
@@ -254,7 +254,7 @@ func benchmarkFunVW(b *testing.B, f funVW, n int) {
 	x := rndV(n)
 	y := rndW()
 	z := make([]Word, n)
-	b.SetBytes(int64(n * _W))
+	b.SetBytes(int64(n * _S))
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		f(z, x, y)
diff --git a/src/math/big/nat_test.go b/src/math/big/nat_test.go
index a2ae53385c..5d93df735d 100644
--- a/src/math/big/nat_test.go
+++ b/src/math/big/nat_test.go
@@ -769,3 +769,55 @@ func BenchmarkExp3Power0x10000(b *testing.B)  { ExpHelper(b, 3, 0x10000) }
 func BenchmarkExp3Power0x40000(b *testing.B)  { ExpHelper(b, 3, 0x40000) }
 func BenchmarkExp3Power0x100000(b *testing.B) { ExpHelper(b, 3, 0x100000) }
 func BenchmarkExp3Power0x400000(b *testing.B) { ExpHelper(b, 3, 0x400000) }
+
+func fibo(n int) nat {
+	switch n {
+	case 0:
+		return nil
+	case 1:
+		return nat{1}
+	}
+	f0 := fibo(0)
+	f1 := fibo(1)
+	var f2 nat
+	for i := 1; i < n; i++ {
+		f2 = f2.add(f0, f1)
+		f0, f1, f2 = f1, f2, f0
+	}
+	return f1
+}
+
+var fiboNums = []string{
+	"0",
+	"55",
+	"6765",
+	"832040",
+	"102334155",
+	"12586269025",
+	"1548008755920",
+	"190392490709135",
+	"23416728348467685",
+	"2880067194370816120",
+	"354224848179261915075",
+}
+
+func TestFibo(t *testing.T) {
+	for i, want := range fiboNums {
+		n := i * 10
+		got := fibo(n).decimalString()
+		if got != want {
+			t.Errorf("fibo(%d) failed: got %s want %s", n, got, want)
+		}
+	}
+}
+
+func BenchmarkFibo(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		fibo(1e0)
+		fibo(1e1)
+		fibo(1e2)
+		fibo(1e3)
+		fibo(1e4)
+		fibo(1e5)
+	}
+}
-- 
2.52.0