From b66d83cceaa016297c436fa3da8c7821edf90989 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Thu, 16 Jan 2025 11:23:46 -0500
Subject: [PATCH] math/big: clean up GCD a little
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The GCD code was setting one *Int to the value of another
by smashing one struct on top of the other, instead of using Set.
That was safe in this one case, but it's not idiomatic in math/big
nor safe in general, so rewrite the code not to do that.
(In one case, by swapping variables around; in another, by calling Set.)

The added Set call does slow down GCDs by a small amount,
since the answer has to be copied out. To compensate for that,
optimize a bit: remove the s, t temporaries entirely and handle
vector x word multiplication directly. The net result is that almost
all GCDs are faster, except for small ones, which are a few
nanoseconds slower.

goos: darwin
goarch: arm64
pkg: math/big
cpu: Apple M3 Pro
                              â bench.before â             bench.after             â
                              â    sec/op    â   sec/op     vs base                â
GCD10x10/WithoutXY-12            23.80n Â± 1%   31.71n Â± 1%  +33.24% (p=0.000 n=10)
GCD10x10/WithXY-12              100.40n Â± 0%   92.14n Â± 1%   -8.22% (p=0.000 n=10)
GCD10x100/WithoutXY-12           63.70n Â± 0%   70.73n Â± 0%  +11.05% (p=0.000 n=10)
GCD10x100/WithXY-12              278.6n Â± 0%   233.1n Â± 1%  -16.35% (p=0.000 n=10)
GCD10x1000/WithoutXY-12          153.4n Â± 0%   162.2n Â± 1%   +5.74% (p=0.000 n=10)
GCD10x1000/WithXY-12             456.0n Â± 0%   411.8n Â± 1%   -9.69% (p=0.000 n=10)
GCD10x10000/WithoutXY-12         1.002Âµ Â± 1%   1.036Âµ Â± 0%   +3.39% (p=0.000 n=10)
GCD10x10000/WithXY-12            2.330Âµ Â± 1%   2.210Âµ Â± 0%   -5.13% (p=0.000 n=10)
GCD10x100000/WithoutXY-12        8.894Âµ Â± 0%   8.889Âµ Â± 1%        ~ (p=0.754 n=10)
GCD10x100000/WithXY-12           20.84Âµ Â± 0%   20.24Âµ Â± 0%   -2.84% (p=0.000 n=10)
GCD100x100/WithoutXY-12          373.3n Â± 3%   314.4n Â± 0%  -15.76% (p=0.000 n=10)
GCD100x100/WithXY-12             662.5n Â± 0%   572.4n Â± 1%  -13.59% (p=0.000 n=10)
GCD100x1000/WithoutXY-12         641.8n Â± 0%   598.1n Â± 1%   -6.81% (p=0.000 n=10)
GCD100x1000/WithXY-12            1.123Âµ Â± 0%   1.019Âµ Â± 1%   -9.26% (p=0.000 n=10)
GCD100x10000/WithoutXY-12        2.870Âµ Â± 0%   2.831Âµ Â± 0%   -1.38% (p=0.000 n=10)
GCD100x10000/WithXY-12           4.930Âµ Â± 1%   4.675Âµ Â± 0%   -5.16% (p=0.000 n=10)
GCD100x100000/WithoutXY-12       24.08Âµ Â± 0%   23.97Âµ Â± 0%   -0.48% (p=0.007 n=10)
GCD100x100000/WithXY-12          43.66Âµ Â± 0%   42.52Âµ Â± 0%   -2.61% (p=0.001 n=10)
GCD1000x1000/WithoutXY-12        3.999Âµ Â± 0%   3.569Âµ Â± 1%  -10.75% (p=0.000 n=10)
GCD1000x1000/WithXY-12           6.397Âµ Â± 0%   5.534Âµ Â± 0%  -13.49% (p=0.000 n=10)
GCD1000x10000/WithoutXY-12       6.875Âµ Â± 0%   6.450Âµ Â± 0%   -6.18% (p=0.000 n=10)
GCD1000x10000/WithXY-12          20.75Âµ Â± 1%   19.17Âµ Â± 1%   -7.64% (p=0.000 n=10)
GCD1000x100000/WithoutXY-12      36.38Âµ Â± 0%   35.60Âµ Â± 1%   -2.13% (p=0.000 n=10)
GCD1000x100000/WithXY-12         172.1Âµ Â± 0%   174.4Âµ Â± 3%        ~ (p=0.052 n=10)
GCD10000x10000/WithoutXY-12      79.89Âµ Â± 1%   75.16Âµ Â± 2%   -5.92% (p=0.000 n=10)
GCD10000x10000/WithXY-12         160.1Âµ Â± 0%   150.0Âµ Â± 0%   -6.33% (p=0.000 n=10)
GCD10000x100000/WithoutXY-12     213.2Âµ Â± 1%   209.0Âµ Â± 1%   -1.98% (p=0.000 n=10)
GCD10000x100000/WithXY-12        1.399m Â± 0%   1.342m Â± 3%   -4.08% (p=0.002 n=10)
GCD100000x100000/WithoutXY-12    5.463m Â± 1%   5.504m Â± 2%        ~ (p=0.190 n=10)
GCD100000x100000/WithXY-12       11.36m Â± 0%   11.46m Â± 1%   +0.86% (p=0.000 n=10)
geomean                          6.953Âµ        6.695Âµ        -3.71%

goos: linux
goarch: amd64
pkg: math/big
cpu: AMD Ryzen 9 7950X 16-Core Processor
                              â bench.before â             bench.after             â
                              â    sec/op    â   sec/op     vs base                â
GCD10x10/WithoutXY-32           39.66n Â±  4%   44.34n Â± 4%  +11.77% (p=0.000 n=10)
GCD10x10/WithXY-32              156.7n Â± 12%   130.8n Â± 2%  -16.53% (p=0.000 n=10)
GCD10x100/WithoutXY-32          115.8n Â±  5%   120.2n Â± 2%   +3.89% (p=0.000 n=10)
GCD10x100/WithXY-32             465.3n Â±  3%   368.1n Â± 2%  -20.91% (p=0.000 n=10)
GCD10x1000/WithoutXY-32         201.1n Â±  1%   210.8n Â± 2%   +4.82% (p=0.000 n=10)
GCD10x1000/WithXY-32            652.9n Â±  4%   605.0n Â± 1%   -7.32% (p=0.002 n=10)
GCD10x10000/WithoutXY-32        1.046Âµ Â±  2%   1.143Âµ Â± 1%   +9.33% (p=0.000 n=10)
GCD10x10000/WithXY-32           3.360Âµ Â±  1%   3.258Âµ Â± 1%   -3.04% (p=0.000 n=10)
GCD10x100000/WithoutXY-32       9.391Âµ Â±  3%   9.997Âµ Â± 1%   +6.46% (p=0.000 n=10)
GCD10x100000/WithXY-32          27.92Âµ Â±  1%   28.21Âµ Â± 0%   +1.04% (p=0.043 n=10)
GCD100x100/WithoutXY-32         443.7n Â±  5%   320.0n Â± 2%  -27.88% (p=0.000 n=10)
GCD100x100/WithXY-32            789.9n Â±  2%   690.4n Â± 1%  -12.60% (p=0.000 n=10)
GCD100x1000/WithoutXY-32        718.4n Â±  3%   600.0n Â± 1%  -16.48% (p=0.000 n=10)
GCD100x1000/WithXY-32           1.388Âµ Â±  4%   1.175Âµ Â± 1%  -15.28% (p=0.000 n=10)
GCD100x10000/WithoutXY-32       2.750Âµ Â±  1%   2.668Âµ Â± 1%   -2.96% (p=0.000 n=10)
GCD100x10000/WithXY-32          6.016Âµ Â±  1%   5.590Âµ Â± 1%   -7.09% (p=0.000 n=10)
GCD100x100000/WithoutXY-32      21.40Âµ Â±  1%   22.30Âµ Â± 1%   +4.21% (p=0.000 n=10)
GCD100x100000/WithXY-32         47.02Âµ Â±  4%   48.80Âµ Â± 0%   +3.78% (p=0.015 n=10)
GCD1000x1000/WithoutXY-32       3.417Âµ Â±  4%   3.020Âµ Â± 1%  -11.65% (p=0.000 n=10)
GCD1000x1000/WithXY-32          5.752Âµ Â±  0%   5.418Âµ Â± 2%   -5.81% (p=0.000 n=10)
GCD1000x10000/WithoutXY-32      6.150Âµ Â±  0%   6.246Âµ Â± 1%   +1.55% (p=0.000 n=10)
GCD1000x10000/WithXY-32         24.68Âµ Â±  3%   25.07Âµ Â± 1%        ~ (p=0.051 n=10)
GCD1000x100000/WithoutXY-32     34.60Âµ Â±  2%   36.85Âµ Â± 1%   +6.51% (p=0.000 n=10)
GCD1000x100000/WithXY-32        209.5Âµ Â±  4%   227.4Âµ Â± 0%   +8.56% (p=0.000 n=10)
GCD10000x10000/WithoutXY-32     90.69Âµ Â±  0%   88.48Âµ Â± 0%   -2.44% (p=0.000 n=10)
GCD10000x10000/WithXY-32        197.1Âµ Â±  0%   200.5Âµ Â± 0%   +1.73% (p=0.000 n=10)
GCD10000x100000/WithoutXY-32    239.1Âµ Â±  0%   242.5Âµ Â± 0%   +1.42% (p=0.000 n=10)
GCD10000x100000/WithXY-32       1.963m Â±  3%   2.028m Â± 0%   +3.28% (p=0.000 n=10)
GCD100000x100000/WithoutXY-32   7.466m Â±  0%   7.412m Â± 0%   -0.71% (p=0.000 n=10)
GCD100000x100000/WithXY-32      16.10m Â±  2%   16.47m Â± 0%   +2.25% (p=0.000 n=10)
geomean                         8.388Âµ         8.127Âµ        -3.12%

Change-Id: I161dc409bad11bcc553bc8116449905ae5b06742
Reviewed-on: https://go-review.googlesource.com/c/go/+/650636
Reviewed-by: Robert Griesemer <gri@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
---
 src/math/big/int.go | 76 ++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 45 deletions(-)

diff --git a/src/math/big/int.go b/src/math/big/int.go
index df44e9dccf..0b710c6968 100644
--- a/src/math/big/int.go
+++ b/src/math/big/int.go
@@ -707,42 +707,36 @@ func lehmerSimulate(A, B *Int) (u0, u1, v0, v1 Word, even bool) {
 // For even == true: u0, v1 >= 0 && u1, v0 <= 0
 // For even == false: u0, v1 <= 0 && u1, v0 >= 0
 // q, r, s, t are temporary variables to avoid allocations in the multiplication.
-func lehmerUpdate(A, B, q, r, s, t *Int, u0, u1, v0, v1 Word, even bool) {
-
-	t.abs = t.abs.setWord(u0)
-	s.abs = s.abs.setWord(v0)
-	t.neg = !even
-	s.neg = even
-
-	t.Mul(A, t)
-	s.Mul(B, s)
-
-	r.abs = r.abs.setWord(u1)
-	q.abs = q.abs.setWord(v1)
-	r.neg = even
-	q.neg = !even
-
-	r.Mul(A, r)
-	q.Mul(B, q)
+func lehmerUpdate(A, B, q, r *Int, u0, u1, v0, v1 Word, even bool) {
+	mulW(q, B, even, v0)
+	mulW(r, A, even, u1)
+	mulW(A, A, !even, u0)
+	mulW(B, B, !even, v1)
+	A.Add(A, q)
+	B.Add(B, r)
+}
 
-	A.Add(t, s)
-	B.Add(r, q)
+// mulW sets z = x * (-?)w
+// where the minus sign is present when neg is true.
+func mulW(z, x *Int, neg bool, w Word) {
+	z.abs = z.abs.mulAddWW(x.abs, w, 0)
+	z.neg = x.neg != neg
 }
 
 // euclidUpdate performs a single step of the Euclidean GCD algorithm
 // if extended is true, it also updates the cosequence Ua, Ub.
-func euclidUpdate(A, B, Ua, Ub, q, r, s, t *Int, extended bool) {
-	q, r = q.QuoRem(A, B, r)
-
-	*A, *B, *r = *B, *r, *A
+// q and r are used as temporaries; the initial values are ignored.
+func euclidUpdate(A, B, Ua, Ub, q, r *Int, extended bool) (nA, nB, nr, nUa, nUb *Int) {
+	q.QuoRem(A, B, r)
 
 	if extended {
-		// Ua, Ub = Ub, Ua - q*Ub
-		t.Set(Ub)
-		s.Mul(Ub, q)
-		Ub.Sub(Ua, s)
-		Ua.Set(t)
+		// Ua, Ub = Ub, Ua-q*Ub
+		q.Mul(q, Ub)
+		Ua, Ub = Ub, Ua
+		Ub.Sub(Ub, q)
 	}
+
+	return B, r, A, Ua, Ub
 }
 
 // lehmerGCD sets z to the greatest common divisor of a and b,
@@ -772,8 +766,6 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
 	// temp variables for multiprecision update
 	q := new(Int)
 	r := new(Int)
-	s := new(Int)
-	t := new(Int)
 
 	// ensure A >= B
 	if A.abs.cmp(B.abs) < 0 {
@@ -791,18 +783,18 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
 			// Simulate the effect of the single-precision steps using the cosequences.
 			// A = u0*A + v0*B
 			// B = u1*A + v1*B
-			lehmerUpdate(A, B, q, r, s, t, u0, u1, v0, v1, even)
+			lehmerUpdate(A, B, q, r, u0, u1, v0, v1, even)
 
 			if extended {
 				// Ua = u0*Ua + v0*Ub
 				// Ub = u1*Ua + v1*Ub
-				lehmerUpdate(Ua, Ub, q, r, s, t, u0, u1, v0, v1, even)
+				lehmerUpdate(Ua, Ub, q, r, u0, u1, v0, v1, even)
 			}
 
 		} else {
 			// Single-digit calculations failed to simulate any quotients.
 			// Do a standard Euclidean step.
-			euclidUpdate(A, B, Ua, Ub, q, r, s, t, extended)
+			A, B, r, Ua, Ub = euclidUpdate(A, B, Ua, Ub, q, r, extended)
 		}
 	}
 
@@ -810,7 +802,7 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
 		// extended Euclidean algorithm base case if B is a single Word
 		if len(A.abs) > 1 {
 			// A is longer than a single Word, so one update is needed.
-			euclidUpdate(A, B, Ua, Ub, q, r, s, t, extended)
+			A, B, r, Ua, Ub = euclidUpdate(A, B, Ua, Ub, q, r, extended)
 		}
 		if len(B.abs) > 0 {
 			// A and B are both a single Word.
@@ -828,15 +820,9 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
 					even = !even
 				}
 
-				t.abs = t.abs.setWord(ua)
-				s.abs = s.abs.setWord(va)
-				t.neg = !even
-				s.neg = even
-
-				t.Mul(Ua, t)
-				s.Mul(Ub, s)
-
-				Ua.Add(t, s)
+				mulW(Ua, Ua, !even, ua)
+				mulW(Ub, Ub, even, va)
+				Ua.Add(Ua, Ub)
 			} else {
 				for bWord != 0 {
 					aWord, bWord = bWord, aWord%bWord
@@ -863,13 +849,13 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
 	}
 
 	if x != nil {
-		*x = *Ua
+		x.Set(Ua)
 		if negA {
 			x.neg = !x.neg
 		}
 	}
 
-	*z = *A
+	z.Set(A)
 
 	return z
 }
-- 
2.52.0