From 80b3ff9f827b5e27f03c3a51034745157ebb3301 Mon Sep 17 00:00:00 2001
From: Robert Griesemer <gri@golang.org>
Date: Wed, 7 Jan 2015 17:16:59 -0800
Subject: [PATCH] math/big: faster assembly kernels for AddVx/SubVx for amd64.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Replaced use of rotate instructions (RCRQ, RCLQ) with ADDQ/SBBQ
for restoring/saving the carry flag per suggestion from TorbjÃ¶rn
Granlund (author of GMP bignum libs for C).
The rotate instructions tend to be slower on todays machines.

benchmark              old ns/op     new ns/op     delta
BenchmarkAddVV_1       5.69          5.51          -3.16%
BenchmarkAddVV_2       7.15          6.87          -3.92%
BenchmarkAddVV_3       8.69          8.06          -7.25%
BenchmarkAddVV_4       8.10          8.13          +0.37%
BenchmarkAddVV_5       8.37          8.47          +1.19%
BenchmarkAddVV_1e1     13.1          12.0          -8.40%
BenchmarkAddVV_1e2     78.1          69.4          -11.14%
BenchmarkAddVV_1e3     815           656           -19.51%
BenchmarkAddVV_1e4     8137          7345          -9.73%
BenchmarkAddVV_1e5     100127        93909         -6.21%
BenchmarkAddVW_1       4.86          4.71          -3.09%
BenchmarkAddVW_2       5.67          5.50          -3.00%
BenchmarkAddVW_3       6.51          6.34          -2.61%
BenchmarkAddVW_4       6.69          6.66          -0.45%
BenchmarkAddVW_5       7.20          7.21          +0.14%
BenchmarkAddVW_1e1     10.0          9.34          -6.60%
BenchmarkAddVW_1e2     45.4          52.3          +15.20%
BenchmarkAddVW_1e3     417           491           +17.75%
BenchmarkAddVW_1e4     4760          4852          +1.93%
BenchmarkAddVW_1e5     69107         67717         -2.01%

benchmark              old MB/s      new MB/s      speedup
BenchmarkAddVV_1       11241.82      11610.28      1.03x
BenchmarkAddVV_2       17902.68      18631.82      1.04x
BenchmarkAddVV_3       22082.43      23835.64      1.08x
BenchmarkAddVV_4       31588.18      31492.06      1.00x
BenchmarkAddVV_5       38229.90      37783.17      0.99x
BenchmarkAddVV_1e1     48891.67      53340.91      1.09x
BenchmarkAddVV_1e2     81940.61      92191.86      1.13x
BenchmarkAddVV_1e3     78443.09      97480.44      1.24x
BenchmarkAddVV_1e4     78644.18      87129.50      1.11x
BenchmarkAddVV_1e5     63918.48      68150.84      1.07x
BenchmarkAddVW_1       13165.09      13581.00      1.03x
BenchmarkAddVW_2       22588.04      23275.41      1.03x
BenchmarkAddVW_3       29483.82      30303.96      1.03x
BenchmarkAddVW_4       38286.54      38453.21      1.00x
BenchmarkAddVW_5       44414.57      44370.59      1.00x
BenchmarkAddVW_1e1     63816.84      68494.08      1.07x
BenchmarkAddVW_1e2     140885.41     122427.16     0.87x
BenchmarkAddVW_1e3     153258.31     130325.28     0.85x
BenchmarkAddVW_1e4     134447.63     131904.02     0.98x
BenchmarkAddVW_1e5     92609.41      94509.88      1.02x

Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae
Reviewed-on: https://go-review.googlesource.com/2503
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/math/big/arith_386.s   |  2 ++
 src/math/big/arith_amd64.s | 57 ++++++++++++++++++--------------------
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s
index 1b47c898f9..649bc4dc88 100644
--- a/src/math/big/arith_386.s
+++ b/src/math/big/arith_386.s
@@ -7,6 +7,8 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
+// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively.
+
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT Â·mulWW(SB),NOSPLIT,$0
 	MOVL x+0(FP), AX
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s
index 56c4cb050e..bb06e69b78 100644
--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -7,16 +7,6 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
-// Literal instruction for MOVQ $0, CX.
-// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
-#define ZERO_CX BYTE $0x48; \
-		BYTE $0xc7; \
-		BYTE $0xc1; \
-		BYTE $0x00; \
-		BYTE $0x00; \
-		BYTE $0x00; \
-		BYTE $0x00
-
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT Â·mulWW(SB),NOSPLIT,$0
 	MOVQ x+0(FP), AX
@@ -35,6 +25,11 @@ TEXT Â·divWW(SB),NOSPLIT,$0
 	MOVQ DX, r+32(FP)
 	RET
 
+// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
+// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
+// This is faster than using rotate instructions.
+//
+// CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit!
 
 // func addVV(z, x, y []Word) (c Word)
 TEXT Â·addVV(SB),NOSPLIT,$0
@@ -52,7 +47,7 @@ TEXT Â·addVV(SB),NOSPLIT,$0
 
 U1:	// n >= 0
 	// regular loop body unrolled 4x
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	MOVQ 8(R8)(SI*8), R12
 	MOVQ 16(R8)(SI*8), R13
@@ -65,7 +60,7 @@ U1:	// n >= 0
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 
 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@@ -75,17 +70,18 @@ V1:	ADDQ $4, DI		// n += 4
 	JLE E1			// if n <= 0 goto E1
 
 L1:	// n > 0
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	ADCQ 0(R9)(SI*8), R11
 	MOVQ R11, 0(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
 	JG L1			// if n > 0 goto L1
 
-E1:	MOVQ CX, c+72(FP)	// return c
+E1:	NEGQ CX
+	MOVQ CX, c+72(FP)	// return c
 	RET
 
 
@@ -106,7 +102,7 @@ TEXT Â·subVV(SB),NOSPLIT,$0
 
 U2:	// n >= 0
 	// regular loop body unrolled 4x
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	MOVQ 8(R8)(SI*8), R12
 	MOVQ 16(R8)(SI*8), R13
@@ -119,7 +115,7 @@ U2:	// n >= 0
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 
 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@@ -129,17 +125,18 @@ V2:	ADDQ $4, DI		// n += 4
 	JLE E2			// if n <= 0 goto E2
 
 L2:	// n > 0
-	RCRQ $1, CX		// CF = c
+	ADDQ CX, CX		// restore CF
 	MOVQ 0(R8)(SI*8), R11
 	SBBQ 0(R9)(SI*8), R11
 	MOVQ R11, 0(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
 
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
 	JG L2			// if n > 0 goto L2
 
-E2:	MOVQ CX, c+72(FP)	// return c
+E2:	NEGQ CX
+	MOVQ CX, c+72(FP)	// return c
 	RET
 
 
@@ -163,11 +160,11 @@ U3:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	ADDQ CX, R11
-	ZERO_CX
 	ADCQ $0, R12
 	ADCQ $0, R13
 	ADCQ $0, R14
-	SETCS CX		// c = CF
+	SBBQ CX, CX		// save CF
+	NEGQ CX
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
@@ -183,8 +180,8 @@ V3:	ADDQ $4, DI		// n += 4
 L3:	// n > 0
 	ADDQ 0(R8)(SI*8), CX
 	MOVQ CX, 0(R10)(SI*8)
-	ZERO_CX
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
+	NEGQ CX
 
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
@@ -201,7 +198,7 @@ TEXT Â·subVW(SB),NOSPLIT,$0
 	MOVQ x+24(FP), R8
 	MOVQ y+48(FP), CX	// c = y
 	MOVQ z+0(FP), R10
-	
+
 	MOVQ $0, SI		// i = 0
 
 	// s/JL/JMP/ below to disable the unrolled loop
@@ -215,11 +212,11 @@ U4:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	SUBQ CX, R11
-	ZERO_CX
 	SBBQ $0, R12
 	SBBQ $0, R13
 	SBBQ $0, R14
-	SETCS CX		// c = CF
+	SBBQ CX, CX		// save CF
+	NEGQ CX
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
@@ -236,8 +233,8 @@ L4:	// n > 0
 	MOVQ 0(R8)(SI*8), R11
 	SUBQ CX, R11
 	MOVQ R11, 0(R10)(SI*8)
-	ZERO_CX
-	RCLQ $1, CX		// c = CF
+	SBBQ CX, CX		// save CF
+	NEGQ CX
 
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
@@ -306,7 +303,7 @@ L9:	MOVQ AX, DX		// w = w1
 	SHRQ CX, DX:AX		// w>>s | w1<<Å
 	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<Å
 	ADDQ $1, BX		// i++
-	
+
 E9:	CMPQ BX, R11
 	JL L9			// i < n-1
 
-- 
2.51.0