From baf426f10fdcb3c11aa20d5034c05c5ba9b0c239 Mon Sep 17 00:00:00 2001
From: Christopher Swenson <cswenson@google.com>
Date: Tue, 28 Aug 2012 09:29:45 -0700
Subject: [PATCH] math/big: Replace RCLQ + ANDQ with SETCS in unrolled
 arithmetic assembly.

benchmark             old ns/op    new ns/op    delta
BenchmarkAddVW_1              8            8   +0.60%
BenchmarkAddVW_2             10            9   -8.64%
BenchmarkAddVW_3             10           10   -4.63%
BenchmarkAddVW_4             10           11   +3.67%
BenchmarkAddVW_5             11           12   +5.98%
BenchmarkAddVW_1e1           18           20   +6.38%
BenchmarkAddVW_1e2          129          115  -10.85%
BenchmarkAddVW_1e3         1270         1089  -14.25%
BenchmarkAddVW_1e4        13376        12145   -9.20%
BenchmarkAddVW_1e5       130392       125260   -3.94%

benchmark              old MB/s     new MB/s  speedup
BenchmarkAddVW_1        7709.10      7661.92    0.99x
BenchmarkAddVW_2       12451.10     13604.00    1.09x
BenchmarkAddVW_3       17727.81     18721.54    1.06x
BenchmarkAddVW_4       23552.64     22708.81    0.96x
BenchmarkAddVW_5       27411.40     25816.22    0.94x
BenchmarkAddVW_1e1     34063.19     32023.06    0.94x
BenchmarkAddVW_1e2     49529.97     55360.55    1.12x
BenchmarkAddVW_1e3     50380.44     58764.18    1.17x
BenchmarkAddVW_1e4     47843.59     52696.10    1.10x
BenchmarkAddVW_1e5     49082.60     51093.66    1.04x

R=gri, rsc, r
CC=golang-dev
https://golang.org/cl/6480063
---
 src/pkg/math/big/arith_amd64.s | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/pkg/math/big/arith_amd64.s b/src/pkg/math/big/arith_amd64.s
index 2d10793e63..d4d2d2f1b1 100644
--- a/src/pkg/math/big/arith_amd64.s
+++ b/src/pkg/math/big/arith_amd64.s
@@ -5,6 +5,16 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
+// Literal instruction for MOVQ $0, CX.
+// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
+#define ZERO_CX BYTE $0x48; \
+		BYTE $0xc7; \
+		BYTE $0xc1; \
+		BYTE $0x00; \
+		BYTE $0x00; \
+		BYTE $0x00; \
+		BYTE $0x00
+
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT Â·mulWW(SB),7,$0
 	MOVQ x+0(FP), AX
@@ -137,7 +147,7 @@ TEXT Â·addVW(SB),7,$0
 	MOVQ x+16(FP), R8
 	MOVQ y+32(FP), CX	// c = y
 	MOVQ z+0(FP), R10
-	
+
 	MOVQ $0, SI		// i = 0
 
 	// s/JL/JMP/ below to disable the unrolled loop
@@ -151,15 +161,15 @@ U3:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	ADDQ CX, R11
+	ZERO_CX
 	ADCQ $0, R12
 	ADCQ $0, R13
 	ADCQ $0, R14
+	SETCS CX		// c = CF
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX
 
 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@@ -171,8 +181,8 @@ V3:	ADDQ $4, DI		// n += 4
 L3:	// n > 0
 	ADDQ 0(R8)(SI*8), CX
 	MOVQ CX, 0(R10)(SI*8)
+	ZERO_CX
 	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX
 
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
@@ -203,15 +213,15 @@ U4:	// n >= 0
 	MOVQ 16(R8)(SI*8), R13
 	MOVQ 24(R8)(SI*8), R14
 	SUBQ CX, R11
+	ZERO_CX
 	SBBQ $0, R12
 	SBBQ $0, R13
 	SBBQ $0, R14
+	SETCS CX		// c = CF
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
 	MOVQ R13, 16(R10)(SI*8)
 	MOVQ R14, 24(R10)(SI*8)
-	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX
 
 	ADDQ $4, SI		// i += 4
 	SUBQ $4, DI		// n -= 4
@@ -224,8 +234,8 @@ L4:	// n > 0
 	MOVQ 0(R8)(SI*8), R11
 	SUBQ CX, R11
 	MOVQ R11, 0(R10)(SI*8)
+	ZERO_CX
 	RCLQ $1, CX		// c = CF
-	ANDQ $1, CX
 
 	ADDQ $1, SI		// i++
 	SUBQ $1, DI		// n--
-- 
2.51.0