From baf426f10fdcb3c11aa20d5034c05c5ba9b0c239 Mon Sep 17 00:00:00 2001 From: Christopher Swenson Date: Tue, 28 Aug 2012 09:29:45 -0700 Subject: [PATCH] math/big: Replace RCLQ + ANDQ with SETCS in unrolled arithmetic assembly. benchmark old ns/op new ns/op delta BenchmarkAddVW_1 8 8 +0.60% BenchmarkAddVW_2 10 9 -8.64% BenchmarkAddVW_3 10 10 -4.63% BenchmarkAddVW_4 10 11 +3.67% BenchmarkAddVW_5 11 12 +5.98% BenchmarkAddVW_1e1 18 20 +6.38% BenchmarkAddVW_1e2 129 115 -10.85% BenchmarkAddVW_1e3 1270 1089 -14.25% BenchmarkAddVW_1e4 13376 12145 -9.20% BenchmarkAddVW_1e5 130392 125260 -3.94% benchmark old MB/s new MB/s speedup BenchmarkAddVW_1 7709.10 7661.92 0.99x BenchmarkAddVW_2 12451.10 13604.00 1.09x BenchmarkAddVW_3 17727.81 18721.54 1.06x BenchmarkAddVW_4 23552.64 22708.81 0.96x BenchmarkAddVW_5 27411.40 25816.22 0.94x BenchmarkAddVW_1e1 34063.19 32023.06 0.94x BenchmarkAddVW_1e2 49529.97 55360.55 1.12x BenchmarkAddVW_1e3 50380.44 58764.18 1.17x BenchmarkAddVW_1e4 47843.59 52696.10 1.10x BenchmarkAddVW_1e5 49082.60 51093.66 1.04x R=gri, rsc, r CC=golang-dev https://golang.org/cl/6480063 --- src/pkg/math/big/arith_amd64.s | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/pkg/math/big/arith_amd64.s b/src/pkg/math/big/arith_amd64.s index 2d10793e63..d4d2d2f1b1 100644 --- a/src/pkg/math/big/arith_amd64.s +++ b/src/pkg/math/big/arith_amd64.s @@ -5,6 +5,16 @@ // This file provides fast assembly versions for the elementary // arithmetic operations on vectors implemented in arith.go. +// Literal instruction for MOVQ $0, CX. +// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.) +#define ZERO_CX BYTE $0x48; \ + BYTE $0xc7; \ + BYTE $0xc1; \ + BYTE $0x00; \ + BYTE $0x00; \ + BYTE $0x00; \ + BYTE $0x00 + // func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),7,$0 MOVQ x+0(FP), AX @@ -137,7 +147,7 @@ TEXT ·addVW(SB),7,$0 MOVQ x+16(FP), R8 MOVQ y+32(FP), CX // c = y MOVQ z+0(FP), R10 - + MOVQ $0, SI // i = 0 // s/JL/JMP/ below to disable the unrolled loop @@ -151,15 +161,15 @@ U3: // n >= 0 MOVQ 16(R8)(SI*8), R13 MOVQ 24(R8)(SI*8), R14 ADDQ CX, R11 + ZERO_CX ADCQ $0, R12 ADCQ $0, R13 ADCQ $0, R14 + SETCS CX // c = CF MOVQ R11, 0(R10)(SI*8) MOVQ R12, 8(R10)(SI*8) MOVQ R13, 16(R10)(SI*8) MOVQ R14, 24(R10)(SI*8) - RCLQ $1, CX // c = CF - ANDQ $1, CX ADDQ $4, SI // i += 4 SUBQ $4, DI // n -= 4 @@ -171,8 +181,8 @@ V3: ADDQ $4, DI // n += 4 L3: // n > 0 ADDQ 0(R8)(SI*8), CX MOVQ CX, 0(R10)(SI*8) + ZERO_CX RCLQ $1, CX // c = CF - ANDQ $1, CX ADDQ $1, SI // i++ SUBQ $1, DI // n-- @@ -203,15 +213,15 @@ U4: // n >= 0 MOVQ 16(R8)(SI*8), R13 MOVQ 24(R8)(SI*8), R14 SUBQ CX, R11 + ZERO_CX SBBQ $0, R12 SBBQ $0, R13 SBBQ $0, R14 + SETCS CX // c = CF MOVQ R11, 0(R10)(SI*8) MOVQ R12, 8(R10)(SI*8) MOVQ R13, 16(R10)(SI*8) MOVQ R14, 24(R10)(SI*8) - RCLQ $1, CX // c = CF - ANDQ $1, CX ADDQ $4, SI // i += 4 SUBQ $4, DI // n -= 4 @@ -224,8 +234,8 @@ L4: // n > 0 MOVQ 0(R8)(SI*8), R11 SUBQ CX, R11 MOVQ R11, 0(R10)(SI*8) + ZERO_CX RCLQ $1, CX // c = CF - ANDQ $1, CX ADDQ $1, SI // i++ SUBQ $1, DI // n-- -- 2.48.1