From 80b3ff9f827b5e27f03c3a51034745157ebb3301 Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Wed, 7 Jan 2015 17:16:59 -0800 Subject: [PATCH] math/big: faster assembly kernels for AddVx/SubVx for amd64. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Replaced use of rotate instructions (RCRQ, RCLQ) with ADDQ/SBBQ for restoring/saving the carry flag per suggestion from Torbjörn Granlund (author of GMP bignum libs for C). The rotate instructions tend to be slower on todays machines. benchmark old ns/op new ns/op delta BenchmarkAddVV_1 5.69 5.51 -3.16% BenchmarkAddVV_2 7.15 6.87 -3.92% BenchmarkAddVV_3 8.69 8.06 -7.25% BenchmarkAddVV_4 8.10 8.13 +0.37% BenchmarkAddVV_5 8.37 8.47 +1.19% BenchmarkAddVV_1e1 13.1 12.0 -8.40% BenchmarkAddVV_1e2 78.1 69.4 -11.14% BenchmarkAddVV_1e3 815 656 -19.51% BenchmarkAddVV_1e4 8137 7345 -9.73% BenchmarkAddVV_1e5 100127 93909 -6.21% BenchmarkAddVW_1 4.86 4.71 -3.09% BenchmarkAddVW_2 5.67 5.50 -3.00% BenchmarkAddVW_3 6.51 6.34 -2.61% BenchmarkAddVW_4 6.69 6.66 -0.45% BenchmarkAddVW_5 7.20 7.21 +0.14% BenchmarkAddVW_1e1 10.0 9.34 -6.60% BenchmarkAddVW_1e2 45.4 52.3 +15.20% BenchmarkAddVW_1e3 417 491 +17.75% BenchmarkAddVW_1e4 4760 4852 +1.93% BenchmarkAddVW_1e5 69107 67717 -2.01% benchmark old MB/s new MB/s speedup BenchmarkAddVV_1 11241.82 11610.28 1.03x BenchmarkAddVV_2 17902.68 18631.82 1.04x BenchmarkAddVV_3 22082.43 23835.64 1.08x BenchmarkAddVV_4 31588.18 31492.06 1.00x BenchmarkAddVV_5 38229.90 37783.17 0.99x BenchmarkAddVV_1e1 48891.67 53340.91 1.09x BenchmarkAddVV_1e2 81940.61 92191.86 1.13x BenchmarkAddVV_1e3 78443.09 97480.44 1.24x BenchmarkAddVV_1e4 78644.18 87129.50 1.11x BenchmarkAddVV_1e5 63918.48 68150.84 1.07x BenchmarkAddVW_1 13165.09 13581.00 1.03x BenchmarkAddVW_2 22588.04 23275.41 1.03x BenchmarkAddVW_3 29483.82 30303.96 1.03x BenchmarkAddVW_4 38286.54 38453.21 1.00x BenchmarkAddVW_5 44414.57 44370.59 1.00x BenchmarkAddVW_1e1 63816.84 68494.08 1.07x BenchmarkAddVW_1e2 140885.41 122427.16 0.87x BenchmarkAddVW_1e3 153258.31 130325.28 0.85x BenchmarkAddVW_1e4 134447.63 131904.02 0.98x BenchmarkAddVW_1e5 92609.41 94509.88 1.02x Change-Id: Ia473e9ab9c63a955c252426684176bca566645ae Reviewed-on: https://go-review.googlesource.com/2503 Reviewed-by: Keith Randall --- src/math/big/arith_386.s | 2 ++ src/math/big/arith_amd64.s | 57 ++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s index 1b47c898f9..649bc4dc88 100644 --- a/src/math/big/arith_386.s +++ b/src/math/big/arith_386.s @@ -7,6 +7,8 @@ // This file provides fast assembly versions for the elementary // arithmetic operations on vectors implemented in arith.go. +// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively. + // func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),NOSPLIT,$0 MOVL x+0(FP), AX diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s index 56c4cb050e..bb06e69b78 100644 --- a/src/math/big/arith_amd64.s +++ b/src/math/big/arith_amd64.s @@ -7,16 +7,6 @@ // This file provides fast assembly versions for the elementary // arithmetic operations on vectors implemented in arith.go. -// Literal instruction for MOVQ $0, CX. -// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.) -#define ZERO_CX BYTE $0x48; \ - BYTE $0xc7; \ - BYTE $0xc1; \ - BYTE $0x00; \ - BYTE $0x00; \ - BYTE $0x00; \ - BYTE $0x00 - // func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),NOSPLIT,$0 MOVQ x+0(FP), AX @@ -35,6 +25,11 @@ TEXT ·divWW(SB),NOSPLIT,$0 MOVQ DX, r+32(FP) RET +// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. +// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. +// This is faster than using rotate instructions. +// +// CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit! // func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),NOSPLIT,$0 @@ -52,7 +47,7 @@ TEXT ·addVV(SB),NOSPLIT,$0 U1: // n >= 0 // regular loop body unrolled 4x - RCRQ $1, CX // CF = c + ADDQ CX, CX // restore CF MOVQ 0(R8)(SI*8), R11 MOVQ 8(R8)(SI*8), R12 MOVQ 16(R8)(SI*8), R13 @@ -65,7 +60,7 @@ U1: // n >= 0 MOVQ R12, 8(R10)(SI*8) MOVQ R13, 16(R10)(SI*8) MOVQ R14, 24(R10)(SI*8) - RCLQ $1, CX // c = CF + SBBQ CX, CX // save CF ADDQ $4, SI // i += 4 SUBQ $4, DI // n -= 4 @@ -75,17 +70,18 @@ V1: ADDQ $4, DI // n += 4 JLE E1 // if n <= 0 goto E1 L1: // n > 0 - RCRQ $1, CX // CF = c + ADDQ CX, CX // restore CF MOVQ 0(R8)(SI*8), R11 ADCQ 0(R9)(SI*8), R11 MOVQ R11, 0(R10)(SI*8) - RCLQ $1, CX // c = CF + SBBQ CX, CX // save CF ADDQ $1, SI // i++ SUBQ $1, DI // n-- JG L1 // if n > 0 goto L1 -E1: MOVQ CX, c+72(FP) // return c +E1: NEGQ CX + MOVQ CX, c+72(FP) // return c RET @@ -106,7 +102,7 @@ TEXT ·subVV(SB),NOSPLIT,$0 U2: // n >= 0 // regular loop body unrolled 4x - RCRQ $1, CX // CF = c + ADDQ CX, CX // restore CF MOVQ 0(R8)(SI*8), R11 MOVQ 8(R8)(SI*8), R12 MOVQ 16(R8)(SI*8), R13 @@ -119,7 +115,7 @@ U2: // n >= 0 MOVQ R12, 8(R10)(SI*8) MOVQ R13, 16(R10)(SI*8) MOVQ R14, 24(R10)(SI*8) - RCLQ $1, CX // c = CF + SBBQ CX, CX // save CF ADDQ $4, SI // i += 4 SUBQ $4, DI // n -= 4 @@ -129,17 +125,18 @@ V2: ADDQ $4, DI // n += 4 JLE E2 // if n <= 0 goto E2 L2: // n > 0 - RCRQ $1, CX // CF = c + ADDQ CX, CX // restore CF MOVQ 0(R8)(SI*8), R11 SBBQ 0(R9)(SI*8), R11 MOVQ R11, 0(R10)(SI*8) - RCLQ $1, CX // c = CF + SBBQ CX, CX // save CF ADDQ $1, SI // i++ SUBQ $1, DI // n-- JG L2 // if n > 0 goto L2 -E2: MOVQ CX, c+72(FP) // return c +E2: NEGQ CX + MOVQ CX, c+72(FP) // return c RET @@ -163,11 +160,11 @@ U3: // n >= 0 MOVQ 16(R8)(SI*8), R13 MOVQ 24(R8)(SI*8), R14 ADDQ CX, R11 - ZERO_CX ADCQ $0, R12 ADCQ $0, R13 ADCQ $0, R14 - SETCS CX // c = CF + SBBQ CX, CX // save CF + NEGQ CX MOVQ R11, 0(R10)(SI*8) MOVQ R12, 8(R10)(SI*8) MOVQ R13, 16(R10)(SI*8) @@ -183,8 +180,8 @@ V3: ADDQ $4, DI // n += 4 L3: // n > 0 ADDQ 0(R8)(SI*8), CX MOVQ CX, 0(R10)(SI*8) - ZERO_CX - RCLQ $1, CX // c = CF + SBBQ CX, CX // save CF + NEGQ CX ADDQ $1, SI // i++ SUBQ $1, DI // n-- @@ -201,7 +198,7 @@ TEXT ·subVW(SB),NOSPLIT,$0 MOVQ x+24(FP), R8 MOVQ y+48(FP), CX // c = y MOVQ z+0(FP), R10 - + MOVQ $0, SI // i = 0 // s/JL/JMP/ below to disable the unrolled loop @@ -215,11 +212,11 @@ U4: // n >= 0 MOVQ 16(R8)(SI*8), R13 MOVQ 24(R8)(SI*8), R14 SUBQ CX, R11 - ZERO_CX SBBQ $0, R12 SBBQ $0, R13 SBBQ $0, R14 - SETCS CX // c = CF + SBBQ CX, CX // save CF + NEGQ CX MOVQ R11, 0(R10)(SI*8) MOVQ R12, 8(R10)(SI*8) MOVQ R13, 16(R10)(SI*8) @@ -236,8 +233,8 @@ L4: // n > 0 MOVQ 0(R8)(SI*8), R11 SUBQ CX, R11 MOVQ R11, 0(R10)(SI*8) - ZERO_CX - RCLQ $1, CX // c = CF + SBBQ CX, CX // save CF + NEGQ CX ADDQ $1, SI // i++ SUBQ $1, DI // n-- @@ -306,7 +303,7 @@ L9: MOVQ AX, DX // w = w1 SHRQ CX, DX:AX // w>>s | w1<<ŝ MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ ADDQ $1, BX // i++ - + E9: CMPQ BX, R11 JL L9 // i < n-1 -- 2.50.0