math/big: faster assembly kernels for AddVx/SubVx for amd64.

author Robert Griesemer <gri@golang.org>

Thu, 8 Jan 2015 01:16:59 +0000 (17:16 -0800)

committer Robert Griesemer <gri@golang.org>

Thu, 8 Jan 2015 16:57:11 +0000 (16:57 +0000)
author Robert Griesemer <gri@golang.org>
Thu, 8 Jan 2015 01:16:59 +0000 (17:16 -0800)
committer Robert Griesemer <gri@golang.org>
Thu, 8 Jan 2015 16:57:11 +0000 (16:57 +0000)
diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s

index 1b47c898f91ee417ae1b82d58a297a49db354583..649bc4dc88f336d150c63dc533af3a0428a6bc93 100644 (file)
--- a/src/math/big/arith_386.s
+++ b/src/math/big/arith_386.s
@@ -7,6 +7,8 @@
  // This file provides fast assembly versions for the elementary
  // arithmetic operations on vectors implemented in arith.go.
  
+// TODO(gri) Replace uses of RCRL/RCLL with ADDL/SBBL respectively.
+
  // func mulWW(x, y Word) (z1, z0 Word)
  TEXT ·mulWW(SB),NOSPLIT,$0
         MOVL x+0(FP), AX
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s

index 56c4cb050e4f00cc9a55996281c1905ee891c3e0..bb06e69b782b3430d8babb403c0d54ad26e6fc8e 100644 (file)
--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -7,16 +7,6 @@
  // This file provides fast assembly versions for the elementary
  // arithmetic operations on vectors implemented in arith.go.
  
-// Literal instruction for MOVQ $0, CX.
-// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
-#define ZERO_CX BYTE $0x48; \
-               BYTE $0xc7; \
-               BYTE $0xc1; \
-               BYTE $0x00; \
-               BYTE $0x00; \
-               BYTE $0x00; \
-               BYTE $0x00
-
  // func mulWW(x, y Word) (z1, z0 Word)
  TEXT ·mulWW(SB),NOSPLIT,$0
         MOVQ x+0(FP), AX
@@ -35,6 +25,11 @@ TEXT ·divWW(SB),NOSPLIT,$0
         MOVQ DX, r+32(FP)
         RET
  
+// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
+// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
+// This is faster than using rotate instructions.
+//
+// CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit!
  
  // func addVV(z, x, y []Word) (c Word)
  TEXT ·addVV(SB),NOSPLIT,$0
@@ -52,7 +47,7 @@ TEXT ·addVV(SB),NOSPLIT,$0
  
  U1:    // n >= 0
         // regular loop body unrolled 4x
-       RCRQ $1, CX             // CF = c
+       ADDQ CX, CX             // restore CF
         MOVQ 0(R8)(SI*8), R11
         MOVQ 8(R8)(SI*8), R12
         MOVQ 16(R8)(SI*8), R13
@@ -65,7 +60,7 @@ U1:   // n >= 0
         MOVQ R12, 8(R10)(SI*8)
         MOVQ R13, 16(R10)(SI*8)
         MOVQ R14, 24(R10)(SI*8)
-       RCLQ $1, CX             // c = CF
+       SBBQ CX, CX             // save CF
  
         ADDQ $4, SI             // i += 4
         SUBQ $4, DI             // n -= 4
@@ -75,17 +70,18 @@ V1: ADDQ $4, DI             // n += 4
         JLE E1                  // if n <= 0 goto E1
  
  L1:    // n > 0
-       RCRQ $1, CX             // CF = c
+       ADDQ CX, CX             // restore CF
         MOVQ 0(R8)(SI*8), R11
         ADCQ 0(R9)(SI*8), R11
         MOVQ R11, 0(R10)(SI*8)
-       RCLQ $1, CX             // c = CF
+       SBBQ CX, CX             // save CF
  
         ADDQ $1, SI             // i++
         SUBQ $1, DI             // n--
         JG L1                   // if n > 0 goto L1
  
-E1:    MOVQ CX, c+72(FP)       // return c
+E1:    NEGQ CX
+       MOVQ CX, c+72(FP)       // return c
         RET
  
  
@@ -106,7 +102,7 @@ TEXT ·subVV(SB),NOSPLIT,$0
  
  U2:    // n >= 0
         // regular loop body unrolled 4x
-       RCRQ $1, CX             // CF = c
+       ADDQ CX, CX             // restore CF
         MOVQ 0(R8)(SI*8), R11
         MOVQ 8(R8)(SI*8), R12
         MOVQ 16(R8)(SI*8), R13
@@ -119,7 +115,7 @@ U2: // n >= 0
         MOVQ R12, 8(R10)(SI*8)
         MOVQ R13, 16(R10)(SI*8)
         MOVQ R14, 24(R10)(SI*8)
-       RCLQ $1, CX             // c = CF
+       SBBQ CX, CX             // save CF
  
         ADDQ $4, SI             // i += 4
         SUBQ $4, DI             // n -= 4
@@ -129,17 +125,18 @@ V2:       ADDQ $4, DI             // n += 4
         JLE E2                  // if n <= 0 goto E2
  
  L2:    // n > 0
-       RCRQ $1, CX             // CF = c
+       ADDQ CX, CX             // restore CF
         MOVQ 0(R8)(SI*8), R11
         SBBQ 0(R9)(SI*8), R11
         MOVQ R11, 0(R10)(SI*8)
-       RCLQ $1, CX             // c = CF
+       SBBQ CX, CX             // save CF
  
         ADDQ $1, SI             // i++
         SUBQ $1, DI             // n--
         JG L2                   // if n > 0 goto L2
  
-E2:    MOVQ CX, c+72(FP)       // return c
+E2:    NEGQ CX
+       MOVQ CX, c+72(FP)       // return c
         RET
  
  
@@ -163,11 +160,11 @@ U3:       // n >= 0
         MOVQ 16(R8)(SI*8), R13
         MOVQ 24(R8)(SI*8), R14
         ADDQ CX, R11
-       ZERO_CX
         ADCQ $0, R12
         ADCQ $0, R13
         ADCQ $0, R14
-       SETCS CX                // c = CF
+       SBBQ CX, CX             // save CF
+       NEGQ CX
         MOVQ R11, 0(R10)(SI*8)
         MOVQ R12, 8(R10)(SI*8)
         MOVQ R13, 16(R10)(SI*8)
@@ -183,8 +180,8 @@ V3: ADDQ $4, DI             // n += 4
  L3:    // n > 0
         ADDQ 0(R8)(SI*8), CX
         MOVQ CX, 0(R10)(SI*8)
-       ZERO_CX
-       RCLQ $1, CX             // c = CF
+       SBBQ CX, CX             // save CF
+       NEGQ CX
  
         ADDQ $1, SI             // i++
         SUBQ $1, DI             // n--
@@ -201,7 +198,7 @@ TEXT ·subVW(SB),NOSPLIT,$0
         MOVQ x+24(FP), R8
         MOVQ y+48(FP), CX       // c = y
         MOVQ z+0(FP), R10
-       
+
         MOVQ $0, SI             // i = 0
  
         // s/JL/JMP/ below to disable the unrolled loop
@@ -215,11 +212,11 @@ U4:       // n >= 0
         MOVQ 16(R8)(SI*8), R13
         MOVQ 24(R8)(SI*8), R14
         SUBQ CX, R11
-       ZERO_CX
         SBBQ $0, R12
         SBBQ $0, R13
         SBBQ $0, R14
-       SETCS CX                // c = CF
+       SBBQ CX, CX             // save CF
+       NEGQ CX
         MOVQ R11, 0(R10)(SI*8)
         MOVQ R12, 8(R10)(SI*8)
         MOVQ R13, 16(R10)(SI*8)
@@ -236,8 +233,8 @@ L4: // n > 0
         MOVQ 0(R8)(SI*8), R11
         SUBQ CX, R11
         MOVQ R11, 0(R10)(SI*8)
-       ZERO_CX
-       RCLQ $1, CX             // c = CF
+       SBBQ CX, CX             // save CF
+       NEGQ CX
  
         ADDQ $1, SI             // i++
         SUBQ $1, DI             // n--
@@ -306,7 +303,7 @@ L9: MOVQ AX, DX             // w = w1
         SHRQ CX, DX:AX          // w>>s | w1<<ŝ
         MOVQ DX, (R10)(BX*8)    // z[i] = w>>s | w1<<ŝ
         ADDQ $1, BX             // i++
-       
+
  E9:    CMPQ BX, R11
         JL L9                   // i < n-1
author	Robert Griesemer <gri@golang.org>
	Thu, 8 Jan 2015 01:16:59 +0000 (17:16 -0800)
committer	Robert Griesemer <gri@golang.org>
	Thu, 8 Jan 2015 16:57:11 +0000 (16:57 +0000)
src/math/big/arith_386.s		patch \| blob \| history
src/math/big/arith_amd64.s		patch \| blob \| history