math/big: rewrite subVW to use fast path on s390x

author Ruixin(Peter) Bao <ruixin.bao@ibm.com>

Tue, 24 Mar 2020 15:51:22 +0000 (11:51 -0400)

committer Michael Munday <mike.munday@ibm.com>

Fri, 24 Apr 2020 14:50:59 +0000 (14:50 +0000)
author Ruixin(Peter) Bao <ruixin.bao@ibm.com>
Tue, 24 Mar 2020 15:51:22 +0000 (11:51 -0400)
committer Michael Munday <mike.munday@ibm.com>
Fri, 24 Apr 2020 14:50:59 +0000 (14:50 +0000)
diff --git a/src/math/big/arith_decl_s390x.go b/src/math/big/arith_decl_s390x.go

index 38fefcc7b0e555a5226d02425e21834eecd4292e..5973d3cfc1c5696f9e979ca26748e29b06ac4b5d 100644 (file)
--- a/src/math/big/arith_decl_s390x.go
+++ b/src/math/big/arith_decl_s390x.go
@@ -12,9 +12,6 @@ func addVV_novec(z, x, y []Word) (c Word)
  func subVV_check(z, x, y []Word) (c Word)
  func subVV_vec(z, x, y []Word) (c Word)
  func subVV_novec(z, x, y []Word) (c Word)
-func subVW_check(z, x []Word, y Word) (c Word)
-func subVW_vec(z, x []Word, y Word) (c Word)
-func subVW_novec(z, x []Word, y Word) (c Word)
  func hasVectorFacility() bool
  
  var hasVX = hasVectorFacility()
diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s

index f0464df201ecf9d54146d855dd2760576c0db4d0..ef0192224fe80e7d7b873570f344666da27e42b3 100644 (file)
--- a/src/math/big/arith_s390x.s
+++ b/src/math/big/arith_s390x.s
@@ -631,220 +631,95 @@ returnC:
         RET
  
  TEXT ·subVW(SB), NOSPLIT, $0
-       MOVD subwvectorfacility+0x00(SB), R1
-       BR   (R1)
-
-TEXT ·subVW_check(SB), NOSPLIT, $0
-       MOVB   ·hasVX(SB), R1
-       CMPBEQ R1, $1, vectorimpl               // vectorfacility = 1, vector supported
-       MOVD   $subwvectorfacility+0x00(SB), R1
-       MOVD   $·subVW_novec(SB), R2
-       MOVD   R2, 0(R1)
-
-       // MOVD $·subVW_novec(SB), 0(R1)
-       BR ·subVW_novec(SB)
-
-vectorimpl:
-       MOVD $subwvectorfacility+0x00(SB), R1
-       MOVD $·subVW_vec(SB), R2
-       MOVD R2, 0(R1)
-
-       // MOVD $·subVW_vec(SB), 0(R1)
-       BR ·subVW_vec(SB)
-
-GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
-DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW_vec(SB), NOSPLIT, $0
-       MOVD z_len+8(FP), R3
-       MOVD x+24(FP), R8
-       MOVD y+48(FP), R4    // c = y
-       MOVD z+0(FP), R2
-
-       MOVD $0, R0  // make sure it's zero
-       MOVD $0, R10 // i = 0
-       MOVD R8, R5
-       MOVD R2, R7
-
-       // s/JL/JMP/ below to disable the unrolled loop
-       SUB $4, R3  // n -= 4
-       BLT v11     // if n < 0 goto v11
-       SUB $12, R3
-       BLT A11
-
-       VZERO V0
-       MOVD  $1, R6     // prepare V0 to be final carry register
-       VLVGG $1, R6, V0 // borrow is initially "no borrow"
-       VZERO V9         // to ensure upper half is zero
-       VLVGG $1, R4, V9
-
-       // n >= 0
-       // regular loop body unrolled 16x
-
-UU1:
-       VLM  0(R5), V1, V4    // 64-bytes into V1..V4
-       ADD  $64, R5
-       VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
-       VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
-
-       VSBCBIQ V1, V9, V0, V25
-       VSBIQ   V1, V9, V0, V17
-       VZERO   V9
-       VSBCBIQ V2, V9, V25, V26
-       VSBIQ   V2, V9, V25, V18
-
-       VLM 0(R5), V5, V6 // 32-bytes into V5..V6
-       ADD $32, R5
-
-       VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
-       VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
-
-       VSBCBIQ V3, V9, V26, V27
-       VSBIQ   V3, V9, V26, V19
-       VSBCBIQ V4, V9, V27, V28
-       VSBIQ   V4, V9, V27, V20
-
-       VLM 0(R5), V7, V8 // 32-bytes into V7..V8
-       ADD $32, R5
-
-       VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
-       VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
-
-       VSBCBIQ V5, V9, V28, V29
-       VSBIQ   V5, V9, V28, V21
-       VSBCBIQ V6, V9, V29, V30
-       VSBIQ   V6, V9, V29, V22
-
-       VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
-       VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
-
-       VSBCBIQ V7, V9, V30, V31
-       VSBIQ   V7, V9, V30, V23
-       VSBCBIQ V8, V9, V31, V0  // V0 has carry-over
-       VSBIQ   V8, V9, V31, V24
-
-       VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
-       VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
-       VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
-       VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
-       VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
-       VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
-       VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
-       VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
-       VSTM  V17, V24, 0(R7)     // 128-bytes into z
-       ADD   $128, R7
-       ADD   $128, R10           // i += 16
-       SUB   $16, R3             // n -= 16
-       BGE   UU1                 // if n >= 0 goto U1
-       VLGVG $1, V0, R4          // put cf into R4 in case we branch to v10
-       SUB   $1, R4              // save cf
-       NEG   R4, R4
-
-A11:
-       ADD $12, R3 // n += 16
-
-       BLT v11 // if n < 0 goto v11
-
-       // n >= 0
-       // regular loop body unrolled 4x
-
-U4:  // n >= 0
-       // regular loop body unrolled 4x
-       MOVD 0(R8)(R10*1), R5
-       MOVD 8(R8)(R10*1), R6
-       MOVD 16(R8)(R10*1), R7
-       MOVD 24(R8)(R10*1), R1
-       SUBC R4, R5            // SLGR  -> SUBC
-       SUBE R0, R6            // SLBGR -> SUBE
-       SUBE R0, R7
-       SUBE R0, R1
-       SUBE R4, R4            // save CF
-       NEG  R4, R4
-       MOVD R5, 0(R2)(R10*1)
-       MOVD R6, 8(R2)(R10*1)
-       MOVD R7, 16(R2)(R10*1)
-       MOVD R1, 24(R2)(R10*1)
+       MOVD z_len+8(FP), R5
+       MOVD x+24(FP), R6
+       MOVD y+48(FP), R7    // The borrow bit passed in
+       MOVD z+0(FP), R8
+       MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
  
-       ADD $32, R10 // i += 4 -> i +=32
-       SUB $4, R3   // n -= 4
-       BGE U4       // if n >= 0 goto U4
+       CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
  
-v11:
-       ADD $4, R3 // n += 4
-       BLE E11    // if n <= 0 goto E4
+       // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
+       MOVD   0(R6), R9
+       SUBC   R7, R9
+       MOVD   R9, 0(R8)
+       CMPBEQ R5, $1, returnResult
+       MOVD   8(R6), R9
+       SUBE   R0, R9
+       MOVD   R9, 8(R8)
+       CMPBEQ R5, $2, returnResult
  
-L4:  // n > 0
+       // Update the counters
+       MOVD $16, R12    // i = 2
+       MOVD $-2(R5), R5 // n = n - 2
  
-       MOVD 0(R8)(R10*1), R5
-       SUBC R4, R5
-       SUBE R4, R4           // save CF
-       NEG  R4, R4
-       MOVD R5, 0(R2)(R10*1)
+loopOverEachWord:
+       BRC  $3, copySetup    // no borrow, copy the rest
+       MOVD 0(R6)(R12*1), R9
  
-       ADD $8, R10 // i++
-       SUB $1, R3  // n--
-       BGT L4      // if n > 0 goto L4
+       // Originally we used the borrow flag generated in the previous iteration
+       // (i.e: SUBE could be used here to do the subtraction). However, since we
+       // already know borrow is 1 (otherwise we will go to copy section), we can
+       // use SUBC here so the current iteration does not depend on the borrow flag
+       // generated in the previous iteration. This could be useful when branch prediction happens.
+       SUBC $1, R9
+       MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
  
-E11:
-       MOVD R4, c+56(FP) // return c
+       MOVD  $8(R12), R12         // i++
+       BRCTG R5, loopOverEachWord // n--
  
+// return the current borrow value
+returnResult:
+       SUBE R0, R0
+       NEG  R0, R0
+       MOVD R0, c+56(FP)
         RET
  
-// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
-// func subVW(z, x []Word, y Word) (c Word)
-// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
-TEXT ·subVW_novec(SB), NOSPLIT, $0
-       MOVD z_len+8(FP), R3
-       MOVD x+24(FP), R8
-       MOVD y+48(FP), R4    // c = y
-       MOVD z+0(FP), R2
-       MOVD $0, R0          // make sure it's 0
-       MOVD $0, R10         // i = 0
-
-       // s/JL/JMP/ below to disable the unrolled loop
-       SUB $4, R3 // n -= 4
-       BLT v4     // if n < 4 goto v4
+// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
+// With the assumption that x and z will not overlap with each other or x and z will
+// point to same memory region, we can use a faster version of copy using only MVC here.
+// In the following implementation, we have three copy loops, each copying a word, 4 words, and
+// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
+copySetup:
+       ADD R12, R6
+       ADD R12, R8
  
-U4:  // n >= 0
-       // regular loop body unrolled 4x
-       MOVD 0(R8)(R10*1), R5
-       MOVD 8(R8)(R10*1), R6
-       MOVD 16(R8)(R10*1), R7
-       MOVD 24(R8)(R10*1), R1
-       SUBC R4, R5            // SLGR  -> SUBC
-       SUBE R0, R6            // SLBGR -> SUBE
-       SUBE R0, R7
-       SUBE R0, R1
-       SUBE R4, R4            // save CF
-       NEG  R4, R4
-       MOVD R5, 0(R2)(R10*1)
-       MOVD R6, 8(R2)(R10*1)
-       MOVD R7, 16(R2)(R10*1)
-       MOVD R1, 24(R2)(R10*1)
+       CMPBGE R5, $4, mediumLoop
  
-       ADD $32, R10 // i += 4 -> i +=32
-       SUB $4, R3   // n -= 4
-       BGE U4       // if n >= 0 goto U4
+smallLoop:  // does a loop unrolling to copy word when n < 4
+       CMPBEQ R5, $0, returnZero
+       MVC    $8, 0(R6), 0(R8)
+       CMPBEQ R5, $1, returnZero
+       MVC    $8, 8(R6), 8(R8)
+       CMPBEQ R5, $2, returnZero
+       MVC    $8, 16(R6), 16(R8)
  
-v4:
-       ADD $4, R3 // n += 4
-       BLE E4     // if n <= 0 goto E4
+returnZero:
+       MOVD $0, c+56(FP) // return 0 as borrow
+       RET
  
-L4:  // n > 0
-       MOVD 0(R8)(R10*1), R5
-       SUBC R4, R5
-       SUBE R4, R4           // save CF
-       NEG  R4, R4
-       MOVD R5, 0(R2)(R10*1)
+mediumLoop:
+       CMPBLT R5, $4, smallLoop
+       CMPBLT R5, $32, mediumLoopBody
  
-       ADD $8, R10 // i++
-       SUB $1, R3  // n--
-       BGT L4      // if n > 0 goto L4
+largeLoop:  // Copying 256 bytes at a time
+       MVC    $256, 0(R6), 0(R8)
+       MOVD   $256(R6), R6
+       MOVD   $256(R8), R8
+       MOVD   $-32(R5), R5
+       CMPBGE R5, $32, largeLoop
+       BR     mediumLoop
  
-E4:
-       MOVD R4, c+56(FP) // return c
+mediumLoopBody:  // Copying 32 bytes at a time
+       MVC    $32, 0(R6), 0(R8)
+       MOVD   $32(R6), R6
+       MOVD   $32(R8), R8
+       MOVD   $-4(R5), R5
+       CMPBGE R5, $4, mediumLoopBody
+       BR     smallLoop
  
+returnC:
+       MOVD R7, c+56(FP)
         RET
  
  // func shlVU(z, x []Word, s uint) (c Word)
diff --git a/src/math/big/arith_s390x_test.go b/src/math/big/arith_s390x_test.go

index e7f1b57ea580ecabac24b34cdf6179f8ff8aa9dc..ce6bca888598884de39df44db27cfb8bafc04da5 100644 (file)
--- a/src/math/big/arith_s390x_test.go
+++ b/src/math/big/arith_s390x_test.go
@@ -30,12 +30,3 @@ func TestFunVVnovec(t *testing.T) {
                 }
         }
  }
-
-func TestFunVWnovec(t *testing.T) {
-       if hasVX == true {
-               for _, a := range sumVW {
-                       arg := argVW{a.x, a.z, a.y, a.c}
-                       testFunVW(t, "subVW_novec", subVW_novec, arg)
-               }
-       }
-}
author	Ruixin(Peter) Bao <ruixin.bao@ibm.com>
	Tue, 24 Mar 2020 15:51:22 +0000 (11:51 -0400)
committer	Michael Munday <mike.munday@ibm.com>
	Fri, 24 Apr 2020 14:50:59 +0000 (14:50 +0000)
src/math/big/arith_decl_s390x.go		patch \| blob \| history
src/math/big/arith_s390x.s		patch \| blob \| history
src/math/big/arith_s390x_test.go		patch \| blob \| history