RET
TEXT ·subVW(SB), NOSPLIT, $0
- MOVD subwvectorfacility+0x00(SB), R1
- BR (R1)
-
-TEXT ·subVW_check(SB), NOSPLIT, $0
- MOVB ·hasVX(SB), R1
- CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
- MOVD $subwvectorfacility+0x00(SB), R1
- MOVD $·subVW_novec(SB), R2
- MOVD R2, 0(R1)
-
- // MOVD $·subVW_novec(SB), 0(R1)
- BR ·subVW_novec(SB)
-
-vectorimpl:
- MOVD $subwvectorfacility+0x00(SB), R1
- MOVD $·subVW_vec(SB), R2
- MOVD R2, 0(R1)
-
- // MOVD $·subVW_vec(SB), 0(R1)
- BR ·subVW_vec(SB)
-
-GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
-DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW_vec(SB), NOSPLIT, $0
- MOVD z_len+8(FP), R3
- MOVD x+24(FP), R8
- MOVD y+48(FP), R4 // c = y
- MOVD z+0(FP), R2
-
- MOVD $0, R0 // make sure it's zero
- MOVD $0, R10 // i = 0
- MOVD R8, R5
- MOVD R2, R7
-
- // s/JL/JMP/ below to disable the unrolled loop
- SUB $4, R3 // n -= 4
- BLT v11 // if n < 0 goto v11
- SUB $12, R3
- BLT A11
-
- VZERO V0
- MOVD $1, R6 // prepare V0 to be final carry register
- VLVGG $1, R6, V0 // borrow is initially "no borrow"
- VZERO V9 // to ensure upper half is zero
- VLVGG $1, R4, V9
-
- // n >= 0
- // regular loop body unrolled 16x
-
-UU1:
- VLM 0(R5), V1, V4 // 64-bytes into V1..V4
- ADD $64, R5
- VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
- VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
-
- VSBCBIQ V1, V9, V0, V25
- VSBIQ V1, V9, V0, V17
- VZERO V9
- VSBCBIQ V2, V9, V25, V26
- VSBIQ V2, V9, V25, V18
-
- VLM 0(R5), V5, V6 // 32-bytes into V5..V6
- ADD $32, R5
-
- VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
- VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
-
- VSBCBIQ V3, V9, V26, V27
- VSBIQ V3, V9, V26, V19
- VSBCBIQ V4, V9, V27, V28
- VSBIQ V4, V9, V27, V20
-
- VLM 0(R5), V7, V8 // 32-bytes into V7..V8
- ADD $32, R5
-
- VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
- VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
-
- VSBCBIQ V5, V9, V28, V29
- VSBIQ V5, V9, V28, V21
- VSBCBIQ V6, V9, V29, V30
- VSBIQ V6, V9, V29, V22
-
- VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
- VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
-
- VSBCBIQ V7, V9, V30, V31
- VSBIQ V7, V9, V30, V23
- VSBCBIQ V8, V9, V31, V0 // V0 has carry-over
- VSBIQ V8, V9, V31, V24
-
- VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
- VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
- VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
- VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
- VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
- VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
- VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
- VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
- VSTM V17, V24, 0(R7) // 128-bytes into z
- ADD $128, R7
- ADD $128, R10 // i += 16
- SUB $16, R3 // n -= 16
- BGE UU1 // if n >= 0 goto U1
- VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
- SUB $1, R4 // save cf
- NEG R4, R4
-
-A11:
- ADD $12, R3 // n += 16
-
- BLT v11 // if n < 0 goto v11
-
- // n >= 0
- // regular loop body unrolled 4x
-
-U4: // n >= 0
- // regular loop body unrolled 4x
- MOVD 0(R8)(R10*1), R5
- MOVD 8(R8)(R10*1), R6
- MOVD 16(R8)(R10*1), R7
- MOVD 24(R8)(R10*1), R1
- SUBC R4, R5 // SLGR -> SUBC
- SUBE R0, R6 // SLBGR -> SUBE
- SUBE R0, R7
- SUBE R0, R1
- SUBE R4, R4 // save CF
- NEG R4, R4
- MOVD R5, 0(R2)(R10*1)
- MOVD R6, 8(R2)(R10*1)
- MOVD R7, 16(R2)(R10*1)
- MOVD R1, 24(R2)(R10*1)
+ MOVD z_len+8(FP), R5
+ MOVD x+24(FP), R6
+ MOVD y+48(FP), R7 // The borrow bit passed in
+ MOVD z+0(FP), R8
+ MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it.
- ADD $32, R10 // i += 4 -> i +=32
- SUB $4, R3 // n -= 4
- BGE U4 // if n >= 0 goto U4
+ CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
-v11:
- ADD $4, R3 // n += 4
- BLE E11 // if n <= 0 goto E4
+ // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
+ MOVD 0(R6), R9
+ SUBC R7, R9
+ MOVD R9, 0(R8)
+ CMPBEQ R5, $1, returnResult
+ MOVD 8(R6), R9
+ SUBE R0, R9
+ MOVD R9, 8(R8)
+ CMPBEQ R5, $2, returnResult
-L4: // n > 0
+ // Update the counters
+ MOVD $16, R12 // i = 2
+ MOVD $-2(R5), R5 // n = n - 2
- MOVD 0(R8)(R10*1), R5
- SUBC R4, R5
- SUBE R4, R4 // save CF
- NEG R4, R4
- MOVD R5, 0(R2)(R10*1)
+loopOverEachWord:
+ BRC $3, copySetup // no borrow, copy the rest
+ MOVD 0(R6)(R12*1), R9
- ADD $8, R10 // i++
- SUB $1, R3 // n--
- BGT L4 // if n > 0 goto L4
+ // Originally we used the borrow flag generated in the previous iteration
+ // (i.e: SUBE could be used here to do the subtraction). However, since we
+ // already know borrow is 1 (otherwise we will go to copy section), we can
+ // use SUBC here so the current iteration does not depend on the borrow flag
+ // generated in the previous iteration. This could be useful when branch prediction happens.
+ SUBC $1, R9
+ MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
-E11:
- MOVD R4, c+56(FP) // return c
+ MOVD $8(R12), R12 // i++
+ BRCTG R5, loopOverEachWord // n--
+// return the current borrow value
+returnResult:
+ SUBE R0, R0
+ NEG R0, R0
+ MOVD R0, c+56(FP)
RET
-// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
-// func subVW(z, x []Word, y Word) (c Word)
-// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
-TEXT ·subVW_novec(SB), NOSPLIT, $0
- MOVD z_len+8(FP), R3
- MOVD x+24(FP), R8
- MOVD y+48(FP), R4 // c = y
- MOVD z+0(FP), R2
- MOVD $0, R0 // make sure it's 0
- MOVD $0, R10 // i = 0
-
- // s/JL/JMP/ below to disable the unrolled loop
- SUB $4, R3 // n -= 4
- BLT v4 // if n < 4 goto v4
+// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
+// With the assumption that x and z will not overlap with each other or x and z will
+// point to same memory region, we can use a faster version of copy using only MVC here.
+// In the following implementation, we have three copy loops, each copying a word, 4 words, and
+// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
+copySetup:
+ ADD R12, R6
+ ADD R12, R8
-U4: // n >= 0
- // regular loop body unrolled 4x
- MOVD 0(R8)(R10*1), R5
- MOVD 8(R8)(R10*1), R6
- MOVD 16(R8)(R10*1), R7
- MOVD 24(R8)(R10*1), R1
- SUBC R4, R5 // SLGR -> SUBC
- SUBE R0, R6 // SLBGR -> SUBE
- SUBE R0, R7
- SUBE R0, R1
- SUBE R4, R4 // save CF
- NEG R4, R4
- MOVD R5, 0(R2)(R10*1)
- MOVD R6, 8(R2)(R10*1)
- MOVD R7, 16(R2)(R10*1)
- MOVD R1, 24(R2)(R10*1)
+ CMPBGE R5, $4, mediumLoop
- ADD $32, R10 // i += 4 -> i +=32
- SUB $4, R3 // n -= 4
- BGE U4 // if n >= 0 goto U4
+smallLoop: // does a loop unrolling to copy word when n < 4
+ CMPBEQ R5, $0, returnZero
+ MVC $8, 0(R6), 0(R8)
+ CMPBEQ R5, $1, returnZero
+ MVC $8, 8(R6), 8(R8)
+ CMPBEQ R5, $2, returnZero
+ MVC $8, 16(R6), 16(R8)
-v4:
- ADD $4, R3 // n += 4
- BLE E4 // if n <= 0 goto E4
+returnZero:
+ MOVD $0, c+56(FP) // return 0 as borrow
+ RET
-L4: // n > 0
- MOVD 0(R8)(R10*1), R5
- SUBC R4, R5
- SUBE R4, R4 // save CF
- NEG R4, R4
- MOVD R5, 0(R2)(R10*1)
+mediumLoop:
+ CMPBLT R5, $4, smallLoop
+ CMPBLT R5, $32, mediumLoopBody
- ADD $8, R10 // i++
- SUB $1, R3 // n--
- BGT L4 // if n > 0 goto L4
+largeLoop: // Copying 256 bytes at a time
+ MVC $256, 0(R6), 0(R8)
+ MOVD $256(R6), R6
+ MOVD $256(R8), R8
+ MOVD $-32(R5), R5
+ CMPBGE R5, $32, largeLoop
+ BR mediumLoop
-E4:
- MOVD R4, c+56(FP) // return c
+mediumLoopBody: // Copying 32 bytes at a time
+ MVC $32, 0(R6), 0(R8)
+ MOVD $32(R6), R6
+ MOVD $32(R8), R8
+ MOVD $-4(R5), R5
+ CMPBGE R5, $4, mediumLoopBody
+ BR smallLoop
+returnC:
+ MOVD R7, c+56(FP)
RET
// func shlVU(z, x []Word, s uint) (c Word)