},
"math/big": {
"bigEndianWord",
- // The following functions require the math_big_pure_go build tag.
- "addVW",
- "subVW",
},
"math/rand": {
"(*rngSource).Int63",
package big
-import "math/bits"
+import (
+ "math/bits"
+ _ "unsafe" // for go:linkname
+)
// A Word represents a single digit of a multi-precision unsigned integer.
type Word uint
return
}
-// The resulting carry c is either 0 or 1.
-func addVW_g(z, x []Word, y Word) (c Word) {
- c = y
- // The comment near the top of this file discusses this for loop condition.
- for i := 0; i < len(z) && i < len(x); i++ {
- zi, cc := bits.Add(uint(x[i]), uint(c), 0)
- z[i] = Word(zi)
- c = Word(cc)
+// addVW sets z = x + y, returning the final carry c.
+// The behavior is undefined if len(x) != len(z).
+// If len(z) == 0, c = y; otherwise, c is 0 or 1.
+//
+// addVW should be an internal detail,
+// but widely used packages access it using linkname.
+// Notable members of the hall of shame include:
+// - github.com/remyoudompheng/bigfft
+//
+// Do not remove or change the type signature.
+// See go.dev/issue/67401.
+//
+//go:linkname addVW
+func addVW(z, x []Word, y Word) (c Word) {
+ x = x[:len(z)]
+ if len(z) == 0 {
+ return y
}
- return
+ zi, cc := bits.Add(uint(x[0]), uint(y), 0)
+ z[0] = Word(zi)
+ if cc == 0 {
+ if &z[0] != &x[0] {
+ copy(z[1:], x[1:])
+ }
+ return 0
+ }
+ for i := 1; i < len(z); i++ {
+ xi := x[i]
+ if xi != ^Word(0) {
+ z[i] = xi + 1
+ if &z[0] != &x[0] {
+ copy(z[i+1:], x[i+1:])
+ }
+ return 0
+ }
+ z[i] = 0
+ }
+ return 1
}
-// addVWlarge is addVW, but intended for large z.
-// The only difference is that we check on every iteration
-// whether we are done with carries,
-// and if so, switch to a much faster copy instead.
-// This is only a good idea for large z,
-// because the overhead of the check and the function call
-// outweigh the benefits when z is small.
-func addVWlarge(z, x []Word, y Word) (c Word) {
+// addVW_ref is the reference implementation for addVW, used only for testing.
+func addVW_ref(z, x []Word, y Word) (c Word) {
c = y
- // The comment near the top of this file discusses this for loop condition.
- for i := 0; i < len(z) && i < len(x); i++ {
- if c == 0 {
- copy(z[i:], x[i:])
- return
- }
+ for i := range z {
zi, cc := bits.Add(uint(x[i]), uint(c), 0)
z[i] = Word(zi)
c = Word(cc)
return
}
-func subVW_g(z, x []Word, y Word) (c Word) {
- c = y
- // The comment near the top of this file discusses this for loop condition.
- for i := 0; i < len(z) && i < len(x); i++ {
- zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
- z[i] = Word(zi)
- c = Word(cc)
+// subVW sets z = x - y, returning the final carry c.
+// The behavior is undefined if len(x) != len(z).
+// If len(z) == 0, c = y; otherwise, c is 0 or 1.
+//
+// subVW should be an internal detail,
+// but widely used packages access it using linkname.
+// Notable members of the hall of shame include:
+// - github.com/remyoudompheng/bigfft
+//
+// Do not remove or change the type signature.
+// See go.dev/issue/67401.
+//
+//go:linkname subVW
+func subVW(z, x []Word, y Word) (c Word) {
+ x = x[:len(z)]
+ if len(z) == 0 {
+ return y
}
- return
+ zi, cc := bits.Sub(uint(x[0]), uint(y), 0)
+ z[0] = Word(zi)
+ if cc == 0 {
+ if &z[0] != &x[0] {
+ copy(z[1:], x[1:])
+ }
+ return 0
+ }
+ for i := 1; i < len(z); i++ {
+ xi := x[i]
+ if xi != 0 {
+ z[i] = xi - 1
+ if &z[0] != &x[0] {
+ copy(z[i+1:], x[i+1:])
+ }
+ return 0
+ }
+ z[i] = ^Word(0)
+ }
+ return 1
}
-// subVWlarge is to subVW as addVWlarge is to addVW.
-func subVWlarge(z, x []Word, y Word) (c Word) {
+// subVW_ref is the reference implementation for subVW, used only for testing.
+func subVW_ref(z, x []Word, y Word) (c Word) {
c = y
- // The comment near the top of this file discusses this for loop condition.
- for i := 0; i < len(z) && i < len(x); i++ {
- if c == 0 {
- copy(z[i:], x[i:])
- return
- }
+ for i := range z {
zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
z[i] = Word(zi)
c = Word(cc)
}
- return
+ return c
}
func lshVU_g(z, x []Word, s uint) (c Word) {
RET
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
- MOVL z+0(FP), DI
- MOVL x+12(FP), SI
- MOVL y+24(FP), AX // c = y
- MOVL z_len+4(FP), BP
- MOVL $0, BX // i = 0
- JMP E3
-
-L3: ADDL (SI)(BX*4), AX
- MOVL AX, (DI)(BX*4)
- SBBL AX, AX // save CF
- NEGL AX
- ADDL $1, BX // i++
-
-E3: CMPL BX, BP // i < n
- JL L3
-
- MOVL AX, c+28(FP)
- RET
-
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB),NOSPLIT,$0
- MOVL z+0(FP), DI
- MOVL x+12(FP), SI
- MOVL y+24(FP), AX // c = y
- MOVL z_len+4(FP), BP
- MOVL $0, BX // i = 0
- JMP E4
-
-L4: MOVL (SI)(BX*4), DX
- SUBL AX, DX
- MOVL DX, (DI)(BX*4)
- SBBL AX, AX // save CF
- NEGL AX
- ADDL $1, BX // i++
-
-E4: CMPL BX, BP // i < n
- JL L4
-
- MOVL AX, c+28(FP)
- RET
-
-
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVL z_len+4(FP), BX // i = z
MOVQ CX, c+72(FP) // return c
RET
-
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
- MOVQ z_len+8(FP), DI
- CMPQ DI, $32
- JG large
- MOVQ x+24(FP), R8
- MOVQ y+48(FP), CX // c = y
- MOVQ z+0(FP), R10
-
- MOVQ $0, SI // i = 0
-
- // s/JL/JMP/ below to disable the unrolled loop
- SUBQ $4, DI // n -= 4
- JL V3 // if n < 4 goto V3
-
-U3: // n >= 0
- // regular loop body unrolled 4x
- MOVQ 0(R8)(SI*8), R11
- MOVQ 8(R8)(SI*8), R12
- MOVQ 16(R8)(SI*8), R13
- MOVQ 24(R8)(SI*8), R14
- ADDQ CX, R11
- ADCQ $0, R12
- ADCQ $0, R13
- ADCQ $0, R14
- SBBQ CX, CX // save CF
- NEGQ CX
- MOVQ R11, 0(R10)(SI*8)
- MOVQ R12, 8(R10)(SI*8)
- MOVQ R13, 16(R10)(SI*8)
- MOVQ R14, 24(R10)(SI*8)
-
- ADDQ $4, SI // i += 4
- SUBQ $4, DI // n -= 4
- JGE U3 // if n >= 0 goto U3
-
-V3: ADDQ $4, DI // n += 4
- JLE E3 // if n <= 0 goto E3
-
-L3: // n > 0
- ADDQ 0(R8)(SI*8), CX
- MOVQ CX, 0(R10)(SI*8)
- SBBQ CX, CX // save CF
- NEGQ CX
-
- ADDQ $1, SI // i++
- SUBQ $1, DI // n--
- JG L3 // if n > 0 goto L3
-
-E3: MOVQ CX, c+56(FP) // return c
- RET
-large:
- JMP ·addVWlarge(SB)
-
-
-// func subVW(z, x []Word, y Word) (c Word)
-// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
-TEXT ·subVW(SB),NOSPLIT,$0
- MOVQ z_len+8(FP), DI
- CMPQ DI, $32
- JG large
- MOVQ x+24(FP), R8
- MOVQ y+48(FP), CX // c = y
- MOVQ z+0(FP), R10
-
- MOVQ $0, SI // i = 0
-
- // s/JL/JMP/ below to disable the unrolled loop
- SUBQ $4, DI // n -= 4
- JL V4 // if n < 4 goto V4
-
-U4: // n >= 0
- // regular loop body unrolled 4x
- MOVQ 0(R8)(SI*8), R11
- MOVQ 8(R8)(SI*8), R12
- MOVQ 16(R8)(SI*8), R13
- MOVQ 24(R8)(SI*8), R14
- SUBQ CX, R11
- SBBQ $0, R12
- SBBQ $0, R13
- SBBQ $0, R14
- SBBQ CX, CX // save CF
- NEGQ CX
- MOVQ R11, 0(R10)(SI*8)
- MOVQ R12, 8(R10)(SI*8)
- MOVQ R13, 16(R10)(SI*8)
- MOVQ R14, 24(R10)(SI*8)
-
- ADDQ $4, SI // i += 4
- SUBQ $4, DI // n -= 4
- JGE U4 // if n >= 0 goto U4
-
-V4: ADDQ $4, DI // n += 4
- JLE E4 // if n <= 0 goto E4
-
-L4: // n > 0
- MOVQ 0(R8)(SI*8), R11
- SUBQ CX, R11
- MOVQ R11, 0(R10)(SI*8)
- SBBQ CX, CX // save CF
- NEGQ CX
-
- ADDQ $1, SI // i++
- SUBQ $1, DI // n--
- JG L4 // if n > 0 goto L4
-
-E4: MOVQ CX, c+56(FP) // return c
- RET
-large:
- JMP ·subVWlarge(SB)
-
-
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVQ z_len+8(FP), BX // i = z
RET
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
- MOVW z+0(FP), R1
- MOVW z_len+4(FP), R4
- MOVW x+12(FP), R2
- MOVW y+24(FP), R3
- ADD R4<<2, R1, R4
- TEQ R1, R4
- BNE L3a
- MOVW R3, c+28(FP)
- RET
-L3a:
- MOVW.P 4(R2), R5
- ADD.S R3, R5
- MOVW.P R5, 4(R1)
- B E3
-L3:
- MOVW.P 4(R2), R5
- ADC.S $0, R5
- MOVW.P R5, 4(R1)
-E3:
- TEQ R1, R4
- BNE L3
-
- MOVW $0, R0
- MOVW.CS $1, R0
- MOVW R0, c+28(FP)
- RET
-
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB),NOSPLIT,$0
- MOVW z+0(FP), R1
- MOVW z_len+4(FP), R4
- MOVW x+12(FP), R2
- MOVW y+24(FP), R3
- ADD R4<<2, R1, R4
- TEQ R1, R4
- BNE L4a
- MOVW R3, c+28(FP)
- RET
-L4a:
- MOVW.P 4(R2), R5
- SUB.S R3, R5
- MOVW.P R5, 4(R1)
- B E4
-L4:
- MOVW.P 4(R2), R5
- SBC.S $0, R5
- MOVW.P R5, 4(R1)
-E4:
- TEQ R1, R4
- BNE L4
-
- MOVW $0, R0
- MOVW.CC $1, R0
- MOVW R0, c+28(FP)
- RET
-
-
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB),NOSPLIT,$0
MOVW z_len+4(FP), R5
MOVD R0, c+72(FP)
RET
-#define vwOneOp(instr, op1) \
- MOVD.P 8(R1), R4; \
- instr op1, R4; \
- MOVD.P R4, 8(R3);
-
-// handle the first 1~4 elements before starting iteration in addVW/subVW
-#define vwPreIter(instr1, instr2, counter, target) \
- vwOneOp(instr1, R2); \
- SUB $1, counter; \
- CBZ counter, target; \
- vwOneOp(instr2, $0); \
- SUB $1, counter; \
- CBZ counter, target; \
- vwOneOp(instr2, $0); \
- SUB $1, counter; \
- CBZ counter, target; \
- vwOneOp(instr2, $0);
-
-// do one iteration of add or sub in addVW/subVW
-#define vwOneIter(instr, counter, exit) \
- CBZ counter, exit; \ // careful not to touch the carry flag
- LDP.P 32(R1), (R4, R5); \
- LDP -16(R1), (R6, R7); \
- instr $0, R4, R8; \
- instr $0, R5, R9; \
- instr $0, R6, R10; \
- instr $0, R7, R11; \
- STP.P (R8, R9), 32(R3); \
- STP (R10, R11), -16(R3); \
- SUB $4, counter;
-
-// do one iteration of copy in addVW/subVW
-#define vwOneIterCopy(counter, exit) \
- CBZ counter, exit; \
- LDP.P 32(R1), (R4, R5); \
- LDP -16(R1), (R6, R7); \
- STP.P (R4, R5), 32(R3); \
- STP (R6, R7), -16(R3); \
- SUB $4, counter;
-
-// func addVW(z, x []Word, y Word) (c Word)
-// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
-// and switches to copy if we are done with carries. The copying is skipped as well
-// if 'x' and 'z' happen to share the same underlying storage.
-// The overhead of the checking and branching is visible when 'z' are small (~5%),
-// so set a threshold of 32, and remain the small-sized part entirely untouched.
-TEXT ·addVW(SB),NOSPLIT,$0
- MOVD z+0(FP), R3
- MOVD z_len+8(FP), R0
- MOVD x+24(FP), R1
- MOVD y+48(FP), R2
- CMP $32, R0
- BGE large // large-sized 'z' and 'x'
- CBZ R0, len0 // the length of z is 0
- MOVD.P 8(R1), R4
- ADDS R2, R4 // z[0] = x[0] + y, set carry
- MOVD.P R4, 8(R3)
- SUB $1, R0
- CBZ R0, len1 // the length of z is 1
- TBZ $0, R0, two
- MOVD.P 8(R1), R4 // do it once
- ADCS $0, R4
- MOVD.P R4, 8(R3)
- SUB $1, R0
-two: // do it twice
- TBZ $1, R0, loop
- LDP.P 16(R1), (R4, R5)
- ADCS $0, R4, R8 // c, z[i] = x[i] + c
- ADCS $0, R5, R9
- STP.P (R8, R9), 16(R3)
- SUB $2, R0
-loop: // do four times per round
- vwOneIter(ADCS, R0, len1)
- B loop
-len1:
- CSET HS, R2 // extract carry flag
-len0:
- MOVD R2, c+56(FP)
-done:
- RET
-large:
- AND $0x3, R0, R10
- AND $~0x3, R0
- // unrolling for the first 1~4 elements to avoid saving the carry
- // flag in each step, adjust $R0 if we unrolled 4 elements
- vwPreIter(ADDS, ADCS, R10, add4)
- SUB $4, R0
-add4:
- BCC copy
- vwOneIter(ADCS, R0, len1)
- B add4
-copy:
- MOVD ZR, c+56(FP)
- CMP R1, R3
- BEQ done
-copy_4: // no carry flag, copy the rest
- vwOneIterCopy(R0, done)
- B copy_4
-
-// func subVW(z, x []Word, y Word) (c Word)
-// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
-// and switches to copy if we are done with carries. The copying is skipped as well
-// if 'x' and 'z' happen to share the same underlying storage.
-// The overhead of the checking and branching is visible when 'z' are small (~5%),
-// so set a threshold of 32, and remain the small-sized part entirely untouched.
-TEXT ·subVW(SB),NOSPLIT,$0
- MOVD z+0(FP), R3
- MOVD z_len+8(FP), R0
- MOVD x+24(FP), R1
- MOVD y+48(FP), R2
- CMP $32, R0
- BGE large // large-sized 'z' and 'x'
- CBZ R0, len0 // the length of z is 0
- MOVD.P 8(R1), R4
- SUBS R2, R4 // z[0] = x[0] - y, set carry
- MOVD.P R4, 8(R3)
- SUB $1, R0
- CBZ R0, len1 // the length of z is 1
- TBZ $0, R0, two // do it once
- MOVD.P 8(R1), R4
- SBCS $0, R4
- MOVD.P R4, 8(R3)
- SUB $1, R0
-two: // do it twice
- TBZ $1, R0, loop
- LDP.P 16(R1), (R4, R5)
- SBCS $0, R4, R8 // c, z[i] = x[i] + c
- SBCS $0, R5, R9
- STP.P (R8, R9), 16(R3)
- SUB $2, R0
-loop: // do four times per round
- vwOneIter(SBCS, R0, len1)
- B loop
-len1:
- CSET LO, R2 // extract carry flag
-len0:
- MOVD R2, c+56(FP)
-done:
- RET
-large:
- AND $0x3, R0, R10
- AND $~0x3, R0
- // unrolling for the first 1~4 elements to avoid saving the carry
- // flag in each step, adjust $R0 if we unrolled 4 elements
- vwPreIter(SUBS, SBCS, R10, sub4)
- SUB $4, R0
-sub4:
- BCS copy
- vwOneIter(SBCS, R0, len1)
- B sub4
-copy:
- MOVD ZR, c+56(FP)
- CMP R1, R3
- BEQ done
-copy_4: // no carry flag, copy the rest
- vwOneIterCopy(R0, done)
- B copy_4
-
// func lshVU(z, x []Word, s uint) (c Word)
// This implementation handles the shift operation from the high word to the low word,
// which may be an error for the case where the low word of x overlaps with the high
//go:noescape
func subVV(z, x, y []Word) (c Word)
-// addVW should be an internal detail,
-// but widely used packages access it using linkname.
-// Notable members of the hall of shame include:
-// - github.com/remyoudompheng/bigfft
-//
-// Do not remove or change the type signature.
-// See go.dev/issue/67401.
-//
-//go:linkname addVW
-//go:noescape
-func addVW(z, x []Word, y Word) (c Word)
-
-// subVW should be an internal detail,
-// but widely used packages access it using linkname.
-// Notable members of the hall of shame include:
-// - github.com/remyoudompheng/bigfft
-//
-// Do not remove or change the type signature.
-// See go.dev/issue/67401.
-//
-//go:linkname subVW
-//go:noescape
-func subVW(z, x []Word, y Word) (c Word)
-
// shlVU should be an internal detail (and a stale one at that),
// but widely used packages access it using linkname.
// Notable members of the hall of shame include:
return subVV_g(z, x, y)
}
-func addVW(z, x []Word, y Word) (c Word) {
- // TODO: remove indirect function call when golang.org/issue/30548 is fixed
- fn := addVW_g
- if len(z) > 32 {
- fn = addVWlarge
- }
- return fn(z, x, y)
-}
-
-func subVW(z, x []Word, y Word) (c Word) {
- // TODO: remove indirect function call when golang.org/issue/30548 is fixed
- fn := subVW_g
- if len(z) > 32 {
- fn = subVWlarge
- }
- return fn(z, x, y)
-}
-
func lshVU(z, x []Word, s uint) (c Word) {
return lshVU_g(z, x, s)
}
MOVV R8, c+72(FP)
RET
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB),NOSPLIT,$0
- // input:
- // R4: z
- // R5: z_len
- // R7: x
- // R10: y
- MOVV z+0(FP), R4
- MOVV z_len+8(FP), R5
- MOVV x+24(FP), R7
- MOVV y+48(FP), R10
- MOVV $0, R6
- SLLV $3, R5
-loop:
- BEQ R5, R6, done
- MOVV (R6)(R7), R8
- ADDV R8, R10, R9 // x1 + c = z1, if z1 < x1 then z1 overflow
- SGTU R8, R9, R10
- MOVV R9, (R6)(R4)
- ADDV $8, R6
- JMP loop
-done:
- MOVV R10, c+56(FP)
- RET
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB),NOSPLIT,$0
- // input:
- // R4: z
- // R5: z_len
- // R7: x
- // R10: y
- MOVV z+0(FP), R4
- MOVV z_len+8(FP), R5
- MOVV x+24(FP), R7
- MOVV y+48(FP), R10
- MOVV $0, R6
- SLLV $3, R5
-loop:
- BEQ R5, R6, done
- MOVV (R6)(R7), R8
- SUBV R10, R8, R11 // x1 - c = z1, if z1 > x1 then overflow
- SGTU R11, R8, R10
- MOVV R11, (R6)(R4)
- ADDV $8, R6
- JMP loop
-done:
- MOVV R10, c+56(FP)
- RET
-
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
-TEXT ·addVW(SB),NOSPLIT,$0
- JMP ·addVW_g(SB)
-
-TEXT ·subVW(SB),NOSPLIT,$0
- JMP ·subVW_g(SB)
-
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
-TEXT ·addVW(SB),NOSPLIT,$0
- JMP ·addVW_g(SB)
-
-TEXT ·subVW(SB),NOSPLIT,$0
- JMP ·subVW_g(SB)
-
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
MOVD R4, c+72(FP)
RET
-// func addVW(z, x []Word, y Word) (c Word)
-TEXT ·addVW(SB), NOSPLIT, $0
- MOVD z+0(FP), R10 // R10 = z[]
- MOVD x+24(FP), R8 // R8 = x[]
- MOVD y+48(FP), R4 // R4 = y = c
- MOVD z_len+8(FP), R11 // R11 = z_len
-
- CMP R11, $0 // If z_len is zero, return
- BEQ done
-
- // We will process the first iteration out of the loop so we capture
- // the value of c. In the subsequent iterations, we will rely on the
- // value of CA set here.
- MOVD 0(R8), R20 // R20 = x[i]
- ADD $-1, R11 // R11 = z_len - 1
- ADDC R20, R4, R6 // R6 = x[i] + c
- CMP R11, $0 // If z_len was 1, we are done
- MOVD R6, 0(R10) // z[i]
- BEQ final
-
- // We will read 4 elements per iteration
- SRDCC $2, R11, R9 // R9 = z_len/4
- DCBT (R8)
- MOVD R9, CTR // Set up the loop counter
- BEQ tail // If R9 = 0, we can't use the loop
- PCALIGN $16
-
-loop:
- MOVD 8(R8), R20 // R20 = x[i]
- MOVD 16(R8), R21 // R21 = x[i+1]
- MOVD 24(R8), R22 // R22 = x[i+2]
- MOVDU 32(R8), R23 // R23 = x[i+3]
- ADDZE R20, R24 // R24 = x[i] + CA
- ADDZE R21, R25 // R25 = x[i+1] + CA
- ADDZE R22, R26 // R26 = x[i+2] + CA
- ADDZE R23, R27 // R27 = x[i+3] + CA
- MOVD R24, 8(R10) // z[i]
- MOVD R25, 16(R10) // z[i+1]
- MOVD R26, 24(R10) // z[i+2]
- MOVDU R27, 32(R10) // z[i+3]
- ADD $-4, R11 // R11 = z_len - 4
- BDNZ loop
-
- // We may have some elements to read
- CMP R11, $0
- BEQ final
-
-tail:
- MOVDU 8(R8), R20
- ADDZE R20, R24
- ADD $-1, R11
- MOVDU R24, 8(R10)
- CMP R11, $0
- BEQ final
-
- MOVDU 8(R8), R20
- ADDZE R20, R24
- ADD $-1, R11
- MOVDU R24, 8(R10)
- CMP R11, $0
- BEQ final
-
- MOVD 8(R8), R20
- ADDZE R20, R24
- MOVD R24, 8(R10)
-
-final:
- ADDZE R0, R4 // c = CA
-done:
- MOVD R4, c+56(FP)
- RET
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW(SB), NOSPLIT, $0
- MOVD z+0(FP), R10 // R10 = z[]
- MOVD x+24(FP), R8 // R8 = x[]
- MOVD y+48(FP), R4 // R4 = y = c
- MOVD z_len+8(FP), R11 // R11 = z_len
-
- CMP R11, $0 // If z_len is zero, return
- BEQ done
-
- // We will process the first iteration out of the loop so we capture
- // the value of c. In the subsequent iterations, we will rely on the
- // value of CA set here.
- MOVD 0(R8), R20 // R20 = x[i]
- ADD $-1, R11 // R11 = z_len - 1
- SUBC R4, R20, R6 // R6 = x[i] - c
- CMP R11, $0 // If z_len was 1, we are done
- MOVD R6, 0(R10) // z[i]
- BEQ final
-
- // We will read 4 elements per iteration
- SRDCC $2, R11, R9 // R9 = z_len/4
- DCBT (R8)
- MOVD R9, CTR // Set up the loop counter
- BEQ tail // If R9 = 0, we can't use the loop
-
- // The loop here is almost the same as the one used in s390x, but
- // we don't need to capture CA every iteration because we've already
- // done that above.
-
- PCALIGN $16
-loop:
- MOVD 8(R8), R20
- MOVD 16(R8), R21
- MOVD 24(R8), R22
- MOVDU 32(R8), R23
- SUBE R0, R20
- SUBE R0, R21
- SUBE R0, R22
- SUBE R0, R23
- MOVD R20, 8(R10)
- MOVD R21, 16(R10)
- MOVD R22, 24(R10)
- MOVDU R23, 32(R10)
- ADD $-4, R11
- BDNZ loop
-
- // We may have some elements to read
- CMP R11, $0
- BEQ final
-
-tail:
- MOVDU 8(R8), R20
- SUBE R0, R20
- ADD $-1, R11
- MOVDU R20, 8(R10)
- CMP R11, $0
- BEQ final
-
- MOVDU 8(R8), R20
- SUBE R0, R20
- ADD $-1, R11
- MOVDU R20, 8(R10)
- CMP R11, $0
- BEQ final
-
- MOVD 8(R8), R20
- SUBE R0, R20
- MOVD R20, 8(R10)
-
-final:
- // Capture CA
- SUBE R4, R4
- NEG R4, R4
-
-done:
- MOVD R4, c+56(FP)
- RET
-
//func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
MOVD z+0(FP), R3
MOV X29, c+72(FP) // return b
RET
-TEXT ·addVW(SB),NOSPLIT,$0
- MOV x+24(FP), X5
- MOV y+48(FP), X6
- MOV z+0(FP), X7
- MOV z_len+8(FP), X30
-
- MOV $4, X28
- MOV X6, X29 // c = y
-
- BEQZ X30, done
- BLTU X30, X28, loop1
-
-loop4:
- MOV 0(X5), X8 // x[0]
- MOV 8(X5), X11 // x[1]
- MOV 16(X5), X14 // x[2]
- MOV 24(X5), X17 // x[3]
-
- ADD X8, X29, X10 // z[0] = x[0] + c
- SLTU X8, X10, X29 // next c
-
- ADD X11, X29, X13 // z[1] = x[1] + c
- SLTU X11, X13, X29 // next c
-
- ADD X14, X29, X16 // z[2] = x[2] + c
- SLTU X14, X16, X29 // next c
-
- ADD X17, X29, X19 // z[3] = x[3] + c
- SLTU X17, X19, X29 // next c
-
- MOV X10, 0(X7) // z[0]
- MOV X13, 8(X7) // z[1]
- MOV X16, 16(X7) // z[2]
- MOV X19, 24(X7) // z[3]
-
- ADD $32, X5
- ADD $32, X7
- SUB $4, X30
-
- BGEU X30, X28, loop4
- BEQZ X30, done
-
-loop1:
- MOV 0(X5), X10 // x
-
- ADD X10, X29, X12 // z = x + c
- SLTU X10, X12, X29 // next c
-
- MOV X12, 0(X7) // z
-
- ADD $8, X5
- ADD $8, X7
- SUB $1, X30
-
- BNEZ X30, loop1
-
-done:
- MOV X29, c+56(FP) // return c
- RET
-
-TEXT ·subVW(SB),NOSPLIT,$0
- MOV x+24(FP), X5
- MOV y+48(FP), X6
- MOV z+0(FP), X7
- MOV z_len+8(FP), X30
-
- MOV $4, X28
- MOV X6, X29 // b = y
-
- BEQZ X30, done
- BLTU X30, X28, loop1
-
-loop4:
- MOV 0(X5), X8 // x[0]
- MOV 8(X5), X11 // x[1]
- MOV 16(X5), X14 // x[2]
- MOV 24(X5), X17 // x[3]
-
- SUB X29, X8, X10 // z[0] = x[0] - b
- SLTU X10, X8, X29 // next b
-
- SUB X29, X11, X13 // z[1] = x[1] - b
- SLTU X13, X11, X29 // next b
-
- SUB X29, X14, X16 // z[2] = x[2] - b
- SLTU X16, X14, X29 // next b
-
- SUB X29, X17, X19 // z[3] = x[3] - b
- SLTU X19, X17, X29 // next b
-
- MOV X10, 0(X7) // z[0]
- MOV X13, 8(X7) // z[1]
- MOV X16, 16(X7) // z[2]
- MOV X19, 24(X7) // z[3]
-
- ADD $32, X5
- ADD $32, X7
- SUB $4, X30
-
- BGEU X30, X28, loop4
- BEQZ X30, done
-
-loop1:
- MOV 0(X5), X10 // x
-
- SUB X29, X10, X12 // z = x - b
- SLTU X12, X10, X29 // next b
-
- MOV X12, 0(X7) // z
-
- ADD $8, X5
- ADD $8, X7
- SUB $1, X30
-
- BNEZ X30, loop1
-
-done:
- MOV X29, c+56(FP) // return b
- RET
-
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)
MOVD R4, c+72(FP) // return c
RET
-TEXT ·addVW(SB), NOSPLIT, $0
- MOVD z_len+8(FP), R5 // length of z
- MOVD x+24(FP), R6
- MOVD y+48(FP), R7 // c = y
- MOVD z+0(FP), R8
-
- CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
-
- // Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
- ADDC 0(R6), R7
- MOVD R7, 0(R8)
- CMPBEQ R5, $1, returnResult // len(z) == 1
- MOVD $0, R9
- ADDE 8(R6), R9
- MOVD R9, 8(R8)
- CMPBEQ R5, $2, returnResult // len(z) == 2
-
- // Update the counters
- MOVD $16, R12 // i = 2
- MOVD $-2(R5), R5 // n = n - 2
-
-loopOverEachWord:
- BRC $12, copySetup // carry = 0, copy the rest
- MOVD $1, R9
-
- // Originally we used the carry flag generated in the previous iteration
- // (i.e: ADDE could be used here to do the addition). However, since we
- // already know carry is 1 (otherwise we will go to copy section), we can use
- // ADDC here so the current iteration does not depend on the carry flag
- // generated in the previous iteration. This could be useful when branch prediction happens.
- ADDC 0(R6)(R12*1), R9
- MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
-
- MOVD $8(R12), R12 // i++
- BRCTG R5, loopOverEachWord // n--
-
-// Return the current carry value
-returnResult:
- MOVD $0, R0
- ADDE R0, R0
- MOVD R0, c+56(FP)
- RET
-
-// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
-// With the assumption that x and z will not overlap with each other or x and z will
-// point to same memory region, we can use a faster version of copy using only MVC here.
-// In the following implementation, we have three copy loops, each copying a word, 4 words, and
-// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
-copySetup:
- ADD R12, R6
- ADD R12, R8
-
- CMPBGE R5, $4, mediumLoop
-
-smallLoop: // does a loop unrolling to copy word when n < 4
- CMPBEQ R5, $0, returnZero
- MVC $8, 0(R6), 0(R8)
- CMPBEQ R5, $1, returnZero
- MVC $8, 8(R6), 8(R8)
- CMPBEQ R5, $2, returnZero
- MVC $8, 16(R6), 16(R8)
-
-returnZero:
- MOVD $0, c+56(FP) // return 0 as carry
- RET
-
-mediumLoop:
- CMPBLT R5, $4, smallLoop
- CMPBLT R5, $32, mediumLoopBody
-
-largeLoop: // Copying 256 bytes at a time.
- MVC $256, 0(R6), 0(R8)
- MOVD $256(R6), R6
- MOVD $256(R8), R8
- MOVD $-32(R5), R5
- CMPBGE R5, $32, largeLoop
- BR mediumLoop
-
-mediumLoopBody: // Copying 32 bytes at a time
- MVC $32, 0(R6), 0(R8)
- MOVD $32(R6), R6
- MOVD $32(R8), R8
- MOVD $-4(R5), R5
- CMPBGE R5, $4, mediumLoopBody
- BR smallLoop
-
-returnC:
- MOVD R7, c+56(FP)
- RET
-
-TEXT ·subVW(SB), NOSPLIT, $0
- MOVD z_len+8(FP), R5
- MOVD x+24(FP), R6
- MOVD y+48(FP), R7 // The borrow bit passed in
- MOVD z+0(FP), R8
- MOVD $0, R0 // R0 is a temporary variable used during computation. Ensure it has zero in it.
-
- CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
-
- // Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
- MOVD 0(R6), R9
- SUBC R7, R9
- MOVD R9, 0(R8)
- CMPBEQ R5, $1, returnResult
- MOVD 8(R6), R9
- SUBE R0, R9
- MOVD R9, 8(R8)
- CMPBEQ R5, $2, returnResult
-
- // Update the counters
- MOVD $16, R12 // i = 2
- MOVD $-2(R5), R5 // n = n - 2
-
-loopOverEachWord:
- BRC $3, copySetup // no borrow, copy the rest
- MOVD 0(R6)(R12*1), R9
-
- // Originally we used the borrow flag generated in the previous iteration
- // (i.e: SUBE could be used here to do the subtraction). However, since we
- // already know borrow is 1 (otherwise we will go to copy section), we can
- // use SUBC here so the current iteration does not depend on the borrow flag
- // generated in the previous iteration. This could be useful when branch prediction happens.
- SUBC $1, R9
- MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
-
- MOVD $8(R12), R12 // i++
- BRCTG R5, loopOverEachWord // n--
-
-// return the current borrow value
-returnResult:
- SUBE R0, R0
- NEG R0, R0
- MOVD R0, c+56(FP)
- RET
-
-// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
-// With the assumption that x and z will not overlap with each other or x and z will
-// point to same memory region, we can use a faster version of copy using only MVC here.
-// In the following implementation, we have three copy loops, each copying a word, 4 words, and
-// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
-copySetup:
- ADD R12, R6
- ADD R12, R8
-
- CMPBGE R5, $4, mediumLoop
-
-smallLoop: // does a loop unrolling to copy word when n < 4
- CMPBEQ R5, $0, returnZero
- MVC $8, 0(R6), 0(R8)
- CMPBEQ R5, $1, returnZero
- MVC $8, 8(R6), 8(R8)
- CMPBEQ R5, $2, returnZero
- MVC $8, 16(R6), 16(R8)
-
-returnZero:
- MOVD $0, c+56(FP) // return 0 as borrow
- RET
-
-mediumLoop:
- CMPBLT R5, $4, smallLoop
- CMPBLT R5, $32, mediumLoopBody
-
-largeLoop: // Copying 256 bytes at a time
- MVC $256, 0(R6), 0(R8)
- MOVD $256(R6), R6
- MOVD $256(R8), R8
- MOVD $-32(R5), R5
- CMPBGE R5, $32, largeLoop
- BR mediumLoop
-
-mediumLoopBody: // Copying 32 bytes at a time
- MVC $32, 0(R6), 0(R8)
- MOVD $32(R6), R6
- MOVD $32(R8), R8
- MOVD $-4(R5), R5
- CMPBGE R5, $4, mediumLoopBody
- BR smallLoop
-
-returnC:
- MOVD R7, c+56(FP)
- RET
-
// func lshVU(z, x []Word, s uint) (c Word)
TEXT ·lshVU(SB), NOSPLIT, $0
BR ·lshVU_g(SB)
func TestAddVV(t *testing.T) { testVV(t, "addVV", addVV, addVV_g) }
func TestSubVV(t *testing.T) { testVV(t, "subVV", subVV, subVV_g) }
-func TestAddVW(t *testing.T) { testVW(t, "addVW", addVW, addVW_g, words4) }
-func TestSubVW(t *testing.T) { testVW(t, "subVW", subVW, subVW_g, words4) }
+func TestAddVW(t *testing.T) { testVW(t, "addVW", addVW, addVW_ref, words4) }
+func TestSubVW(t *testing.T) { testVW(t, "subVW", subVW, subVW_ref, words4) }
func TestLshVU(t *testing.T) { testVU(t, "lshVU", lshVU, lshVU_g, shifts) }
func TestRshVU(t *testing.T) { testVU(t, "rshVU", rshVU, rshVU_g, shifts) }
func TestMulAddVWW(t *testing.T) { testVWW(t, "mulAddVWW", mulAddVWW, mulAddVWW_g, muls) }
}
func BenchmarkAddVW(b *testing.B) {
- bench(b, "/impl=asm/data=random", benchVW(addVW, 123))
- bench(b, "/impl=asm/data=carry", benchCarryVW(addVW, ^Word(0), 1))
- bench(b, "/impl=asm/data=shortcut", benchShortVW(addVW, 123))
- bench(b, "/impl=go/data=random", benchVW(addVW_g, 123))
- bench(b, "/impl=go/data=carry", benchCarryVW(addVW_g, ^Word(0), 1))
- bench(b, "/impl=go/data=shortcut", benchShortVW(addVW_g, 123))
+ bench(b, "/data=random", benchVW(addVW, 123))
+ bench(b, "/data=carry", benchCarryVW(addVW, ^Word(0), 1))
+ bench(b, "/data=shortcut", benchShortVW(addVW, 123))
}
func BenchmarkSubVW(b *testing.B) {
- bench(b, "/impl=asm/data=random", benchVW(subVW, 123))
- bench(b, "/impl=asm/data=carry", benchCarryVW(subVW, 0, 1))
- bench(b, "/impl=asm/data=shortcut", benchShortVW(subVW, 123))
- bench(b, "/impl=go/data=random", benchVW(subVW_g, 123))
- bench(b, "/impl=go/data=carry", benchCarryVW(subVW_g, 0, 1))
- bench(b, "/impl=go/data=shortcut", benchShortVW(subVW_g, 123))
+ bench(b, "/data=random", benchVW(subVW, 123))
+ bench(b, "/data=carry", benchCarryVW(subVW, 0, 1))
+ bench(b, "/data=shortcut", benchShortVW(subVW, 123))
}
func benchVW(fn func(z, x []Word, w Word) Word, w Word) benchFunc {
TEXT ·subVV(SB),NOSPLIT,$0
JMP ·subVV_g(SB)
-TEXT ·addVW(SB),NOSPLIT,$0
- JMP ·addVW_g(SB)
-
-TEXT ·subVW(SB),NOSPLIT,$0
- JMP ·subVW_g(SB)
-
TEXT ·lshVU(SB),NOSPLIT,$0
JMP ·lshVU_g(SB)