//go:build !purego
+// Register usage (z13 convention):
+// R2 = rp (result pointer)
+// R3 = ap (source pointer)
+// R4 = an / idx (loop counter)
+// R5 = b0 (multiplier limb)
+// R6 = cy (carry)
+
#include "textflag.h"
// func addMulVVW1024(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1024(SB), $0-32
- MOVD $16, R5
- JMP addMulVVWx(SB)
+ MOVD $16, R4
+ JMP addMulVVWx(SB)
// func addMulVVW1536(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1536(SB), $0-32
- MOVD $24, R5
- JMP addMulVVWx(SB)
+ MOVD $24, R4
+ JMP addMulVVWx(SB)
// func addMulVVW2048(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW2048(SB), $0-32
- MOVD $32, R5
- JMP addMulVVWx(SB)
+ MOVD $32, R4
+ JMP addMulVVWx(SB)
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
MOVD z+0(FP), R2
- MOVD x+8(FP), R8
- MOVD y+16(FP), R9
-
- MOVD $0, R1 // i*8 = 0
- MOVD $0, R7 // i = 0
- MOVD $0, R0 // make sure it's zero
- MOVD $0, R4 // c = 0
-
- MOVD R5, R12
- AND $-2, R12
- CMPBGE R5, $2, A6
- BR E6
-
-A6:
- MOVD (R8)(R1*1), R6
- MULHDU R9, R6
- MOVD (R2)(R1*1), R10
- ADDC R10, R11 // add to low order bits
- ADDE R0, R6
- ADDC R4, R11
- ADDE R0, R6
- MOVD R6, R4
- MOVD R11, (R2)(R1*1)
-
- MOVD (8)(R8)(R1*1), R6
- MULHDU R9, R6
- MOVD (8)(R2)(R1*1), R10
- ADDC R10, R11 // add to low order bits
- ADDE R0, R6
- ADDC R4, R11
- ADDE R0, R6
- MOVD R6, R4
- MOVD R11, (8)(R2)(R1*1)
-
- ADD $16, R1 // i*8 + 8
- ADD $2, R7 // i++
-
- CMPBLT R7, R12, A6
- BR E6
-
-L6:
- // TODO: drop unused single-step loop.
- MOVD (R8)(R1*1), R6
- MULHDU R9, R6
- MOVD (R2)(R1*1), R10
- ADDC R10, R11 // add to low order bits
- ADDE R0, R6
- ADDC R4, R11
- ADDE R0, R6
- MOVD R6, R4
- MOVD R11, (R2)(R1*1)
-
- ADD $8, R1 // i*8 + 8
- ADD $1, R7 // i++
-
-E6:
- CMPBLT R7, R5, L6 // i < n
-
- MOVD R4, c+24(FP)
+ MOVD x+8(FP), R3
+ MOVD y+16(FP), R5
+
+ MOVD $0, R6
+
+L_ent:
+ VZERO V0
+ VZERO V2
+ SRD $2, R4, R10
+ TMLL R4, $1
+ BRC $8, L_bx0
+
+L_bx1:
+ VLEG $1, 0(R2), V2
+ VZERO V4
+ TMLL R4, $2
+ BRC $7, L_b11
+
+L_b01:
+ MOVD $-24, R4
+ MOVD R6, R0
+ MOVD 0(R3), R7
+ MLGR R5, R6
+ ADDC R0, R7
+ MOVD $0, R0
+ ADDE R0, R6
+ VLVGG $1, R7, V4
+ VAQ V2, V4, V2
+ VSTEG $1, V2, 0(R2)
+ VMRHG V2, V2, V2
+ CMPBEQ R10, $0, L_1
+ BR L_cj0
+
+L_b11:
+ MOVD $-8, R4
+ MOVD 0(R3), R9
+ MLGR R5, R8
+ ADDC R6, R9
+ MOVD $0, R6
+ ADDE R6, R8
+ VLVGG $1, R9, V4
+ VAQ V2, V4, V2
+ VSTEG $1, V2, 0(R2)
+ VMRHG V2, V2, V2
+ BR L_cj1
+
+L_bx0:
+ TMLL R4, $2
+ BRC $7, L_b10
+
+L_b00:
+ MOVD $-32, R4
+
+L_cj0:
+ MOVD 32(R3)(R4), R1
+ MOVD 40(R3)(R4), R9
+ MLGR R5, R0
+ MLGR R5, R8
+ VL 32(R4)(R2), V1
+ VPDI $4, V1, V1, V1
+ VLVGP R0, R1, V6
+ VLVGP R9, R6, V7
+ BR L_mid
+
+L_b10:
+ MOVD $-16, R4
+ MOVD R6, R8
+
+L_cj1:
+ MOVD 16(R4)(R3), R1
+ MOVD 24(R4)(R3), R7
+ MLGR R5, R0
+ MLGR R5, R6
+ VL 16(R4)(R2), V1
+ VPDI $4, V1, V1, V1
+ VLVGP R0, R1, V6
+ VLVGP R7, R8, V7
+ CMPBEQ R10, $0, L_end
+
+L_top:
+ MOVD 32(R4)(R3), R1
+ MOVD 40(R4)(R3), R9
+ MLGR R5, R0
+ MLGR R5, R8
+ VACQ V6, V1, V0, V5
+ VACCCQ V6, V1, V0, V0
+ VACQ V5, V7, V2, V3
+ VACCCQ V5, V7, V2, V2
+ VPDI $4, V3, V3, V3
+ VL 32(R4)(R2), V1
+ VPDI $4, V1, V1, V1
+ VST V3, 16(R4)(R2)
+ VLVGP R0, R1, V6
+ VLVGP R9, R6, V7
+
+L_mid:
+ MOVD 48(R4)(R3), R1
+ MOVD 56(R4)(R3), R7
+ MLGR R5, R0
+ MLGR R5, R6
+ VACQ V6, V1, V0, V5
+ VACCCQ V6, V1, V0, V0
+ VACQ V5, V7, V2, V3
+ VACCCQ V5, V7, V2, V2
+ VPDI $4, V3, V3, V3
+ VL 48(R4)(R2), V1
+ VPDI $4, V1, V1, V1
+ VST V3, 32(R4)(R2)
+ VLVGP R0, R1, V6
+ VLVGP R7, R8, V7
+ MOVD $32(R4), R4
+ BRCTG R10, L_top
+
+L_end:
+ VACQ V6, V1, V0, V5
+ VACCCQ V6, V1, V0, V0
+ VACQ V5, V7, V2, V3
+ VACCCQ V5, V7, V2, V2
+ VPDI $4, V3, V3, V3
+ VST V3, 16(R2)(R4)
+ VAG V0, V2, V2
+
+L_1:
+ VLGVG $1, V2, R2
+ ADDC R6, R2
+ MOVD R2, c+24(FP)
RET
+