// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0
- MOVD z+0(FP), R10
- MOVD x+24(FP), R8
- MOVD y+48(FP), R9
- MOVD r+56(FP), R4 // c = r
- MOVD z_len+8(FP), R11
- MOVD $0, R3 // i = 0
- MOVD $8, R18
- MOVD $1, R19
-
- JMP e5
-
-l5:
- MULLD R18, R3, R5
- MOVD (R8)(R5), R20
- MULLD R9, R20, R6
- MULHDU R9, R20, R7
- ADDC R4, R6
- ADDZE R7
- MOVD R6, (R10)(R5)
- MOVD R7, R4
- ADD R19, R3
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y
+ MOVD r+56(FP), R4 // R4 = r = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
-e5:
- CMP R3, R11
- BLT l5
+ MOVD R0, R3 // R3 will be the index register
+ CMP R0, R11
+ MOVD R11, CTR // Initialize loop counter
+ BEQ done
+loop:
+ MOVD (R8)(R3), R20 // x[i]
+ MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
+ MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
+ ADDC R4, R6 // Compute sum for z1 and z0
+ ADDZE R7
+ MOVD R6, (R10)(R3) // z[i]
+ MOVD R7, R4 // c
+ ADD $8, R3
+ BC 16, 0, loop // bdnz
+
+done:
MOVD R4, c+64(FP)
RET
// func addMulVVW(z, x []Word, y Word) (c Word)
TEXT ·addMulVVW(SB), NOSPLIT, $0
- MOVD z+0(FP), R10
- MOVD x+24(FP), R8
- MOVD y+48(FP), R9
- MOVD z_len+8(FP), R22
-
- MOVD $0, R5 // i = 0
- MOVD $0, R4 // c = 0
- MOVD $8, R28
- MOVD $-2, R23
- AND R22, R23 // mask the last bit of z.len
- MOVD $2, R24
- CMP R23, R24
- BGE unrolled
- JMP end
-
-unrolled:
- MOVD $8, R19 // no (RA)(RB*8) on power
- MULLD R5, R19
- MOVD (R10)(R19), R11 // R11 = z[i]
- MOVD (R8)(R19), R16 // R16 = x[i]
- ADD R28, R19, R25
- MOVD (R10)(R25), R17
- MOVD (R8)(R25), R18
-
- MULLD R9, R16, R12
- MULHDU R9, R16, R14
- MULLD R9, R18, R6
- MULHDU R9, R18, R7
- ADDC R4, R12
- ADDZE R14
- ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry
- ADDZE R14 // carry = high order bits + add carry
- MOVD R12, (R10)(R19)
- ADDC R14, R6
- ADDZE R7
- ADDC R17, R6
- ADDZE R7
- MOVD R6, (R10)(R25)
- MOVD R7, R4
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y
+ MOVD z_len+8(FP), R22 // R22 = z_len
- ADD R24, R5
- CMP R5, R23
- BLT unrolled
- JMP end
+ MOVD R0, R3 // R3 will be the index register
+ CMP R0, R22
+ MOVD R0, R4 // R4 = c = 0
+ MOVD R22, CTR // Initialize loop counter
+ BEQ done
loop:
- MOVD $8, R19
- MULLD R5, R19
- MOVD (R10)(R19), R11
- MOVD (R8)(R19), R16
- MULLD R9, R16, R12
- MULHDU R9, R16, R14
- ADDC R4, R12
- ADDZE R14
- ADDC R11, R12
- ADDZE R14
- MOVD R12, (R10)(R19)
- MOVD R14, R4
-
- MOVD $1, R15
- ADD R15, R5
-
-end:
- CMP R5, R22
- BLT loop
+ MOVD (R8)(R3), R20 // Load x[i]
+ MOVD (R10)(R3), R21 // Load z[i]
+ MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
+ MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
+ ADDC R21, R6 // R6 = z0
+ ADDZE R7 // R7 = z1
+ ADDC R4, R6 // R6 = z0 + c + 0
+ ADDZE R7, R4 // c += z1
+ MOVD R6, (R10)(R3) // Store z[i]
+ ADD $8, R3
+ BC 16, 0, loop // bdnz
+done:
MOVD R4, c+56(FP)
RET