MOVD R4, c+72(FP)
RET
+// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB), NOSPLIT, $0
- BR ·addVW_g(SB)
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R4 // R4 = y = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11 // If z_len is zero, return
+ BEQ done
+ // We will process the first iteration out of the loop so we capture
+ // the value of c. In the subsequent iterations, we will rely on the
+ // value of CA set here.
+ MOVD 0(R8), R20 // R20 = x[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ ADDC R20, R4, R6 // R6 = x[i] + c
+ CMP R0, R11 // If z_len was 1, we are done
+ MOVD R6, 0(R10) // z[i]
+ BEQ final
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R9 // R9 = z_len/4
+ DCBT (R8)
+ CMP R0, R9
+ MOVD R9, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+
+loop:
+ MOVD 8(R8), R20 // R20 = x[i]
+ MOVD 16(R8), R21 // R21 = x[i+1]
+ MOVD 24(R8), R22 // R22 = x[i+2]
+ MOVDU 32(R8), R23 // R23 = x[i+3]
+ ADDZE R20, R24 // R24 = x[i] + CA
+ ADDZE R21, R25 // R25 = x[i+1] + CA
+ ADDZE R22, R26 // R26 = x[i+2] + CA
+ ADDZE R23, R27 // R27 = x[i+3] + CA
+ MOVD R24, 8(R10) // z[i]
+ MOVD R25, 16(R10) // z[i+1]
+ MOVD R26, 24(R10) // z[i+2]
+ MOVDU R27, 32(R10) // z[i+3]
+ ADD $-4, R11 // R11 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ final
+
+tail:
+ MOVDU 8(R8), R20
+ ADDZE R20, R24
+ ADD $-1, R11
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVDU 8(R8), R20
+ ADDZE R20, R24
+ ADD $-1, R11
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVD 8(R8), R20
+ ADDZE R20, R24
+ MOVD R24, 8(R10)
+
+final:
+ ADDZE R0, R4 // c = CA
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+// func subVW(z, x []Word, y Word) (c Word)
TEXT ·subVW(SB), NOSPLIT, $0
- BR ·subVW_g(SB)
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R4 // R4 = y = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11 // If z_len is zero, return
+ BEQ done
+
+ // We will process the first iteration out of the loop so we capture
+ // the value of c. In the subsequent iterations, we will rely on the
+ // value of CA set here.
+ MOVD 0(R8), R20 // R20 = x[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ SUBC R4, R20, R6 // R6 = x[i] - c
+ CMP R0, R11 // If z_len was 1, we are done
+ MOVD R6, 0(R10) // z[i]
+ BEQ final
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R9 // R9 = z_len/4
+ DCBT (R8)
+ CMP R0, R9
+ MOVD R9, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+
+ // The loop here is almost the same as the one used in s390x, but
+ // we don't need to capture CA every iteration because we've already
+ // done that above.
+loop:
+ MOVD 8(R8), R20
+ MOVD 16(R8), R21
+ MOVD 24(R8), R22
+ MOVDU 32(R8), R23
+ SUBE R0, R20
+ SUBE R0, R21
+ SUBE R0, R22
+ SUBE R0, R23
+ MOVD R20, 8(R10)
+ MOVD R21, 16(R10)
+ MOVD R22, 24(R10)
+ MOVDU R23, 32(R10)
+ ADD $-4, R11
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ final
+
+tail:
+ MOVDU 8(R8), R20
+ SUBE R0, R20
+ ADD $-1, R11
+ MOVDU R20, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVDU 8(R8), R20
+ SUBE R0, R20
+ ADD $-1, R11
+ MOVDU R20, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVD 8(R8), R20
+ SUBE R0, R20
+ MOVD R20, 8(R10)
+
+final:
+ // Capture CA
+ SUBE R4, R4
+ NEG R4, R4
+
+done:
+ MOVD R4, c+56(FP)
+ RET
TEXT ·shlVU(SB), NOSPLIT, $0
BR ·shlVU_g(SB)