From 0329c915a03199d202581d0b33b092371fde08dc Mon Sep 17 00:00:00 2001 From: "Ruixin(Peter) Bao" Date: Tue, 21 Apr 2020 15:14:11 -0400 Subject: [PATCH] math/big: clean up whitespace in arith_s390x.s file This CL looks big but it only does formatting changes to arith_s390x.s. The file was formatted using asmfmt(https://github.com/klauspost/asmfmt) , so there should not be any functional impact. I verified that the generated assembly of big.test file is identical. Change-Id: I8b4035ef082a4d0357881869327e25253f2d8be1 Reviewed-on: https://go-review.googlesource.com/c/go/+/229302 Reviewed-by: Michael Munday Run-TryBot: Michael Munday TryBot-Result: Gobot Gobot --- src/math/big/arith_s390x.s | 1869 ++++++++++++++++++------------------ 1 file changed, 958 insertions(+), 911 deletions(-) diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s index 9156d9debe..319a7754ed 100644 --- a/src/math/big/arith_s390x.s +++ b/src/math/big/arith_s390x.s @@ -9,303 +9,312 @@ // This file provides fast assembly versions for the elementary // arithmetic operations on vectors implemented in arith.go. -TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1 - MOVD $x-24(SP), R1 - XC $24, 0(R1), 0(R1) // clear the storage - MOVD $2, R0 // R0 is the number of double words stored -1 - WORD $0xB2B01000 // STFLE 0(R1) - XOR R0, R0 // reset the value of R0 - MOVBZ z-8(SP), R1 - AND $0x40, R1 - BEQ novector +TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector + vectorinstalled: - // check if the vector instruction has been enabled - VLEIB $0, $0xF, V16 - VLGVB $0, V16, R1 - CMPBNE R1, $0xF, novector - MOVB $1, ret+0(FP) // have vx - RET + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET + novector: - MOVB $0, ret+0(FP) // no vx - RET - -TEXT ·mulWW(SB),NOSPLIT,$0 - MOVD x+0(FP), R3 - MOVD y+8(FP), R4 - MULHDU R3, R4 - MOVD R10, z1+16(FP) - MOVD R11, z0+24(FP) + MOVB $0, ret+0(FP) // no vx + RET + +TEXT ·mulWW(SB), NOSPLIT, $0 + MOVD x+0(FP), R3 + MOVD y+8(FP), R4 + MULHDU R3, R4 + MOVD R10, z1+16(FP) + MOVD R11, z0+24(FP) RET // func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - MOVD x1+0(FP), R10 - MOVD x0+8(FP), R11 - MOVD y+16(FP), R5 - WORD $0xb98700a5 // dlgr r10,r5 - MOVD R11, q+24(FP) - MOVD R10, r+32(FP) +TEXT ·divWW(SB), NOSPLIT, $0 + MOVD x1+0(FP), R10 + MOVD x0+8(FP), R11 + MOVD y+16(FP), R5 + WORD $0xb98700a5 // dlgr r10,r5 + MOVD R11, q+24(FP) + MOVD R10, r+32(FP) RET // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 // func addVV(z, x, y []Word) (c Word) +TEXT ·addVV(SB), NOSPLIT, $0 + MOVD addvectorfacility+0x00(SB), R1 + BR (R1) + +TEXT ·addVV_check(SB), NOSPLIT, $0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $addvectorfacility+0x00(SB), R1 + MOVD $·addVV_novec(SB), R2 + MOVD R2, 0(R1) -TEXT ·addVV(SB),NOSPLIT,$0 - MOVD addvectorfacility+0x00(SB),R1 - BR (R1) + // MOVD $·addVV_novec(SB), 0(R1) + BR ·addVV_novec(SB) -TEXT ·addVV_check(SB),NOSPLIT, $0 - MOVB ·hasVX(SB), R1 - CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported - MOVD $addvectorfacility+0x00(SB), R1 - MOVD $·addVV_novec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·addVV_novec(SB), 0(R1) - BR ·addVV_novec(SB) vectorimpl: - MOVD $addvectorfacility+0x00(SB), R1 - MOVD $·addVV_vec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·addVV_vec(SB), 0(R1) - BR ·addVV_vec(SB) + MOVD $addvectorfacility+0x00(SB), R1 + MOVD $·addVV_vec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·addVV_vec(SB), 0(R1) + BR ·addVV_vec(SB) GLOBL addvectorfacility+0x00(SB), NOPTR, $8 DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) -TEXT ·addVV_vec(SB),NOSPLIT,$0 - MOVD z_len+8(FP), R3 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD z+0(FP), R2 - - MOVD $0, R4 // c = 0 - MOVD $0, R0 // make sure it's zero - MOVD $0, R10 // i = 0 +TEXT ·addVV_vec(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 - BLT v1 - SUB $12, R3 // n -= 16 - BLT A1 // if n < 0 goto A1 - - MOVD R8, R5 - MOVD R9, R6 - MOVD R2, R7 + SUB $4, R3 + BLT v1 + SUB $12, R3 // n -= 16 + BLT A1 // if n < 0 goto A1 + + MOVD R8, R5 + MOVD R9, R6 + MOVD R2, R7 + // n >= 0 // regular loop body unrolled 16x - VZERO V0 // c = 0 -UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8 - ADD $64, R5 - VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order - VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order - - - VLM 0(R6), V9, V12 // 64-bytes into V9..V16 - ADD $64, R6 - VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order - VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order - - VACCCQ V1, V9, V0, V25 - VACQ V1, V9, V0, V17 - VACCCQ V2, V10, V25, V26 - VACQ V2, V10, V25, V18 - - - VLM 0(R5), V5, V6 // 32-bytes into V1..V8 - VLM 0(R6), V13, V14 // 32-bytes into V9..V16 - ADD $32, R5 - ADD $32, R6 - - VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order - VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order - VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order - VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order - - VACCCQ V3, V11, V26, V27 - VACQ V3, V11, V26, V19 - VACCCQ V4, V12, V27, V28 - VACQ V4, V12, V27, V20 - - VLM 0(R5), V7, V8 // 32-bytes into V1..V8 - VLM 0(R6), V15, V16 // 32-bytes into V9..V16 - ADD $32, R5 - ADD $32, R6 - - VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order - VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order - VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order - VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order - - VACCCQ V5, V13, V28, V29 - VACQ V5, V13, V28, V21 - VACCCQ V6, V14, V29, V30 - VACQ V6, V14, V29, V22 - - VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order - VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order - VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order - VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order - - VACCCQ V7, V15, V30, V31 - VACQ V7, V15, V30, V23 - VACCCQ V8, V16, V31, V0 //V0 has carry-over - VACQ V8, V16, V31, V24 - - VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order - VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order - VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order - VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order - VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order - VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order - VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order - VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order - VSTM V17, V24, 0(R7) // 128-bytes into z - ADD $128, R7 - ADD $128, R10 // i += 16 - SUB $16, R3 // n -= 16 - BGE UU1 // if n >= 0 goto U1 - VLGVG $1, V0, R4 // put cf into R4 - NEG R4, R4 // save cf - -A1: ADD $12, R3 // n += 16 - + VZERO V0 // c = 0 + +UU1: + VLM 0(R5), V1, V4 // 64-bytes into V1..V8 + ADD $64, R5 + VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order + VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order + + VLM 0(R6), V9, V12 // 64-bytes into V9..V16 + ADD $64, R6 + VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order + VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order + + VACCCQ V1, V9, V0, V25 + VACQ V1, V9, V0, V17 + VACCCQ V2, V10, V25, V26 + VACQ V2, V10, V25, V18 + + VLM 0(R5), V5, V6 // 32-bytes into V1..V8 + VLM 0(R6), V13, V14 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order + VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order + VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order + VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order + + VACCCQ V3, V11, V26, V27 + VACQ V3, V11, V26, V19 + VACCCQ V4, V12, V27, V28 + VACQ V4, V12, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V1..V8 + VLM 0(R6), V15, V16 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order + VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order + VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order + VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order + + VACCCQ V5, V13, V28, V29 + VACQ V5, V13, V28, V21 + VACCCQ V6, V14, V29, V30 + VACQ V6, V14, V29, V22 + + VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order + VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order + VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order + VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order + + VACCCQ V7, V15, V30, V31 + VACQ V7, V15, V30, V23 + VACCCQ V8, V16, V31, V0 // V0 has carry-over + VACQ V8, V16, V31, V24 + + VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order + VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order + VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order + VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order + VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order + VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order + VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order + VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 + NEG R4, R4 // save cf + +A1: + ADD $12, R3 // n += 16 // s/JL/JMP/ below to disable the unrolled loop - BLT v1 // if n < 0 goto v1 + BLT v1 // if n < 0 goto v1 -U1: // n >= 0 +U1: // n >= 0 // regular loop body unrolled 4x - MOVD 0(R8)(R10*1), R5 - MOVD 8(R8)(R10*1), R6 - MOVD 16(R8)(R10*1), R7 - MOVD 24(R8)(R10*1), R1 - ADDC R4, R4 // restore CF - MOVD 0(R9)(R10*1), R11 - ADDE R11, R5 - MOVD 8(R9)(R10*1), R11 - ADDE R11, R6 - MOVD 16(R9)(R10*1), R11 - ADDE R11, R7 - MOVD 24(R9)(R10*1), R11 - ADDE R11, R1 - MOVD R0, R4 - ADDE R4, R4 // save CF - NEG R4, R4 - MOVD R5, 0(R2)(R10*1) - MOVD R6, 8(R2)(R10*1) - MOVD R7, 16(R2)(R10*1) - MOVD R1, 24(R2)(R10*1) - - - ADD $32, R10 // i += 4 - SUB $4, R3 // n -= 4 - BGE U1 // if n >= 0 goto U1 - -v1: ADD $4, R3 // n += 4 - BLE E1 // if n <= 0 goto E1 - -L1: // n > 0 - ADDC R4, R4 // restore CF - MOVD 0(R8)(R10*1), R5 - MOVD 0(R9)(R10*1), R11 - ADDE R11, R5 - MOVD R5, 0(R2)(R10*1) - MOVD R0, R4 - ADDE R4, R4 // save CF - NEG R4, R4 - - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L1 // if n > 0 goto L1 - -E1: NEG R4, R4 - MOVD R4, c+72(FP) // return c + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R4 // restore CF + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD 8(R9)(R10*1), R11 + ADDE R11, R6 + MOVD 16(R9)(R10*1), R11 + ADDE R11, R7 + MOVD 24(R9)(R10*1), R11 + ADDE R11, R1 + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1 + +v1: + ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + ADDC R4, R4 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1 + +E1: + NEG R4, R4 + MOVD R4, c+72(FP) // return c RET -TEXT ·addVV_novec(SB),NOSPLIT,$0 +TEXT ·addVV_novec(SB), NOSPLIT, $0 novec: - MOVD z_len+8(FP), R3 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD z+0(FP), R2 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 - MOVD $0, R4 // c = 0 - MOVD $0, R0 // make sure it's zero - MOVD $0, R10 // i = 0 + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v1n // if n < 0 goto v1n -U1n: // n >= 0 + SUB $4, R3 // n -= 4 + BLT v1n // if n < 0 goto v1n + +U1n: // n >= 0 // regular loop body unrolled 4x - MOVD 0(R8)(R10*1), R5 - MOVD 8(R8)(R10*1), R6 - MOVD 16(R8)(R10*1), R7 - MOVD 24(R8)(R10*1), R1 - ADDC R4, R4 // restore CF - MOVD 0(R9)(R10*1), R11 - ADDE R11, R5 - MOVD 8(R9)(R10*1), R11 - ADDE R11, R6 - MOVD 16(R9)(R10*1), R11 - ADDE R11, R7 - MOVD 24(R9)(R10*1), R11 - ADDE R11, R1 - MOVD R0, R4 - ADDE R4, R4 // save CF - NEG R4, R4 - MOVD R5, 0(R2)(R10*1) - MOVD R6, 8(R2)(R10*1) - MOVD R7, 16(R2)(R10*1) - MOVD R1, 24(R2)(R10*1) - - - ADD $32, R10 // i += 4 - SUB $4, R3 // n -= 4 - BGE U1n // if n >= 0 goto U1n - -v1n: ADD $4, R3 // n += 4 - BLE E1n // if n <= 0 goto E1n - -L1n: // n > 0 - ADDC R4, R4 // restore CF - MOVD 0(R8)(R10*1), R5 - MOVD 0(R9)(R10*1), R11 - ADDE R11, R5 - MOVD R5, 0(R2)(R10*1) - MOVD R0, R4 - ADDE R4, R4 // save CF - NEG R4, R4 - - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L1n // if n > 0 goto L1n - -E1n: NEG R4, R4 - MOVD R4, c+72(FP) // return c + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R4 // restore CF + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD 8(R9)(R10*1), R11 + ADDE R11, R6 + MOVD 16(R9)(R10*1), R11 + ADDE R11, R7 + MOVD 24(R9)(R10*1), R11 + ADDE R11, R1 + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1n // if n >= 0 goto U1n + +v1n: + ADD $4, R3 // n += 4 + BLE E1n // if n <= 0 goto E1n + +L1n: // n > 0 + ADDC R4, R4 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1n // if n > 0 goto L1n + +E1n: + NEG R4, R4 + MOVD R4, c+72(FP) // return c RET +TEXT ·subVV(SB), NOSPLIT, $0 + MOVD subvectorfacility+0x00(SB), R1 + BR (R1) + +TEXT ·subVV_check(SB), NOSPLIT, $0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $subvectorfacility+0x00(SB), R1 + MOVD $·subVV_novec(SB), R2 + MOVD R2, 0(R1) -TEXT ·subVV(SB),NOSPLIT,$0 - MOVD subvectorfacility+0x00(SB),R1 - BR (R1) + // MOVD $·subVV_novec(SB), 0(R1) + BR ·subVV_novec(SB) -TEXT ·subVV_check(SB),NOSPLIT,$0 - MOVB ·hasVX(SB), R1 - CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported - MOVD $subvectorfacility+0x00(SB), R1 - MOVD $·subVV_novec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·subVV_novec(SB), 0(R1) - BR ·subVV_novec(SB) vectorimpl: - MOVD $subvectorfacility+0x00(SB), R1 - MOVD $·subVV_vec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·subVV_vec(SB), 0(R1) - BR ·subVV_vec(SB) + MOVD $subvectorfacility+0x00(SB), R1 + MOVD $·subVV_vec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·subVV_vec(SB), 0(R1) + BR ·subVV_vec(SB) GLOBL subvectorfacility+0x00(SB), NOPTR, $8 DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) @@ -313,181 +322,182 @@ DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 // func subVV(z, x, y []Word) (c Word) // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) -TEXT ·subVV_vec(SB),NOSPLIT,$0 - MOVD z_len+8(FP), R3 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD z+0(FP), R2 - MOVD $0, R4 // c = 0 - MOVD $0, R0 // make sure it's zero - MOVD $0, R10 // i = 0 +TEXT ·subVV_vec(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v1 // if n < 0 goto v1 - SUB $12, R3 // n -= 16 - BLT A1 // if n < 0 goto A1 + SUB $4, R3 // n -= 4 + BLT v1 // if n < 0 goto v1 + SUB $12, R3 // n -= 16 + BLT A1 // if n < 0 goto A1 - MOVD R8, R5 - MOVD R9, R6 - MOVD R2, R7 + MOVD R8, R5 + MOVD R9, R6 + MOVD R2, R7 // n >= 0 // regular loop body unrolled 16x - VZERO V0 // cf = 0 - MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) - VLVGG $1, R4, V0 //put carry into V0 - -UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8 - ADD $64, R5 - VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order - VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order - - - VLM 0(R6), V9, V12 // 64-bytes into V9..V16 - ADD $64, R6 - VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order - VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order - - VSBCBIQ V1, V9, V0, V25 - VSBIQ V1, V9, V0, V17 - VSBCBIQ V2, V10, V25, V26 - VSBIQ V2, V10, V25, V18 - - - VLM 0(R5), V5, V6 // 32-bytes into V1..V8 - VLM 0(R6), V13, V14 // 32-bytes into V9..V16 - ADD $32, R5 - ADD $32, R6 - - VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order - VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order - VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order - VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order - - VSBCBIQ V3, V11, V26, V27 - VSBIQ V3, V11, V26, V19 - VSBCBIQ V4, V12, V27, V28 - VSBIQ V4, V12, V27, V20 - - VLM 0(R5), V7, V8 // 32-bytes into V1..V8 - VLM 0(R6), V15, V16 // 32-bytes into V9..V16 - ADD $32, R5 - ADD $32, R6 - - VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order - VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order - VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order - VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order - - VSBCBIQ V5, V13, V28, V29 - VSBIQ V5, V13, V28, V21 - VSBCBIQ V6, V14, V29, V30 - VSBIQ V6, V14, V29, V22 - - VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order - VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order - VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order - VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order - - VSBCBIQ V7, V15, V30, V31 - VSBIQ V7, V15, V30, V23 - VSBCBIQ V8, V16, V31, V0 //V0 has carry-over - VSBIQ V8, V16, V31, V24 - - VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order - VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order - VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order - VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order - VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order - VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order - VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order - VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order - VSTM V17, V24, 0(R7) // 128-bytes into z - ADD $128, R7 - ADD $128, R10 // i += 16 - SUB $16, R3 // n -= 16 - BGE UU1 // if n >= 0 goto U1 - VLGVG $1, V0, R4 // put cf into R4 - SUB $1, R4 // save cf - -A1: ADD $12, R3 // n += 16 - BLT v1 // if n < 0 goto v1 - -U1: // n >= 0 + VZERO V0 // cf = 0 + MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) + VLVGG $1, R4, V0 // put carry into V0 + +UU1: + VLM 0(R5), V1, V4 // 64-bytes into V1..V8 + ADD $64, R5 + VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order + VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order + + VLM 0(R6), V9, V12 // 64-bytes into V9..V16 + ADD $64, R6 + VPDI $0x4, V9, V9, V9 // flip the doublewords to big-endian order + VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order + + VSBCBIQ V1, V9, V0, V25 + VSBIQ V1, V9, V0, V17 + VSBCBIQ V2, V10, V25, V26 + VSBIQ V2, V10, V25, V18 + + VLM 0(R5), V5, V6 // 32-bytes into V1..V8 + VLM 0(R6), V13, V14 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order + VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order + VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order + VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order + + VSBCBIQ V3, V11, V26, V27 + VSBIQ V3, V11, V26, V19 + VSBCBIQ V4, V12, V27, V28 + VSBIQ V4, V12, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V1..V8 + VLM 0(R6), V15, V16 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order + VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order + VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order + VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order + + VSBCBIQ V5, V13, V28, V29 + VSBIQ V5, V13, V28, V21 + VSBCBIQ V6, V14, V29, V30 + VSBIQ V6, V14, V29, V22 + + VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order + VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order + VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order + VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order + + VSBCBIQ V7, V15, V30, V31 + VSBIQ V7, V15, V30, V23 + VSBCBIQ V8, V16, V31, V0 // V0 has carry-over + VSBIQ V8, V16, V31, V24 + + VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order + VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order + VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order + VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order + VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order + VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order + VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order + VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 + SUB $1, R4 // save cf + +A1: + ADD $12, R3 // n += 16 + BLT v1 // if n < 0 goto v1 + +U1: // n >= 0 // regular loop body unrolled 4x - MOVD 0(R8)(R10*1), R5 - MOVD 8(R8)(R10*1), R6 - MOVD 16(R8)(R10*1), R7 - MOVD 24(R8)(R10*1), R1 - MOVD R0, R11 - SUBC R4, R11 // restore CF - MOVD 0(R9)(R10*1), R11 - SUBE R11, R5 - MOVD 8(R9)(R10*1), R11 - SUBE R11, R6 - MOVD 16(R9)(R10*1), R11 - SUBE R11, R7 - MOVD 24(R9)(R10*1), R11 - SUBE R11, R1 - MOVD R0, R4 - SUBE R4, R4 // save CF - MOVD R5, 0(R2)(R10*1) - MOVD R6, 8(R2)(R10*1) - MOVD R7, 16(R2)(R10*1) - MOVD R1, 24(R2)(R10*1) - - ADD $32, R10 // i += 4 - SUB $4, R3 // n -= 4 - BGE U1 // if n >= 0 goto U1n - -v1: ADD $4, R3 // n += 4 - BLE E1 // if n <= 0 goto E1 - -L1: // n > 0 - MOVD R0, R11 - SUBC R4, R11 // restore CF - MOVD 0(R8)(R10*1), R5 - MOVD 0(R9)(R10*1), R11 - SUBE R11, R5 - MOVD R5, 0(R2)(R10*1) - MOVD R0, R4 - SUBE R4, R4 // save CF - - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L1 // if n > 0 goto L1n - -E1: NEG R4, R4 - MOVD R4, c+72(FP) // return c - RET + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD 8(R9)(R10*1), R11 + SUBE R11, R6 + MOVD 16(R9)(R10*1), R11 + SUBE R11, R7 + MOVD 24(R9)(R10*1), R11 + SUBE R11, R1 + MOVD R0, R4 + SUBE R4, R4 // save CF + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1n +v1: + ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + SUBE R4, R4 // save CF + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1n + +E1: + NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 // func subVV(z, x, y []Word) (c Word) // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) -TEXT ·subVV_novec(SB),NOSPLIT,$0 +TEXT ·subVV_novec(SB), NOSPLIT, $0 MOVD z_len+8(FP), R3 MOVD x+24(FP), R8 MOVD y+48(FP), R9 MOVD z+0(FP), R2 - MOVD $0, R4 // c = 0 - MOVD $0, R0 // make sure it's zero - MOVD $0, R10 // i = 0 + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v1 // if n < 0 goto v1 + SUB $4, R3 // n -= 4 + BLT v1 // if n < 0 goto v1 -U1: // n >= 0 +U1: // n >= 0 // regular loop body unrolled 4x MOVD 0(R8)(R10*1), R5 MOVD 8(R8)(R10*1), R6 MOVD 16(R8)(R10*1), R7 MOVD 24(R8)(R10*1), R1 MOVD R0, R11 - SUBC R4, R11 // restore CF + SUBC R4, R11 // restore CF MOVD 0(R9)(R10*1), R11 SUBE R11, R5 MOVD 8(R9)(R10*1), R11 @@ -497,152 +507,154 @@ U1: // n >= 0 MOVD 24(R9)(R10*1), R11 SUBE R11, R1 MOVD R0, R4 - SUBE R4, R4 // save CF + SUBE R4, R4 // save CF MOVD R5, 0(R2)(R10*1) MOVD R6, 8(R2)(R10*1) MOVD R7, 16(R2)(R10*1) MOVD R1, 24(R2)(R10*1) + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1 - ADD $32, R10 // i += 4 - SUB $4, R3 // n -= 4 - BGE U1 // if n >= 0 goto U1 - -v1: ADD $4, R3 // n += 4 - BLE E1 // if n <= 0 goto E1 +v1: + ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 -L1: // n > 0 +L1: // n > 0 MOVD R0, R11 - SUBC R4, R11 // restore CF + SUBC R4, R11 // restore CF MOVD 0(R8)(R10*1), R5 MOVD 0(R9)(R10*1), R11 SUBE R11, R5 MOVD R5, 0(R2)(R10*1) MOVD R0, R4 - SUBE R4, R4 // save CF + SUBE R4, R4 // save CF - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L1 // if n > 0 goto L1 + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1 -E1: NEG R4, R4 - MOVD R4, c+72(FP) // return c +E1: + NEG R4, R4 + MOVD R4, c+72(FP) // return c RET -TEXT ·addVW(SB),NOSPLIT,$0 - MOVD addwvectorfacility+0x00(SB),R1 - BR (R1) - -TEXT ·addVW_check(SB),NOSPLIT,$0 - MOVB ·hasVX(SB), R1 - CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported - MOVD $addwvectorfacility+0x00(SB), R1 - MOVD $·addVW_novec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·addVW_novec(SB), 0(R1) - BR ·addVW_novec(SB) +TEXT ·addVW(SB), NOSPLIT, $0 + MOVD addwvectorfacility+0x00(SB), R1 + BR (R1) + +TEXT ·addVW_check(SB), NOSPLIT, $0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $addwvectorfacility+0x00(SB), R1 + MOVD $·addVW_novec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·addVW_novec(SB), 0(R1) + BR ·addVW_novec(SB) + vectorimpl: - MOVD $addwvectorfacility+0x00(SB), R1 - MOVD $·addVW_vec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·addVW_vec(SB), 0(R1) - BR ·addVW_vec(SB) + MOVD $addwvectorfacility+0x00(SB), R1 + MOVD $·addVW_vec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·addVW_vec(SB), 0(R1) + BR ·addVW_vec(SB) GLOBL addwvectorfacility+0x00(SB), NOPTR, $8 DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB) - // func addVW_vec(z, x []Word, y Word) (c Word) -TEXT ·addVW_vec(SB),NOSPLIT,$0 - MOVD z_len+8(FP), R3 - MOVD x+24(FP), R8 - MOVD y+48(FP), R4 // c = y - MOVD z+0(FP), R2 +TEXT ·addVW_vec(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R4 // c = y + MOVD z+0(FP), R2 - MOVD $0, R0 // make sure it's zero - MOVD $0, R10 // i = 0 - MOVD R8, R5 - MOVD R2, R7 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + MOVD R8, R5 + MOVD R2, R7 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v10 // if n < 0 goto v10 - SUB $12, R3 - BLT A10 + SUB $4, R3 // n -= 4 + BLT v10 // if n < 0 goto v10 + SUB $12, R3 + BLT A10 // n >= 0 // regular loop body unrolled 16x - VZERO V0 // prepare V0 to be final carry register - VZERO V9 // to ensure upper half is zero - VLVGG $1, R4, V9 -UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4 - ADD $64, R5 - VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order - VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order - - - VACCCQ V1, V9, V0, V25 - VACQ V1, V9, V0, V17 - VZERO V9 - VACCCQ V2, V9, V25, V26 - VACQ V2, V9, V25, V18 - - - VLM 0(R5), V5, V6 // 32-bytes into V5..V6 - ADD $32, R5 - - VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order - VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order - - VACCCQ V3, V9, V26, V27 - VACQ V3, V9, V26, V19 - VACCCQ V4, V9, V27, V28 - VACQ V4, V9, V27, V20 - - VLM 0(R5), V7, V8 // 32-bytes into V7..V8 - ADD $32, R5 - - VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order - VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order - - VACCCQ V5, V9, V28, V29 - VACQ V5, V9, V28, V21 - VACCCQ V6, V9, V29, V30 - VACQ V6, V9, V29, V22 - - VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order - VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order - - VACCCQ V7, V9, V30, V31 - VACQ V7, V9, V30, V23 - VACCCQ V8, V9, V31, V0 //V0 has carry-over - VACQ V8, V9, V31, V24 - - VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order - VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order - VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order - VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order - VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order - VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order - VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order - VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order - VSTM V17, V24, 0(R7) // 128-bytes into z - ADD $128, R7 - ADD $128, R10 // i += 16 - SUB $16, R3 // n -= 16 - BGE UU1 // if n >= 0 goto U1 - VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 - -A10: ADD $12, R3 // n += 16 - + VZERO V0 // prepare V0 to be final carry register + VZERO V9 // to ensure upper half is zero + VLVGG $1, R4, V9 + +UU1: + VLM 0(R5), V1, V4 // 64-bytes into V1..V4 + ADD $64, R5 + VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order + VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order + + VACCCQ V1, V9, V0, V25 + VACQ V1, V9, V0, V17 + VZERO V9 + VACCCQ V2, V9, V25, V26 + VACQ V2, V9, V25, V18 + + VLM 0(R5), V5, V6 // 32-bytes into V5..V6 + ADD $32, R5 + + VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order + VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order + + VACCCQ V3, V9, V26, V27 + VACQ V3, V9, V26, V19 + VACCCQ V4, V9, V27, V28 + VACQ V4, V9, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V7..V8 + ADD $32, R5 + + VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order + VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order + + VACCCQ V5, V9, V28, V29 + VACQ V5, V9, V28, V21 + VACCCQ V6, V9, V29, V30 + VACQ V6, V9, V29, V22 + + VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order + VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order + + VACCCQ V7, V9, V30, V31 + VACQ V7, V9, V30, V23 + VACCCQ V8, V9, V31, V0 // V0 has carry-over + VACQ V8, V9, V31, V24 + + VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order + VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order + VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order + VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order + VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order + VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order + VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order + VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 + +A10: + ADD $12, R3 // n += 16 // s/JL/JMP/ below to disable the unrolled loop - BLT v10 // if n < 0 goto v10 - + BLT v10 // if n < 0 goto v10 -U4: // n >= 0 +U4: // n >= 0 // regular loop body unrolled 4x MOVD 0(R8)(R10*1), R5 MOVD 8(R8)(R10*1), R6 @@ -653,52 +665,52 @@ U4: // n >= 0 ADDE R0, R7 ADDE R0, R1 ADDE R0, R0 - MOVD R0, R4 // save CF + MOVD R0, R4 // save CF SUB R0, R0 MOVD R5, 0(R2)(R10*1) MOVD R6, 8(R2)(R10*1) MOVD R7, 16(R2)(R10*1) MOVD R1, 24(R2)(R10*1) - ADD $32, R10 // i += 4 -> i +=32 - SUB $4, R3 // n -= 4 - BGE U4 // if n >= 0 goto U4 - -v10: ADD $4, R3 // n += 4 - BLE E10 // if n <= 0 goto E4 + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 +v10: + ADD $4, R3 // n += 4 + BLE E10 // if n <= 0 goto E4 -L4: // n > 0 - MOVD 0(R8)(R10*1), R5 - ADDC R4, R5 - ADDE R0, R0 - MOVD R0, R4 // save CF - SUB R0, R0 - MOVD R5, 0(R2)(R10*1) +L4: // n > 0 + MOVD 0(R8)(R10*1), R5 + ADDC R4, R5 + ADDE R0, R0 + MOVD R0, R4 // save CF + SUB R0, R0 + MOVD R5, 0(R2)(R10*1) - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L4 // if n > 0 goto L4 + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 -E10: MOVD R4, c+56(FP) // return c +E10: + MOVD R4, c+56(FP) // return c RET - -TEXT ·addVW_novec(SB),NOSPLIT,$0 -//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) +TEXT ·addVW_novec(SB), NOSPLIT, $0 + // DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) MOVD z_len+8(FP), R3 MOVD x+24(FP), R8 - MOVD y+48(FP), R4 // c = y + MOVD y+48(FP), R4 // c = y MOVD z+0(FP), R2 - MOVD $0, R0 // make sure it's 0 - MOVD $0, R10 // i = 0 + MOVD $0, R0 // make sure it's 0 + MOVD $0, R10 // i = 0 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v4 // if n < 4 goto v4 + SUB $4, R3 // n -= 4 + BLT v4 // if n < 4 goto v4 -U4: // n >= 0 +U4: // n >= 0 // regular loop body unrolled 4x MOVD 0(R8)(R10*1), R5 MOVD 8(R8)(R10*1), R6 @@ -709,531 +721,566 @@ U4: // n >= 0 ADDE R0, R7 ADDE R0, R1 ADDE R0, R0 - MOVD R0, R4 // save CF + MOVD R0, R4 // save CF SUB R0, R0 MOVD R5, 0(R2)(R10*1) MOVD R6, 8(R2)(R10*1) MOVD R7, 16(R2)(R10*1) MOVD R1, 24(R2)(R10*1) - ADD $32, R10 // i += 4 -> i +=32 - SUB $4, R3 // n -= 4 - BGE U4 // if n >= 0 goto U4 + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 -v4: ADD $4, R3 // n += 4 - BLE E4 // if n <= 0 goto E4 +v4: + ADD $4, R3 // n += 4 + BLE E4 // if n <= 0 goto E4 -L4: // n > 0 +L4: // n > 0 MOVD 0(R8)(R10*1), R5 ADDC R4, R5 ADDE R0, R0 - MOVD R0, R4 // save CF + MOVD R0, R4 // save CF SUB R0, R0 MOVD R5, 0(R2)(R10*1) - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L4 // if n > 0 goto L4 + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 -E4: MOVD R4, c+56(FP) // return c +E4: + MOVD R4, c+56(FP) // return c RET -TEXT ·subVW(SB),NOSPLIT,$0 - MOVD subwvectorfacility+0x00(SB),R1 - BR (R1) - -TEXT ·subVW_check(SB),NOSPLIT,$0 - MOVB ·hasVX(SB), R1 - CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported - MOVD $subwvectorfacility+0x00(SB), R1 - MOVD $·subVW_novec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·subVW_novec(SB), 0(R1) - BR ·subVW_novec(SB) +TEXT ·subVW(SB), NOSPLIT, $0 + MOVD subwvectorfacility+0x00(SB), R1 + BR (R1) + +TEXT ·subVW_check(SB), NOSPLIT, $0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $subwvectorfacility+0x00(SB), R1 + MOVD $·subVW_novec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·subVW_novec(SB), 0(R1) + BR ·subVW_novec(SB) + vectorimpl: - MOVD $subwvectorfacility+0x00(SB), R1 - MOVD $·subVW_vec(SB), R2 - MOVD R2, 0(R1) - //MOVD $·subVW_vec(SB), 0(R1) - BR ·subVW_vec(SB) + MOVD $subwvectorfacility+0x00(SB), R1 + MOVD $·subVW_vec(SB), R2 + MOVD R2, 0(R1) + + // MOVD $·subVW_vec(SB), 0(R1) + BR ·subVW_vec(SB) GLOBL subwvectorfacility+0x00(SB), NOPTR, $8 DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB) // func subVW(z, x []Word, y Word) (c Word) -TEXT ·subVW_vec(SB),NOSPLIT,$0 - MOVD z_len+8(FP), R3 - MOVD x+24(FP), R8 - MOVD y+48(FP), R4 // c = y - MOVD z+0(FP), R2 +TEXT ·subVW_vec(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R4 // c = y + MOVD z+0(FP), R2 - MOVD $0, R0 // make sure it's zero - MOVD $0, R10 // i = 0 - MOVD R8, R5 - MOVD R2, R7 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + MOVD R8, R5 + MOVD R2, R7 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v11 // if n < 0 goto v11 - SUB $12, R3 - BLT A11 + SUB $4, R3 // n -= 4 + BLT v11 // if n < 0 goto v11 + SUB $12, R3 + BLT A11 - VZERO V0 - MOVD $1, R6 // prepare V0 to be final carry register - VLVGG $1, R6, V0 // borrow is initially "no borrow" - VZERO V9 // to ensure upper half is zero - VLVGG $1, R4, V9 + VZERO V0 + MOVD $1, R6 // prepare V0 to be final carry register + VLVGG $1, R6, V0 // borrow is initially "no borrow" + VZERO V9 // to ensure upper half is zero + VLVGG $1, R4, V9 // n >= 0 // regular loop body unrolled 16x - -UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4 - ADD $64, R5 - VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order - VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order - - - VSBCBIQ V1, V9, V0, V25 - VSBIQ V1, V9, V0, V17 - VZERO V9 - VSBCBIQ V2, V9, V25, V26 - VSBIQ V2, V9, V25, V18 - - VLM 0(R5), V5, V6 // 32-bytes into V5..V6 - ADD $32, R5 - - VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order - VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order - - - VSBCBIQ V3, V9, V26, V27 - VSBIQ V3, V9, V26, V19 - VSBCBIQ V4, V9, V27, V28 - VSBIQ V4, V9, V27, V20 - - VLM 0(R5), V7, V8 // 32-bytes into V7..V8 - ADD $32, R5 - - VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order - VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order - - VSBCBIQ V5, V9, V28, V29 - VSBIQ V5, V9, V28, V21 - VSBCBIQ V6, V9, V29, V30 - VSBIQ V6, V9, V29, V22 - - VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order - VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order - - VSBCBIQ V7, V9, V30, V31 - VSBIQ V7, V9, V30, V23 - VSBCBIQ V8, V9, V31, V0 // V0 has carry-over - VSBIQ V8, V9, V31, V24 - - VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order - VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order - VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order - VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order - VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order - VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order - VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order - VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order - VSTM V17, V24, 0(R7) // 128-bytes into z - ADD $128, R7 - ADD $128, R10 // i += 16 - SUB $16, R3 // n -= 16 - BGE UU1 // if n >= 0 goto U1 - VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 - SUB $1, R4 // save cf - NEG R4, R4 -A11: ADD $12, R3 // n += 16 - - BLT v11 // if n < 0 goto v11 +UU1: + VLM 0(R5), V1, V4 // 64-bytes into V1..V4 + ADD $64, R5 + VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order + VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order + + VSBCBIQ V1, V9, V0, V25 + VSBIQ V1, V9, V0, V17 + VZERO V9 + VSBCBIQ V2, V9, V25, V26 + VSBIQ V2, V9, V25, V18 + + VLM 0(R5), V5, V6 // 32-bytes into V5..V6 + ADD $32, R5 + + VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order + VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order + + VSBCBIQ V3, V9, V26, V27 + VSBIQ V3, V9, V26, V19 + VSBCBIQ V4, V9, V27, V28 + VSBIQ V4, V9, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V7..V8 + ADD $32, R5 + + VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order + VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order + + VSBCBIQ V5, V9, V28, V29 + VSBIQ V5, V9, V28, V21 + VSBCBIQ V6, V9, V29, V30 + VSBIQ V6, V9, V29, V22 + + VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order + VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order + + VSBCBIQ V7, V9, V30, V31 + VSBIQ V7, V9, V30, V23 + VSBCBIQ V8, V9, V31, V0 // V0 has carry-over + VSBIQ V8, V9, V31, V24 + + VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order + VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order + VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order + VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order + VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order + VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order + VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order + VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 + SUB $1, R4 // save cf + NEG R4, R4 + +A11: + ADD $12, R3 // n += 16 + + BLT v11 // if n < 0 goto v11 // n >= 0 // regular loop body unrolled 4x -U4: // n >= 0 +U4: // n >= 0 // regular loop body unrolled 4x MOVD 0(R8)(R10*1), R5 MOVD 8(R8)(R10*1), R6 MOVD 16(R8)(R10*1), R7 MOVD 24(R8)(R10*1), R1 - SUBC R4, R5 //SLGR -> SUBC - SUBE R0, R6 //SLBGR -> SUBE + SUBC R4, R5 // SLGR -> SUBC + SUBE R0, R6 // SLBGR -> SUBE SUBE R0, R7 SUBE R0, R1 - SUBE R4, R4 // save CF + SUBE R4, R4 // save CF NEG R4, R4 MOVD R5, 0(R2)(R10*1) MOVD R6, 8(R2)(R10*1) MOVD R7, 16(R2)(R10*1) MOVD R1, 24(R2)(R10*1) - ADD $32, R10 // i += 4 -> i +=32 - SUB $4, R3 // n -= 4 - BGE U4 // if n >= 0 goto U4 + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 -v11: ADD $4, R3 // n += 4 - BLE E11 // if n <= 0 goto E4 +v11: + ADD $4, R3 // n += 4 + BLE E11 // if n <= 0 goto E4 -L4: // n > 0 +L4: // n > 0 - MOVD 0(R8)(R10*1), R5 - SUBC R4, R5 - SUBE R4, R4 // save CF - NEG R4, R4 - MOVD R5, 0(R2)(R10*1) + MOVD 0(R8)(R10*1), R5 + SUBC R4, R5 + SUBE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L4 // if n > 0 goto L4 + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 -E11: MOVD R4, c+56(FP) // return c +E11: + MOVD R4, c+56(FP) // return c RET -//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) +// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) // func subVW(z, x []Word, y Word) (c Word) // (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names) -TEXT ·subVW_novec(SB),NOSPLIT,$0 +TEXT ·subVW_novec(SB), NOSPLIT, $0 MOVD z_len+8(FP), R3 MOVD x+24(FP), R8 - MOVD y+48(FP), R4 // c = y + MOVD y+48(FP), R4 // c = y MOVD z+0(FP), R2 - MOVD $0, R0 // make sure it's 0 - MOVD $0, R10 // i = 0 + MOVD $0, R0 // make sure it's 0 + MOVD $0, R10 // i = 0 // s/JL/JMP/ below to disable the unrolled loop - SUB $4, R3 // n -= 4 - BLT v4 // if n < 4 goto v4 + SUB $4, R3 // n -= 4 + BLT v4 // if n < 4 goto v4 -U4: // n >= 0 +U4: // n >= 0 // regular loop body unrolled 4x MOVD 0(R8)(R10*1), R5 MOVD 8(R8)(R10*1), R6 MOVD 16(R8)(R10*1), R7 MOVD 24(R8)(R10*1), R1 - SUBC R4, R5 //SLGR -> SUBC - SUBE R0, R6 //SLBGR -> SUBE + SUBC R4, R5 // SLGR -> SUBC + SUBE R0, R6 // SLBGR -> SUBE SUBE R0, R7 SUBE R0, R1 - SUBE R4, R4 // save CF + SUBE R4, R4 // save CF NEG R4, R4 MOVD R5, 0(R2)(R10*1) MOVD R6, 8(R2)(R10*1) MOVD R7, 16(R2)(R10*1) MOVD R1, 24(R2)(R10*1) - ADD $32, R10 // i += 4 -> i +=32 - SUB $4, R3 // n -= 4 - BGE U4 // if n >= 0 goto U4 + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 -v4: ADD $4, R3 // n += 4 - BLE E4 // if n <= 0 goto E4 +v4: + ADD $4, R3 // n += 4 + BLE E4 // if n <= 0 goto E4 -L4: // n > 0 +L4: // n > 0 MOVD 0(R8)(R10*1), R5 SUBC R4, R5 - SUBE R4, R4 // save CF + SUBE R4, R4 // save CF NEG R4, R4 MOVD R5, 0(R2)(R10*1) - ADD $8, R10 // i++ - SUB $1, R3 // n-- - BGT L4 // if n > 0 goto L4 + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 -E4: MOVD R4, c+56(FP) // return c +E4: + MOVD R4, c+56(FP) // return c RET // func shlVU(z, x []Word, s uint) (c Word) -TEXT ·shlVU(SB),NOSPLIT,$0 - MOVD z_len+8(FP), R5 - MOVD $0, R0 - SUB $1, R5 // n-- - BLT X8b // n < 0 (n <= 0) +TEXT ·shlVU(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R5 + MOVD $0, R0 + SUB $1, R5 // n-- + BLT X8b // n < 0 (n <= 0) // n > 0 - MOVD s+48(FP), R4 - CMPBEQ R0, R4, Z80 //handle 0 case beq - MOVD $64, R6 - CMPBEQ R6, R4, Z864 //handle 64 case beq - MOVD z+0(FP), R2 - MOVD x+24(FP), R8 - SLD $3, R5 // n = n*8 - SUB R4, R6, R7 - MOVD (R8)(R5*1), R10 // w1 = x[i-1] - SRD R7, R10, R3 - MOVD R3, c+56(FP) - - MOVD $0, R1 // i = 0 - BR E8 + MOVD s+48(FP), R4 + CMPBEQ R0, R4, Z80 // handle 0 case beq + MOVD $64, R6 + CMPBEQ R6, R4, Z864 // handle 64 case beq + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + SUB R4, R6, R7 + MOVD (R8)(R5*1), R10 // w1 = x[i-1] + SRD R7, R10, R3 + MOVD R3, c+56(FP) + + MOVD $0, R1 // i = 0 + BR E8 // i < n-1 -L8: MOVD R10, R3 // w = w1 - MOVD -8(R8)(R5*1), R10 // w1 = x[i+1] +L8: + MOVD R10, R3 // w = w1 + MOVD -8(R8)(R5*1), R10 // w1 = x[i+1] - SLD R4, R3 // w<>ŝ - SRD R7, R10, R6 - OR R6, R3 - MOVD R3, (R2)(R5*1) // z[i] = w<>ŝ - SUB $8, R5 // i-- + SLD R4, R3 // w<>ŝ + SRD R7, R10, R6 + OR R6, R3 + MOVD R3, (R2)(R5*1) // z[i] = w<>ŝ + SUB $8, R5 // i-- -E8: CMPBGT R5, R0, L8 // i < n-1 +E8: + CMPBGT R5, R0, L8 // i < n-1 // i >= n-1 -X8a: SLD R4, R10 // w1<= n-1 - MOVD R10, (R2)(R5*1) + MOVD R10, (R2)(R5*1) RET -Z864: MOVD z+0(FP), R2 - MOVD x+24(FP), R8 - SLD $3, R5 // n = n*8 - MOVD (R8)(R5*1), R3 // w1 = x[n-1] - MOVD R3, c+56(FP) // z[i] = x[n-1] +Z864: + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + MOVD (R8)(R5*1), R3 // w1 = x[n-1] + MOVD R3, c+56(FP) // z[i] = x[n-1] - BR E864 + BR E864 // i < n-1 -L864: MOVD -8(R8)(R5*1), R3 +L864: + MOVD -8(R8)(R5*1), R3 - MOVD R3, (R2)(R5*1) // z[i] = x[n-1] - SUB $8, R5 // i-- + MOVD R3, (R2)(R5*1) // z[i] = x[n-1] + SUB $8, R5 // i-- -E864: CMPBGT R5, R0, L864 // i < n-1 +E864: + CMPBGT R5, R0, L864 // i < n-1 - MOVD R0, (R2) // z[n-1] = 0 + MOVD R0, (R2) // z[n-1] = 0 RET - // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6 // func shrVU(z, x []Word, s uint) (c Word) -TEXT ·shrVU(SB),NOSPLIT,$0 - MOVD z_len+8(FP), R5 - MOVD $0, R0 - SUB $1, R5 // n-- - BLT X9b // n < 0 (n <= 0) +TEXT ·shrVU(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R5 + MOVD $0, R0 + SUB $1, R5 // n-- + BLT X9b // n < 0 (n <= 0) // n > 0 - MOVD s+48(FP), R4 - CMPBEQ R0, R4, ZB0 //handle 0 case beq - MOVD $64, R6 - CMPBEQ R6, R4, ZB64 //handle 64 case beq - MOVD z+0(FP), R2 - MOVD x+24(FP), R8 - SLD $3, R5 // n = n*8 - SUB R4, R6, R7 - MOVD (R8), R10 // w1 = x[0] - SLD R7, R10, R3 - MOVD R3, c+56(FP) - - MOVD $0, R1 // i = 0 - BR E9 + MOVD s+48(FP), R4 + CMPBEQ R0, R4, ZB0 // handle 0 case beq + MOVD $64, R6 + CMPBEQ R6, R4, ZB64 // handle 64 case beq + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + SUB R4, R6, R7 + MOVD (R8), R10 // w1 = x[0] + SLD R7, R10, R3 + MOVD R3, c+56(FP) + + MOVD $0, R1 // i = 0 + BR E9 // i < n-1 -L9: MOVD R10, R3 // w = w1 - MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] +L9: + MOVD R10, R3 // w = w1 + MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] - SRD R4, R3 // w>>s | w1<>s | w1<>s | w1<>s | w1<= n-1 -X9a: SRD R4, R10 // w1>>s - MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s +X9a: + SRD R4, R10 // w1>>s + MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s RET -X9b: MOVD R0, c+56(FP) +X9b: + MOVD R0, c+56(FP) RET -ZB0: MOVD z+0(FP), R2 - MOVD x+24(FP), R8 - SLD $3, R5 // n = n*8 +ZB0: + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 - MOVD (R8), R10 // w1 = x[0] - MOVD $0, R3 // R10 << 64 - MOVD R3, c+56(FP) + MOVD (R8), R10 // w1 = x[0] + MOVD $0, R3 // R10 << 64 + MOVD R3, c+56(FP) - MOVD $0, R1 // i = 0 - BR E9Z + MOVD $0, R1 // i = 0 + BR E9Z // i < n-1 -L9Z: MOVD R10, R3 // w = w1 - MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] +L9Z: + MOVD R10, R3 // w = w1 + MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] - MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<>s | w1<= n-1 - MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s + MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s RET -ZB64: MOVD z+0(FP), R2 - MOVD x+24(FP), R8 - SLD $3, R5 // n = n*8 - MOVD (R8), R3 // w1 = x[0] - MOVD R3, c+56(FP) +ZB64: + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + MOVD (R8), R3 // w1 = x[0] + MOVD R3, c+56(FP) - MOVD $0, R1 // i = 0 - BR E964 + MOVD $0, R1 // i = 0 + BR E964 // i < n-1 -L964: MOVD 8(R8)(R1*1), R3 // w1 = x[i+1] +L964: + MOVD 8(R8)(R1*1), R3 // w1 = x[i+1] - MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<>s | w1<= n-1 - MOVD $0, R10 // w1>>s - MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s + MOVD $0, R10 // w1>>s + MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s RET // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i // func mulAddVWW(z, x []Word, y, r Word) (c Word) -TEXT ·mulAddVWW(SB),NOSPLIT,$0 - MOVD z+0(FP), R2 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD r+56(FP), R4 // c = r - MOVD z_len+8(FP), R5 - MOVD $0, R1 // i = 0 - MOVD $0, R7 // i*8 = 0 - MOVD $0, R0 // make sure it's zero - BR E5 - -L5: MOVD (R8)(R1*1), R6 - MULHDU R9, R6 - ADDC R4, R11 //add to low order bits - ADDE R0, R6 - MOVD R11, (R2)(R1*1) - MOVD R6, R4 - ADD $8, R1 // i*8 + 8 - ADD $1, R7 // i++ - -E5: CMPBLT R7, R5, L5 // i < n - - MOVD R4, c+64(FP) +TEXT ·mulAddVWW(SB), NOSPLIT, $0 + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD r+56(FP), R4 // c = r + MOVD z_len+8(FP), R5 + MOVD $0, R1 // i = 0 + MOVD $0, R7 // i*8 = 0 + MOVD $0, R0 // make sure it's zero + BR E5 + +L5: + MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + ADDC R4, R11 // add to low order bits + ADDE R0, R6 + MOVD R11, (R2)(R1*1) + MOVD R6, R4 + ADD $8, R1 // i*8 + 8 + ADD $1, R7 // i++ + +E5: + CMPBLT R7, R5, L5 // i < n + + MOVD R4, c+64(FP) RET // func addMulVVW(z, x []Word, y Word) (c Word) // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i -TEXT ·addMulVVW(SB),NOSPLIT,$0 - MOVD z+0(FP), R2 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD z_len+8(FP), R5 - - MOVD $0, R1 // i*8 = 0 - MOVD $0, R7 // i = 0 - MOVD $0, R0 // make sure it's zero - MOVD $0, R4 // c = 0 - - MOVD R5, R12 - AND $-2, R12 - CMPBGE R5, $2, A6 - BR E6 - -A6: MOVD (R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (R2)(R1*1), R10 - ADDC R10, R11 //add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (R2)(R1*1) - - MOVD (8)(R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (8)(R2)(R1*1), R10 - ADDC R10, R11 //add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (8)(R2)(R1*1) - - ADD $16, R1 // i*8 + 8 - ADD $2, R7 // i++ - - CMPBLT R7, R12, A6 - BR E6 - -L6: MOVD (R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (R2)(R1*1), R10 - ADDC R10, R11 //add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (R2)(R1*1) - - ADD $8, R1 // i*8 + 8 - ADD $1, R7 // i++ - -E6: CMPBLT R7, R5, L6 // i < n - - MOVD R4, c+56(FP) +TEXT ·addMulVVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z_len+8(FP), R5 + + MOVD $0, R1 // i*8 = 0 + MOVD $0, R7 // i = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R4 // c = 0 + + MOVD R5, R12 + AND $-2, R12 + CMPBGE R5, $2, A6 + BR E6 + +A6: + MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (R2)(R1*1), R10 + ADDC R10, R11 // add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (R2)(R1*1) + + MOVD (8)(R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (8)(R2)(R1*1), R10 + ADDC R10, R11 // add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (8)(R2)(R1*1) + + ADD $16, R1 // i*8 + 8 + ADD $2, R7 // i++ + + CMPBLT R7, R12, A6 + BR E6 + +L6: + MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (R2)(R1*1), R10 + ADDC R10, R11 // add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (R2)(R1*1) + + ADD $8, R1 // i*8 + 8 + ADD $1, R7 // i++ + +E6: + CMPBLT R7, R5, L6 // i < n + + MOVD R4, c+56(FP) RET // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i -TEXT ·divWVW(SB),NOSPLIT,$0 - MOVD z+0(FP), R2 - MOVD xn+24(FP), R10 // r = xn - MOVD x+32(FP), R8 - MOVD y+56(FP), R9 - MOVD z_len+8(FP), R7 // i = z - SLD $3, R7, R1 // i*8 - MOVD $0, R0 // make sure it's zero - BR E7 - -L7: MOVD (R8)(R1*1), R11 - WORD $0xB98700A9 //DLGR R10,R9 - MOVD R11, (R2)(R1*1) - -E7: SUB $1, R7 // i-- - SUB $8, R1 - BGE L7 // i >= 0 - - MOVD R10, r+64(FP) +TEXT ·divWVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R2 + MOVD xn+24(FP), R10 // r = xn + MOVD x+32(FP), R8 + MOVD y+56(FP), R9 + MOVD z_len+8(FP), R7 // i = z + SLD $3, R7, R1 // i*8 + MOVD $0, R0 // make sure it's zero + BR E7 + +L7: + MOVD (R8)(R1*1), R11 + WORD $0xB98700A9 // DLGR R10,R9 + MOVD R11, (R2)(R1*1) + +E7: + SUB $1, R7 // i-- + SUB $8, R1 + BGE L7 // i >= 0 + + MOVD R10, r+64(FP) RET -- 2.48.1