From: bill_ofarrell Date: Wed, 31 Oct 2018 02:45:51 +0000 (-0400) Subject: crypto/elliptic: utilize faster z14 multiply/square instructions (when available) X-Git-Tag: go1.12beta1~167 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=897e0807c30a7b1860c15d4c05d68907fbba9262;p=gostls13.git crypto/elliptic: utilize faster z14 multiply/square instructions (when available) In the s390x assembly implementation of NIST P-256 curve, utilize faster multiply/square instructions introduced in the z14. These new instructions are designed for crypto and are constant time. The algorithm is unchanged except for faster multiplication when run on a z14 or later. On z13, the original mutiplication (also constant time) is used. P-256 performance is critical in many applications, such as Blockchain. name old time new time delta BaseMultP256 24396 ns/op 21564 ns/op 1.13x ScalarMultP256 87546 ns/op 72813 ns/op. 1.20x Change-Id: I7e6d8b420fac56d5f9cc13c9423e2080df854bac Reviewed-on: https://go-review.googlesource.com/c/146022 Reviewed-by: Michael Munday Reviewed-by: Brad Fitzpatrick Run-TryBot: Michael Munday --- diff --git a/src/crypto/elliptic/p256_asm_s390x.s b/src/crypto/elliptic/p256_asm_s390x.s index 2219b858b3..c5b55a04c3 100644 --- a/src/crypto/elliptic/p256_asm_s390x.s +++ b/src/crypto/elliptic/p256_asm_s390x.s @@ -3,6 +3,8 @@ // license that can be found in the LICENSE file. #include "textflag.h" +#include "go_asm.h" + DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 @@ -44,28 +46,23 @@ GLOBL p256ord<>(SB), 8, $32 GLOBL p256<>(SB), 8, $80 GLOBL p256mul<>(SB), 8, $160 -// func hasVectorFacility() bool -TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 - MOVD $x-24(SP), R1 - XC $24, 0(R1), 0(R1) // clear the storage - MOVD $2, R0 // R0 is the number of double words stored -1 - WORD $0xB2B01000 // STFLE 0(R1) - XOR R0, R0 // reset the value of R0 - MOVBZ z-8(SP), R1 - AND $0x40, R1 - BEQ novector - -vectorinstalled: - // check if the vector instruction has been enabled - VLEIB $0, $0xF, V16 - VLGVB $0, V16, R1 - CMPBNE R1, $0xF, novector - MOVB $1, ret+0(FP) // have vx - RET - -novector: - MOVB $0, ret+0(FP) // no vx - RET +DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718 +DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f +DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718 +DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011 +DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f +DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718 +DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011 +DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718 +DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a +DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011 +DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011 +DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a +DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203 +DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a +DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a +DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203 +GLOBL p256vmsl<>(SB), 8, $128 // --------------------------------------- // iff cond == 1 val <- -val @@ -890,7 +887,7 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0 #undef K0 // --------------------------------------- -// p256MulInternal +// p256MulInternalVX // V0-V3,V30,V31 - Not Modified // V4-V15 - Volatile @@ -1033,7 +1030,7 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0 * * Last 'group' needs to RED2||RED1 shifted less */ -TEXT p256MulInternal<>(SB), NOSPLIT, $0-0 +TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0 VL 32(CPOOL), SEL1 VL 48(CPOOL), SEL2 VL 64(CPOOL), SEL3 @@ -1278,6 +1275,443 @@ TEXT p256MulInternal<>(SB), NOSPLIT, $0-0 #undef CAR1 #undef CAR2 +// --------------------------------------- +// p256MulInternalVMSL +// V0-V3,V30,V31 - Not Modified +// V4-V14 - Volatile + +#define CPOOL R4 +#define SCRATCH R9 + +// Parameters +#define X0 V0 // Not modified +#define X1 V1 // Not modified +#define Y0 V2 // Not modified +#define Y1 V3 // Not modified +#define T0 V4 +#define T1 V5 +#define T2 V6 +#define P0 V30 // Not modified +#define P1 V31 // Not modified + +// input: d0 +// output: h0, h1 +// temp: TEMP, ZERO, BORROW +#define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \ + VZERO ZERO \ + VSLDB $4, d0, ZERO, h0 \ + VLR h0, BORROW \ + VSLDB $12, ZERO, h0, TEMP \ + VSQ TEMP, h0, h0 \ + VSLDB $12, d0, BORROW, h1 \ + VSLDB $8, ZERO, BORROW, TEMP \ + VAQ TEMP, h0, h0 \ + +#define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \ + VZERO ZERO \ + VSLDB $8, d2, ZERO, TEMP \ + VSLDB $8, d2, TEMP, h0 \ + VSLDB $12, ZERO, TEMP, h1 \ + VSQ h1, h0, h0 \ + +TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 + VSTM V16, V19, (SCRATCH) + + MOVD $p256vmsl<>+0x00(SB), CPOOL + + // Divide input1 into 5 limbs + VGBM $0x007f, V14 + VZERO V12 + VSLDB $2, X1, X0, V13 + VSLDB $2, Y1, Y0, V8 + VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb + VSLDB $4, V12, Y1, V6 // V6: 4 bytes limb + + VN V14, X0, V5 // V5: first 7 bytes limb + VN V14, Y0, V10 // V10: first 7 bytes limb + VN V14, V13, V13 // v13: third 7 bytes limb + VN V14, V8, V8 // V8: third 7 bytes limb + + VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1) + VMSLG V8, V5, V12, V8 // v8: l8 x l5 + VMSLG V6, V13, V12, V13 // v13: l6 x l3 + VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9) + VMSLG V6, V5, V12, V6 // v6: l6 x l5 + + MOVD $p256vmsl<>+0x00(SB), CPOOL + VGBM $0x7f7f, V14 + + VL 0(CPOOL), V4 + VL 16(CPOOL), V7 + VL 32(CPOOL), V9 + VL 48(CPOOL), V5 + VLM 64(CPOOL), V16, V19 + + VPERM V12, X0, V4, V4 // v4: limb4 | limb5 + VPERM Y1, Y0, V7, V7 + VPERM V12, Y0, V9, V9 // v9: limb10 | limb9 + VPERM X1, X0, V5, V5 + VPERM X1, X0, V16, V16 + VPERM Y1, Y0, V17, V17 + VPERM X1, V12, V18, V18 // v18: limb1 | limb2 + VPERM Y1, V12, V19, V19 // v19: limb7 | limb6 + VN V14, V7, V7 // v7: limb9 | limb8 + VN V14, V5, V5 // v5: limb3 | limb4 + VN V14, V16, V16 // v16: limb2 | limb3 + VN V14, V17, V17 // v17: limb8 | limb7 + + VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) + VMSLG V9, V5, V8, V8 // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3) + VMSLG V9, V16, V12, V16 // v16: l10 x l9 + l2 x l3 + VMSLG V9, V18, V12, V9 // v9: l10 x l1 + l9 x l2 + VMSLG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 + VMSLG V17, V4, V16, V16 // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4) + VMSLG V17, V5, V9, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2 + VMSLG V19, V5, V7, V7 // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6) + VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8) + VAQ V9, V6, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) + VAQ V17, V13, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) + + VSLDB $9, V12, V10, V4 + VSLDB $9, V12, V7, V5 + VAQ V4, V14, V14 + VAQ V5, V13, V13 + + VSLDB $9, V12, V14, V4 + VSLDB $9, V12, V13, V5 + VAQ V4, V8, V8 + VAQ V5, V19, V19 + + VSLDB $9, V12, V8, V4 + VSLDB $9, V12, V19, V5 + VAQ V4, V16, V16 + VAQ V5, V11, V11 + + VSLDB $9, V12, V16, V4 + VAQ V4, V9, V17 + + VGBM $0x007f, V4 + VGBM $0x00ff, V5 + + VN V10, V4, V10 + VN V14, V4, V14 + VN V8, V4, V8 + VN V16, V4, V16 + VN V17, V4, V9 + VN V7, V4, V7 + VN V13, V4, V13 + VN V19, V4, V19 + VN V11, V5, V11 + + VSLDB $7, V14, V14, V14 + VSLDB $14, V8, V12, V4 + VSLDB $14, V12, V8, V8 + VSLDB $5, V16, V16, V16 + VSLDB $12, V9, V12, V5 + + VO V14, V10, V10 + VO V8, V16, V16 + VO V4, V10, V10 // first rightmost 128bits of the multiplication result + VO V5, V16, V16 // second rightmost 128bits of the multiplication result + + // adjust v7, v13, v19, v11 + VSLDB $7, V13, V13, V13 + VSLDB $14, V19, V12, V4 + VSLDB $14, V12, V19, V19 + VSLDB $5, V11, V12, V5 + VO V13, V7, V7 + VO V4, V7, V7 + VO V19, V5, V11 + + VSLDB $9, V12, V17, V14 + VSLDB $12, V12, V9, V9 + VACCQ V7, V14, V13 + VAQ V7, V14, V7 + VAQ V11, V13, V11 + + // First reduction, 96 bits + VSLDB $4, V16, V10, T0 + VSLDB $4, V12, V16, T1 + VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result + VSLDB $3, V7, V12, V7 + OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2 + VO V7, V9, V7 // third rightmost 128bits of the multiplication result + VACCQ T0, T2, V9 + VAQ T0, T2, T2 + VACQ T1, V8, V9, V8 + + // Second reduction 96 bits + VSLDB $4, V8, T2, T0 + VSLDB $4, V12, V8, T1 + OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8 + VACCQ T0, V8, T2 + VAQ T0, V8, V8 + VACQ T1, V9, T2, V9 + + // Third reduction 64 bits + VSLDB $8, V9, V8, T0 + VSLDB $8, V12, V9, T1 + OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 + VACCQ T0, V13, V12 + VAQ T0, V13, V13 + VACQ T1, V14, V12, V14 + VACCQ V13, V7, V12 + VAQ V13, V7, T0 + VACCCQ V14, V11, V12, T2 + VACQ V14, V11, V12, T1 // results T2 | T1 | T0 + + // --------------------------------------------------- + MOVD $p256mul<>+0x00(SB), CPOOL + + VZERO V12 + VSCBIQ P0, T0, V8 + VSQ P0, T0, V7 + VSBCBIQ T1, P1, V8, V10 + VSBIQ T1, P1, V8, V9 + VSBIQ T2, V12, V10, T2 + + // what output to use, V9||V7 or T1||T0? + VSEL T0, V7, T2, T0 + VSEL T1, V9, T2, T1 + + VLM (SCRATCH), V16, V19 + + RET + +// --------------------------------------- +// p256SqrInternalVMSL +// V0-V1,V30,V31 - Not Modified +// V4-V14 - Volatile + +TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 + VSTM V16, V18, (SCRATCH) + + MOVD $p256vmsl<>+0x00(SB), CPOOL + // Divide input into limbs + VGBM $0x007f, V14 + VZERO V12 + VSLDB $2, X1, X0, V13 + VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb + + VN V14, X0, V10 // V10: first 7 bytes limb + VN V14, V13, V13 // v13: third 7 bytes limb + + VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1) + VMSLG V13, V13, V12, V13 // v13: l8 x l3 + VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9) + + MOVD $p256vmsl<>+0x00(SB), CPOOL + VGBM $0x7f7f, V14 + + VL 0(CPOOL), V4 + VL 16(CPOOL), V7 + VL 32(CPOOL), V9 + VL 48(CPOOL), V5 + VLM 64(CPOOL), V16, V18 + VL 112(CPOOL), V8 + + VPERM V12, X0, V4, V4 // v4: limb4 | limb5 + VPERM X1, X0, V7, V7 + VPERM V12, X0, V9, V9 // v9: limb10 | limb9 + VPERM X1, X0, V5, V5 + VPERM X1, X0, V16, V16 + VPERM X1, X0, V17, V17 + VPERM X1, V12, V18, V18 // v18: limb1 | limb2 + VPERM X1, V12, V8, V8 // v8: limb7 | limb6 + VN V14, V7, V7 // v7: limb9 | limb8 + VN V14, V5, V5 // v5: limb3 | limb4 + VN V14, V16, V16 // v16: limb2 | limb3 + VN V14, V17, V17 // v17: limb8 | limb7 + + VMSLEOG V9, V18, V13, V6 // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) + VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) + VMSLEOG V9, V16, V12, V16 // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4) + VMSLEOG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 (column 6) + VMSLEG V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) + VMSLG V8, V18, V12, V8 // v8: l7 x l1 + l6 x l2 (column 8) + VMSLEG V9, V5, V12, V18 // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3) + + VSLDB $9, V12, V10, V4 + VSLDB $9, V12, V7, V5 + VAQ V4, V14, V14 + VAQ V5, V13, V13 + + VSLDB $9, V12, V14, V4 + VSLDB $9, V12, V13, V5 + VAQ V4, V18, V18 + VAQ V5, V8, V8 + + VSLDB $9, V12, V18, V4 + VSLDB $9, V12, V8, V5 + VAQ V4, V16, V16 + VAQ V5, V11, V11 + + VSLDB $9, V12, V16, V4 + VAQ V4, V6, V17 + + VGBM $0x007f, V4 + VGBM $0x00ff, V5 + + VN V10, V4, V10 + VN V14, V4, V14 + VN V18, V4, V18 + VN V16, V4, V16 + VN V17, V4, V9 + VN V7, V4, V7 + VN V13, V4, V13 + VN V8, V4, V8 + VN V11, V5, V11 + + VSLDB $7, V14, V14, V14 + VSLDB $14, V18, V12, V4 + VSLDB $14, V12, V18, V18 + VSLDB $5, V16, V16, V16 + VSLDB $12, V9, V12, V5 + + VO V14, V10, V10 + VO V18, V16, V16 + VO V4, V10, V10 // first rightmost 128bits of the multiplication result + VO V5, V16, V16 // second rightmost 128bits of the multiplication result + + // adjust v7, v13, v8, v11 + VSLDB $7, V13, V13, V13 + VSLDB $14, V8, V12, V4 + VSLDB $14, V12, V8, V8 + VSLDB $5, V11, V12, V5 + VO V13, V7, V7 + VO V4, V7, V7 + VO V8, V5, V11 + + VSLDB $9, V12, V17, V14 + VSLDB $12, V12, V9, V9 + VACCQ V7, V14, V13 + VAQ V7, V14, V7 + VAQ V11, V13, V11 + + // First reduction, 96 bits + VSLDB $4, V16, V10, T0 + VSLDB $4, V12, V16, T1 + VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result + VSLDB $3, V7, V12, V7 + OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2 + VO V7, V9, V7 // third rightmost 128bits of the multiplication result + VACCQ T0, T2, V9 + VAQ T0, T2, T2 + VACQ T1, V8, V9, V8 + + // Second reduction 96 bits + VSLDB $4, V8, T2, T0 + VSLDB $4, V12, V8, T1 + OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8 + VACCQ T0, V8, T2 + VAQ T0, V8, V8 + VACQ T1, V9, T2, V9 + + // Third reduction 64 bits + VSLDB $8, V9, V8, T0 + VSLDB $8, V12, V9, T1 + OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 + VACCQ T0, V13, V12 + VAQ T0, V13, V13 + VACQ T1, V14, V12, V14 + VACCQ V13, V7, V12 + VAQ V13, V7, T0 + VACCCQ V14, V11, V12, T2 + VACQ V14, V11, V12, T1 // results T2 | T1 | T0 + + // --------------------------------------------------- + MOVD $p256mul<>+0x00(SB), CPOOL + + VZERO V12 + VSCBIQ P0, T0, V8 + VSQ P0, T0, V7 + VSBCBIQ T1, P1, V8, V10 + VSBIQ T1, P1, V8, V9 + VSBIQ T2, V12, V10, T2 + + // what output to use, V9||V7 or T1||T0? + VSEL T0, V7, T2, T0 + VSEL T1, V9, T2, T1 + + VLM (SCRATCH), V16, V18 + RET + + + +#undef CPOOL +#undef SCRATCH +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef T2 +#undef P0 +#undef P1 + +#define SCRATCH R9 + +TEXT p256MulInternal<>(SB),NOSPLIT,$64-0 + MOVD $scratch-64(SP), SCRATCH + MOVD ·p256MulInternalFacility+0x00(SB),R7 + CALL (R7) + RET + +TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 + MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 + MOVD $·p256MulInternalFacility+0x00(SB), R7 + MOVD $·p256MulInternalVX(SB), R8 + CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported + MOVD $·p256MulInternalVMSL(SB), R8 +novmsl: + MOVD R8, 0(R7) + BR (R8) + +GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8 +DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB) + +// Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 + +TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0 + VLR X0, Y0 + VLR X1, Y1 + BR ·p256MulInternalVX(SB) + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 + + +TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0 + MOVD $scratch-48(SP), SCRATCH + MOVD ·p256SqrInternalFacility+0x00(SB),R7 + CALL (R7) + RET + +TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 + MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 + MOVD $·p256SqrInternalFacility+0x00(SB), R7 + MOVD $·p256SqrInternalVX(SB), R8 + CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported + MOVD $·p256SqrInternalVMSL(SB), R8 +novmsl: + MOVD R8, 0(R7) + BR (R8) + + +GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8 +DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB) + +#undef SCRATCH + + #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ VZERO ZER \ VSCBIQ Y0, X0, CAR1 \ @@ -1385,6 +1819,52 @@ TEXT ·p256MulAsm(SB), NOSPLIT, $0 #undef P0 #undef P1 +// --------------------------------------- +// func p256SqrAsm(res, in1 []byte) +#define res_ptr R1 +#define x_ptr R2 +#define y_ptr R3 +#define CPOOL R4 + +// Parameters +#define X0 V0 +#define X1 V1 +#define T0 V4 +#define T1 V5 + +// Constants +#define P0 V30 +#define P1 V31 +TEXT ·p256SqrAsm(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in1+24(FP), x_ptr + + VL (1*16)(x_ptr), X0 + VL (0*16)(x_ptr), X1 + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), P0 + VL 0(CPOOL), P1 + + CALL p256SqrInternal<>(SB) + + VST T0, (1*16)(res_ptr) + VST T1, (0*16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef CPOOL + +#undef X0 +#undef X1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + + // Point add with P2 being affine point // If sign == 1 -> P2 = -P2 // If sel == 0 -> P3 = P1 @@ -1524,7 +2004,7 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 VL 80(P1ptr), X0 // Z1L VLR X0, Y0 VLR X1, Y1 - CALL p256MulInternal<>(SB) + CALL p256SqrInternal<>(SB) // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 VLR T0, X0 @@ -1570,7 +2050,7 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 VLR Y0, X0 VLR Y1, X1 - CALL p256MulInternal<>(SB) + CALL p256SqrInternal<>(SB) VLR T0, X0 VLR T1, X1 @@ -1594,7 +2074,7 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 VLR T2H, X1 VLR T2L, Y0 VLR T2H, Y1 - CALL p256MulInternal<>(SB) + CALL p256SqrInternal<>(SB) // SUB(T(SB) + CALL p256SqrInternal<>(SB) // SUB(X(SB) + CALL p256SqrInternal<>(SB) // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 VLR T0, X0 @@ -1873,7 +2353,7 @@ TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 // X- ; Y=X ; MUL; T- // Y3 = Y3² VLR X0, Y0 VLR X1, Y1 - CALL p256MulInternal<>(SB) + CALL p256SqrInternal<>(SB) // HAL(Y3(SB) + CALL p256SqrInternal<>(SB) // ADD(T1(SB) + CALL p256SqrInternal<>(SB) // X- ; Y=T ; MUL; R=T // R = Z1*T1 VLR T0, Y0 @@ -2085,7 +2565,7 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 VL 80(P2ptr), X0 // Z2L VLR X0, Y0 VLR X1, Y1 - CALL p256MulInternal<>(SB) + CALL p256SqrInternal<>(SB) // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 VLR T0, Y0 @@ -2175,7 +2655,7 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 VLR HH, X1 VLR HL, Y0 VLR HH, Y1 - CALL p256MulInternal<>(SB) + CALL p256SqrInternal<>(SB) // X- ; Y=T ; MUL; T2=T // T2 = H*T1 VLR T0, Y0 @@ -2196,7 +2676,7 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 VLR RH, X1 VLR RL, Y0 VLR RH, Y1 - CALL p256MulInternal<>(SB) + CALL p256SqrInternal<>(SB) // SUB(T