From d63865b5d19ec3ca57aa30b45b2e0b57b3d54087 Mon Sep 17 00:00:00 2001 From: pooja shyamsundar Date: Sun, 15 May 2022 21:07:27 -0700 Subject: [PATCH] crypto/internal/nistec: re-enable s390x asm for P-256 - formatting and optimized init functionality for precomputed table - updated formatting for comments - further optimized init functionality Fixes #52709 Change-Id: Ie96a8ee52f09821d5ac53115185cbc1ad8f954d2 Reviewed-on: https://go-review.googlesource.com/c/go/+/404058 Auto-Submit: Lynn Boger Reviewed-by: Lynn Boger Run-TryBot: Lynn Boger Reviewed-by: Bill O'Farrell TryBot-Result: Gopher Robot Reviewed-by: Dmitri Shuralyov Reviewed-by: Pooja Shyamsundar Reviewed-by: Ian Lance Taylor Reviewed-by: Filippo Valsorda --- src/crypto/internal/nistec/p256.go | 2 +- src/crypto/internal/nistec/p256_asm.go | 12 +- src/crypto/internal/nistec/p256_asm_s390x.s | 1014 ++++++----------- .../internal/nistec/p256_asm_table_test.go | 2 +- src/crypto/internal/nistec/p256_s390x.go | 570 --------- 5 files changed, 371 insertions(+), 1229 deletions(-) delete mode 100644 src/crypto/internal/nistec/p256_s390x.go diff --git a/src/crypto/internal/nistec/p256.go b/src/crypto/internal/nistec/p256.go index 08b2ba98f4..353b428c1d 100644 --- a/src/crypto/internal/nistec/p256.go +++ b/src/crypto/internal/nistec/p256.go @@ -4,7 +4,7 @@ // Code generated by generate.go. DO NOT EDIT. -//go:build !amd64 && !arm64 && !ppc64le +//go:build !amd64 && !arm64 && !ppc64le && !s390x package nistec diff --git a/src/crypto/internal/nistec/p256_asm.go b/src/crypto/internal/nistec/p256_asm.go index 64c9078c81..bc443ba323 100644 --- a/src/crypto/internal/nistec/p256_asm.go +++ b/src/crypto/internal/nistec/p256_asm.go @@ -10,14 +10,16 @@ // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x // https://eprint.iacr.org/2013/816.pdf -//go:build amd64 || arm64 || ppc64le +//go:build amd64 || arm64 || ppc64le || s390x package nistec import ( _ "embed" + "encoding/binary" "errors" "math/bits" + "runtime" "unsafe" ) @@ -323,6 +325,14 @@ var p256PrecomputedEmbed string func init() { p256PrecomputedPtr := (*unsafe.Pointer)(unsafe.Pointer(&p256PrecomputedEmbed)) + if runtime.GOARCH == "s390x" { + var newTable [43 * 32 * 2 * 4]uint64 + for i, x := range (*[43 * 32 * 2 * 4][8]byte)(*p256PrecomputedPtr) { + newTable[i] = binary.LittleEndian.Uint64(x[:]) + } + newTablePtr := unsafe.Pointer(&newTable) + p256PrecomputedPtr = &newTablePtr + } p256Precomputed = (*[43]p256AffineTable)(*p256PrecomputedPtr) } diff --git a/src/crypto/internal/nistec/p256_asm_s390x.s b/src/crypto/internal/nistec/p256_asm_s390x.s index 4154f0dadf..8da4f3f5b8 100644 --- a/src/crypto/internal/nistec/p256_asm_s390x.s +++ b/src/crypto/internal/nistec/p256_asm_s390x.s @@ -2,12 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build ignore - #include "textflag.h" #include "go_asm.h" - DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff @@ -23,6 +20,8 @@ DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask +DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 @@ -45,30 +44,50 @@ DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 GLOBL p256ordK0<>(SB), 8, $4 GLOBL p256ord<>(SB), 8, $32 -GLOBL p256<>(SB), 8, $80 +GLOBL p256<>(SB), 8, $96 GLOBL p256mul<>(SB), 8, $160 -DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718 -DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f -DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718 -DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011 -DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f -DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718 -DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011 -DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718 -DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a -DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011 -DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011 -DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a -DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203 -DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a -DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a -DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203 -GLOBL p256vmsl<>(SB), 8, $128 +// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) +TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0 + JMP ·p256BigToLittle(SB) + +// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) +TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0 + JMP ·p256BigToLittle(SB) + +// --------------------------------------- +// func p256LittleToBig(res *[32]byte, in *p256Element) +TEXT ·p256LittleToBig(SB), NOSPLIT, $0 + JMP ·p256BigToLittle(SB) + +// func p256BigToLittle(res *p256Element, in *[32]byte) +#define res_ptr R1 +#define in_ptr R2 +#define T1L V2 +#define T1H V3 + +TEXT ·p256BigToLittle(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in+8(FP), in_ptr + + VL 0(in_ptr), T1H + VL 16(in_ptr), T1L + + VPDI $0x4, T1L, T1L, T1L + VPDI $0x4, T1H, T1H, T1H + + VST T1L, 0(res_ptr) + VST T1H, 16(res_ptr) + RET + +#undef res_ptr +#undef in_ptr +#undef T1L +#undef T1H // --------------------------------------- // iff cond == 1 val <- -val -// func p256NegCond(val *p256Point, cond int) +// func p256NegCond(val *p256Element, cond int) #define P1ptr R1 #define CPOOL R4 @@ -90,8 +109,10 @@ TEXT ·p256NegCond(SB), NOSPLIT, $0 VL 16(CPOOL), PL VL 0(CPOOL), PH - VL 32(P1ptr), Y1H - VL 48(P1ptr), Y1L + VL 16(P1ptr), Y1H + VPDI $0x4, Y1H, Y1H, Y1H + VL 0(P1ptr), Y1L + VPDI $0x4, Y1L, Y1L, Y1L VLREPG cond+8(FP), SEL1 VZERO ZER @@ -104,8 +125,10 @@ TEXT ·p256NegCond(SB), NOSPLIT, $0 VSEL Y1L, T1L, SEL1, Y1L VSEL Y1H, T1H, SEL1, Y1H - VST Y1H, 32(P1ptr) - VST Y1L, 48(P1ptr) + VPDI $0x4, Y1H, Y1H, Y1H + VST Y1H, 16(P1ptr) + VPDI $0x4, Y1L, Y1L, Y1L + VST Y1L, 0(P1ptr) RET #undef P1ptr @@ -122,7 +145,7 @@ TEXT ·p256NegCond(SB), NOSPLIT, $0 // --------------------------------------- // if cond == 0 res <- b; else res <- a -// func p256MovCond(res, a, b *p256Point, cond int) +// func p256MovCond(res, a, b *P256Point, cond int) #define P3ptr R1 #define P1ptr R2 #define P2ptr R3 @@ -202,7 +225,7 @@ TEXT ·p256MovCond(SB), NOSPLIT, $0 // Constant time table access // Indexed from 1 to 15, with -1 offset // (index 0 is implicitly point at infinity) -// func p256Select(point *p256Point, table []p256Point, idx int) +// func p256Select(res *P256Point, table *p256Table, idx int) #define P3ptr R1 #define P1ptr R2 #define COUNT R4 @@ -225,9 +248,9 @@ TEXT ·p256MovCond(SB), NOSPLIT, $0 #define SEL1 V20 #define SEL2 V21 TEXT ·p256Select(SB), NOSPLIT, $0 - MOVD point+0(FP), P3ptr + MOVD res+0(FP), P3ptr MOVD table+8(FP), P1ptr - VLREPB idx+(32+7)(FP), IDX + VLREPB idx+(16+7)(FP), IDX VREPIB $1, ONE VREPIB $1, SEL2 MOVD $1, COUNT @@ -291,99 +314,8 @@ loop_select: #undef SEL2 // --------------------------------------- -// Constant time table access -// Indexed from 1 to 15, with -1 offset -// (index 0 is implicitly point at infinity) -// func p256SelectBase(point *p256Point, table []p256Point, idx int) -#define P3ptr R1 -#define P1ptr R2 -#define COUNT R4 - -#define X1L V0 -#define X1H V1 -#define Y1L V2 -#define Y1H V3 -#define Z1L V4 -#define Z1H V5 -#define X2L V6 -#define X2H V7 -#define Y2L V8 -#define Y2H V9 -#define Z2L V10 -#define Z2H V11 - -#define ONE V18 -#define IDX V19 -#define SEL1 V20 -#define SEL2 V21 -TEXT ·p256SelectBase(SB), NOSPLIT, $0 - MOVD point+0(FP), P3ptr - MOVD table+8(FP), P1ptr - VLREPB idx+(32+7)(FP), IDX - VREPIB $1, ONE - VREPIB $1, SEL2 - MOVD $1, COUNT - - VZERO X1H - VZERO X1L - VZERO Y1H - VZERO Y1L - VZERO Z1H - VZERO Z1L - -loop_select: - VL 0(P1ptr), X2H - VL 16(P1ptr), X2L - VL 32(P1ptr), Y2H - VL 48(P1ptr), Y2L - VL 64(P1ptr), Z2H - VL 80(P1ptr), Z2L - - VCEQG SEL2, IDX, SEL1 - VSEL X2L, X1L, SEL1, X1L - VSEL X2H, X1H, SEL1, X1H - VSEL Y2L, Y1L, SEL1, Y1L - VSEL Y2H, Y1H, SEL1, Y1H - VSEL Z2L, Z1L, SEL1, Z1L - VSEL Z2H, Z1H, SEL1, Z1H - - VAB SEL2, ONE, SEL2 - ADDW $1, COUNT - ADD $96, P1ptr - CMPW COUNT, $65 - BLT loop_select - - VST X1H, 0(P3ptr) - VST X1L, 16(P3ptr) - VST Y1H, 32(P3ptr) - VST Y1L, 48(P3ptr) - VST Z1H, 64(P3ptr) - VST Z1L, 80(P3ptr) - RET - -#undef P3ptr -#undef P1ptr -#undef COUNT -#undef X1L -#undef X1H -#undef Y1L -#undef Y1H -#undef Z1L -#undef Z1H -#undef X2L -#undef X2H -#undef Y2L -#undef Y2H -#undef Z2L -#undef Z2H -#undef ONE -#undef IDX -#undef SEL1 -#undef SEL2 - -// --------------------------------------- -// func p256FromMont(res, in []byte) +// func p256FromMont(res, in *p256Element) #define res_ptr R1 #define x_ptr R2 #define CPOOL R4 @@ -406,7 +338,7 @@ loop_select: TEXT ·p256FromMont(SB), NOSPLIT, $0 MOVD res+0(FP), res_ptr - MOVD in+24(FP), x_ptr + MOVD in+8(FP), x_ptr VZERO T2 VZERO ZER @@ -416,8 +348,10 @@ TEXT ·p256FromMont(SB), NOSPLIT, $0 VL 48(CPOOL), SEL2 VL 64(CPOOL), SEL1 - VL (1*16)(x_ptr), T0 - VL (0*16)(x_ptr), T1 + VL (0*16)(x_ptr), T0 + VPDI $0x4, T0, T0, T0 + VL (1*16)(x_ptr), T1 + VPDI $0x4, T1, T1, T1 // First round VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 @@ -487,8 +421,10 @@ TEXT ·p256FromMont(SB), NOSPLIT, $0 VSEL T0, TT0, T2, T0 VSEL T1, TT1, T2, T1 - VST T0, (1*16)(res_ptr) - VST T1, (0*16)(res_ptr) + VPDI $0x4, T0, T0, TT0 + VST TT0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, TT1 + VST TT1, (1*16)(res_ptr) RET #undef res_ptr @@ -509,8 +445,100 @@ TEXT ·p256FromMont(SB), NOSPLIT, $0 #undef PL #undef PH +// Constant time table access +// Indexed from 1 to 15, with -1 offset +// (index 0 is implicitly point at infinity) +// func p256SelectBase(point *p256Point, table []p256Point, idx int) +// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) + +#define P3ptr R1 +#define P1ptr R2 +#define COUNT R4 +#define CPOOL R5 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 +#define LE2BE V12 + +#define ONE V18 +#define IDX V19 +#define SEL1 V20 +#define SEL2 V21 + +TEXT ·p256SelectAffine(SB), NOSPLIT, $0 + MOVD res+0(FP), P3ptr + MOVD table+8(FP), P1ptr + MOVD $p256<>+0x00(SB), CPOOL + VLREPB idx+(16+7)(FP), IDX + VREPIB $1, ONE + VREPIB $1, SEL2 + MOVD $1, COUNT + VL 80(CPOOL), LE2BE + + VZERO X1H + VZERO X1L + VZERO Y1H + VZERO Y1L + +loop_select: + VL 0(P1ptr), X2H + VL 16(P1ptr), X2L + VL 32(P1ptr), Y2H + VL 48(P1ptr), Y2L + + VCEQG SEL2, IDX, SEL1 + + VSEL X2L, X1L, SEL1, X1L + VSEL X2H, X1H, SEL1, X1H + VSEL Y2L, Y1L, SEL1, Y1L + VSEL Y2H, Y1H, SEL1, Y1H + + VAB SEL2, ONE, SEL2 + ADDW $1, COUNT + ADD $64, P1ptr + CMPW COUNT, $65 + BLT loop_select + VST X1H, 0(P3ptr) + VST X1L, 16(P3ptr) + VST Y1H, 32(P3ptr) + VST Y1L, 48(P3ptr) + + RET + +#undef P3ptr +#undef P1ptr +#undef COUNT +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef ONE +#undef IDX +#undef SEL1 +#undef SEL2 +#undef CPOOL + // --------------------------------------- -// func p256OrdMul(res, in1, in2 []byte) + +// func p256OrdMul(res, in1, in2 *p256OrdElement) #define res_ptr R1 #define x_ptr R2 #define y_ptr R3 @@ -538,10 +566,10 @@ TEXT ·p256FromMont(SB), NOSPLIT, $0 #define MK0 V30 #define K0 V31 -TEXT ·p256OrdMul(SB), NOSPLIT, $0 +TEXT ·p256OrdMul<>(SB), NOSPLIT, $0 MOVD res+0(FP), res_ptr - MOVD in1+24(FP), x_ptr - MOVD in2+48(FP), y_ptr + MOVD in1+8(FP), x_ptr + MOVD in2+16(FP), y_ptr VZERO T2 MOVD $p256ordK0<>+0x00(SB), R4 @@ -554,10 +582,14 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0 VL 16(R4), M0 VL 0(R4), M1 - VL (1*16)(x_ptr), X0 - VL (0*16)(x_ptr), X1 - VL (1*16)(y_ptr), Y0 - VL (0*16)(y_ptr), Y1 + VL (0*16)(x_ptr), X0 + VPDI $0x4, X0, X0, X0 + VL (1*16)(x_ptr), X1 + VPDI $0x4, X1, X1, X1 + VL (0*16)(y_ptr), Y0 + VPDI $0x4, Y0, Y0, Y0 + VL (1*16)(y_ptr), Y1 + VPDI $0x4, Y1, Y1, Y1 // ---------------------------------------------------------------------------/ VREPF $3, Y0, YDIG @@ -856,8 +888,10 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0 VSEL T0, ADD1, T2, T0 VSEL T1, ADD2, T2, T1 - VST T0, (1*16)(res_ptr) - VST T1, (0*16)(res_ptr) + VPDI $0x4, T0, T0, T0 + VST T0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, T1 + VST T1, (1*16)(res_ptr) RET #undef res_ptr @@ -889,7 +923,7 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0 #undef K0 // --------------------------------------- -// p256MulInternalVX +// p256MulInternal // V0-V3,V30,V31 - Not Modified // V4-V15 - Volatile @@ -1032,7 +1066,7 @@ TEXT ·p256OrdMul(SB), NOSPLIT, $0 * * Last 'group' needs to RED2||RED1 shifted less */ -TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0 +TEXT p256MulInternal<>(SB), NOSPLIT, $0-0 VL 32(CPOOL), SEL1 VL 48(CPOOL), SEL2 VL 64(CPOOL), SEL3 @@ -1278,401 +1312,6 @@ TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0 #undef CAR2 // --------------------------------------- -// p256MulInternalVMSL -// V0-V3,V30,V31 - Not Modified -// V4-V14 - Volatile - -#define CPOOL R4 -#define SCRATCH R9 - -// Parameters -#define X0 V0 // Not modified -#define X1 V1 // Not modified -#define Y0 V2 // Not modified -#define Y1 V3 // Not modified -#define T0 V4 -#define T1 V5 -#define T2 V6 -#define P0 V30 // Not modified -#define P1 V31 // Not modified - -// input: d0 -// output: h0, h1 -// temp: TEMP, ZERO, BORROW -#define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \ - VZERO ZERO \ - VSLDB $4, d0, ZERO, h0 \ - VLR h0, BORROW \ - VSLDB $12, ZERO, h0, TEMP \ - VSQ TEMP, h0, h0 \ - VSLDB $12, d0, BORROW, h1 \ - VSLDB $8, ZERO, BORROW, TEMP \ - VAQ TEMP, h0, h0 \ - -#define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \ - VZERO ZERO \ - VSLDB $8, d2, ZERO, TEMP \ - VSLDB $8, d2, TEMP, h0 \ - VSLDB $12, ZERO, TEMP, h1 \ - VSQ h1, h0, h0 \ - -TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 - VSTM V16, V19, (SCRATCH) - - MOVD $p256vmsl<>+0x00(SB), CPOOL - - // Divide input1 into 5 limbs - VGBM $0x007f, V14 - VZERO V12 - VSLDB $2, X1, X0, V13 - VSLDB $2, Y1, Y0, V8 - VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb - VSLDB $4, V12, Y1, V6 // V6: 4 bytes limb - - VN V14, X0, V5 // V5: first 7 bytes limb - VN V14, Y0, V10 // V10: first 7 bytes limb - VN V14, V13, V13 // v13: third 7 bytes limb - VN V14, V8, V8 // V8: third 7 bytes limb - - VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1) - VMSLG V8, V5, V12, V8 // v8: l8 x l5 - VMSLG V6, V13, V12, V13 // v13: l6 x l3 - VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9) - VMSLG V6, V5, V12, V6 // v6: l6 x l5 - - MOVD $p256vmsl<>+0x00(SB), CPOOL - VGBM $0x7f7f, V14 - - VL 0(CPOOL), V4 - VL 16(CPOOL), V7 - VL 32(CPOOL), V9 - VL 48(CPOOL), V5 - VLM 64(CPOOL), V16, V19 - - VPERM V12, X0, V4, V4 // v4: limb4 | limb5 - VPERM Y1, Y0, V7, V7 - VPERM V12, Y0, V9, V9 // v9: limb10 | limb9 - VPERM X1, X0, V5, V5 - VPERM X1, X0, V16, V16 - VPERM Y1, Y0, V17, V17 - VPERM X1, V12, V18, V18 // v18: limb1 | limb2 - VPERM Y1, V12, V19, V19 // v19: limb7 | limb6 - VN V14, V7, V7 // v7: limb9 | limb8 - VN V14, V5, V5 // v5: limb3 | limb4 - VN V14, V16, V16 // v16: limb2 | limb3 - VN V14, V17, V17 // v17: limb8 | limb7 - - VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) - VMSLG V9, V5, V8, V8 // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3) - VMSLG V9, V16, V12, V16 // v16: l10 x l9 + l2 x l3 - VMSLG V9, V18, V12, V9 // v9: l10 x l1 + l9 x l2 - VMSLG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 - VMSLG V17, V4, V16, V16 // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4) - VMSLG V17, V5, V9, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 - VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2 - VMSLG V19, V5, V7, V7 // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6) - VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8) - VAQ V9, V6, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) - VAQ V17, V13, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) - - VSLDB $9, V12, V10, V4 - VSLDB $9, V12, V7, V5 - VAQ V4, V14, V14 - VAQ V5, V13, V13 - - VSLDB $9, V12, V14, V4 - VSLDB $9, V12, V13, V5 - VAQ V4, V8, V8 - VAQ V5, V19, V19 - - VSLDB $9, V12, V8, V4 - VSLDB $9, V12, V19, V5 - VAQ V4, V16, V16 - VAQ V5, V11, V11 - - VSLDB $9, V12, V16, V4 - VAQ V4, V9, V17 - - VGBM $0x007f, V4 - VGBM $0x00ff, V5 - - VN V10, V4, V10 - VN V14, V4, V14 - VN V8, V4, V8 - VN V16, V4, V16 - VN V17, V4, V9 - VN V7, V4, V7 - VN V13, V4, V13 - VN V19, V4, V19 - VN V11, V5, V11 - - VSLDB $7, V14, V14, V14 - VSLDB $14, V8, V12, V4 - VSLDB $14, V12, V8, V8 - VSLDB $5, V16, V16, V16 - VSLDB $12, V9, V12, V5 - - VO V14, V10, V10 - VO V8, V16, V16 - VO V4, V10, V10 // first rightmost 128bits of the multiplication result - VO V5, V16, V16 // second rightmost 128bits of the multiplication result - - // adjust v7, v13, v19, v11 - VSLDB $7, V13, V13, V13 - VSLDB $14, V19, V12, V4 - VSLDB $14, V12, V19, V19 - VSLDB $5, V11, V12, V5 - VO V13, V7, V7 - VO V4, V7, V7 - VO V19, V5, V11 - - VSLDB $9, V12, V17, V14 - VSLDB $12, V12, V9, V9 - VACCQ V7, V14, V13 - VAQ V7, V14, V7 - VAQ V11, V13, V11 - - // First reduction, 96 bits - VSLDB $4, V16, V10, T0 - VSLDB $4, V12, V16, T1 - VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result - VSLDB $3, V7, V12, V7 - OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2 - VO V7, V9, V7 // third rightmost 128bits of the multiplication result - VACCQ T0, T2, V9 - VAQ T0, T2, T2 - VACQ T1, V8, V9, V8 - - // Second reduction 96 bits - VSLDB $4, V8, T2, T0 - VSLDB $4, V12, V8, T1 - OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8 - VACCQ T0, V8, T2 - VAQ T0, V8, V8 - VACQ T1, V9, T2, V9 - - // Third reduction 64 bits - VSLDB $8, V9, V8, T0 - VSLDB $8, V12, V9, T1 - OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 - VACCQ T0, V13, V12 - VAQ T0, V13, V13 - VACQ T1, V14, V12, V14 - VACCQ V13, V7, V12 - VAQ V13, V7, T0 - VACCCQ V14, V11, V12, T2 - VACQ V14, V11, V12, T1 // results T2 | T1 | T0 - - // --------------------------------------------------- - MOVD $p256mul<>+0x00(SB), CPOOL - - VZERO V12 - VSCBIQ P0, T0, V8 - VSQ P0, T0, V7 - VSBCBIQ T1, P1, V8, V10 - VSBIQ T1, P1, V8, V9 - VSBIQ T2, V12, V10, T2 - - // what output to use, V9||V7 or T1||T0? - VSEL T0, V7, T2, T0 - VSEL T1, V9, T2, T1 - - VLM (SCRATCH), V16, V19 - - RET - -// --------------------------------------- -// p256SqrInternalVMSL -// V0-V1,V30,V31 - Not Modified -// V4-V14 - Volatile - -TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 - VSTM V16, V18, (SCRATCH) - - MOVD $p256vmsl<>+0x00(SB), CPOOL - // Divide input into limbs - VGBM $0x007f, V14 - VZERO V12 - VSLDB $2, X1, X0, V13 - VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb - - VN V14, X0, V10 // V10: first 7 bytes limb - VN V14, V13, V13 // v13: third 7 bytes limb - - VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1) - VMSLG V13, V13, V12, V13 // v13: l8 x l3 - VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9) - - MOVD $p256vmsl<>+0x00(SB), CPOOL - VGBM $0x7f7f, V14 - - VL 0(CPOOL), V4 - VL 16(CPOOL), V7 - VL 32(CPOOL), V9 - VL 48(CPOOL), V5 - VLM 64(CPOOL), V16, V18 - VL 112(CPOOL), V8 - - VPERM V12, X0, V4, V4 // v4: limb4 | limb5 - VPERM X1, X0, V7, V7 - VPERM V12, X0, V9, V9 // v9: limb10 | limb9 - VPERM X1, X0, V5, V5 - VPERM X1, X0, V16, V16 - VPERM X1, X0, V17, V17 - VPERM X1, V12, V18, V18 // v18: limb1 | limb2 - VPERM X1, V12, V8, V8 // v8: limb7 | limb6 - VN V14, V7, V7 // v7: limb9 | limb8 - VN V14, V5, V5 // v5: limb3 | limb4 - VN V14, V16, V16 // v16: limb2 | limb3 - VN V14, V17, V17 // v17: limb8 | limb7 - - VMSLEOG V9, V18, V13, V6 // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) - VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) - VMSLEOG V9, V16, V12, V16 // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4) - VMSLEOG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 (column 6) - VMSLEG V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) - VMSLG V8, V18, V12, V8 // v8: l7 x l1 + l6 x l2 (column 8) - VMSLEG V9, V5, V12, V18 // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3) - - VSLDB $9, V12, V10, V4 - VSLDB $9, V12, V7, V5 - VAQ V4, V14, V14 - VAQ V5, V13, V13 - - VSLDB $9, V12, V14, V4 - VSLDB $9, V12, V13, V5 - VAQ V4, V18, V18 - VAQ V5, V8, V8 - - VSLDB $9, V12, V18, V4 - VSLDB $9, V12, V8, V5 - VAQ V4, V16, V16 - VAQ V5, V11, V11 - - VSLDB $9, V12, V16, V4 - VAQ V4, V6, V17 - - VGBM $0x007f, V4 - VGBM $0x00ff, V5 - - VN V10, V4, V10 - VN V14, V4, V14 - VN V18, V4, V18 - VN V16, V4, V16 - VN V17, V4, V9 - VN V7, V4, V7 - VN V13, V4, V13 - VN V8, V4, V8 - VN V11, V5, V11 - - VSLDB $7, V14, V14, V14 - VSLDB $14, V18, V12, V4 - VSLDB $14, V12, V18, V18 - VSLDB $5, V16, V16, V16 - VSLDB $12, V9, V12, V5 - - VO V14, V10, V10 - VO V18, V16, V16 - VO V4, V10, V10 // first rightmost 128bits of the multiplication result - VO V5, V16, V16 // second rightmost 128bits of the multiplication result - - // adjust v7, v13, v8, v11 - VSLDB $7, V13, V13, V13 - VSLDB $14, V8, V12, V4 - VSLDB $14, V12, V8, V8 - VSLDB $5, V11, V12, V5 - VO V13, V7, V7 - VO V4, V7, V7 - VO V8, V5, V11 - - VSLDB $9, V12, V17, V14 - VSLDB $12, V12, V9, V9 - VACCQ V7, V14, V13 - VAQ V7, V14, V7 - VAQ V11, V13, V11 - - // First reduction, 96 bits - VSLDB $4, V16, V10, T0 - VSLDB $4, V12, V16, T1 - VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result - VSLDB $3, V7, V12, V7 - OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2 - VO V7, V9, V7 // third rightmost 128bits of the multiplication result - VACCQ T0, T2, V9 - VAQ T0, T2, T2 - VACQ T1, V8, V9, V8 - - // Second reduction 96 bits - VSLDB $4, V8, T2, T0 - VSLDB $4, V12, V8, T1 - OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8 - VACCQ T0, V8, T2 - VAQ T0, V8, V8 - VACQ T1, V9, T2, V9 - - // Third reduction 64 bits - VSLDB $8, V9, V8, T0 - VSLDB $8, V12, V9, T1 - OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 - VACCQ T0, V13, V12 - VAQ T0, V13, V13 - VACQ T1, V14, V12, V14 - VACCQ V13, V7, V12 - VAQ V13, V7, T0 - VACCCQ V14, V11, V12, T2 - VACQ V14, V11, V12, T1 // results T2 | T1 | T0 - - // --------------------------------------------------- - MOVD $p256mul<>+0x00(SB), CPOOL - - VZERO V12 - VSCBIQ P0, T0, V8 - VSQ P0, T0, V7 - VSBCBIQ T1, P1, V8, V10 - VSBIQ T1, P1, V8, V9 - VSBIQ T2, V12, V10, T2 - - // what output to use, V9||V7 or T1||T0? - VSEL T0, V7, T2, T0 - VSEL T1, V9, T2, T1 - - VLM (SCRATCH), V16, V18 - RET - - - -#undef CPOOL -#undef SCRATCH -#undef X0 -#undef X1 -#undef Y0 -#undef Y1 -#undef T0 -#undef T1 -#undef T2 -#undef P0 -#undef P1 - -#define SCRATCH R9 - -TEXT p256MulInternal<>(SB),NOSPLIT,$64-0 - MOVD $scratch-64(SP), SCRATCH - MOVD ·p256MulInternalFacility+0x00(SB),R7 - CALL (R7) - RET - -TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 - MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 - MOVD $·p256MulInternalFacility+0x00(SB), R7 - MOVD $·p256MulInternalVX(SB), R8 - CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported - MOVD $·p256MulInternalVMSL(SB), R8 -novmsl: - MOVD R8, 0(R7) - BR (R8) - -GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8 -DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB) // Parameters #define X0 V0 @@ -1680,40 +1319,16 @@ DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB) #define Y0 V2 #define Y1 V3 -TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0 +TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0 VLR X0, Y0 VLR X1, Y1 - BR ·p256MulInternalVX(SB) + BR p256MulInternal<>(SB) #undef X0 #undef X1 #undef Y0 #undef Y1 - -TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0 - MOVD $scratch-48(SP), SCRATCH - MOVD ·p256SqrInternalFacility+0x00(SB),R7 - CALL (R7) - RET - -TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 - MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 - MOVD $·p256SqrInternalFacility+0x00(SB), R7 - MOVD $·p256SqrInternalVX(SB), R8 - CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported - MOVD $·p256SqrInternalVMSL(SB), R8 -novmsl: - MOVD R8, 0(R7) - BR (R8) - - -GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8 -DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB) - -#undef SCRATCH - - #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ VZERO ZER \ VSCBIQ Y0, X0, CAR1 \ @@ -1770,7 +1385,7 @@ DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB) VO T1, TT1, T1 // --------------------------------------- -// func p256MulAsm(res, in1, in2 []byte) +// func p256Mul(res, in1, in2 *p256Element) #define res_ptr R1 #define x_ptr R2 #define y_ptr R3 @@ -1787,15 +1402,19 @@ DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB) // Constants #define P0 V30 #define P1 V31 -TEXT ·p256MulAsm(SB), NOSPLIT, $0 +TEXT ·p256Mul(SB), NOSPLIT, $0 MOVD res+0(FP), res_ptr - MOVD in1+24(FP), x_ptr - MOVD in2+48(FP), y_ptr - - VL (1*16)(x_ptr), X0 - VL (0*16)(x_ptr), X1 - VL (1*16)(y_ptr), Y0 - VL (0*16)(y_ptr), Y1 + MOVD in1+8(FP), x_ptr + MOVD in2+16(FP), y_ptr + + VL (0*16)(x_ptr), X0 + VPDI $0x4, X0, X0, X0 + VL (1*16)(x_ptr), X1 + VPDI $0x4, X1, X1, X1 + VL (0*16)(y_ptr), Y0 + VPDI $0x4, Y0, Y0, Y0 + VL (1*16)(y_ptr), Y1 + VPDI $0x4, Y1, Y1, Y1 MOVD $p256mul<>+0x00(SB), CPOOL VL 16(CPOOL), P0 @@ -1803,8 +1422,10 @@ TEXT ·p256MulAsm(SB), NOSPLIT, $0 CALL p256MulInternal<>(SB) - VST T0, (1*16)(res_ptr) - VST T1, (0*16)(res_ptr) + VPDI $0x4, T0, T0, T0 + VST T0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, T1 + VST T1, (1*16)(res_ptr) RET #undef res_ptr @@ -1822,11 +1443,13 @@ TEXT ·p256MulAsm(SB), NOSPLIT, $0 #undef P1 // --------------------------------------- -// func p256SqrAsm(res, in1 []byte) +// func p256Sqr(res, in *p256Element, n int) #define res_ptr R1 #define x_ptr R2 #define y_ptr R3 #define CPOOL R4 +#define COUNT R5 +#define N R6 // Parameters #define X0 V0 @@ -1837,27 +1460,41 @@ TEXT ·p256MulAsm(SB), NOSPLIT, $0 // Constants #define P0 V30 #define P1 V31 -TEXT ·p256SqrAsm(SB), NOSPLIT, $0 +TEXT ·p256Sqr(SB), NOSPLIT, $0 MOVD res+0(FP), res_ptr - MOVD in1+24(FP), x_ptr + MOVD in+8(FP), x_ptr - VL (1*16)(x_ptr), X0 - VL (0*16)(x_ptr), X1 + VL (0*16)(x_ptr), X0 + VPDI $0x4, X0, X0, X0 + VL (1*16)(x_ptr), X1 + VPDI $0x4, X1, X1, X1 MOVD $p256mul<>+0x00(SB), CPOOL + MOVD $0, COUNT + MOVD n+16(FP), N VL 16(CPOOL), P0 VL 0(CPOOL), P1 +loop: CALL p256SqrInternal<>(SB) + VLR T0, X0 + VLR T1, X1 + ADDW $1, COUNT + CMPW COUNT, N + BLT loop - VST T0, (1*16)(res_ptr) - VST T1, (0*16)(res_ptr) + VPDI $0x4, T0, T0, T0 + VST T0, (0*16)(res_ptr) + VPDI $0x4, T1, T1, T1 + VST T1, (1*16)(res_ptr) RET #undef res_ptr #undef x_ptr #undef y_ptr #undef CPOOL +#undef COUNT +#undef N #undef X0 #undef X1 @@ -1866,12 +1503,11 @@ TEXT ·p256SqrAsm(SB), NOSPLIT, $0 #undef P0 #undef P1 - // Point add with P2 being affine point // If sign == 1 -> P2 = -P2 // If sel == 0 -> P3 = P1 // if zero == 0 -> P3 = P2 -// p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) +// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) #define P3ptr R1 #define P1ptr R2 #define P2ptr R3 @@ -1971,9 +1607,9 @@ SUB(T+0x00(SB), CPOOL VL 16(CPOOL), PL @@ -1983,8 +1619,10 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 // } - VL 32(P2ptr), Y2H - VL 48(P2ptr), Y2L + VL 48(P2ptr), Y2H + VPDI $0x4, Y2H, Y2H, Y2H + VL 32(P2ptr), Y2L + VPDI $0x4, Y2L, Y2L, Y2L VLREPG sign+24(FP), SEL1 VZERO ZER @@ -2002,8 +1640,10 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. */ // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 - VL 64(P1ptr), X1 // Z1H - VL 80(P1ptr), X0 // Z1L + VL 80(P1ptr), X1 // Z1H + VPDI $0x4, X1, X1, X1 + VL 64(P1ptr), X0 // Z1L + VPDI $0x4, X0, X0, X0 VLR X0, Y0 VLR X1, Y1 CALL p256SqrInternal<>(SB) @@ -2016,8 +1656,10 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 VLR T1, T2H // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 - VL 0(P2ptr), Y1 // X2H - VL 16(P2ptr), Y0 // X2L + VL 16(P2ptr), Y1 // X2H + VPDI $0x4, Y1, Y1, Y1 + VL 0(P2ptr), Y0 // X2L + VPDI $0x4, Y0, Y0, Y0 CALL p256MulInternal<>(SB) VLR T0, T1L VLR T1, T1H @@ -2030,18 +1672,24 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 CALL p256MulInternal<>(SB) // SUB(T2(SB) // VST T1, 64(P3ptr) @@ -2062,8 +1710,10 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 VLR T1, T4H // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 - VL 0(P1ptr), Y1 // X1H - VL 16(P1ptr), Y0 // X1L + VL 16(P1ptr), Y1 // X1H + VPDI $0x4, Y1, Y1, Y1 + VL 0(P1ptr), Y0 // X1L + VPDI $0x4, Y0, Y0, Y0 CALL p256MulInternal<>(SB) VLR T0, T3L VLR T1, T3H @@ -2097,8 +1747,10 @@ TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 VLR T4L, X0 VLR T4H, X1 - VL 32(P1ptr), Y1 // Y1H - VL 48(P1ptr), Y0 // Y1L + VL 48(P1ptr), Y1 // Y1H + VPDI $0x4, Y1, Y1, Y1 + VL 32(P1ptr), Y0 // Y1L + VPDI $0x4, Y0, Y0, Y0 CALL p256MulInternal<>(SB) // SUB(T+0x00(SB), CPOOL VL 16(CPOOL), PL VL 0(CPOOL), PH // X=Z1; Y=Z1; MUL; T- // T1 = Z1² - VL 64(P1ptr), X1 // Z1H - VL 80(P1ptr), X0 // Z1L + VL 80(P1ptr), X1 // Z1H + VPDI $0x4, X1, X1, X1 + VL 64(P1ptr), X0 // Z1L + VPDI $0x4, X0, X0, X0 VLR X0, Y0 VLR X1, Y1 CALL p256SqrInternal<>(SB) // SUB(X(SB) - VST T1, 64(P3ptr) - VST T0, 80(P3ptr) + VPDI $0x4, T1, T1, TT1 + VST TT1, 80(P3ptr) + VPDI $0x4, T0, T0, TT0 + VST TT0, 64(P3ptr) // X- ; Y=X ; MUL; T- // Y3 = Y3² VLR X0, Y0 @@ -2346,8 +2020,10 @@ TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 VLR T0, X0 VLR T1, X1 - VL 0(P1ptr), Y1 - VL 16(P1ptr), Y0 + VL 16(P1ptr), Y1 + VPDI $0x4, Y1, Y1, Y1 + VL 0(P1ptr), Y0 + VPDI $0x4, Y0, Y0, Y0 CALL p256MulInternal<>(SB) VLR T0, T3L VLR T1, T3H @@ -2372,8 +2048,10 @@ TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 // SUB(X3+0x00(SB), CPOOL VL 16(CPOOL), PL VL 0(CPOOL), PH // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 - VL 64(P1ptr), X1 // Z1H - VL 80(P1ptr), X0 // Z1L + VL 80(P1ptr), X1 // Z1H + VPDI $0x4, X1, X1, X1 + VL 64(P1ptr), X0 // Z1L + VPDI $0x4, X0, X0, X0 VLR X0, Y0 VLR X1, Y1 CALL p256SqrInternal<>(SB) @@ -2556,15 +2238,19 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 VLR T1, RH // X=X2; Y- ; MUL; H=T // H = X2*T1 - VL 0(P2ptr), X1 // X2H - VL 16(P2ptr), X0 // X2L + VL 16(P2ptr), X1 // X2H + VPDI $0x4, X1, X1, X1 + VL 0(P2ptr), X0 // X2L + VPDI $0x4, X0, X0, X0 CALL p256MulInternal<>(SB) VLR T0, HL VLR T1, HH // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 - VL 64(P2ptr), X1 // Z2H - VL 80(P2ptr), X0 // Z2L + VL 80(P2ptr), X1 // Z2H + VPDI $0x4, X1, X1, X1 + VL 64(P2ptr), X0 // Z2L + VPDI $0x4, X0, X0, X0 VLR X0, Y0 VLR X1, Y1 CALL p256SqrInternal<>(SB) @@ -2577,8 +2263,10 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 VLR T1, S1H // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 - VL 0(P1ptr), X1 // X1H - VL 16(P1ptr), X0 // X1L + VL 16(P1ptr), X1 // X1H + VPDI $0x4, X1, X1, X1 + VL 0(P1ptr), X0 // X1L + VPDI $0x4, X0, X0, X0 CALL p256MulInternal<>(SB) VLR T0, U1L VLR T1, U1H @@ -2602,10 +2290,14 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 MOVD ISZERO, ret+24(FP) // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 - VL 64(P1ptr), X1 // Z1H - VL 80(P1ptr), X0 // Z1L - VL 64(P2ptr), Y1 // Z2H - VL 80(P2ptr), Y0 // Z2L + VL 80(P1ptr), X1 // Z1H + VPDI $0x4, X1, X1, X1 + VL 64(P1ptr), X0 // Z1L + VPDI $0x4, X0, X0, X0 + VL 80(P2ptr), Y1 // Z2H + VPDI $0x4, Y1, Y1, Y1 + VL 64(P2ptr), Y0 // Z2L + VPDI $0x4, Y0, Y0, Y0 CALL p256MulInternal<>(SB) // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H @@ -2614,12 +2306,16 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 VLR HL, Y0 VLR HH, Y1 CALL p256MulInternal<>(SB) - VST T1, 64(P3ptr) - VST T0, 80(P3ptr) + VPDI $0x4, T1, T1, TT1 + VST TT1, 80(P3ptr) + VPDI $0x4, T0, T0, TT0 + VST TT0, 64(P3ptr) // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 - VL 32(P1ptr), X1 - VL 48(P1ptr), X0 + VL 48(P1ptr), X1 + VPDI $0x4, X1, X1, X1 + VL 32(P1ptr), X0 + VPDI $0x4, X0, X0, X0 VLR S1L, Y0 VLR S1H, Y1 CALL p256MulInternal<>(SB) @@ -2627,8 +2323,10 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 VLR T1, S1H // X=Y2; Y=R ; MUL; T- // R = Y2*R - VL 32(P2ptr), X1 - VL 48(P2ptr), X0 + VL 48(P2ptr), X1 + VPDI $0x4, X1, X1, X1 + VL 32(P2ptr), X0 + VPDI $0x4, X0, X0, X0 VLR RL, Y0 VLR RH, Y1 CALL p256MulInternal<>(SB) @@ -2688,8 +2386,10 @@ TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 // SUB(T P2 = -P2 -// If sel == 0 -> P3 = P1 -// if zero == 0 -> P3 = P2 -// -//go:noescape -func p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) - -// Point add -// -//go:noescape -func p256PointAddAsm(P3, P1, P2 *p256Point) int - -//go:noescape -func p256PointDoubleAsm(P3, P1 *p256Point) - -func (curve p256CurveFast) Inverse(k *big.Int) *big.Int { - if k.Cmp(p256Params.N) >= 0 { - // This should never happen. - reducedK := new(big.Int).Mod(k, p256Params.N) - k = reducedK - } - - // table will store precomputed powers of x. The 32 bytes at index - // i store x^(i+1). - var table [15][32]byte - - x := fromBig(k) - // This code operates in the Montgomery domain where R = 2^256 mod n - // and n is the order of the scalar field. (See initP256 for the - // value.) Elements in the Montgomery domain take the form a×R and - // multiplication of x and y in the calculates (x × y × R^-1) mod n. RR - // is R×R mod n thus the Montgomery multiplication x and RR gives x×R, - // i.e. converts x into the Montgomery domain. Stored in BigEndian form - RR := []byte{0x66, 0xe1, 0x2d, 0x94, 0xf3, 0xd9, 0x56, 0x20, 0x28, 0x45, 0xb2, 0x39, 0x2b, 0x6b, 0xec, 0x59, - 0x46, 0x99, 0x79, 0x9c, 0x49, 0xbd, 0x6f, 0xa6, 0x83, 0x24, 0x4c, 0x95, 0xbe, 0x79, 0xee, 0xa2} - - p256OrdMul(table[0][:], x, RR) - - // Prepare the table, no need in constant time access, because the - // power is not a secret. (Entry 0 is never used.) - for i := 2; i < 16; i += 2 { - p256OrdSqr(table[i-1][:], table[(i/2)-1][:], 1) - p256OrdMul(table[i][:], table[i-1][:], table[0][:]) - } - - copy(x, table[14][:]) // f - - p256OrdSqr(x[0:32], x[0:32], 4) - p256OrdMul(x[0:32], x[0:32], table[14][:]) // ff - t := make([]byte, 32) - copy(t, x) - - p256OrdSqr(x, x, 8) - p256OrdMul(x, x, t) // ffff - copy(t, x) - - p256OrdSqr(x, x, 16) - p256OrdMul(x, x, t) // ffffffff - copy(t, x) - - p256OrdSqr(x, x, 64) // ffffffff0000000000000000 - p256OrdMul(x, x, t) // ffffffff00000000ffffffff - p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000 - p256OrdMul(x, x, t) // ffffffff00000000ffffffffffffffff - - // Remaining 32 windows - expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4, - 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf} - for i := 0; i < 32; i++ { - p256OrdSqr(x, x, 4) - p256OrdMul(x, x, table[expLo[i]-1][:]) - } - - // Multiplying by one in the Montgomery domain converts a Montgomery - // value out of the domain. - one := []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1} - p256OrdMul(x, x, one) - - return new(big.Int).SetBytes(x) -} - -// fromBig converts a *big.Int into a format used by this code. -func fromBig(big *big.Int) []byte { - // This could be done a lot more efficiently... - res := big.Bytes() - if 32 == len(res) { - return res - } - t := make([]byte, 32) - offset := 32 - len(res) - for i := len(res) - 1; i >= 0; i-- { - t[i+offset] = res[i] - } - return t -} - -// p256GetMultiplier makes sure byte array will have 32 byte elements, If the scalar -// is equal or greater than the order of the group, it's reduced modulo that order. -func p256GetMultiplier(in []byte) []byte { - n := new(big.Int).SetBytes(in) - - if n.Cmp(p256Params.N) >= 0 { - n.Mod(n, p256Params.N) - } - return fromBig(n) -} - -// p256MulAsm operates in a Montgomery domain with R = 2^256 mod p, where p is the -// underlying field of the curve. (See initP256 for the value.) Thus rr here is -// R×R mod p. See comment in Inverse about how this is used. -var rr = []byte{0x00, 0x00, 0x00, 0x04, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, - 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03} - -// (This is one, in the Montgomery domain.) -var one = []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01} - -func maybeReduceModP(in *big.Int) *big.Int { - if in.Cmp(p256Params.P) < 0 { - return in - } - return new(big.Int).Mod(in, p256Params.P) -} - -func (curve p256CurveFast) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) { - var r1, r2 p256Point - scalarReduced := p256GetMultiplier(baseScalar) - r1IsInfinity := scalarIsZero(scalarReduced) - r1.p256BaseMult(scalarReduced) - - copy(r2.x[:], fromBig(maybeReduceModP(bigX))) - copy(r2.y[:], fromBig(maybeReduceModP(bigY))) - copy(r2.z[:], one) - p256MulAsm(r2.x[:], r2.x[:], rr[:]) - p256MulAsm(r2.y[:], r2.y[:], rr[:]) - - scalarReduced = p256GetMultiplier(scalar) - r2IsInfinity := scalarIsZero(scalarReduced) - r2.p256ScalarMult(p256GetMultiplier(scalar)) - - var sum, double p256Point - pointsEqual := p256PointAddAsm(&sum, &r1, &r2) - p256PointDoubleAsm(&double, &r1) - p256MovCond(&sum, &double, &sum, pointsEqual) - p256MovCond(&sum, &r1, &sum, r2IsInfinity) - p256MovCond(&sum, &r2, &sum, r1IsInfinity) - return sum.p256PointToAffine() -} - -func (curve p256CurveFast) ScalarBaseMult(scalar []byte) (x, y *big.Int) { - var r p256Point - r.p256BaseMult(p256GetMultiplier(scalar)) - return r.p256PointToAffine() -} - -func (curve p256CurveFast) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { - var r p256Point - copy(r.x[:], fromBig(maybeReduceModP(bigX))) - copy(r.y[:], fromBig(maybeReduceModP(bigY))) - copy(r.z[:], one) - p256MulAsm(r.x[:], r.x[:], rr[:]) - p256MulAsm(r.y[:], r.y[:], rr[:]) - r.p256ScalarMult(p256GetMultiplier(scalar)) - return r.p256PointToAffine() -} - -// scalarIsZero returns 1 if scalar represents the zero value, and zero -// otherwise. -func scalarIsZero(scalar []byte) int { - b := byte(0) - for _, s := range scalar { - b |= s - } - return subtle.ConstantTimeByteEq(b, 0) -} - -func (p *p256Point) p256PointToAffine() (x, y *big.Int) { - zInv := make([]byte, 32) - zInvSq := make([]byte, 32) - - p256Inverse(zInv, p.z[:]) - p256Sqr(zInvSq, zInv) - p256MulAsm(zInv, zInv, zInvSq) - - p256MulAsm(zInvSq, p.x[:], zInvSq) - p256MulAsm(zInv, p.y[:], zInv) - - p256FromMont(zInvSq, zInvSq) - p256FromMont(zInv, zInv) - - return new(big.Int).SetBytes(zInvSq), new(big.Int).SetBytes(zInv) -} - -// p256Inverse sets out to in^-1 mod p. -func p256Inverse(out, in []byte) { - var stack [6 * 32]byte - p2 := stack[32*0 : 32*0+32] - p4 := stack[32*1 : 32*1+32] - p8 := stack[32*2 : 32*2+32] - p16 := stack[32*3 : 32*3+32] - p32 := stack[32*4 : 32*4+32] - - p256Sqr(out, in) - p256MulAsm(p2, out, in) // 3*p - - p256Sqr(out, p2) - p256Sqr(out, out) - p256MulAsm(p4, out, p2) // f*p - - p256Sqr(out, p4) - p256Sqr(out, out) - p256Sqr(out, out) - p256Sqr(out, out) - p256MulAsm(p8, out, p4) // ff*p - - p256Sqr(out, p8) - - for i := 0; i < 7; i++ { - p256Sqr(out, out) - } - p256MulAsm(p16, out, p8) // ffff*p - - p256Sqr(out, p16) - for i := 0; i < 15; i++ { - p256Sqr(out, out) - } - p256MulAsm(p32, out, p16) // ffffffff*p - - p256Sqr(out, p32) - - for i := 0; i < 31; i++ { - p256Sqr(out, out) - } - p256MulAsm(out, out, in) - - for i := 0; i < 32*4; i++ { - p256Sqr(out, out) - } - p256MulAsm(out, out, p32) - - for i := 0; i < 32; i++ { - p256Sqr(out, out) - } - p256MulAsm(out, out, p32) - - for i := 0; i < 16; i++ { - p256Sqr(out, out) - } - p256MulAsm(out, out, p16) - - for i := 0; i < 8; i++ { - p256Sqr(out, out) - } - p256MulAsm(out, out, p8) - - p256Sqr(out, out) - p256Sqr(out, out) - p256Sqr(out, out) - p256Sqr(out, out) - p256MulAsm(out, out, p4) - - p256Sqr(out, out) - p256Sqr(out, out) - p256MulAsm(out, out, p2) - - p256Sqr(out, out) - p256Sqr(out, out) - p256MulAsm(out, out, in) -} - -func boothW5(in uint) (int, int) { - var s uint = ^((in >> 5) - 1) - var d uint = (1 << 6) - in - 1 - d = (d & s) | (in & (^s)) - d = (d >> 1) + (d & 1) - return int(d), int(s & 1) -} - -func boothW7(in uint) (int, int) { - var s uint = ^((in >> 7) - 1) - var d uint = (1 << 8) - in - 1 - d = (d & s) | (in & (^s)) - d = (d >> 1) + (d & 1) - return int(d), int(s & 1) -} - -func initTable() { - p256PreFast = new([37][64]p256Point) //z coordinate not used - basePoint := p256Point{ - x: [32]byte{0x18, 0x90, 0x5f, 0x76, 0xa5, 0x37, 0x55, 0xc6, 0x79, 0xfb, 0x73, 0x2b, 0x77, 0x62, 0x25, 0x10, - 0x75, 0xba, 0x95, 0xfc, 0x5f, 0xed, 0xb6, 0x01, 0x79, 0xe7, 0x30, 0xd4, 0x18, 0xa9, 0x14, 0x3c}, //(p256.x*2^256)%p - y: [32]byte{0x85, 0x71, 0xff, 0x18, 0x25, 0x88, 0x5d, 0x85, 0xd2, 0xe8, 0x86, 0x88, 0xdd, 0x21, 0xf3, 0x25, - 0x8b, 0x4a, 0xb8, 0xe4, 0xba, 0x19, 0xe4, 0x5c, 0xdd, 0xf2, 0x53, 0x57, 0xce, 0x95, 0x56, 0x0a}, //(p256.y*2^256)%p - z: [32]byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}, //(p256.z*2^256)%p - } - - t1 := new(p256Point) - t2 := new(p256Point) - *t2 = basePoint - - zInv := make([]byte, 32) - zInvSq := make([]byte, 32) - for j := 0; j < 64; j++ { - *t1 = *t2 - for i := 0; i < 37; i++ { - // The window size is 7 so we need to double 7 times. - if i != 0 { - for k := 0; k < 7; k++ { - p256PointDoubleAsm(t1, t1) - } - } - // Convert the point to affine form. (Its values are - // still in Montgomery form however.) - p256Inverse(zInv, t1.z[:]) - p256Sqr(zInvSq, zInv) - p256MulAsm(zInv, zInv, zInvSq) - - p256MulAsm(t1.x[:], t1.x[:], zInvSq) - p256MulAsm(t1.y[:], t1.y[:], zInv) - - copy(t1.z[:], basePoint.z[:]) - // Update the table entry - copy(p256PreFast[i][j].x[:], t1.x[:]) - copy(p256PreFast[i][j].y[:], t1.y[:]) - } - if j == 0 { - p256PointDoubleAsm(t2, &basePoint) - } else { - p256PointAddAsm(t2, t2, &basePoint) - } - } -} - -func (p *p256Point) p256BaseMult(scalar []byte) { - wvalue := (uint(scalar[31]) << 1) & 0xff - sel, sign := boothW7(uint(wvalue)) - p256SelectBase(p, p256PreFast[0][:], sel) - p256NegCond(p, sign) - - copy(p.z[:], one[:]) - var t0 p256Point - - copy(t0.z[:], one[:]) - - index := uint(6) - zero := sel - - for i := 1; i < 37; i++ { - if index < 247 { - wvalue = ((uint(scalar[31-index/8]) >> (index % 8)) + (uint(scalar[31-index/8-1]) << (8 - (index % 8)))) & 0xff - } else { - wvalue = (uint(scalar[31-index/8]) >> (index % 8)) & 0xff - } - index += 7 - sel, sign = boothW7(uint(wvalue)) - p256SelectBase(&t0, p256PreFast[i][:], sel) - p256PointAddAffineAsm(p, p, &t0, sign, sel, zero) - zero |= sel - } -} - -func (p *p256Point) p256ScalarMult(scalar []byte) { - // precomp is a table of precomputed points that stores powers of p - // from p^1 to p^16. - var precomp [16]p256Point - var t0, t1, t2, t3 p256Point - - // Prepare the table - *&precomp[0] = *p - - p256PointDoubleAsm(&t0, p) - p256PointDoubleAsm(&t1, &t0) - p256PointDoubleAsm(&t2, &t1) - p256PointDoubleAsm(&t3, &t2) - *&precomp[1] = t0 // 2 - *&precomp[3] = t1 // 4 - *&precomp[7] = t2 // 8 - *&precomp[15] = t3 // 16 - - p256PointAddAsm(&t0, &t0, p) - p256PointAddAsm(&t1, &t1, p) - p256PointAddAsm(&t2, &t2, p) - *&precomp[2] = t0 // 3 - *&precomp[4] = t1 // 5 - *&precomp[8] = t2 // 9 - - p256PointDoubleAsm(&t0, &t0) - p256PointDoubleAsm(&t1, &t1) - *&precomp[5] = t0 // 6 - *&precomp[9] = t1 // 10 - - p256PointAddAsm(&t2, &t0, p) - p256PointAddAsm(&t1, &t1, p) - *&precomp[6] = t2 // 7 - *&precomp[10] = t1 // 11 - - p256PointDoubleAsm(&t0, &t0) - p256PointDoubleAsm(&t2, &t2) - *&precomp[11] = t0 // 12 - *&precomp[13] = t2 // 14 - - p256PointAddAsm(&t0, &t0, p) - p256PointAddAsm(&t2, &t2, p) - *&precomp[12] = t0 // 13 - *&precomp[14] = t2 // 15 - - // Start scanning the window from top bit - index := uint(254) - var sel, sign int - - wvalue := (uint(scalar[31-index/8]) >> (index % 8)) & 0x3f - sel, _ = boothW5(uint(wvalue)) - p256Select(p, precomp[:], sel) - zero := sel - - for index > 4 { - index -= 5 - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - - if index < 247 { - wvalue = ((uint(scalar[31-index/8]) >> (index % 8)) + (uint(scalar[31-index/8-1]) << (8 - (index % 8)))) & 0x3f - } else { - wvalue = (uint(scalar[31-index/8]) >> (index % 8)) & 0x3f - } - - sel, sign = boothW5(uint(wvalue)) - - p256Select(&t0, precomp[:], sel) - p256NegCond(&t0, sign) - p256PointAddAsm(&t1, p, &t0) - p256MovCond(&t1, &t1, p, sel) - p256MovCond(p, &t1, &t0, zero) - zero |= sel - } - - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - p256PointDoubleAsm(p, p) - - wvalue = (uint(scalar[31]) << 1) & 0x3f - sel, sign = boothW5(uint(wvalue)) - - p256Select(&t0, precomp[:], sel) - p256NegCond(&t0, sign) - p256PointAddAsm(&t1, p, &t0) - p256MovCond(&t1, &t1, p, sel) - p256MovCond(p, &t1, &t0, zero) -} -- 2.50.0