// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build ignore
-
#include "textflag.h"
// This is a port of the s390x asm implementation.
// have noted are most likely needed to make it work.
// - The string used with VPERM to swap the byte order
// for loads and stores.
-// - The EXTRACT_HI and EXTRACT_LO strings.
// - The constants that are loaded from CPOOL.
//
-// Permute string used by VPERM to reorder bytes
-// loaded or stored using LXVD2X or STXVD2X
-// on little endian.
-DATA byteswap<>+0(SB)/8, $0x08090a0b0c0d0e0f
-DATA byteswap<>+8(SB)/8, $0x0001020304050607
-
// The following constants are defined in an order
// that is correct for use with LXVD2X/STXVD2X
// on little endian.
DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
-// The following are used with VPERM to extract the high and low
-// values from the intermediate results of a vector multiply.
-// They are used in the VMULTxxx macros. These have been tested
-// only on little endian, I think they would have to be different
-// for big endian.
-DATA p256permhilo<>+0x00(SB)/8, $0x0405060714151617 // least significant
-DATA p256permhilo<>+0x08(SB)/8, $0x0c0d0e0f1c1d1e1f
-DATA p256permhilo<>+0x10(SB)/8, $0x0001020310111213 // most significant
-DATA p256permhilo<>+0x18(SB)/8, $0x08090a0b18191A1B
-
// External declarations for constants
GLOBL p256ord<>(SB), 8, $32
GLOBL p256<>(SB), 8, $80
GLOBL p256mul<>(SB), 8, $160
-GLOBL p256permhilo<>(SB), 8, $32
-GLOBL byteswap<>+0(SB), RODATA, $16
// The following macros are used to implement the ppc64le
// equivalent function from the corresponding s390x
// Implementation for big endian would have to be
// investigated, I think it would be different.
//
-// Vector multiply low word
-//
-// VMLF x0, x1, out_low
-#define VMULT_LOW(x1, x2, out_low) \
- VMULUWM x1, x2, out_low
-
-//
-// Vector multiply high word
-//
-// VMLHF x0, x1, out_hi
-#define VMULT_HI(x1, x2, out_hi) \
- VMULEUW x1, x2, TMP1; \
- VMULOUW x1, x2, TMP2; \
- VPERM TMP1, TMP2, EXTRACT_HI, out_hi
-
//
// Vector multiply word
//
// VMLF x0, x1, out_low
// VMLHF x0, x1, out_hi
#define VMULT(x1, x2, out_low, out_hi) \
- VMULEUW x1, x2, TMP1; \
- VMULOUW x1, x2, TMP2; \
- VPERM TMP1, TMP2, EXTRACT_LO, out_low; \
- VPERM TMP1, TMP2, EXTRACT_HI, out_hi
+ VMULEUW x1, x2, TMP1; \
+ VMULOUW x1, x2, TMP2; \
+ VMRGEW TMP1, TMP2, out_hi; \
+ VMRGOW TMP1, TMP2, out_low
//
// Vector multiply add word
//
// VMALF x0, x1, y, out_low
// VMALHF x0, x1, y, out_hi
-#define VMULT_ADD(x1, x2, y, out_low, out_hi) \
- VSPLTISW $1, TMP1; \
- VMULEUW y, TMP1, TMP2; \
- VMULOUW y, TMP1, TMP1; \
- VMULEUW x1, x2, out_low; \
- VMULOUW x1, x2, out_hi; \
- VADDUDM TMP1, out_hi, TMP1; \
- VADDUDM TMP2, out_low, TMP2; \
- VPERM TMP2, TMP1, EXTRACT_LO, out_low; \
- VPERM TMP2, TMP1, EXTRACT_HI, out_hi
-
-//
-// Vector multiply add high word
-//
-// VMALF x0, x1, y, out_low
-// VMALHF x0, x1, y, out_hi
-#define VMULT_ADD_HI(x1, x2, y, out_low, out_hi) \
- VSPLTISW $1, TMP1; \
- VMULOUW y, TMP1, TMP2; \
- VMULEUW y, TMP1, TMP1; \
- VMULEUW x1, x2, out_hi; \
- VMULOUW x1, x2, out_low; \
- VADDUDM TMP1, out_hi, TMP1; \
- VADDUDM TMP2, out_low, TMP2; \
- VPERM TMP2, TMP1, EXTRACT_HI, out_hi
-
-//
-// Vector multiply add low word
-//
-// VMALF s0, x1, y, out_low
-#define VMULT_ADD_LOW(x1, x2, y, out_low) \
- VMULUWM x1, x2, out_low; \
- VADDUWM out_low, y, out_low
+#define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
+ VMULEUW y, one, TMP2; \
+ VMULOUW y, one, TMP1; \
+ VMULEUW x1, x2, out_low; \
+ VMULOUW x1, x2, out_hi; \
+ VADDUDM TMP2, out_low, TMP2; \
+ VADDUDM TMP1, out_hi, TMP1; \
+ VMRGOW TMP2, TMP1, out_low; \
+ VMRGEW TMP2, TMP1, out_hi
#define res_ptr R3
#define a_ptr R4
#undef res_ptr
#undef a_ptr
-// func p256NegCond(val *p256Point, cond int)
#define P1ptr R3
#define CPOOL R7
#define Y1L V0
-#define Y1L_ VS32
#define Y1H V1
-#define Y1H_ VS33
#define T1L V2
-#define T1L_ VS34
#define T1H V3
-#define T1H_ VS35
-
-#define SWAP V28
-#define SWAP_ VS60
#define PL V30
-#define PL_ VS62
#define PH V31
-#define PH_ VS63
-#define SEL1 V5
-#define SEL1_ VS37
#define CAR1 V6
-//
-// iff cond == 1 val <- -val
-//
+// func p256NegCond(val *p256Point, cond int)
TEXT ·p256NegCond(SB), NOSPLIT, $0-16
MOVD val+0(FP), P1ptr
MOVD $16, R16
- MOVD $32, R17
- MOVD $48, R18
- MOVD $40, R19
MOVD cond+8(FP), R6
CMP $0, R6
MOVD $p256mul<>+0x00(SB), CPOOL
- MOVD $byteswap<>+0x00(SB), R8
- LXVD2X (R8)(R0), SWAP_
-
- LXVD2X (P1ptr)(R17), Y1L_
- LXVD2X (P1ptr)(R18), Y1H_
+ LXVD2X (P1ptr)(R0), Y1L
+ LXVD2X (P1ptr)(R16), Y1H
- VPERM Y1H, Y1H, SWAP, Y1H
- VPERM Y1L, Y1L, SWAP, Y1L
+ XXPERMDI Y1H, Y1H, $2, Y1H
+ XXPERMDI Y1L, Y1L, $2, Y1L
- LXVD2X (CPOOL)(R0), PL_
- LXVD2X (CPOOL)(R16), PH_
+ LXVD2X (CPOOL)(R0), PL
+ LXVD2X (CPOOL)(R16), PH
VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry
VSUBUQM PL, Y1L, T1L // subtract part2 giving result
VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
- VPERM T1H, T1H, SWAP, T1H
- VPERM T1L, T1L, SWAP, T1L
+ XXPERMDI T1H, T1H, $2, T1H
+ XXPERMDI T1L, T1L, $2, T1L
- STXVD2X T1L_, (R17+P1ptr)
- STXVD2X T1H_, (R18+P1ptr)
+ STXVD2X T1L, (R0+P1ptr)
+ STXVD2X T1H, (R16+P1ptr)
RET
#undef P1ptr
#undef CPOOL
#undef Y1L
-#undef Y1L_
#undef Y1H
-#undef Y1H_
#undef T1L
-#undef T1L_
#undef T1H
-#undef T1H_
#undef PL
-#undef PL_
#undef PH
-#undef PH_
-#undef SEL1
-#undef SEL1_
#undef CAR1
-//
-// if cond == 0 res <-b else res <-a
-//
-// func p256MovCond(res, a, b *p256Point, cond int)
#define P3ptr R3
#define P1ptr R4
#define P2ptr R5
-#define FROMptr R7
#define X1L V0
#define X1H V1
#define Y1L V2
#define Y1H V3
#define Z1L V4
#define Z1H V5
-#define X1L_ VS32
-#define X1H_ VS33
-#define Y1L_ VS34
-#define Y1H_ VS35
-#define Z1L_ VS36
-#define Z1H_ VS37
+#define X2L V6
+#define X2H V7
+#define Y2L V8
+#define Y2H V9
+#define Z2L V10
+#define Z2H V11
+#define SEL V12
+#define ZER V13
// This function uses LXVD2X and STXVD2X to avoid the
// data alignment requirement for LVX, STVX. Since
// this code is just moving bytes and not doing arithmetic,
// order of the bytes doesn't matter.
//
+// func p256MovCond(res, a, b *p256Point, cond int)
TEXT ·p256MovCond(SB), NOSPLIT, $0-32
MOVD res+0(FP), P3ptr
MOVD a+8(FP), P1ptr
MOVD b+16(FP), P2ptr
- MOVD cond+24(FP), R6
MOVD $16, R16
MOVD $32, R17
MOVD $48, R18
MOVD $56, R21
MOVD $64, R19
MOVD $80, R20
-
- // Check the condition
- CMP $0, R6
-
- // If 0, use b as the source
- BEQ FROMB
-
- // Not 0, use a as the source
- MOVD P1ptr, FROMptr
- BR LOADVALS
-
-FROMB:
- MOVD P2ptr, FROMptr
-
-LOADVALS:
- // Load from a or b depending on the setting
- // of FROMptr
- LXVW4X (FROMptr+R0), X1H_
- LXVW4X (FROMptr+R16), X1L_
- LXVW4X (FROMptr+R17), Y1H_
- LXVW4X (FROMptr+R18), Y1L_
- LXVW4X (FROMptr+R19), Z1H_
- LXVW4X (FROMptr+R20), Z1L_
-
- STXVW4X X1H_, (P3ptr+R0)
- STXVW4X X1L_, (P3ptr+R16)
- STXVW4X Y1H_, (P3ptr+R17)
- STXVW4X Y1L_, (P3ptr+R18)
- STXVW4X Z1H_, (P3ptr+R19)
- STXVW4X Z1L_, (P3ptr+R20)
+ // cond is R1 + 24 (cond offset) + 32
+ LXVDSX (R1)(R21), SEL
+ VSPLTISB $0, ZER
+ // SEL controls whether to store a or b
+ VCMPEQUD SEL, ZER, SEL
+
+ LXVD2X (P1ptr+R0), X1H
+ LXVD2X (P1ptr+R16), X1L
+ LXVD2X (P1ptr+R17), Y1H
+ LXVD2X (P1ptr+R18), Y1L
+ LXVD2X (P1ptr+R19), Z1H
+ LXVD2X (P1ptr+R20), Z1L
+
+ LXVD2X (P2ptr+R0), X2H
+ LXVD2X (P2ptr+R16), X2L
+ LXVD2X (P2ptr+R17), Y2H
+ LXVD2X (P2ptr+R18), Y2L
+ LXVD2X (P2ptr+R19), Z2H
+ LXVD2X (P2ptr+R20), Z2L
+
+ VSEL X1H, X2H, SEL, X1H
+ VSEL X1L, X2L, SEL, X1L
+ VSEL Y1H, Y2H, SEL, Y1H
+ VSEL Y1L, Y2L, SEL, Y1L
+ VSEL Z1H, Z2H, SEL, Z1H
+ VSEL Z1L, Z2L, SEL, Z1L
+
+ STXVD2X X1H, (P3ptr+R0)
+ STXVD2X X1L, (P3ptr+R16)
+ STXVD2X Y1H, (P3ptr+R17)
+ STXVD2X Y1L, (P3ptr+R18)
+ STXVD2X Z1H, (P3ptr+R19)
+ STXVD2X Z1L, (P3ptr+R20)
RET
#undef P3ptr
#undef P1ptr
#undef P2ptr
-#undef FROMptr
#undef X1L
#undef X1H
#undef Y1L
#undef Y1H
#undef Z1L
#undef Z1H
-#undef X1L_
-#undef X1H_
-#undef Y1L_
-#undef Y1H_
-#undef Z1L_
-#undef Z1H_
-//
-// Select the point from the table for idx
-//
-// func p256Select(point *p256Point, table []p256Point, idx int)
+#undef X2L
+#undef X2H
+#undef Y2L
+#undef Y2H
+#undef Z2L
+#undef Z2H
+#undef SEL
+#undef ZER
+
#define P3ptr R3
#define P1ptr R4
#define COUNT R5
#define Y1H V3
#define Z1L V4
#define Z1H V5
-#define X1L_ VS32
-#define X1H_ VS33
-#define Y1L_ VS34
-#define Y1H_ VS35
-#define Z1L_ VS36
-#define Z1H_ VS37
#define X2L V6
#define X2H V7
#define Y2L V8
#define Y2H V9
#define Z2L V10
#define Z2H V11
-#define X2L_ VS38
-#define X2H_ VS39
-#define Y2L_ VS40
-#define Y2H_ VS41
-#define Z2L_ VS42
-#define Z2H_ VS43
#define ONE V18
#define IDX V19
#define SEL1 V20
-#define SEL1_ VS52
#define SEL2 V21
-//
-TEXT ·p256Select(SB), NOSPLIT, $0-40
- MOVD point+0(FP), P3ptr
+// func p256Select(point *p256Point, table *p256Table, idx int)
+TEXT ·p256Select(SB), NOSPLIT, $0-24
+ MOVD res+0(FP), P3ptr
MOVD table+8(FP), P1ptr
MOVD $16, R16
MOVD $32, R17
MOVD $64, R19
MOVD $80, R20
- LXVDSX (R1)(R19), SEL1_ // VLREPG idx+32(FP), SEL1
+ LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
VSPLTB $7, SEL1, IDX // splat byte
VSPLTISB $1, ONE // VREPIB $1, ONE
VSPLTISB $1, SEL2 // VREPIB $1, SEL2
// LVXD2X is used here since data alignment doesn't
// matter.
- LXVD2X (P1ptr+R0), X2H_
- LXVD2X (P1ptr+R16), X2L_
- LXVD2X (P1ptr+R17), Y2H_
- LXVD2X (P1ptr+R18), Y2L_
- LXVD2X (P1ptr+R19), Z2H_
- LXVD2X (P1ptr+R20), Z2L_
+ LXVD2X (P1ptr+R0), X2H
+ LXVD2X (P1ptr+R16), X2L
+ LXVD2X (P1ptr+R17), Y2H
+ LXVD2X (P1ptr+R18), Y2L
+ LXVD2X (P1ptr+R19), Z2H
+ LXVD2X (P1ptr+R20), Z2L
VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
// Add 1 to all bytes in SEL2
VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK
ADD $96, P1ptr
- BC 16, 0, loop_select
+ BDNZ loop_select
// STXVD2X is used here so that alignment doesn't
// need to be verified. Since values were loaded
// using LXVD2X this is OK.
- STXVD2X X1H_, (P3ptr+R0)
- STXVD2X X1L_, (P3ptr+R16)
- STXVD2X Y1H_, (P3ptr+R17)
- STXVD2X Y1L_, (P3ptr+R18)
- STXVD2X Z1H_, (P3ptr+R19)
- STXVD2X Z1L_, (P3ptr+R20)
+ STXVD2X X1H, (P3ptr+R0)
+ STXVD2X X1L, (P3ptr+R16)
+ STXVD2X Y1H, (P3ptr+R17)
+ STXVD2X Y1L, (P3ptr+R18)
+ STXVD2X Z1H, (P3ptr+R19)
+ STXVD2X Z1L, (P3ptr+R20)
RET
#undef P3ptr
#undef Y2H
#undef Z2L
#undef Z2H
-#undef X2L_
-#undef X2H_
-#undef Y2L_
-#undef Y2H_
-#undef Z2L_
-#undef Z2H_
#undef ONE
#undef IDX
#undef SEL1
-#undef SEL1_
#undef SEL2
-// func p256SelectBase(point, table []uint64, idx int)
+// The following functions all reverse the byte order.
+
+//func p256BigToLittle(res *p256Element, in *[32]byte)
+TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
+ MOVD res+0(FP), R3
+ MOVD in+8(FP), R4
+ BR p256InternalEndianSwap<>(SB)
+
+//func p256LittleToBig(res *[32]byte, in *p256Element)
+TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
+ MOVD res+0(FP), R3
+ MOVD in+8(FP), R4
+ BR p256InternalEndianSwap<>(SB)
+
+//func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
+TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
+ MOVD res+0(FP), R3
+ MOVD in+8(FP), R4
+ BR p256InternalEndianSwap<>(SB)
+
+//func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
+TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
+ MOVD res+0(FP), R3
+ MOVD in+8(FP), R4
+ BR p256InternalEndianSwap<>(SB)
+
+TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0
+ // Index registers needed for BR movs
+ MOVD $8, R9
+ MOVD $16, R10
+ MOVD $24, R14
+
+ MOVDBR (R0)(R4), R5
+ MOVDBR (R9)(R4), R6
+ MOVDBR (R10)(R4), R7
+ MOVDBR (R14)(R4), R8
+
+ MOVD R8, 0(R3)
+ MOVD R7, 8(R3)
+ MOVD R6, 16(R3)
+ MOVD R5, 24(R3)
+
+ RET
+
#define P3ptr R3
#define P1ptr R4
#define COUNT R5
#define Y2H V9
#define Z2L V10
#define Z2H V11
-#define X2L_ VS38
-#define X2H_ VS39
-#define Y2L_ VS40
-#define Y2H_ VS41
-#define Z2L_ VS42
-#define Z2H_ VS43
#define ONE V18
#define IDX V19
#define SEL1 V20
-#define SEL1_ VS52
#define SEL2 V21
-TEXT ·p256SelectBase(SB), NOSPLIT, $0-40
- MOVD point+0(FP), P3ptr
+
+// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
+TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
+ MOVD res+0(FP), P3ptr
MOVD table+8(FP), P1ptr
MOVD $16, R16
MOVD $32, R17
MOVD $48, R18
- MOVD $64, R19
- MOVD $80, R20
- MOVD $56, R21
- LXVDSX (R1)(R19), SEL1_
+ LXVDSX (R1)(R18), SEL1
VSPLTB $7, SEL1, IDX // splat byte
VSPLTISB $1, ONE // Vector with byte 1s
VSPLTISB $1, SEL2 // Vector with byte 1s
- MOVD $65, COUNT
+ MOVD $64, COUNT
MOVD COUNT, CTR // loop count
VSPLTISB $0, X1H // VZERO X1H
VSPLTISB $0, X1L // VZERO X1L
VSPLTISB $0, Y1H // VZERO Y1H
VSPLTISB $0, Y1L // VZERO Y1L
- VSPLTISB $0, Z1H // VZERO Z1H
- VSPLTISB $0, Z1L // VZERO Z1L
loop_select:
- LXVD2X (P1ptr+R0), X2H_
- LXVD2X (P1ptr+R16), X2L_
- LXVD2X (P1ptr+R17), Y2H_
- LXVD2X (P1ptr+R18), Y2L_
- LXVD2X (P1ptr+R19), Z2H_
- LXVD2X (P1ptr+R20), Z2L_
+ LXVD2X (P1ptr+R0), X2H
+ LXVD2X (P1ptr+R16), X2L
+ LXVD2X (P1ptr+R17), Y2H
+ LXVD2X (P1ptr+R18), Y2L
VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
VSEL X1H, X2H, SEL1, X1H
VSEL Y1L, Y2L, SEL1, Y1L
VSEL Y1H, Y2H, SEL1, Y1H
- VSEL Z1L, Z2L, SEL1, Z1L
- VSEL Z1H, Z2H, SEL1, Z1H
VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1
- ADD $96, P1ptr // Next chunk
- BC 16, 0, loop_select
-
- STXVD2X X1H_, (P3ptr+R0)
- STXVD2X X1L_, (P3ptr+R16)
- STXVD2X Y1H_, (P3ptr+R17)
- STXVD2X Y1L_, (P3ptr+R18)
- STXVD2X Z1H_, (P3ptr+R19)
- STXVD2X Z1L_, (P3ptr+R20)
+ ADD $64, P1ptr // Next chunk
+ BDNZ loop_select
+
+ STXVD2X X1H, (P3ptr+R0)
+ STXVD2X X1L, (P3ptr+R16)
+ STXVD2X Y1H, (P3ptr+R17)
+ STXVD2X Y1L, (P3ptr+R18)
RET
#undef P3ptr
#undef Y2H
#undef Z2L
#undef Z2H
-#undef X1L_
-#undef X1H_
-#undef X2L_
-#undef X2H_
-#undef Y1L_
-#undef Y1H_
-#undef Y2L_
-#undef Y2H_
-#undef Z1L_
-#undef Z1H_
-#undef Z2L_
-#undef Z2H_
#undef ONE
#undef IDX
#undef SEL1
-#undef SEL1_
#undef SEL2
-#undef SWAP
-#undef SWAP_
-// ---------------------------------------
-// func p256FromMont(res, in []byte)
#define res_ptr R3
#define x_ptr R4
#define CPOOL R7
#define T0 V0
-#define T0_ VS32
#define T1 V1
-#define T1_ VS33
#define T2 V2
#define TT0 V3
#define TT1 V4
-#define TT0_ VS35
-#define TT1_ VS36
#define ZER V6
#define SEL1 V7
-#define SEL1_ VS39
#define SEL2 V8
-#define SEL2_ VS40
#define CAR1 V9
#define CAR2 V10
#define RED1 V11
#define RED2 V12
#define PL V13
-#define PL_ VS45
#define PH V14
-#define PH_ VS46
-#define SWAP V28
-#define SWAP_ VS57
-TEXT ·p256FromMont(SB), NOSPLIT, $0-48
+// func p256FromMont(res, in *p256Element)
+TEXT ·p256FromMont(SB), NOSPLIT, $0-16
MOVD res+0(FP), res_ptr
- MOVD in+24(FP), x_ptr
+ MOVD in+8(FP), x_ptr
MOVD $16, R16
MOVD $32, R17
MOVD $48, R18
MOVD $64, R19
MOVD $p256<>+0x00(SB), CPOOL
- MOVD $byteswap<>+0x00(SB), R15
VSPLTISB $0, T2 // VZERO T2
VSPLTISB $0, ZER // VZERO ZER
// Constants are defined so that the LXVD2X is correct
- LXVD2X (CPOOL+R0), PH_
- LXVD2X (CPOOL+R16), PL_
+ LXVD2X (CPOOL+R0), PH
+ LXVD2X (CPOOL+R16), PL
// VPERM byte selections
- LXVD2X (CPOOL+R18), SEL2_
- LXVD2X (CPOOL+R19), SEL1_
-
- LXVD2X (R15)(R0), SWAP_
+ LXVD2X (CPOOL+R18), SEL2
+ LXVD2X (CPOOL+R19), SEL1
- LXVD2X (R16)(x_ptr), T1_
- LXVD2X (R0)(x_ptr), T0_
+ LXVD2X (R16)(x_ptr), T1
+ LXVD2X (R0)(x_ptr), T0
// Put in true little endian order
- VPERM T0, T0, SWAP, T0
- VPERM T1, T1, SWAP, T1
+ XXPERMDI T0, T0, $2, T0
+ XXPERMDI T1, T1, $2, T1
// First round
VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
// Reorder the bytes so STXVD2X can be used.
// TT0, TT1 used for VPERM result in case
// the caller expects T0, T1 to be good.
- VPERM T0, T0, SWAP, TT0
- VPERM T1, T1, SWAP, TT1
+ XXPERMDI T0, T0, $2, TT0
+ XXPERMDI T1, T1, $2, TT1
- STXVD2X TT0_, (R0)(res_ptr)
- STXVD2X TT1_, (R16)(res_ptr)
+ STXVD2X TT0, (R0)(res_ptr)
+ STXVD2X TT1, (R16)(res_ptr)
RET
#undef res_ptr
#undef x_ptr
#undef CPOOL
#undef T0
-#undef T0_
#undef T1
-#undef T1_
#undef T2
#undef TT0
#undef TT1
#undef ZER
#undef SEL1
-#undef SEL1_
#undef SEL2
-#undef SEL2_
#undef CAR1
#undef CAR2
#undef RED1
#undef RED2
#undef PL
-#undef PL_
#undef PH
-#undef PH_
-#undef SWAP
-#undef SWAP_
// ---------------------------------------
// p256MulInternal
#define SEL4 V6 // Overloaded with YDIG,CAR1
#define SEL5 V9 // Overloaded with ADD3,SEL2
#define SEL6 V10 // Overloaded with ADD4,SEL3
-#define SEL1_ VS45
-#define SEL2_ VS41
-#define SEL3_ VS42
-#define SEL4_ VS38
-#define SEL5_ VS41
-#define SEL6_ VS42
-
-// TMP1, TMP2, EXTRACT_LO, EXTRACT_HI used in
+
+// TMP1, TMP2 used in
// VMULT macros
#define TMP1 V13 // Overloaded with RED3
#define TMP2 V27
-#define EVENODD R5
-#define EXTRACT_LO V28
-#define EXTRACT_LO_ VS60
-#define EXTRACT_HI V29
-#define EXTRACT_HI_ VS61
+#define ONE V29 // 1s splatted by word
/* *
* To follow the flow of bits, for your own sanity a stiff drink, need you shall.
MOVD $96, R21
MOVD $112, R22
- MOVD $p256permhilo<>+0x00(SB), EVENODD
-
- // These values are used by the VMULTxxx macros to
- // extract the high and low portions of the intermediate
- // result.
- LXVD2X (R0)(EVENODD), EXTRACT_LO_
- LXVD2X (R16)(EVENODD), EXTRACT_HI_
-
// ---------------------------------------------------
VSPLTW $3, Y0, YDIG // VREPF Y0 is input
VMULT(X0, YDIG, ADD1, ADD1H)
VMULT(X1, YDIG, ADD2, ADD2H)
+ VSPLTISW $1, ONE
VSPLTW $2, Y0, YDIG // VREPF
// VMALF X0, YDIG, ADD1H, ADD3
// VMALF X1, YDIG, ADD2H, ADD4
// VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
// VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
- VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
- VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
+ VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
+ VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
- LXVD2X (R17)(CPOOL), SEL1_
+ LXVD2X (R17)(CPOOL), SEL1
VSPLTISB $0, ZER // VZERO ZER
VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ
- LXVD2X (R18)(CPOOL), SEL2_
- LXVD2X (R19)(CPOOL), SEL3_
- LXVD2X (R20)(CPOOL), SEL4_
+ LXVD2X (R18)(CPOOL), SEL2
+ LXVD2X (R19)(CPOOL), SEL3
+ LXVD2X (R20)(CPOOL), SEL4
VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
// ---------------------------------------------------
VSPLTW $1, Y0, YDIG // VREPF
- LXVD2X (R0)(EVENODD), EXTRACT_LO_
- LXVD2X (R16)(EVENODD), EXTRACT_HI_
// VMALHF X0, YDIG, T0, ADD1H
// VMALHF X1, YDIG, T1, ADD2H
// VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
// VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
- VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H)
- VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H)
+ VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
+ VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
VSPLTW $0, Y0, YDIG // VREPF
// VMALF X1, YDIG, ADD2H, ADD4
// VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
// VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
- VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
- VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
+ VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
+ VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
VSPLTISB $0, ZER // VZERO ZER
- LXVD2X (R17)(CPOOL), SEL1_
+ LXVD2X (R17)(CPOOL), SEL1
VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB
VADDEUQM T1, ADD4, CAR1, T1 // VACQ
VADDUQM T2, CAR2, T2 // VAQ
- LXVD2X (R18)(CPOOL), SEL2_
- LXVD2X (R19)(CPOOL), SEL3_
- LXVD2X (R20)(CPOOL), SEL4_
+ LXVD2X (R18)(CPOOL), SEL2
+ LXVD2X (R19)(CPOOL), SEL3
+ LXVD2X (R20)(CPOOL), SEL4
VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
// ---------------------------------------------------
VSPLTW $3, Y1, YDIG // VREPF
- LXVD2X (R0)(EVENODD), EXTRACT_LO_
- LXVD2X (R16)(EVENODD), EXTRACT_HI_
// VMALHF X0, YDIG, T0, ADD1H
// VMALHF X1, YDIG, T1, ADD2H
// VMALF X0, YDIG, T0, ADD1
// VMALF X1, YDIG, T1, ADD2
- VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H)
- VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H)
+ VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
+ VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
VSPLTW $2, Y1, YDIG // VREPF
// VMALF X1, YDIG, ADD2H, ADD4
// VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
// VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
- VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
- VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
+ VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
+ VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
- LXVD2X (R17)(CPOOL), SEL1_
+ LXVD2X (R17)(CPOOL), SEL1
VSPLTISB $0, ZER // VZERO ZER
- LXVD2X (R17)(CPOOL), SEL1_
+ LXVD2X (R17)(CPOOL), SEL1
VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
VADDEUQM T1, ADD4, CAR1, T1 // VACQ
VADDUQM T2, CAR2, T2 // VAQ
- LXVD2X (R18)(CPOOL), SEL2_
- LXVD2X (R19)(CPOOL), SEL3_
- LXVD2X (R20)(CPOOL), SEL4_
+ LXVD2X (R18)(CPOOL), SEL2
+ LXVD2X (R19)(CPOOL), SEL3
+ LXVD2X (R20)(CPOOL), SEL4
VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
// ---------------------------------------------------
VSPLTW $1, Y1, YDIG // VREPF
- LXVD2X (R0)(EVENODD), EXTRACT_LO_
- LXVD2X (R16)(EVENODD), EXTRACT_HI_
// VMALHF X0, YDIG, T0, ADD1H
// VMALHF X1, YDIG, T1, ADD2H
// VMALF X0, YDIG, T0, ADD1
// VMALF X1, YDIG, T1, ADD2
- VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H)
- VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H)
+ VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
+ VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
VSPLTW $0, Y1, YDIG // VREPF
// VMALF X1, YDIG, ADD2H, ADD4
// VMALHF X0, YDIG, ADD1H, ADD3H
// VMALHF X1, YDIG, ADD2H, ADD4H
- VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H)
- VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H)
+ VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
+ VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
VSPLTISB $0, ZER // VZERO ZER
- LXVD2X (R17)(CPOOL), SEL1_
+ LXVD2X (R17)(CPOOL), SEL1
VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
VSLDOI $12, ADD2, ADD1, T0 // VSLDB
VADDEUQM T1, ADD4, CAR1, T1 // VACQ
VADDUQM T2, CAR2, T2 // VAQ
- LXVD2X (R21)(CPOOL), SEL5_
- LXVD2X (R22)(CPOOL), SEL6_
+ LXVD2X (R21)(CPOOL), SEL5
+ LXVD2X (R22)(CPOOL), SEL6
VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ
#undef SEL4
#undef SEL5
#undef SEL6
-#undef SEL1_
-#undef SEL2_
-#undef SEL3_
-#undef SEL4_
-#undef SEL5_
-#undef SEL6_
#undef YDIG
#undef ADD1H
#undef TMP1
#undef TMP2
-#undef EVENODD
-#undef EXTRACT_HI
-#undef EXTRACT_HI_
-#undef EXTRACT_LO
-#undef EXTRACT_LO_
#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
VSPLTISB $0, ZER \ // VZERO
VOR T0, TT0, T0 \
VOR T1, TT1, T1
-// ---------------------------------------
-// func p256MulAsm(res, in1, in2 []byte)
#define res_ptr R3
#define x_ptr R4
#define y_ptr R5
#define CPOOL R7
#define TEMP R8
+#define N R9
// Parameters
#define X0 V0
#define Y1 V3
#define T0 V4
#define T1 V5
-#define X0_ VS32
-#define X1_ VS33
-#define Y0_ VS34
-#define Y1_ VS35
-#define T0_ VS36
-#define T1_ VS37
-#define SWAP V28
-#define SWAP_ VS60
// Constants
#define P0 V30
#define P1 V31
-#define P0_ VS62
-#define P1_ VS63
-//
-// Montgomery multiplication modulo P256
-//
-TEXT ·p256MulAsm(SB), NOSPLIT, $0-72
+// func p256MulAsm(res, in1, in2 *p256Element)
+TEXT ·p256Mul(SB), NOSPLIT, $0-24
MOVD res+0(FP), res_ptr
- MOVD in1+24(FP), x_ptr
- MOVD in2+48(FP), y_ptr
+ MOVD in1+8(FP), x_ptr
+ MOVD in2+16(FP), y_ptr
MOVD $16, R16
MOVD $32, R17
MOVD $p256mul<>+0x00(SB), CPOOL
- MOVD $byteswap<>+0x00(SB), R8
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R0)(x_ptr), X0_
- LXVD2X (R16)(x_ptr), X1_
+ LXVD2X (R0)(x_ptr), X0
+ LXVD2X (R16)(x_ptr), X1
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
- LXVD2X (R0)(y_ptr), Y0_
- LXVD2X (R16)(y_ptr), Y1_
+ LXVD2X (R0)(y_ptr), Y0
+ LXVD2X (R16)(y_ptr), Y1
- VPERM Y0, Y0, SWAP, Y0
- VPERM Y1, Y1, SWAP, Y1
+ XXPERMDI Y0, Y0, $2, Y0
+ XXPERMDI Y1, Y1, $2, Y1
- LXVD2X (R16)(CPOOL), P1_
- LXVD2X (R0)(CPOOL), P0_
+ LXVD2X (R16)(CPOOL), P1
+ LXVD2X (R0)(CPOOL), P0
CALL p256MulInternal<>(SB)
MOVD $p256mul<>+0x00(SB), CPOOL
- MOVD $byteswap<>+0x00(SB), R8
- LXVD2X (R8)(R0), SWAP_
+ XXPERMDI T0, T0, $2, T0
+ XXPERMDI T1, T1, $2, T1
+ STXVD2X T0, (R0)(res_ptr)
+ STXVD2X T1, (R16)(res_ptr)
+ RET
+
+// func p256Sqr(res, in *p256Element, n int)
+TEXT ·p256Sqr(SB), NOSPLIT, $0-24
+ MOVD res+0(FP), res_ptr
+ MOVD in+8(FP), x_ptr
+ MOVD $16, R16
+ MOVD $32, R17
+
+ MOVD $p256mul<>+0x00(SB), CPOOL
+
+ LXVD2X (R0)(x_ptr), X0
+ LXVD2X (R16)(x_ptr), X1
+
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
+
+sqrLoop:
+ // Sqr uses same value for both
- VPERM T0, T0, SWAP, T0
- VPERM T1, T1, SWAP, T1
- STXVD2X T0_, (R0)(res_ptr)
- STXVD2X T1_, (R16)(res_ptr)
+ VOR X0, X0, Y0
+ VOR X1, X1, Y1
+
+ LXVD2X (R16)(CPOOL), P1
+ LXVD2X (R0)(CPOOL), P0
+
+ CALL p256MulInternal<>(SB)
+
+ MOVD n+16(FP), N
+ ADD $-1, N
+ CMP $0, N
+ BEQ done
+ MOVD N, n+16(FP) // Save counter to avoid clobber
+ VOR T0, T0, X0
+ VOR T1, T1, X1
+ BR sqrLoop
+
+done:
+ MOVD $p256mul<>+0x00(SB), CPOOL
+
+ XXPERMDI T0, T0, $2, T0
+ XXPERMDI T1, T1, $2, T1
+ STXVD2X T0, (R0)(res_ptr)
+ STXVD2X T1, (R16)(res_ptr)
RET
#undef res_ptr
#undef T1
#undef P0
#undef P1
-#undef X0_
-#undef X1_
-#undef Y0_
-#undef Y1_
-#undef T0_
-#undef T1_
-#undef P0_
-#undef P1_
-
-// Point add with P2 being affine point
-// If sign == 1 -> P2 = -P2
-// If sel == 0 -> P3 = P1
-// if zero == 0 -> P3 = P2
-// p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
+
#define P3ptr R3
#define P1ptr R4
#define P2ptr R5
// Temporaries in REGs
#define Y2L V15
#define Y2H V16
-#define Y2L_ VS47
-#define Y2H_ VS48
#define T1L V17
#define T1H V18
#define T2L V19
// p256MulAsm Parameters
#define X0 V0
#define X1 V1
-#define X0_ VS32
-#define X1_ VS33
#define Y0 V2
#define Y1 V3
-#define Y0_ VS34
-#define Y1_ VS35
#define T0 V4
#define T1 V5
#define PL V30
#define PH V31
-#define PL_ VS62
-#define PH_ VS63
// Names for zero/sel selects
#define X1L V0
#define X1H V1
-#define X1L_ VS32
-#define X1H_ VS33
#define Y1L V2 // p256MulAsmParmY
#define Y1H V3 // p256MulAsmParmY
-#define Y1L_ VS34
-#define Y1H_ VS35
#define Z1L V4
#define Z1H V5
-#define Z1L_ VS36
-#define Z1H_ VS37
#define X2L V0
#define X2H V1
-#define X2L_ VS32
-#define X2H_ VS33
#define Z2L V4
#define Z2H V5
-#define Z2L_ VS36
-#define Z2H_ VS37
#define X3L V17 // T1L
#define X3H V18 // T1H
#define Y3L V21 // T3L
#define Y3H V22 // T3H
#define Z3L V25
#define Z3H V26
-#define X3L_ VS49
-#define X3H_ VS50
-#define Y3L_ VS53
-#define Y3H_ VS54
-#define Z3L_ VS57
-#define Z3H_ VS58
#define ZER V6
#define SEL1 V7
-#define SEL1_ VS39
#define CAR1 V8
#define CAR2 V9
/* *
// V27 is clobbered by p256MulInternal so must be
// saved in a temp.
//
+// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
MOVD res+0(FP), P3ptr
MOVD in1+8(FP), P1ptr
MOVD $160, R25
MOVD $104, R26 // offset of sign+24(FP)
- MOVD $byteswap<>+0+00(SB), R8
- LXVD2X (R16)(CPOOL), PH_
- LXVD2X (R0)(CPOOL), PL_
-
- // if (sign == 1) {
- // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
- // }
+ LXVD2X (R16)(CPOOL), PH
+ LXVD2X (R0)(CPOOL), PL
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R17)(P2ptr), Y2L_
- LXVD2X (R18)(P2ptr), Y2H_
- VPERM Y2H, Y2H, SWAP, Y2H
- VPERM Y2L, Y2L, SWAP, Y2L
+ LXVD2X (R17)(P2ptr), Y2L
+ LXVD2X (R18)(P2ptr), Y2H
+ XXPERMDI Y2H, Y2H, $2, Y2H
+ XXPERMDI Y2L, Y2L, $2, Y2L
// Equivalent of VLREPG sign+24(FP), SEL1
- LXVDSX (R1)(R26), SEL1_
+ LXVDSX (R1)(R26), SEL1
VSPLTISB $0, ZER
VCMPEQUD SEL1, ZER, SEL1
* Source: 2004 Hankerson–Menezes–Vanstone, page 91.
*/
// X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R19)(P1ptr), X0_ // Z1H
- LXVD2X (R20)(P1ptr), X1_ // Z1L
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R19)(P1ptr), X0 // Z1H
+ LXVD2X (R20)(P1ptr), X1 // Z1L
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
VOR X0, X0, Y0
VOR X1, X1, Y1
CALL p256MulInternal<>(SB)
// X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
MOVD in2+16(FP), P2ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R0)(P2ptr), Y0_ // X2H
- LXVD2X (R16)(P2ptr), Y1_ // X2L
- VPERM Y0, Y0, SWAP, Y0
- VPERM Y1, Y1, SWAP, Y1
+ LXVD2X (R0)(P2ptr), Y0 // X2H
+ LXVD2X (R16)(P2ptr), Y1 // X2L
+ XXPERMDI Y0, Y0, $2, Y0
+ XXPERMDI Y1, Y1, $2, Y1
CALL p256MulInternal<>(SB)
VOR T0, T0, T1L
VOR T1, T1, T1H
// SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
MOVD in1+8(FP), P1ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R17)(P1ptr), Y1L_
- LXVD2X (R18)(P1ptr), Y1H_
- VPERM Y1H, Y1H, SWAP, Y1H
- VPERM Y1L, Y1L, SWAP, Y1L
+ LXVD2X (R17)(P1ptr), Y1L
+ LXVD2X (R18)(P1ptr), Y1H
+ XXPERMDI Y1H, Y1H, $2, Y1H
+ XXPERMDI Y1L, Y1L, $2, Y1L
p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
// SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
- LXVD2X (R0)(P1ptr), X1L_
- LXVD2X (R16)(P1ptr), X1H_
- VPERM X1H, X1H, SWAP, X1H
- VPERM X1L, X1L, SWAP, X1L
+ LXVD2X (R0)(P1ptr), X1L
+ LXVD2X (R16)(P1ptr), X1H
+ XXPERMDI X1H, X1H, $2, X1H
+ XXPERMDI X1L, X1L, $2, X1L
p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
// X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
- LXVD2X (R19)(P1ptr), X0_ // Z1H
- LXVD2X (R20)(P1ptr), X1_ // Z1L
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R19)(P1ptr), X0 // Z1H
+ LXVD2X (R20)(P1ptr), X1 // Z1L
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
CALL p256MulInternal<>(SB)
VOR T0, T0, Z3L
// X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
MOVD in1+8(FP), P1ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R0)(P1ptr), Y0_ // X1H
- LXVD2X (R16)(P1ptr), Y1_ // X1L
- VPERM Y1, Y1, SWAP, Y1
- VPERM Y0, Y0, SWAP, Y0
+ LXVD2X (R0)(P1ptr), Y0 // X1H
+ LXVD2X (R16)(P1ptr), Y1 // X1L
+ XXPERMDI Y1, Y1, $2, Y1
+ XXPERMDI Y0, Y0, $2, Y0
CALL p256MulInternal<>(SB)
VOR T0, T0, T3L
VOR T1, T1, T3H
VOR T4L, T4L, X0
VOR T4H, T4H, X1
MOVD in1+8(FP), P1ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R17)(P1ptr), Y0_ // Y1H
- LXVD2X (R18)(P1ptr), Y1_ // Y1L
- VPERM Y0, Y0, SWAP, Y0
- VPERM Y1, Y1, SWAP, Y1
+ LXVD2X (R17)(P1ptr), Y0 // Y1H
+ LXVD2X (R18)(P1ptr), Y1 // Y1L
+ XXPERMDI Y0, Y0, $2, Y0
+ XXPERMDI Y1, Y1, $2, Y1
CALL p256MulInternal<>(SB)
// SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
// copy(P3.z[:], Z1)
// }
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R0)(P1ptr), X1L_
- LXVD2X (R16)(P1ptr), X1H_
- VPERM X1H, X1H, SWAP, X1H
- VPERM X1L, X1L, SWAP, X1L
+ LXVD2X (R0)(P1ptr), X1L
+ LXVD2X (R16)(P1ptr), X1H
+ XXPERMDI X1H, X1H, $2, X1H
+ XXPERMDI X1L, X1L, $2, X1L
// Y1 already loaded, left over from addition
- LXVD2X (R19)(P1ptr), Z1L_
- LXVD2X (R20)(P1ptr), Z1H_
- VPERM Z1H, Z1H, SWAP, Z1H
- VPERM Z1L, Z1L, SWAP, Z1L
+ LXVD2X (R19)(P1ptr), Z1L
+ LXVD2X (R20)(P1ptr), Z1H
+ XXPERMDI Z1H, Z1H, $2, Z1H
+ XXPERMDI Z1L, Z1L, $2, Z1L
MOVD $112, R26 // Get offset to sel+32
- LXVDSX (R1)(R26), SEL1_
+ LXVDSX (R1)(R26), SEL1
VSPLTISB $0, ZER
VCMPEQUD SEL1, ZER, SEL1
VSEL Z3L, Z1L, SEL1, Z3L
VSEL Z3H, Z1H, SEL1, Z3H
- // if (zero == 0) {
- // copy(P3.x[:], X2)
- // copy(P3.y[:], Y2)
- // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
- // }
MOVD in2+16(FP), P2ptr
- LXVD2X (R0)(P2ptr), X2L_
- LXVD2X (R16)(P2ptr), X2H_
- VPERM X2H, X2H, SWAP, X2H
- VPERM X2L, X2L, SWAP, X2L
+ LXVD2X (R0)(P2ptr), X2L
+ LXVD2X (R16)(P2ptr), X2H
+ XXPERMDI X2H, X2H, $2, X2H
+ XXPERMDI X2L, X2L, $2, X2L
// Y2 already loaded
- LXVD2X (R23)(CPOOL), Z2L_
- LXVD2X (R24)(CPOOL), Z2H_
+ LXVD2X (R23)(CPOOL), Z2L
+ LXVD2X (R24)(CPOOL), Z2H
MOVD $120, R26 // Get the value from zero+40(FP)
- LXVDSX (R1)(R26), SEL1_
+ LXVDSX (R1)(R26), SEL1
VSPLTISB $0, ZER
VCMPEQUD SEL1, ZER, SEL1
// Reorder the bytes so they can be stored using STXVD2X.
MOVD res+0(FP), P3ptr
- VPERM X3H, X3H, SWAP, X3H
- VPERM X3L, X3L, SWAP, X3L
- VPERM Y3H, Y3H, SWAP, Y3H
- VPERM Y3L, Y3L, SWAP, Y3L
- VPERM Z3H, Z3H, SWAP, Z3H
- VPERM Z3L, Z3L, SWAP, Z3L
- STXVD2X X3L_, (R0)(P3ptr)
- STXVD2X X3H_, (R16)(P3ptr)
- STXVD2X Y3L_, (R17)(P3ptr)
- STXVD2X Y3H_, (R18)(P3ptr)
- STXVD2X Z3L_, (R19)(P3ptr)
- STXVD2X Z3H_, (R20)(P3ptr)
+ XXPERMDI X3H, X3H, $2, X3H
+ XXPERMDI X3L, X3L, $2, X3L
+ XXPERMDI Y3H, Y3H, $2, Y3H
+ XXPERMDI Y3L, Y3L, $2, Y3L
+ XXPERMDI Z3H, Z3H, $2, Z3H
+ XXPERMDI Z3L, Z3L, $2, Z3L
+ STXVD2X X3L, (R0)(P3ptr)
+ STXVD2X X3H, (R16)(P3ptr)
+ STXVD2X Y3L, (R17)(P3ptr)
+ STXVD2X Y3H, (R18)(P3ptr)
+ STXVD2X Z3L, (R19)(P3ptr)
+ STXVD2X Z3H, (R20)(P3ptr)
RET
#undef P1ptr
#undef P2ptr
#undef CPOOL
-#undef SWAP
-#undef SWAP_
#undef Y2L
#undef Y2H
-#undef Y2L_
-#undef Y2H_
#undef T1L
#undef T1H
#undef T2L
#undef TT0
#undef TT1
-#undef TT0_
-#undef TT1_
#undef T2
#undef X0
#undef X1
-#undef X0_
-#undef X1_
#undef Y0
#undef Y1
-#undef Y0_
-#undef Y1_
#undef T0
#undef T1
#undef PL
#undef PH
-#undef PL_
-#undef PH_
#undef X1L
#undef X1H
-#undef X1L_
-#undef X1H_
#undef Y1L
#undef Y1H
-#undef Y1L_
-#undef Y1H_
#undef Z1L
#undef Z1H
-#undef Z1L_
-#undef Z1H_
#undef X2L
#undef X2H
-#undef X2L_
-#undef X2H_
#undef Z2L
#undef Z2H
-#undef Z2L_
-#undef Z2H_
#undef X3L
#undef X3H
-#undef X3L_
-#undef X3H_
#undef Y3L
#undef Y3H
-#undef Y3L_
-#undef Y3H_
#undef Z3L
#undef Z3H
-#undef Z3L_
-#undef Z3H_
#undef ZER
#undef SEL1
-#undef SEL1_
#undef CAR1
#undef CAR2
-// p256PointDoubleAsm(P3, P1 *p256Point)
// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
// http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
// Temporaries in REGs
#define X3L V15
#define X3H V16
-#define X3L_ VS47
-#define X3H_ VS48
#define Y3L V17
#define Y3H V18
-#define Y3L_ VS49
-#define Y3H_ VS50
#define T1L V19
#define T1H V20
#define T2L V21
#define X1L V6
#define X1H V7
-#define X1L_ VS38
-#define X1H_ VS39
#define Y1L V8
#define Y1H V9
-#define Y1L_ VS40
-#define Y1H_ VS41
#define Z1L V10
#define Z1H V11
// Temps for Sub and Add
#define TT0 V11
#define TT1 V12
-#define TT0_ VS43
-#define TT1_ VS44
#define T2 V13
// p256MulAsm Parameters
#define X0 V0
#define X1 V1
-#define X0_ VS32
-#define X1_ VS33
#define Y0 V2
#define Y1 V3
-#define Y0_ VS34
-#define Y1_ VS35
#define T0 V4
#define T1 V5
-#define T0_ VS36
-#define T1_ VS37
#define PL V30
#define PH V31
-#define PL_ VS62
-#define PH_ VS63
#define Z3L V23
#define Z3H V24
-#define SWAP V25
-#define SWAP_ VS57
#define ZER V26
#define SEL1 V27
#define CAR1 V28
* T1 = T1*T2
* Y3 = T1-Y3
*/
-
+// p256PointDoubleAsm(res, in1 *p256Point)
TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
MOVD res+0(FP), P3ptr
MOVD in+8(FP), P1ptr
MOVD $p256mul<>+0x00(SB), CPOOL
- MOVD $byteswap<>+0x00(SB), R15
MOVD $16, R16
MOVD $32, R17
MOVD $64, R19
MOVD $80, R20
- LXVD2X (R16)(CPOOL), PH_
- LXVD2X (R0)(CPOOL), PL_
-
- LXVD2X (R15)(R0), SWAP_
+ LXVD2X (R16)(CPOOL), PH
+ LXVD2X (R0)(CPOOL), PL
// X=Z1; Y=Z1; MUL; T- // T1 = Z1²
- LXVD2X (R19)(P1ptr), X0_ // Z1H
- LXVD2X (R20)(P1ptr), X1_ // Z1L
+ LXVD2X (R19)(P1ptr), X0 // Z1H
+ LXVD2X (R20)(P1ptr), X1 // Z1L
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
VOR X0, X0, Y0
VOR X1, X1, Y1
CALL p256MulInternal<>(SB)
// SUB(X<X1-T) // T2 = X1-T1
- LXVD2X (R0)(P1ptr), X1L_
- LXVD2X (R16)(P1ptr), X1H_
- VPERM X1L, X1L, SWAP, X1L
- VPERM X1H, X1H, SWAP, X1H
+ LXVD2X (R0)(P1ptr), X1L
+ LXVD2X (R16)(P1ptr), X1H
+ XXPERMDI X1L, X1L, $2, X1L
+ XXPERMDI X1H, X1H, $2, X1H
p256SubInternal(X1,X0,X1H,X1L,T1,T0)
p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
// ADD(X<Y1+Y1) // Y3 = 2*Y1
- LXVD2X (R15)(R0), SWAP_
- LXVD2X (R17)(P1ptr), Y1L_
- LXVD2X (R18)(P1ptr), Y1H_
- VPERM Y1L, Y1L, SWAP, Y1L
- VPERM Y1H, Y1H, SWAP, Y1H
+ LXVD2X (R17)(P1ptr), Y1L
+ LXVD2X (R18)(P1ptr), Y1H
+ XXPERMDI Y1L, Y1L, $2, Y1L
+ XXPERMDI Y1H, Y1H, $2, Y1H
p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
// X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
- LXVD2X (R15)(R0), SWAP_
- LXVD2X (R19)(P1ptr), Y0_
- LXVD2X (R20)(P1ptr), Y1_
- VPERM Y0, Y0, SWAP, Y0
- VPERM Y1, Y1, SWAP, Y1
+ LXVD2X (R19)(P1ptr), Y0
+ LXVD2X (R20)(P1ptr), Y1
+ XXPERMDI Y0, Y0, $2, Y0
+ XXPERMDI Y1, Y1, $2, Y1
CALL p256MulInternal<>(SB)
- LXVD2X (R15)(R0), SWAP_
-
// Leave T0, T1 as is.
- VPERM T0, T0, SWAP, TT0
- VPERM T1, T1, SWAP, TT1
- STXVD2X TT0_, (R19)(P3ptr)
- STXVD2X TT1_, (R20)(P3ptr)
+ XXPERMDI T0, T0, $2, TT0
+ XXPERMDI T1, T1, $2, TT1
+ STXVD2X TT0, (R19)(P3ptr)
+ STXVD2X TT1, (R20)(P3ptr)
// X- ; Y=X ; MUL; T- // Y3 = Y3²
VOR X0, X0, Y0
// X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
VOR T0, T0, X0
VOR T1, T1, X1
- LXVD2X (R15)(R0), SWAP_
- LXVD2X (R0)(P1ptr), Y0_
- LXVD2X (R16)(P1ptr), Y1_
- VPERM Y0, Y0, SWAP, Y0
- VPERM Y1, Y1, SWAP, Y1
+ LXVD2X (R0)(P1ptr), Y0
+ LXVD2X (R16)(P1ptr), Y1
+ XXPERMDI Y0, Y0, $2, Y0
+ XXPERMDI Y1, Y1, $2, Y1
CALL p256MulInternal<>(SB)
VOR T0, T0, T3L
VOR T1, T1, T3H
// SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
- LXVD2X (R15)(R0), SWAP_
- VPERM X3L, X3L, SWAP, TT0
- VPERM X3H, X3H, SWAP, TT1
- STXVD2X TT0_, (R0)(P3ptr)
- STXVD2X TT1_, (R16)(P3ptr)
+ XXPERMDI X3L, X3L, $2, TT0
+ XXPERMDI X3H, X3H, $2, TT1
+ STXVD2X TT0, (R0)(P3ptr)
+ STXVD2X TT1, (R16)(P3ptr)
// SUB(X<T3-X3) // T1 = T3-X3
p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
// SUB(Y3<T-Y3) // Y3 = T1-Y3
p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
- LXVD2X (R15)(R0), SWAP_
- VPERM Y3L, Y3L, SWAP, Y3L
- VPERM Y3H, Y3H, SWAP, Y3H
- STXVD2X Y3L_, (R17)(P3ptr)
- STXVD2X Y3H_, (R18)(P3ptr)
+ XXPERMDI Y3L, Y3L, $2, Y3L
+ XXPERMDI Y3H, Y3H, $2, Y3H
+ STXVD2X Y3L, (R17)(P3ptr)
+ STXVD2X Y3H, (R18)(P3ptr)
RET
#undef P3ptr
#undef CPOOL
#undef X3L
#undef X3H
-#undef X3L_
-#undef X3H_
#undef Y3L
#undef Y3H
-#undef Y3L_
-#undef Y3H_
#undef T1L
#undef T1H
#undef T2L
#undef T3H
#undef X1L
#undef X1H
-#undef X1L_
-#undef X1H_
#undef Y1L
#undef Y1H
-#undef Y1L_
-#undef Y1H_
#undef Z1L
#undef Z1H
#undef TT0
#undef TT1
-#undef TT0_
-#undef TT1_
#undef T2
#undef X0
#undef X1
-#undef X0_
-#undef X1_
#undef Y0
#undef Y1
-#undef Y0_
-#undef Y1_
#undef T0
#undef T1
-#undef T0_
-#undef T1_
#undef PL
#undef PH
-#undef PL_
-#undef PH_
#undef Z3L
#undef Z3H
#undef ZER
#undef SEL1
#undef CAR1
#undef CAR2
-#undef SWAP
-#undef SWAP_
-// p256PointAddAsm(P3, P1, P2 *p256Point)
#define P3ptr R3
#define P1ptr R4
#define P2ptr R5
#define HH V25
#define RL V26
#define RH V27
-#define RH_ VS59
// Temps for Sub and Add
#define ZER V6
#define CAR1 V8
#define CAR2 V9
#define TT0 V11
-#define TT0_ VS43
#define TT1 V12
-#define TT1_ VS44
#define T2 V13
-#define SWAP V28
-#define SWAP_ VS60
-
// p256MulAsm Parameters
#define X0 V0
#define X1 V1
-#define X0_ VS32
-#define X1_ VS33
#define Y0 V2
#define Y1 V3
-#define Y0_ VS34
-#define Y1_ VS35
#define T0 V4
#define T1 V5
-#define T0_ VS36
-#define T1_ VS37
#define PL V30
#define PH V31
-#define PL_ VS62
-#define PH_ VS63
/*
* https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
*
// X=S1; Y=T2; MUL; T- // T2 = S1*T2
// SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
*/
+// p256PointAddAsm(res, in1, in2 *p256Point)
TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
MOVD res+0(FP), P3ptr
MOVD in1+8(FP), P1ptr
MOVD $64, R19
MOVD $80, R20
- MOVD $byteswap<>+0x00(SB), R8
- LXVD2X (R16)(CPOOL), PH_
- LXVD2X (R0)(CPOOL), PL_
+ LXVD2X (R16)(CPOOL), PH
+ LXVD2X (R0)(CPOOL), PL
// X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R19)(P1ptr), X0_ // Z1L
- LXVD2X (R20)(P1ptr), X1_ // Z1H
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R19)(P1ptr), X0 // Z1L
+ LXVD2X (R20)(P1ptr), X1 // Z1H
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
VOR X0, X0, Y0
VOR X1, X1, Y1
CALL p256MulInternal<>(SB)
VOR T0, T0, RL // SAVE: RL
VOR T1, T1, RH // SAVE: RH
- STXVD2X RH_, (R1)(R17) // V27 has to be saved
+ STXVD2X RH, (R1)(R17) // V27 has to be saved
// X=X2; Y- ; MUL; H=T // H = X2*T1
MOVD in2+16(FP), P2ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R0)(P2ptr), X0_ // X2L
- LXVD2X (R16)(P2ptr), X1_ // X2H
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R0)(P2ptr), X0 // X2L
+ LXVD2X (R16)(P2ptr), X1 // X2H
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
CALL p256MulInternal<>(SB)
VOR T0, T0, HL // SAVE: HL
VOR T1, T1, HH // SAVE: HH
// X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
MOVD in2+16(FP), P2ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R19)(P2ptr), X0_ // Z2L
- LXVD2X (R20)(P2ptr), X1_ // Z2H
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R19)(P2ptr), X0 // Z2L
+ LXVD2X (R20)(P2ptr), X1 // Z2H
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
VOR X0, X0, Y0
VOR X1, X1, Y1
CALL p256MulInternal<>(SB)
// X=X1; Y- ; MUL; U1=T // U1 = X1*T2
MOVD in1+8(FP), P1ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R0)(P1ptr), X0_ // X1L
- LXVD2X (R16)(P1ptr), X1_ // X1H
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R0)(P1ptr), X0 // X1L
+ LXVD2X (R16)(P1ptr), X1 // X1H
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
CALL p256MulInternal<>(SB)
VOR T0, T0, U1L // SAVE: U1L
VOR T1, T1, U1H // SAVE: U1H
MOVD RES1, ret+24(FP)
// X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
- MOVD $byteswap<>+0x00(SB), R8
MOVD in1+8(FP), P1ptr
MOVD in2+16(FP), P2ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R19)(P1ptr), X0_ // Z1L
- LXVD2X (R20)(P1ptr), X1_ // Z1H
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
- LXVD2X (R19)(P2ptr), Y0_ // Z2L
- LXVD2X (R20)(P2ptr), Y1_ // Z2H
- VPERM Y0, Y0, SWAP, Y0
- VPERM Y1, Y1, SWAP, Y1
+ LXVD2X (R19)(P1ptr), X0 // Z1L
+ LXVD2X (R20)(P1ptr), X1 // Z1H
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
+ LXVD2X (R19)(P2ptr), Y0 // Z2L
+ LXVD2X (R20)(P2ptr), Y1 // Z2H
+ XXPERMDI Y0, Y0, $2, Y0
+ XXPERMDI Y1, Y1, $2, Y1
CALL p256MulInternal<>(SB)
// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
VOR HH, HH, Y1
CALL p256MulInternal<>(SB)
MOVD res+0(FP), P3ptr
- LXVD2X (R8)(R0), SWAP_
- VPERM T1, T1, SWAP, TT1
- VPERM T0, T0, SWAP, TT0
- STXVD2X TT0_, (R19)(P3ptr)
- STXVD2X TT1_, (R20)(P3ptr)
+ XXPERMDI T1, T1, $2, TT1
+ XXPERMDI T0, T0, $2, TT0
+ STXVD2X TT0, (R19)(P3ptr)
+ STXVD2X TT1, (R20)(P3ptr)
// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
MOVD in1+8(FP), P1ptr
- LXVD2X (R17)(P1ptr), X0_
- LXVD2X (R18)(P1ptr), X1_
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R17)(P1ptr), X0
+ LXVD2X (R18)(P1ptr), X1
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
VOR S1L, S1L, Y0
VOR S1H, S1H, Y1
CALL p256MulInternal<>(SB)
// X=Y2; Y=R ; MUL; T- // R = Y2*R
MOVD in2+16(FP), P2ptr
- LXVD2X (R8)(R0), SWAP_
- LXVD2X (R17)(P2ptr), X0_
- LXVD2X (R18)(P2ptr), X1_
- VPERM X0, X0, SWAP, X0
- VPERM X1, X1, SWAP, X1
+ LXVD2X (R17)(P2ptr), X0
+ LXVD2X (R18)(P2ptr), X1
+ XXPERMDI X0, X0, $2, X0
+ XXPERMDI X1, X1, $2, X1
VOR RL, RL, Y0
// VOR RH, RH, Y1 RH was saved above in D2X format
- LXVD2X (R1)(R17), Y1_
+ LXVD2X (R1)(R17), Y1
CALL p256MulInternal<>(SB)
// SUB(R<T-S1) // R = T-S1
p256SubInternal(RH,RL,T1,T0,S1H,S1L)
- STXVD2X RH_, (R1)(R17) // Save RH
+ STXVD2X RH, (R1)(R17) // Save RH
// if R == 0 or R^P == 0 then ret=ret else ret=0
// clobbers T1H and T1L
VOR RL, RL, Y0
// RH was saved above using STXVD2X
- LXVD2X (R1)(R17), X1_
+ LXVD2X (R1)(R17), X1
VOR X1, X1, Y1
// VOR RH, RH, Y1
// SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
p256SubInternal(T1,T0,T1,T0,X1,X0)
MOVD res+0(FP), P3ptr
- LXVD2X (R8)(R0), SWAP_
- VPERM T1, T1, SWAP, TT1
- VPERM T0, T0, SWAP, TT0
- STXVD2X TT0_, (R0)(P3ptr)
- STXVD2X TT1_, (R16)(P3ptr)
+ XXPERMDI T1, T1, $2, TT1
+ XXPERMDI T0, T0, $2, TT0
+ STXVD2X TT0, (R0)(P3ptr)
+ STXVD2X TT1, (R16)(P3ptr)
// SUB(Y<U1-T) // Y3 = U1-X3
p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
VOR RL, RL, X0
// VOR RH, RH, X1
- LXVD2X (R1)(R17), X1_
+ LXVD2X (R1)(R17), X1
CALL p256MulInternal<>(SB)
VOR T0, T0, U1L
VOR T1, T1, U1H
// SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
p256SubInternal(T1,T0,U1H,U1L,T1,T0)
MOVD res+0(FP), P3ptr
- LXVD2X (R8)(R0), SWAP_
- VPERM T1, T1, SWAP, TT1
- VPERM T0, T0, SWAP, TT0
- STXVD2X TT0_, (R17)(P3ptr)
- STXVD2X TT1_, (R18)(P3ptr)
+ XXPERMDI T1, T1, $2, TT1
+ XXPERMDI T0, T0, $2, TT0
+ STXVD2X TT0, (R17)(P3ptr)
+ STXVD2X TT1, (R18)(P3ptr)
RET
+++ /dev/null
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build ignore
-
-package elliptic
-
-import (
- "crypto/subtle"
- "encoding/binary"
- "math/big"
-)
-
-// This was ported from the s390x implementation for ppc64le.
-// Some hints are included here for changes that should be
-// in the big endian ppc64 implementation, however more
-// investigation and testing is needed for the ppc64 big
-// endian version to work.
-type p256CurveFast struct {
- *CurveParams
-}
-
-type p256Point struct {
- x [32]byte
- y [32]byte
- z [32]byte
-}
-
-var p256PreFast *[37][64]p256Point
-
-func init() {
- initP256Arch = func() {
- p256 = p256CurveFast{&p256Params}
- initTable()
- }
-}
-
-func (curve p256CurveFast) Params() *CurveParams {
- return curve.CurveParams
-}
-
-// Functions implemented in p256_asm_ppc64le.s
-// Montgomery multiplication modulo P256
-//
-//go:noescape
-func p256MulAsm(res, in1, in2 []byte)
-
-// Montgomery square modulo P256
-func p256Sqr(res, in []byte) {
- p256MulAsm(res, in, in)
-}
-
-// Montgomery multiplication by 1
-//
-//go:noescape
-func p256FromMont(res, in []byte)
-
-// iff cond == 1 val <- -val
-//
-//go:noescape
-func p256NegCond(val *p256Point, cond int)
-
-// if cond == 0 res <- b; else res <- a
-//
-//go:noescape
-func p256MovCond(res, a, b *p256Point, cond int)
-
-// Constant time table access
-//
-//go:noescape
-func p256Select(point *p256Point, table []p256Point, idx int)
-
-//go:noescape
-func p256SelectBase(point *p256Point, table []p256Point, idx int)
-
-// Point add with P2 being affine point
-// If sign == 1 -> P2 = -P2
-// If sel == 0 -> P3 = P1
-// if zero == 0 -> P3 = P2
-//
-//go:noescape
-func p256PointAddAffineAsm(res, in1, in2 *p256Point, sign, sel, zero int)
-
-//go:noescape
-func p256PointAddAsm(res, in1, in2 *p256Point) int
-
-//go:noescape
-func p256PointDoubleAsm(res, in *p256Point)
-
-// The result should be a slice in LE order, but the slice
-// from big.Bytes is in BE order.
-// TODO: For big endian implementation, do not reverse bytes.
-func fromBig(big *big.Int) []byte {
- // This could be done a lot more efficiently...
- res := big.Bytes()
- t := make([]byte, 32)
- if len(res) < 32 {
- copy(t[32-len(res):], res)
- } else if len(res) == 32 {
- copy(t, res)
- } else {
- copy(t, res[len(res)-32:])
- }
- p256ReverseBytes(t, t)
- return t
-}
-
-// p256GetMultiplier makes sure byte array will have 32 byte elements, If the scalar
-// is equal or greater than the order of the group, it's reduced modulo that order.
-func p256GetMultiplier(in []byte) []byte {
- n := new(big.Int).SetBytes(in)
-
- if n.Cmp(p256Params.N) >= 0 {
- n.Mod(n, p256Params.N)
- }
- return fromBig(n)
-}
-
-// p256MulAsm operates in a Montgomery domain with R = 2^256 mod p, where p is the
-// underlying field of the curve. (See initP256 for the value.) Thus rr here is
-// R×R mod p. See comment in Inverse about how this is used.
-// TODO: For big endian implementation, the bytes in these slices should be in reverse order,
-// as found in the s390x implementation.
-var rr = []byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0, 0xff, 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00}
-
-// (This is one, in the Montgomery domain.)
-var one = []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}
-
-func maybeReduceModP(in *big.Int) *big.Int {
- if in.Cmp(p256Params.P) < 0 {
- return in
- }
- return new(big.Int).Mod(in, p256Params.P)
-}
-
-// p256ReverseBytes copies the first 32 bytes from in to res in reverse order.
-func p256ReverseBytes(res, in []byte) {
- // remove bounds check
- in = in[:32]
- res = res[:32]
-
- // Load in reverse order
- a := binary.BigEndian.Uint64(in[0:])
- b := binary.BigEndian.Uint64(in[8:])
- c := binary.BigEndian.Uint64(in[16:])
- d := binary.BigEndian.Uint64(in[24:])
-
- // Store in normal order
- binary.LittleEndian.PutUint64(res[0:], d)
- binary.LittleEndian.PutUint64(res[8:], c)
- binary.LittleEndian.PutUint64(res[16:], b)
- binary.LittleEndian.PutUint64(res[24:], a)
-}
-
-func (curve p256CurveFast) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
- var r1, r2 p256Point
-
- scalarReduced := p256GetMultiplier(baseScalar)
- r1IsInfinity := scalarIsZero(scalarReduced)
- r1.p256BaseMult(scalarReduced)
-
- copy(r2.x[:], fromBig(maybeReduceModP(bigX)))
- copy(r2.y[:], fromBig(maybeReduceModP(bigY)))
- copy(r2.z[:], one)
- p256MulAsm(r2.x[:], r2.x[:], rr[:])
- p256MulAsm(r2.y[:], r2.y[:], rr[:])
-
- scalarReduced = p256GetMultiplier(scalar)
- r2IsInfinity := scalarIsZero(scalarReduced)
- r2.p256ScalarMult(scalarReduced)
-
- var sum, double p256Point
- pointsEqual := p256PointAddAsm(&sum, &r1, &r2)
- p256PointDoubleAsm(&double, &r1)
- p256MovCond(&sum, &double, &sum, pointsEqual)
- p256MovCond(&sum, &r1, &sum, r2IsInfinity)
- p256MovCond(&sum, &r2, &sum, r1IsInfinity)
- return sum.p256PointToAffine()
-}
-
-func (curve p256CurveFast) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
- var r p256Point
- reducedScalar := p256GetMultiplier(scalar)
- r.p256BaseMult(reducedScalar)
- return r.p256PointToAffine()
-}
-
-func (curve p256CurveFast) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
- scalarReduced := p256GetMultiplier(scalar)
- var r p256Point
- copy(r.x[:], fromBig(maybeReduceModP(bigX)))
- copy(r.y[:], fromBig(maybeReduceModP(bigY)))
- copy(r.z[:], one)
- p256MulAsm(r.x[:], r.x[:], rr[:])
- p256MulAsm(r.y[:], r.y[:], rr[:])
- r.p256ScalarMult(scalarReduced)
- return r.p256PointToAffine()
-}
-
-func scalarIsZero(scalar []byte) int {
- // If any byte is not zero, return 0.
- // Check for -0.... since that appears to compare to 0.
- b := byte(0)
- for _, s := range scalar {
- b |= s
- }
- return subtle.ConstantTimeByteEq(b, 0)
-}
-
-func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
- zInv := make([]byte, 32)
- zInvSq := make([]byte, 32)
-
- p256Inverse(zInv, p.z[:])
- p256Sqr(zInvSq, zInv)
- p256MulAsm(zInv, zInv, zInvSq)
-
- p256MulAsm(zInvSq, p.x[:], zInvSq)
- p256MulAsm(zInv, p.y[:], zInv)
-
- p256FromMont(zInvSq, zInvSq)
- p256FromMont(zInv, zInv)
-
- // SetBytes expects a slice in big endian order,
- // since ppc64le is little endian, reverse the bytes.
- // TODO: For big endian, bytes don't need to be reversed.
- p256ReverseBytes(zInvSq, zInvSq)
- p256ReverseBytes(zInv, zInv)
- rx := new(big.Int).SetBytes(zInvSq)
- ry := new(big.Int).SetBytes(zInv)
- return rx, ry
-}
-
-// p256Inverse sets out to in^-1 mod p.
-func p256Inverse(out, in []byte) {
- var stack [6 * 32]byte
- p2 := stack[32*0 : 32*0+32]
- p4 := stack[32*1 : 32*1+32]
- p8 := stack[32*2 : 32*2+32]
- p16 := stack[32*3 : 32*3+32]
- p32 := stack[32*4 : 32*4+32]
-
- p256Sqr(out, in)
- p256MulAsm(p2, out, in) // 3*p
-
- p256Sqr(out, p2)
- p256Sqr(out, out)
- p256MulAsm(p4, out, p2) // f*p
-
- p256Sqr(out, p4)
- p256Sqr(out, out)
- p256Sqr(out, out)
- p256Sqr(out, out)
- p256MulAsm(p8, out, p4) // ff*p
-
- p256Sqr(out, p8)
-
- for i := 0; i < 7; i++ {
- p256Sqr(out, out)
- }
- p256MulAsm(p16, out, p8) // ffff*p
-
- p256Sqr(out, p16)
- for i := 0; i < 15; i++ {
- p256Sqr(out, out)
- }
- p256MulAsm(p32, out, p16) // ffffffff*p
-
- p256Sqr(out, p32)
-
- for i := 0; i < 31; i++ {
- p256Sqr(out, out)
- }
- p256MulAsm(out, out, in)
-
- for i := 0; i < 32*4; i++ {
- p256Sqr(out, out)
- }
- p256MulAsm(out, out, p32)
-
- for i := 0; i < 32; i++ {
- p256Sqr(out, out)
- }
- p256MulAsm(out, out, p32)
-
- for i := 0; i < 16; i++ {
- p256Sqr(out, out)
- }
- p256MulAsm(out, out, p16)
-
- for i := 0; i < 8; i++ {
- p256Sqr(out, out)
- }
- p256MulAsm(out, out, p8)
-
- p256Sqr(out, out)
- p256Sqr(out, out)
- p256Sqr(out, out)
- p256Sqr(out, out)
- p256MulAsm(out, out, p4)
-
- p256Sqr(out, out)
- p256Sqr(out, out)
- p256MulAsm(out, out, p2)
-
- p256Sqr(out, out)
- p256Sqr(out, out)
- p256MulAsm(out, out, in)
-}
-
-func boothW5(in uint) (int, int) {
- var s uint = ^((in >> 5) - 1)
- var d uint = (1 << 6) - in - 1
- d = (d & s) | (in & (^s))
- d = (d >> 1) + (d & 1)
- return int(d), int(s & 1)
-}
-
-func boothW6(in uint) (int, int) {
- var s uint = ^((in >> 6) - 1)
- var d uint = (1 << 7) - in - 1
- d = (d & s) | (in & (^s))
- d = (d >> 1) + (d & 1)
- return int(d), int(s & 1)
-}
-
-func boothW7(in uint) (int, int) {
- var s uint = ^((in >> 7) - 1)
- var d uint = (1 << 8) - in - 1
- d = (d & s) | (in & (^s))
- d = (d >> 1) + (d & 1)
- return int(d), int(s & 1)
-}
-
-func initTable() {
- p256PreFast = new([37][64]p256Point)
-
- // TODO: For big endian, these slices should be in reverse byte order,
- // as found in the s390x implementation.
- basePoint := p256Point{
- x: [32]byte{0x3c, 0x14, 0xa9, 0x18, 0xd4, 0x30, 0xe7, 0x79, 0x01, 0xb6, 0xed, 0x5f, 0xfc, 0x95, 0xba, 0x75,
- 0x10, 0x25, 0x62, 0x77, 0x2b, 0x73, 0xfb, 0x79, 0xc6, 0x55, 0x37, 0xa5, 0x76, 0x5f, 0x90, 0x18}, //(p256.x*2^256)%p
- y: [32]byte{0x0a, 0x56, 0x95, 0xce, 0x57, 0x53, 0xf2, 0xdd, 0x5c, 0xe4, 0x19, 0xba, 0xe4, 0xb8, 0x4a, 0x8b,
- 0x25, 0xf3, 0x21, 0xdd, 0x88, 0x86, 0xe8, 0xd2, 0x85, 0x5d, 0x88, 0x25, 0x18, 0xff, 0x71, 0x85}, //(p256.y*2^256)%p
- z: [32]byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, //(p256.z*2^256)%p
- }
-
- t1 := new(p256Point)
- t2 := new(p256Point)
- *t2 = basePoint
-
- zInv := make([]byte, 32)
- zInvSq := make([]byte, 32)
- for j := 0; j < 64; j++ {
- *t1 = *t2
- for i := 0; i < 37; i++ {
- // The window size is 7 so we need to double 7 times.
- if i != 0 {
- for k := 0; k < 7; k++ {
- p256PointDoubleAsm(t1, t1)
- }
- }
- // Convert the point to affine form. (Its values are
- // still in Montgomery form however.)
- p256Inverse(zInv, t1.z[:])
- p256Sqr(zInvSq, zInv)
- p256MulAsm(zInv, zInv, zInvSq)
-
- p256MulAsm(t1.x[:], t1.x[:], zInvSq)
- p256MulAsm(t1.y[:], t1.y[:], zInv)
-
- copy(t1.z[:], basePoint.z[:])
- // Update the table entry
- copy(p256PreFast[i][j].x[:], t1.x[:])
- copy(p256PreFast[i][j].y[:], t1.y[:])
- }
- if j == 0 {
- p256PointDoubleAsm(t2, &basePoint)
- } else {
- p256PointAddAsm(t2, t2, &basePoint)
- }
- }
-}
-
-func (p *p256Point) p256BaseMult(scalar []byte) {
- // TODO: For big endian, the index should be 31 not 0.
- wvalue := (uint(scalar[0]) << 1) & 0xff
- sel, sign := boothW7(uint(wvalue))
- p256SelectBase(p, p256PreFast[0][:], sel)
- p256NegCond(p, sign)
-
- copy(p.z[:], one[:])
- var t0 p256Point
-
- copy(t0.z[:], one[:])
-
- index := uint(6)
- zero := sel
- for i := 1; i < 37; i++ {
- // TODO: For big endian, use the same index values as found
- // in the s390x implementation.
- if index < 247 {
- wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0xff
- } else {
- wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0xff
- }
- index += 7
- sel, sign = boothW7(uint(wvalue))
- p256SelectBase(&t0, p256PreFast[i][:], sel)
- p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
- zero |= sel
- }
-}
-
-func (p *p256Point) p256ScalarMult(scalar []byte) {
- // precomp is a table of precomputed points that stores powers of p
- // from p^1 to p^16.
- var precomp [16]p256Point
- var t0, t1, t2, t3 p256Point
-
- *&precomp[0] = *p
- p256PointDoubleAsm(&t0, p)
- p256PointDoubleAsm(&t1, &t0)
- p256PointDoubleAsm(&t2, &t1)
- p256PointDoubleAsm(&t3, &t2)
- *&precomp[1] = t0
- *&precomp[3] = t1
- *&precomp[7] = t2
- *&precomp[15] = t3
-
- p256PointAddAsm(&t0, &t0, p)
- p256PointAddAsm(&t1, &t1, p)
- p256PointAddAsm(&t2, &t2, p)
-
- *&precomp[2] = t0
- *&precomp[4] = t1
- *&precomp[8] = t2
-
- p256PointDoubleAsm(&t0, &t0)
- p256PointDoubleAsm(&t1, &t1)
- *&precomp[5] = t0
- *&precomp[9] = t1
-
- p256PointAddAsm(&t2, &t0, p)
- p256PointAddAsm(&t1, &t1, p)
- *&precomp[6] = t2
- *&precomp[10] = t1
-
- p256PointDoubleAsm(&t0, &t0)
- p256PointDoubleAsm(&t2, &t2)
- *&precomp[11] = t0
- *&precomp[13] = t2
-
- p256PointAddAsm(&t0, &t0, p)
- p256PointAddAsm(&t2, &t2, p)
- *&precomp[12] = t0
- *&precomp[14] = t2
-
- // Start scanning the window from top bit
- index := uint(254)
- var sel, sign int
-
- // TODO: For big endian, use index found in s390x implementation.
- wvalue := (uint(scalar[index/8]) >> (index % 8)) & 0x3f
- sel, _ = boothW5(uint(wvalue))
- p256Select(p, precomp[:], sel)
- zero := sel
-
- for index > 4 {
- index -= 5
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
-
- // TODO: For big endian, use index values as found in s390x implementation.
- if index < 247 {
- wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0x3f
- } else {
- wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0x3f
- }
-
- sel, sign = boothW5(uint(wvalue))
-
- p256Select(&t0, precomp[:], sel)
- p256NegCond(&t0, sign)
- p256PointAddAsm(&t1, p, &t0)
- p256MovCond(&t1, &t1, p, sel)
- p256MovCond(p, &t1, &t0, zero)
- zero |= sel
- }
-
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
- p256PointDoubleAsm(p, p)
-
- // TODO: Use index for big endian as found in s390x implementation.
- wvalue = (uint(scalar[0]) << 1) & 0x3f
- sel, sign = boothW5(uint(wvalue))
-
- p256Select(&t0, precomp[:], sel)
- p256NegCond(&t0, sign)
- p256PointAddAsm(&t1, p, &t0)
- p256MovCond(&t1, &t1, p, sel)
- p256MovCond(p, &t1, &t0, zero)
-}