--- /dev/null
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains constant-time, 64-bit assembly implementation of
+// P256. The optimizations performed here are described in detail in:
+// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
+// 256-bit primes"
+// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
+// https://eprint.iacr.org/2013/816.pdf
+
+package main
+
+import (
+ "os"
+ "strings"
+
+ . "github.com/mmcloughlin/avo/build"
+ "github.com/mmcloughlin/avo/ir"
+ . "github.com/mmcloughlin/avo/operand"
+ . "github.com/mmcloughlin/avo/reg"
+)
+
+//go:generate go run . -out ../p256_asm_amd64.s -pkg nistec
+
+var (
+ res_ptr GPPhysical = RDI
+ x_ptr = RSI
+ y_ptr = RCX
+)
+
+// These variables have been versioned as they get redfined in the reference implementation.
+// This is done to produce a minimal semantic diff.
+var (
+ acc0_v1 GPPhysical = R8
+ acc1_v1 = R9
+ acc2_v1 = R10
+ acc3_v1 = R11
+ acc4_v1 = R12
+ acc5_v1 = R13
+ t0_v1 = R14
+ t1_v1 = R15
+)
+
+func main() {
+ Package("crypto/internal/nistec")
+ ConstraintExpr("!purego")
+ p256OrdLittleToBig()
+ p256OrdBigToLittle()
+ p256LittleToBig()
+ p256BigToLittle()
+ p256MovCond()
+ p256NegCond()
+ p256Sqr()
+ p256Mul()
+ p256FromMont()
+ p256Select()
+ p256SelectAffine()
+ p256OrdMul()
+ p256OrdSqr()
+ p256SubInternal()
+ p256MulInternal()
+ p256SqrInternal()
+ p256PointAddAffineAsm()
+ p256IsZero()
+ p256PointAddAsm()
+ p256PointDoubleAsm()
+ Generate()
+
+ internalFunctions := []string{
+ "·p256SubInternal",
+ "·p256MulInternal",
+ "·p256SqrInternal",
+ "·p256IsZero",
+ }
+ removePeskyUnicodeDot(internalFunctions, "../p256_asm_amd64.s")
+}
+
+// Implements:
+//
+// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
+func p256OrdLittleToBig() {
+ Implement("p256OrdLittleToBig")
+ Attributes(NOSPLIT)
+ // Hack to get Avo to output:
+ // JMP ·p256BigToLittle(SB)
+ Instruction(&ir.Instruction{
+ Opcode: "JMP",
+ Operands: []Op{
+ LabelRef("·p256BigToLittle(SB)"),
+ },
+ })
+}
+
+// Implements:
+//
+// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
+func p256OrdBigToLittle() {
+ Implement("p256OrdBigToLittle")
+ Attributes(NOSPLIT)
+ // Hack to get Avo to output:
+ // JMP ·p256BigToLittle(SB)
+ Instruction(&ir.Instruction{
+ Opcode: "JMP",
+ Operands: []Op{
+ LabelRef("·p256BigToLittle(SB)"),
+ },
+ })
+}
+
+// Implements
+//
+// func p256LittleToBig(res *[32]byte, in *p256Element)
+func p256LittleToBig() {
+ Implement("p256LittleToBig")
+ Attributes(NOSPLIT)
+ // Hack to get Avo to output:
+ // JMP ·p256BigToLittle(SB)
+ Instruction(&ir.Instruction{
+ Opcode: "JMP",
+ Operands: []Op{
+ LabelRef("·p256BigToLittle(SB)"),
+ },
+ })
+}
+
+// Implements:
+//
+// func p256BigToLittle(res *p256Element, in *[32]byte)
+func p256BigToLittle() {
+ Implement("p256BigToLittle")
+ Attributes(NOSPLIT)
+
+ Load(Param("res"), res_ptr)
+ Load(Param("in"), x_ptr)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), acc0_v1)
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), acc1_v1)
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), acc2_v1)
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), acc3_v1)
+
+ BSWAPQ(acc0_v1)
+ BSWAPQ(acc1_v1)
+ BSWAPQ(acc2_v1)
+ BSWAPQ(acc3_v1)
+
+ MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*0))
+ MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*1))
+ MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*2))
+ MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*3))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256MovCond(res, a, b *P256Point, cond int)
+func p256MovCond() {
+ Implement("p256MovCond")
+ Attributes(NOSPLIT)
+
+ Load(Param("res"), res_ptr)
+ Load(Param("a"), x_ptr)
+ Load(Param("b"), y_ptr)
+ Load(Param("cond"), X12)
+
+ PXOR(X13, X13)
+ PSHUFD(Imm(0), X12, X12)
+ PCMPEQL(X13, X12)
+
+ MOVOU(X12, X0)
+ MOVOU(Mem{Base: x_ptr}.Offset(16*0), X6)
+ PANDN(X6, X0)
+ MOVOU(X12, X1)
+ MOVOU(Mem{Base: x_ptr}.Offset(16*1), X7)
+ PANDN(X7, X1)
+ MOVOU(X12, X2)
+ MOVOU(Mem{Base: x_ptr}.Offset(16*2), X8)
+ PANDN(X8, X2)
+ MOVOU(X12, X3)
+ MOVOU(Mem{Base: x_ptr}.Offset(16*3), X9)
+ PANDN(X9, X3)
+ MOVOU(X12, X4)
+ MOVOU(Mem{Base: x_ptr}.Offset(16*4), X10)
+ PANDN(X10, X4)
+ MOVOU(X12, X5)
+ MOVOU(Mem{Base: x_ptr}.Offset(16*5), X11)
+ PANDN(X11, X5)
+
+ MOVOU(Mem{Base: y_ptr}.Offset(16*0), X6)
+ MOVOU(Mem{Base: y_ptr}.Offset(16*1), X7)
+ MOVOU(Mem{Base: y_ptr}.Offset(16*2), X8)
+ MOVOU(Mem{Base: y_ptr}.Offset(16*3), X9)
+ MOVOU(Mem{Base: y_ptr}.Offset(16*4), X10)
+ MOVOU(Mem{Base: y_ptr}.Offset(16*5), X11)
+
+ PAND(X12, X6)
+ PAND(X12, X7)
+ PAND(X12, X8)
+ PAND(X12, X9)
+ PAND(X12, X10)
+ PAND(X12, X11)
+
+ PXOR(X6, X0)
+ PXOR(X7, X1)
+ PXOR(X8, X2)
+ PXOR(X9, X3)
+ PXOR(X10, X4)
+ PXOR(X11, X5)
+
+ MOVOU(X0, Mem{Base: res_ptr}.Offset(16*0))
+ MOVOU(X1, Mem{Base: res_ptr}.Offset(16*1))
+ MOVOU(X2, Mem{Base: res_ptr}.Offset(16*2))
+ MOVOU(X3, Mem{Base: res_ptr}.Offset(16*3))
+ MOVOU(X4, Mem{Base: res_ptr}.Offset(16*4))
+ MOVOU(X5, Mem{Base: res_ptr}.Offset(16*5))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256NegCond(val *p256Element, cond int)
+func p256NegCond() {
+ Implement("p256NegCond")
+ Attributes(NOSPLIT)
+
+ Load(Param("val"), res_ptr)
+ Load(Param("cond"), t0_v1)
+
+ Comment("acc = poly")
+ MOVQ(I32(-1), acc0_v1)
+ p256const0 := p256const0_DATA()
+ MOVQ(p256const0, acc1_v1)
+ MOVQ(I32(0), acc2_v1)
+ p256const1 := p256const1_DATA()
+ MOVQ(p256const1, acc3_v1)
+
+ Comment("Load the original value")
+ MOVQ(Mem{Base: res_ptr}.Offset(8*0), acc5_v1)
+ MOVQ(Mem{Base: res_ptr}.Offset(8*1), x_ptr)
+ MOVQ(Mem{Base: res_ptr}.Offset(8*2), y_ptr)
+ MOVQ(Mem{Base: res_ptr}.Offset(8*3), t1_v1)
+
+ Comment("Speculatively subtract")
+ SUBQ(acc5_v1, acc0_v1)
+ SBBQ(x_ptr, acc1_v1)
+ SBBQ(y_ptr, acc2_v1)
+ SBBQ(t1_v1, acc3_v1)
+
+ Comment("If condition is 0, keep original value")
+ TESTQ(t0_v1, t0_v1)
+ CMOVQEQ(acc5_v1, acc0_v1)
+ CMOVQEQ(x_ptr, acc1_v1)
+ CMOVQEQ(y_ptr, acc2_v1)
+ CMOVQEQ(t1_v1, acc3_v1)
+
+ Comment("Store result")
+ MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0))
+ MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1))
+ MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2))
+ MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256Sqr(res, in *p256Element, n int)
+func p256Sqr() {
+ Implement("p256Sqr")
+ Attributes(NOSPLIT)
+
+ Load(Param("res"), res_ptr)
+ Load(Param("in"), x_ptr)
+ Load(Param("n"), RBX)
+
+ Label("sqrLoop")
+
+ Comment("y[1:] * y[0]")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ MOVQ(RAX, acc1_v1)
+ MOVQ(RDX, acc2_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc3_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc4_v1)
+
+ Comment("y[2:] * y[1]")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc5_v1)
+
+ Comment("y[3] * y[2]")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, y_ptr)
+ XORQ(t1_v1, t1_v1)
+
+ Comment("*2")
+ ADDQ(acc1_v1, acc1_v1)
+ ADCQ(acc2_v1, acc2_v1)
+ ADCQ(acc3_v1, acc3_v1)
+ ADCQ(acc4_v1, acc4_v1)
+ ADCQ(acc5_v1, acc5_v1)
+ ADCQ(y_ptr, y_ptr)
+ ADCQ(Imm(0), t1_v1)
+
+ Comment("Missing products")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(RAX)
+ MOVQ(RAX, acc0_v1)
+ MOVQ(RDX, t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(RAX)
+ ADDQ(t0_v1, acc1_v1)
+ ADCQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(RAX)
+ ADDQ(t0_v1, acc3_v1)
+ ADCQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(RAX)
+ ADDQ(t0_v1, acc5_v1)
+ ADCQ(RAX, y_ptr)
+ ADCQ(RDX, t1_v1)
+ MOVQ(t1_v1, x_ptr)
+
+ Comment("First reduction step")
+ MOVQ(acc0_v1, RAX)
+ MOVQ(acc0_v1, t1_v1)
+ SHLQ(Imm(32), acc0_v1)
+
+ p256const1 := p256const1_DATA()
+ MULQ(p256const1)
+
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc0_v1, acc1_v1)
+ ADCQ(t1_v1, acc2_v1)
+ ADCQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc0_v1)
+
+ Comment("Second reduction step")
+ MOVQ(acc1_v1, RAX)
+ MOVQ(acc1_v1, t1_v1)
+ SHLQ(Imm(32), acc1_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc1_v1, acc2_v1)
+ ADCQ(t1_v1, acc3_v1)
+ ADCQ(RAX, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc1_v1)
+
+ Comment("Third reduction step")
+ MOVQ(acc2_v1, RAX)
+ MOVQ(acc2_v1, t1_v1)
+ SHLQ(Imm(32), acc2_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc2_v1, acc3_v1)
+ ADCQ(t1_v1, acc0_v1)
+ ADCQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc2_v1)
+
+ Comment("Last reduction step")
+ XORQ(t0_v1, t0_v1)
+ MOVQ(acc3_v1, RAX)
+ MOVQ(acc3_v1, t1_v1)
+ SHLQ(Imm(32), acc3_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc3_v1, acc0_v1)
+ ADCQ(t1_v1, acc1_v1)
+ ADCQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc3_v1)
+
+ Comment("Add bits [511:256] of the sqr result")
+ ADCQ(acc4_v1, acc0_v1)
+ ADCQ(acc5_v1, acc1_v1)
+ ADCQ(y_ptr, acc2_v1)
+ ADCQ(x_ptr, acc3_v1)
+ ADCQ(Imm(0), t0_v1)
+
+ MOVQ(acc0_v1, acc4_v1)
+ MOVQ(acc1_v1, acc5_v1)
+ MOVQ(acc2_v1, y_ptr)
+ MOVQ(acc3_v1, t1_v1)
+
+ Comment("Subtract p256")
+ SUBQ(I8(-1), acc0_v1)
+
+ p256const0 := p256const0_DATA()
+ SBBQ(p256const0, acc1_v1)
+ SBBQ(Imm(0), acc2_v1)
+ SBBQ(p256const1, acc3_v1)
+ SBBQ(Imm(0), t0_v1)
+
+ CMOVQCS(acc4_v1, acc0_v1)
+ CMOVQCS(acc5_v1, acc1_v1)
+ CMOVQCS(y_ptr, acc2_v1)
+ CMOVQCS(t1_v1, acc3_v1)
+
+ MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0))
+ MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1))
+ MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2))
+ MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3))
+ MOVQ(res_ptr, x_ptr)
+ DECQ(RBX)
+ JNE(LabelRef("sqrLoop"))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256Mul(res, in1, in2 *p256Element)
+func p256Mul() {
+ Implement("p256Mul")
+ Attributes(NOSPLIT)
+
+ Load(Param("res"), res_ptr)
+ Load(Param("in1"), x_ptr)
+ Load(Param("in2"), y_ptr)
+
+ Comment("x * y[0]")
+ MOVQ(Mem{Base: y_ptr}.Offset(8*0), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ MOVQ(RAX, acc0_v1)
+ MOVQ(RDX, acc1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc2_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc3_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc4_v1)
+ XORQ(acc5_v1, acc5_v1)
+
+ Comment("First reduction step")
+ MOVQ(acc0_v1, RAX)
+ MOVQ(acc0_v1, t1_v1)
+ SHLQ(Imm(32), acc0_v1)
+ p256const1 := p256const1_DATA()
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc0_v1, acc1_v1)
+ ADCQ(t1_v1, acc2_v1)
+ ADCQ(RAX, acc3_v1)
+ ADCQ(RDX, acc4_v1)
+ ADCQ(Imm(0), acc5_v1)
+ XORQ(acc0_v1, acc0_v1)
+
+ Comment("x * y[1]")
+ MOVQ(Mem{Base: y_ptr}.Offset(8*1), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(RDX, acc5_v1)
+ ADCQ(Imm(0), acc0_v1)
+
+ Comment("Second reduction step")
+ MOVQ(acc1_v1, RAX)
+ MOVQ(acc1_v1, t1_v1)
+ SHLQ(Imm(32), acc1_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc1_v1, acc2_v1)
+ ADCQ(t1_v1, acc3_v1)
+ ADCQ(RAX, acc4_v1)
+ ADCQ(RDX, acc5_v1)
+ ADCQ(Imm(0), acc0_v1)
+ XORQ(acc1_v1, acc1_v1)
+
+ Comment("x * y[2]")
+ MOVQ(Mem{Base: y_ptr}.Offset(8*2), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(RDX, acc0_v1)
+ ADCQ(Imm(0), acc1_v1)
+
+ Comment("Third reduction step")
+ MOVQ(acc2_v1, RAX)
+ MOVQ(acc2_v1, t1_v1)
+ SHLQ(Imm(32), acc2_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc2_v1, acc3_v1)
+ ADCQ(t1_v1, acc4_v1)
+ ADCQ(RAX, acc5_v1)
+ ADCQ(RDX, acc0_v1)
+ ADCQ(Imm(0), acc1_v1)
+ XORQ(acc2_v1, acc2_v1)
+ Comment("x * y[3]")
+
+ MOVQ(Mem{Base: y_ptr}.Offset(8*3), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc0_v1)
+ ADCQ(RDX, acc1_v1)
+ ADCQ(Imm(0), acc2_v1)
+
+ Comment("Last reduction step")
+ MOVQ(acc3_v1, RAX)
+ MOVQ(acc3_v1, t1_v1)
+ SHLQ(Imm(32), acc3_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc3_v1, acc4_v1)
+ ADCQ(t1_v1, acc5_v1)
+ ADCQ(RAX, acc0_v1)
+ ADCQ(RDX, acc1_v1)
+ ADCQ(Imm(0), acc2_v1)
+
+ Comment("Copy result [255:0]")
+ MOVQ(acc4_v1, x_ptr)
+ MOVQ(acc5_v1, acc3_v1)
+ MOVQ(acc0_v1, t0_v1)
+ MOVQ(acc1_v1, t1_v1)
+
+ Comment("Subtract p256")
+ SUBQ(I8(-1), acc4_v1)
+ p256const0 := p256const0_DATA()
+ SBBQ(p256const0, acc5_v1)
+ SBBQ(Imm(0), acc0_v1)
+ // SBBQ p256const1<>(SB), acc1_v1
+ SBBQ(p256const1, acc1_v1)
+ SBBQ(Imm(0), acc2_v1)
+
+ CMOVQCS(x_ptr, acc4_v1)
+ CMOVQCS(acc3_v1, acc5_v1)
+ CMOVQCS(t0_v1, acc0_v1)
+ CMOVQCS(t1_v1, acc1_v1)
+
+ MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0))
+ MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1))
+ MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2))
+ MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256FromMont(res, in *p256Element)
+func p256FromMont() {
+ Implement("p256FromMont")
+ Attributes(NOSPLIT)
+
+ Load(Param("res"), res_ptr)
+ Load(Param("in"), x_ptr)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), acc0_v1)
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), acc1_v1)
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), acc2_v1)
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), acc3_v1)
+ XORQ(acc4_v1, acc4_v1)
+
+ Comment("Only reduce, no multiplications are needed")
+ Comment("First stage")
+ MOVQ(acc0_v1, RAX)
+ MOVQ(acc0_v1, t1_v1)
+ SHLQ(Imm(32), acc0_v1)
+ p256const1 := p256const1_DATA()
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc0_v1, acc1_v1)
+ ADCQ(t1_v1, acc2_v1)
+ ADCQ(RAX, acc3_v1)
+ ADCQ(RDX, acc4_v1)
+ XORQ(acc5_v1, acc5_v1)
+
+ Comment("Second stage")
+ MOVQ(acc1_v1, RAX)
+ MOVQ(acc1_v1, t1_v1)
+ SHLQ(Imm(32), acc1_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc1_v1, acc2_v1)
+ ADCQ(t1_v1, acc3_v1)
+ ADCQ(RAX, acc4_v1)
+ ADCQ(RDX, acc5_v1)
+ XORQ(acc0_v1, acc0_v1)
+
+ Comment("Third stage")
+ MOVQ(acc2_v1, RAX)
+ MOVQ(acc2_v1, t1_v1)
+ SHLQ(Imm(32), acc2_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc2_v1, acc3_v1)
+ ADCQ(t1_v1, acc4_v1)
+ ADCQ(RAX, acc5_v1)
+ ADCQ(RDX, acc0_v1)
+ XORQ(acc1_v1, acc1_v1)
+
+ Comment("Last stage")
+ MOVQ(acc3_v1, RAX)
+ MOVQ(acc3_v1, t1_v1)
+ SHLQ(Imm(32), acc3_v1)
+ MULQ(p256const1)
+ SHRQ(Imm(32), t1_v1)
+ ADDQ(acc3_v1, acc4_v1)
+ ADCQ(t1_v1, acc5_v1)
+ ADCQ(RAX, acc0_v1)
+ ADCQ(RDX, acc1_v1)
+
+ MOVQ(acc4_v1, x_ptr)
+ MOVQ(acc5_v1, acc3_v1)
+ MOVQ(acc0_v1, t0_v1)
+ MOVQ(acc1_v1, t1_v1)
+
+ SUBQ(I8(-1), acc4_v1)
+ p256const0 := p256const0_DATA()
+ SBBQ(p256const0, acc5_v1)
+ SBBQ(Imm(0), acc0_v1)
+ SBBQ(p256const1, acc1_v1)
+
+ CMOVQCS(x_ptr, acc4_v1)
+ CMOVQCS(acc3_v1, acc5_v1)
+ CMOVQCS(t0_v1, acc0_v1)
+ CMOVQCS(t1_v1, acc1_v1)
+
+ MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0))
+ MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1))
+ MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2))
+ MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256Select(res *P256Point, table *p256Table, idx int)
+func p256Select() {
+ Implement("p256Select")
+ Attributes(NOSPLIT)
+
+ Load(Param("idx"), RAX)
+ Load(Param("table"), RDI)
+ Load(Param("res"), RDX)
+
+ PXOR(X15, X15) // X15 = 0
+ PCMPEQL(X14, X14) // X14 = -1
+ PSUBL(X14, X15) // X15 = 1
+ // Force Avo to emit:
+ // MOVL AX, X14
+ Instruction(&ir.Instruction{
+ Opcode: "MOVL",
+ Operands: []Op{
+ EAX, X14,
+ },
+ })
+ PSHUFD(Imm(0), X14, X14)
+
+ PXOR(X0, X0)
+ PXOR(X1, X1)
+ PXOR(X2, X2)
+ PXOR(X3, X3)
+ PXOR(X4, X4)
+ PXOR(X5, X5)
+ MOVQ(U32(16), RAX)
+
+ MOVOU(X15, X13)
+
+ Label("loop_select")
+
+ MOVOU(X13, X12)
+ PADDL(X15, X13)
+ PCMPEQL(X14, X12)
+
+ MOVOU(Mem{Base: DI}.Offset(16*0), X6)
+ MOVOU(Mem{Base: DI}.Offset(16*1), X7)
+ MOVOU(Mem{Base: DI}.Offset(16*2), X8)
+ MOVOU(Mem{Base: DI}.Offset(16*3), X9)
+ MOVOU(Mem{Base: DI}.Offset(16*4), X10)
+ MOVOU(Mem{Base: DI}.Offset(16*5), X11)
+ ADDQ(U8(16*6), RDI)
+
+ PAND(X12, X6)
+ PAND(X12, X7)
+ PAND(X12, X8)
+ PAND(X12, X9)
+ PAND(X12, X10)
+ PAND(X12, X11)
+
+ PXOR(X6, X0)
+ PXOR(X7, X1)
+ PXOR(X8, X2)
+ PXOR(X9, X3)
+ PXOR(X10, X4)
+ PXOR(X11, X5)
+
+ DECQ(RAX)
+ JNE(LabelRef("loop_select"))
+
+ MOVOU(X0, Mem{Base: DX}.Offset(16*0))
+ MOVOU(X1, Mem{Base: DX}.Offset(16*1))
+ MOVOU(X2, Mem{Base: DX}.Offset(16*2))
+ MOVOU(X3, Mem{Base: DX}.Offset(16*3))
+ MOVOU(X4, Mem{Base: DX}.Offset(16*4))
+ MOVOU(X5, Mem{Base: DX}.Offset(16*5))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
+func p256SelectAffine() {
+ Implement("p256SelectAffine")
+ Attributes(NOSPLIT)
+
+ Load(Param("idx"), RAX)
+ Load(Param("table"), RDI)
+ Load(Param("res"), RDX)
+
+ PXOR(X15, X15) // X15 = 0
+ PCMPEQL(X14, X14) // X14 = -1
+ PSUBL(X14, X15) // X15 = 1
+
+ // Hack to get Avo to emit:
+ // MOVL AX, X14
+ Instruction(&ir.Instruction{Opcode: "MOVL", Operands: []Op{RAX, X14}})
+
+ PSHUFD(Imm(0), X14, X14)
+
+ PXOR(X0, X0)
+ PXOR(X1, X1)
+ PXOR(X2, X2)
+ PXOR(X3, X3)
+ MOVQ(U32(16), RAX)
+
+ MOVOU(X15, X13)
+
+ Label("loop_select_base")
+
+ MOVOU(X13, X12)
+ PADDL(X15, X13)
+ PCMPEQL(X14, X12)
+
+ MOVOU(Mem{Base: DI}.Offset(16*0), X4)
+ MOVOU(Mem{Base: DI}.Offset(16*1), X5)
+ MOVOU(Mem{Base: DI}.Offset(16*2), X6)
+ MOVOU(Mem{Base: DI}.Offset(16*3), X7)
+
+ MOVOU(Mem{Base: DI}.Offset(16*4), X8)
+ MOVOU(Mem{Base: DI}.Offset(16*5), X9)
+ MOVOU(Mem{Base: DI}.Offset(16*6), X10)
+ MOVOU(Mem{Base: DI}.Offset(16*7), X11)
+
+ ADDQ(Imm(16*8), RDI)
+
+ PAND(X12, X4)
+ PAND(X12, X5)
+ PAND(X12, X6)
+ PAND(X12, X7)
+
+ MOVOU(X13, X12)
+ PADDL(X15, X13)
+ PCMPEQL(X14, X12)
+
+ PAND(X12, X8)
+ PAND(X12, X9)
+ PAND(X12, X10)
+ PAND(X12, X11)
+
+ PXOR(X4, X0)
+ PXOR(X5, X1)
+ PXOR(X6, X2)
+ PXOR(X7, X3)
+
+ PXOR(X8, X0)
+ PXOR(X9, X1)
+ PXOR(X10, X2)
+ PXOR(X11, X3)
+
+ DECQ(RAX)
+ JNE(LabelRef("loop_select_base"))
+
+ MOVOU(X0, Mem{Base: DX}.Offset(16*0))
+ MOVOU(X1, Mem{Base: DX}.Offset(16*1))
+ MOVOU(X2, Mem{Base: DX}.Offset(16*2))
+ MOVOU(X3, Mem{Base: DX}.Offset(16*3))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256OrdMul(res, in1, in2 *p256OrdElement)
+func p256OrdMul() {
+ Implement("p256OrdMul")
+ Attributes(NOSPLIT)
+
+ Load(Param("res"), res_ptr)
+ Load(Param("in1"), x_ptr)
+ Load(Param("in2"), y_ptr)
+
+ Comment("x * y[0]")
+ MOVQ(Mem{Base: y_ptr}.Offset(8*0), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ MOVQ(RAX, acc0_v1)
+ MOVQ(RDX, acc1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc2_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc3_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc4_v1)
+ XORQ(acc5_v1, acc5_v1)
+
+ Comment("First reduction step")
+ MOVQ(acc0_v1, RAX)
+ p256ordK0 := p256ordK0_DATA()
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ p256ord := p256ord_DATA()
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x10), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x18), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(RDX, acc4_v1)
+ ADCQ(Imm(0), acc5_v1)
+
+ Comment("x * y[1]")
+ MOVQ(Mem{Base: y_ptr}.Offset(8*1), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(RDX, acc5_v1)
+ ADCQ(Imm(0), acc0_v1)
+
+ Comment("Second reduction step")
+ MOVQ(acc1_v1, RAX)
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x10), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x18), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(RDX, acc5_v1)
+ ADCQ(Imm(0), acc0_v1)
+
+ Comment("x * y[2]")
+ MOVQ(Mem{Base: y_ptr}.Offset(8*2), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(RDX, acc0_v1)
+ ADCQ(Imm(0), acc1_v1)
+
+ Comment("Third reduction step")
+ MOVQ(acc2_v1, RAX)
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x10), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x18), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(RDX, acc0_v1)
+ ADCQ(Imm(0), acc1_v1)
+
+ Comment("x * y[3]")
+ MOVQ(Mem{Base: y_ptr}.Offset(8*3), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc0_v1)
+ ADCQ(RDX, acc1_v1)
+ ADCQ(Imm(0), acc2_v1)
+
+ Comment("Last reduction step")
+ MOVQ(acc3_v1, RAX)
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x10), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x18), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc0_v1)
+ ADCQ(RDX, acc1_v1)
+ ADCQ(Imm(0), acc2_v1)
+
+ Comment("Copy result [255:0]")
+ MOVQ(acc4_v1, x_ptr)
+ MOVQ(acc5_v1, acc3_v1)
+ MOVQ(acc0_v1, t0_v1)
+ MOVQ(acc1_v1, t1_v1)
+
+ Comment("Subtract p256")
+ SUBQ(p256ord.Offset(0x00), acc4_v1)
+ SBBQ(p256ord.Offset(0x08), acc5_v1)
+ SBBQ(p256ord.Offset(0x10), acc0_v1)
+ SBBQ(p256ord.Offset(0x18), acc1_v1)
+ SBBQ(Imm(0), acc2_v1)
+
+ CMOVQCS(x_ptr, acc4_v1)
+ CMOVQCS(acc3_v1, acc5_v1)
+ CMOVQCS(t0_v1, acc0_v1)
+ CMOVQCS(t1_v1, acc1_v1)
+
+ MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0))
+ MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1))
+ MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2))
+ MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3))
+
+ RET()
+}
+
+// Implements:
+//
+// func p256OrdSqr(res, in *p256OrdElement, n int)
+func p256OrdSqr() {
+ Implement("p256OrdSqr")
+ Attributes(NOSPLIT)
+
+ Load(Param("res"), res_ptr)
+ Load(Param("in"), x_ptr)
+ Load(Param("n"), RBX)
+
+ Label("ordSqrLoop")
+
+ Comment("y[1:] * y[0]")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(t0_v1)
+ MOVQ(RAX, acc1_v1)
+ MOVQ(RDX, acc2_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc3_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc4_v1)
+
+ Comment("y[2:] * y[1]")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc5_v1)
+
+ Comment("y[3] * y[2]")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc5_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, y_ptr)
+ XORQ(t1_v1, t1_v1)
+
+ Comment("*2")
+ ADDQ(acc1_v1, acc1_v1)
+ ADCQ(acc2_v1, acc2_v1)
+ ADCQ(acc3_v1, acc3_v1)
+ ADCQ(acc4_v1, acc4_v1)
+ ADCQ(acc5_v1, acc5_v1)
+ ADCQ(y_ptr, y_ptr)
+ ADCQ(Imm(0), t1_v1)
+
+ Comment("Missing products")
+ MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
+ MULQ(RAX)
+ MOVQ(RAX, acc0_v1)
+ MOVQ(RDX, t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
+ MULQ(RAX)
+ ADDQ(t0_v1, acc1_v1)
+ ADCQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
+ MULQ(RAX)
+ ADDQ(t0_v1, acc3_v1)
+ ADCQ(RAX, acc4_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t0_v1)
+
+ MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
+ MULQ(RAX)
+ ADDQ(t0_v1, acc5_v1)
+ ADCQ(RAX, y_ptr)
+ ADCQ(RDX, t1_v1)
+ MOVQ(t1_v1, x_ptr)
+
+ Comment("First reduction step")
+ MOVQ(acc0_v1, RAX)
+ p256ordK0 := p256ordK0_DATA()
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ p256ord := p256ord_DATA()
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc1_v1)
+
+ MOVQ(t0_v1, t1_v1)
+ ADCQ(RDX, acc2_v1)
+ ADCQ(Imm(0), t1_v1)
+ SUBQ(t0_v1, acc2_v1)
+ SBBQ(Imm(0), t1_v1)
+
+ MOVQ(t0_v1, RAX)
+ MOVQ(t0_v1, RDX)
+ MOVQ(t0_v1, acc0_v1)
+ SHLQ(Imm(32), RAX)
+ SHRQ(Imm(32), RDX)
+
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), acc0_v1)
+ SUBQ(RAX, acc3_v1)
+ SBBQ(RDX, acc0_v1)
+
+ Comment("Second reduction step")
+ MOVQ(acc1_v1, RAX)
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc1_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc2_v1)
+
+ MOVQ(t0_v1, t1_v1)
+ ADCQ(RDX, acc3_v1)
+ ADCQ(Imm(0), t1_v1)
+ SUBQ(t0_v1, acc3_v1)
+ SBBQ(Imm(0), t1_v1)
+
+ MOVQ(t0_v1, RAX)
+ MOVQ(t0_v1, RDX)
+ MOVQ(t0_v1, acc1_v1)
+ SHLQ(Imm(32), RAX)
+ SHRQ(Imm(32), RDX)
+
+ ADDQ(t1_v1, acc0_v1)
+ ADCQ(Imm(0), acc1_v1)
+ SUBQ(RAX, acc0_v1)
+ SBBQ(RDX, acc1_v1)
+
+ Comment("Third reduction step")
+ MOVQ(acc2_v1, RAX)
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc2_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc3_v1)
+
+ MOVQ(t0_v1, t1_v1)
+ ADCQ(RDX, acc0_v1)
+ ADCQ(Imm(0), t1_v1)
+ SUBQ(t0_v1, acc0_v1)
+ SBBQ(Imm(0), t1_v1)
+
+ MOVQ(t0_v1, RAX)
+ MOVQ(t0_v1, RDX)
+ MOVQ(t0_v1, acc2_v1)
+ SHLQ(Imm(32), RAX)
+ SHRQ(Imm(32), RDX)
+
+ ADDQ(t1_v1, acc1_v1)
+ ADCQ(Imm(0), acc2_v1)
+ SUBQ(RAX, acc1_v1)
+ SBBQ(RDX, acc2_v1)
+
+ Comment("Last reduction step")
+ MOVQ(acc3_v1, RAX)
+ MULQ(p256ordK0)
+ MOVQ(RAX, t0_v1)
+
+ MOVQ(p256ord.Offset(0x00), RAX)
+ MULQ(t0_v1)
+ ADDQ(RAX, acc3_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(p256ord.Offset(0x08), RAX)
+ MULQ(t0_v1)
+ ADDQ(t1_v1, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ ADDQ(RAX, acc0_v1)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, t1_v1)
+
+ MOVQ(t0_v1, t1_v1)
+ ADCQ(RDX, acc1_v1)
+ ADCQ(Imm(0), t1_v1)
+ SUBQ(t0_v1, acc1_v1)
+ SBBQ(Imm(0), t1_v1)
+
+ MOVQ(t0_v1, RAX)
+ MOVQ(t0_v1, RDX)
+ MOVQ(t0_v1, acc3_v1)
+ SHLQ(Imm(32), RAX)
+ SHRQ(Imm(32), RDX)
+
+ ADDQ(t1_v1, acc2_v1)
+ ADCQ(Imm(0), acc3_v1)
+ SUBQ(RAX, acc2_v1)
+ SBBQ(RDX, acc3_v1)
+ XORQ(t0_v1, t0_v1)
+
+ Comment("Add bits [511:256] of the sqr result")
+ ADCQ(acc4_v1, acc0_v1)
+ ADCQ(acc5_v1, acc1_v1)
+ ADCQ(y_ptr, acc2_v1)
+ ADCQ(x_ptr, acc3_v1)
+ ADCQ(Imm(0), t0_v1)
+
+ MOVQ(acc0_v1, acc4_v1)
+ MOVQ(acc1_v1, acc5_v1)
+ MOVQ(acc2_v1, y_ptr)
+ MOVQ(acc3_v1, t1_v1)
+
+ Comment("Subtract p256")
+ SUBQ(p256ord.Offset(0x00), acc0_v1)
+ SBBQ(p256ord.Offset(0x08), acc1_v1)
+ SBBQ(p256ord.Offset(0x10), acc2_v1)
+ SBBQ(p256ord.Offset(0x18), acc3_v1)
+ SBBQ(Imm(0), t0_v1)
+
+ CMOVQCS(acc4_v1, acc0_v1)
+ CMOVQCS(acc5_v1, acc1_v1)
+ CMOVQCS(y_ptr, acc2_v1)
+ CMOVQCS(t1_v1, acc3_v1)
+
+ MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0))
+ MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1))
+ MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2))
+ MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3))
+ MOVQ(res_ptr, x_ptr)
+ DECQ(RBX)
+ JNE(LabelRef("ordSqrLoop"))
+
+ RET()
+}
+
+// These variables have been versioned as they get redfined in the reference implementation.
+// This is done to produce a minimal semantic diff.
+var (
+ mul0_v2 = RAX
+ mul1_v2 = RDX
+ acc0_v2 = RBX
+ acc1_v2 = RCX
+ acc2_v2 = R8
+ acc3_v2 = R9
+ acc4_v2 = R10
+ acc5_v2 = R11
+ acc6_v2 = R12
+ acc7_v2 = R13
+ t0_v2 = R14
+ t1_v2 = R15
+ t2_v2 = RDI
+ t3_v2 = RSI
+ hlp_v2 = RBP
+)
+
+func p256SubInternal() {
+ Function("p256SubInternal")
+ Attributes(NOSPLIT)
+
+ XORQ(mul0_v2, mul0_v2)
+ SUBQ(t0_v2, acc4_v2)
+ SBBQ(t1_v2, acc5_v2)
+ SBBQ(t2_v2, acc6_v2)
+ SBBQ(t3_v2, acc7_v2)
+ SBBQ(Imm(0), mul0_v2)
+
+ MOVQ(acc4_v2, acc0_v2)
+ MOVQ(acc5_v2, acc1_v2)
+ MOVQ(acc6_v2, acc2_v2)
+ MOVQ(acc7_v2, acc3_v2)
+
+ ADDQ(I8(-1), acc4_v2)
+ p256const0 := p256const0_DATA()
+ ADCQ(p256const0, acc5_v2)
+ ADCQ(Imm(0), acc6_v2)
+ p256const1 := p256const1_DATA()
+ ADCQ(p256const1, acc7_v2)
+ ANDQ(Imm(1), mul0_v2)
+
+ CMOVQEQ(acc0_v2, acc4_v2)
+ CMOVQEQ(acc1_v2, acc5_v2)
+ CMOVQEQ(acc2_v2, acc6_v2)
+ CMOVQEQ(acc3_v2, acc7_v2)
+
+ RET()
+}
+
+func p256MulInternal() {
+ Function("p256MulInternal")
+ Attributes(NOSPLIT)
+
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(t0_v2)
+ MOVQ(mul0_v2, acc0_v2)
+ MOVQ(mul1_v2, acc1_v2)
+
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(t1_v2)
+ ADDQ(mul0_v2, acc1_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc2_v2)
+
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(t2_v2)
+ ADDQ(mul0_v2, acc2_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc3_v2)
+
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(t3_v2)
+ ADDQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc4_v2)
+
+ MOVQ(acc5_v2, mul0_v2)
+ MULQ(t0_v2)
+ ADDQ(mul0_v2, acc1_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc5_v2, mul0_v2)
+ MULQ(t1_v2)
+ ADDQ(hlp_v2, acc2_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc2_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc5_v2, mul0_v2)
+ MULQ(t2_v2)
+ ADDQ(hlp_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc5_v2, mul0_v2)
+ MULQ(t3_v2)
+ ADDQ(hlp_v2, acc4_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc4_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc5_v2)
+
+ MOVQ(acc6_v2, mul0_v2)
+ MULQ(t0_v2)
+ ADDQ(mul0_v2, acc2_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc6_v2, mul0_v2)
+ MULQ(t1_v2)
+ ADDQ(hlp_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc6_v2, mul0_v2)
+ MULQ(t2_v2)
+ ADDQ(hlp_v2, acc4_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc4_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc6_v2, mul0_v2)
+ MULQ(t3_v2)
+ ADDQ(hlp_v2, acc5_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc5_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc6_v2)
+
+ MOVQ(acc7_v2, mul0_v2)
+ MULQ(t0_v2)
+ ADDQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc7_v2, mul0_v2)
+ MULQ(t1_v2)
+ ADDQ(hlp_v2, acc4_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc4_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc7_v2, mul0_v2)
+ MULQ(t2_v2)
+ ADDQ(hlp_v2, acc5_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc5_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc7_v2, mul0_v2)
+ MULQ(t3_v2)
+ ADDQ(hlp_v2, acc6_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, acc6_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc7_v2)
+
+ Comment("First reduction step")
+ MOVQ(acc0_v2, mul0_v2)
+ MOVQ(acc0_v2, hlp_v2)
+ SHLQ(Imm(32), acc0_v2)
+ p256const1 := p256const1_DATA()
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc0_v2, acc1_v2)
+ ADCQ(hlp_v2, acc2_v2)
+ ADCQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc0_v2)
+
+ Comment("Second reduction step")
+ MOVQ(acc1_v2, mul0_v2)
+ MOVQ(acc1_v2, hlp_v2)
+ SHLQ(Imm(32), acc1_v2)
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc1_v2, acc2_v2)
+ ADCQ(hlp_v2, acc3_v2)
+ ADCQ(mul0_v2, acc0_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc1_v2)
+
+ Comment("Third reduction step")
+ MOVQ(acc2_v2, mul0_v2)
+ MOVQ(acc2_v2, hlp_v2)
+ SHLQ(Imm(32), acc2_v2)
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc2_v2, acc3_v2)
+ ADCQ(hlp_v2, acc0_v2)
+ ADCQ(mul0_v2, acc1_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc2_v2)
+
+ Comment("Last reduction step")
+ MOVQ(acc3_v2, mul0_v2)
+ MOVQ(acc3_v2, hlp_v2)
+ SHLQ(Imm(32), acc3_v2)
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc3_v2, acc0_v2)
+ ADCQ(hlp_v2, acc1_v2)
+ ADCQ(mul0_v2, acc2_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc3_v2)
+ MOVQ(U32(0), RBP)
+
+ Comment("Add bits [511:256] of the result")
+ ADCQ(acc0_v2, acc4_v2)
+ ADCQ(acc1_v2, acc5_v2)
+ ADCQ(acc2_v2, acc6_v2)
+ ADCQ(acc3_v2, acc7_v2)
+ ADCQ(Imm(0), hlp_v2)
+
+ Comment("Copy result")
+ MOVQ(acc4_v2, acc0_v2)
+ MOVQ(acc5_v2, acc1_v2)
+ MOVQ(acc6_v2, acc2_v2)
+ MOVQ(acc7_v2, acc3_v2)
+
+ Comment("Subtract p256")
+ SUBQ(I8(-1), acc4_v2)
+ p256const0 := p256const0_DATA()
+ SBBQ(p256const0, acc5_v2)
+ SBBQ(Imm(0), acc6_v2)
+ SBBQ(p256const1, acc7_v2)
+ SBBQ(Imm(0), hlp_v2)
+
+ Comment("If the result of the subtraction is negative, restore the previous result")
+ CMOVQCS(acc0_v2, acc4_v2)
+ CMOVQCS(acc1_v2, acc5_v2)
+ CMOVQCS(acc2_v2, acc6_v2)
+ CMOVQCS(acc3_v2, acc7_v2)
+
+ RET()
+}
+
+func p256SqrInternal() {
+ Function("p256SqrInternal")
+ Attributes(NOSPLIT)
+
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(acc5_v2)
+ MOVQ(mul0_v2, acc1_v2)
+ MOVQ(mul1_v2, acc2_v2)
+
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(acc6_v2)
+ ADDQ(mul0_v2, acc2_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc3_v2)
+
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(acc7_v2)
+ ADDQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, t0_v2)
+
+ MOVQ(acc5_v2, mul0_v2)
+ MULQ(acc6_v2)
+ ADDQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, hlp_v2)
+
+ MOVQ(acc5_v2, mul0_v2)
+ MULQ(acc7_v2)
+ ADDQ(hlp_v2, t0_v2)
+ ADCQ(Imm(0), mul1_v2)
+ ADDQ(mul0_v2, t0_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, t1_v2)
+
+ MOVQ(acc6_v2, mul0_v2)
+ MULQ(acc7_v2)
+ ADDQ(mul0_v2, t1_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, t2_v2)
+ XORQ(t3_v2, t3_v2)
+
+ Comment("*2")
+ ADDQ(acc1_v2, acc1_v2)
+ ADCQ(acc2_v2, acc2_v2)
+ ADCQ(acc3_v2, acc3_v2)
+ ADCQ(t0_v2, t0_v2)
+ ADCQ(t1_v2, t1_v2)
+ ADCQ(t2_v2, t2_v2)
+ ADCQ(Imm(0), t3_v2)
+
+ Comment("Missing products")
+ MOVQ(acc4_v2, mul0_v2)
+ MULQ(mul0_v2)
+ MOVQ(mul0_v2, acc0_v2)
+ MOVQ(RDX, acc4_v2)
+
+ MOVQ(acc5_v2, mul0_v2)
+ MULQ(mul0_v2)
+ ADDQ(acc4_v2, acc1_v2)
+ ADCQ(mul0_v2, acc2_v2)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc4_v2)
+
+ MOVQ(acc6_v2, mul0_v2)
+ MULQ(mul0_v2)
+ ADDQ(acc4_v2, acc3_v2)
+ ADCQ(mul0_v2, t0_v2)
+ ADCQ(Imm(0), RDX)
+ MOVQ(RDX, acc4_v2)
+
+ MOVQ(acc7_v2, mul0_v2)
+ MULQ(mul0_v2)
+ ADDQ(acc4_v2, t1_v2)
+ ADCQ(mul0_v2, t2_v2)
+ ADCQ(RDX, t3_v2)
+
+ Comment("First reduction step")
+ MOVQ(acc0_v2, mul0_v2)
+ MOVQ(acc0_v2, hlp_v2)
+ SHLQ(Imm(32), acc0_v2)
+ p256const1 := p256const1_DATA()
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc0_v2, acc1_v2)
+ ADCQ(hlp_v2, acc2_v2)
+ ADCQ(mul0_v2, acc3_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc0_v2)
+
+ Comment("Second reduction step")
+ MOVQ(acc1_v2, mul0_v2)
+ MOVQ(acc1_v2, hlp_v2)
+ SHLQ(Imm(32), acc1_v2)
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc1_v2, acc2_v2)
+ ADCQ(hlp_v2, acc3_v2)
+ ADCQ(mul0_v2, acc0_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc1_v2)
+
+ Comment("Third reduction step")
+ MOVQ(acc2_v2, mul0_v2)
+ MOVQ(acc2_v2, hlp_v2)
+ SHLQ(Imm(32), acc2_v2)
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc2_v2, acc3_v2)
+ ADCQ(hlp_v2, acc0_v2)
+ ADCQ(mul0_v2, acc1_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc2_v2)
+
+ Comment("Last reduction step")
+ MOVQ(acc3_v2, mul0_v2)
+ MOVQ(acc3_v2, hlp_v2)
+ SHLQ(Imm(32), acc3_v2)
+ MULQ(p256const1)
+ SHRQ(Imm(32), hlp_v2)
+ ADDQ(acc3_v2, acc0_v2)
+ ADCQ(hlp_v2, acc1_v2)
+ ADCQ(mul0_v2, acc2_v2)
+ ADCQ(Imm(0), mul1_v2)
+ MOVQ(mul1_v2, acc3_v2)
+ MOVQ(U32(0), RBP)
+
+ Comment("Add bits [511:256] of the result")
+ ADCQ(acc0_v2, t0_v2)
+ ADCQ(acc1_v2, t1_v2)
+ ADCQ(acc2_v2, t2_v2)
+ ADCQ(acc3_v2, t3_v2)
+ ADCQ(Imm(0), hlp_v2)
+
+ Comment("Copy result")
+ MOVQ(t0_v2, acc4_v2)
+ MOVQ(t1_v2, acc5_v2)
+ MOVQ(t2_v2, acc6_v2)
+ MOVQ(t3_v2, acc7_v2)
+
+ Comment("Subtract p256")
+ SUBQ(I8(-1), acc4_v2)
+ p256const0 := p256const0_DATA()
+ SBBQ(p256const0, acc5_v2)
+ SBBQ(Imm(0), acc6_v2)
+ SBBQ(p256const1, acc7_v2)
+ SBBQ(Imm(0), hlp_v2)
+
+ Comment("If the result of the subtraction is negative, restore the previous result")
+ CMOVQCS(t0_v2, acc4_v2)
+ CMOVQCS(t1_v2, acc5_v2)
+ CMOVQCS(t2_v2, acc6_v2)
+ CMOVQCS(t3_v2, acc7_v2)
+
+ RET()
+}
+
+func p256MulBy2Inline() {
+ XORQ(mul0_v2, mul0_v2)
+ ADDQ(acc4_v2, acc4_v2)
+ ADCQ(acc5_v2, acc5_v2)
+ ADCQ(acc6_v2, acc6_v2)
+ ADCQ(acc7_v2, acc7_v2)
+ ADCQ(I8(0), mul0_v2)
+ MOVQ(acc4_v2, t0_v2)
+ MOVQ(acc5_v2, t1_v2)
+ MOVQ(acc6_v2, t2_v2)
+ MOVQ(acc7_v2, t3_v2)
+ SUBQ(I8(-1), t0_v2)
+ p256const0 := p256const0_DATA()
+ SBBQ(p256const0, t1_v2)
+ SBBQ(I8(0), t2_v2)
+ p256const1 := p256const1_DATA()
+ SBBQ(p256const1, t3_v2)
+ SBBQ(I8(0), mul0_v2)
+ CMOVQCS(acc4_v2, t0_v2)
+ CMOVQCS(acc5_v2, t1_v2)
+ CMOVQCS(acc6_v2, t2_v2)
+ CMOVQCS(acc7_v2, t3_v2)
+}
+
+func p256AddInline() {
+ XORQ(mul0_v2, mul0_v2)
+ ADDQ(t0_v2, acc4_v2)
+ ADCQ(t1_v2, acc5_v2)
+ ADCQ(t2_v2, acc6_v2)
+ ADCQ(t3_v2, acc7_v2)
+ ADCQ(I8(0), mul0_v2)
+ MOVQ(acc4_v2, t0_v2)
+ MOVQ(acc5_v2, t1_v2)
+ MOVQ(acc6_v2, t2_v2)
+ MOVQ(acc7_v2, t3_v2)
+ SUBQ(I8(-1), t0_v2)
+ p256const0 := p256const0_DATA()
+ SBBQ(p256const0, t1_v2)
+ SBBQ(I8(0), t2_v2)
+ p256const1 := p256const1_DATA()
+ SBBQ(p256const1, t3_v2)
+ SBBQ(I8(0), mul0_v2)
+ CMOVQCS(acc4_v2, t0_v2)
+ CMOVQCS(acc5_v2, t1_v2)
+ CMOVQCS(acc6_v2, t2_v2)
+ CMOVQCS(acc7_v2, t3_v2)
+}
+
+/* ---------------------------------------*/
+
+type MemFunc func(off int) Mem
+
+func LDacc(src MemFunc) {
+ MOVQ(src(8*0), acc4_v2)
+ MOVQ(src(8*1), acc5_v2)
+ MOVQ(src(8*2), acc6_v2)
+ MOVQ(src(8*3), acc7_v2)
+}
+
+func LDt(src MemFunc) {
+ MOVQ(src(8*0), t0_v2)
+ MOVQ(src(8*1), t1_v2)
+ MOVQ(src(8*2), t2_v2)
+ MOVQ(src(8*3), t3_v2)
+}
+
+func ST(dst MemFunc) {
+ MOVQ(acc4_v2, dst(8*0))
+ MOVQ(acc5_v2, dst(8*1))
+ MOVQ(acc6_v2, dst(8*2))
+ MOVQ(acc7_v2, dst(8*3))
+}
+
+func STt(dst MemFunc) {
+ MOVQ(t0_v2, dst(8*0))
+ MOVQ(t1_v2, dst(8*1))
+ MOVQ(t2_v2, dst(8*2))
+ MOVQ(t3_v2, dst(8*3))
+}
+
+func acc2t() {
+ MOVQ(acc4_v2, t0_v2)
+ MOVQ(acc5_v2, t1_v2)
+ MOVQ(acc6_v2, t2_v2)
+ MOVQ(acc7_v2, t3_v2)
+}
+
+func t2acc() {
+ MOVQ(t0_v2, acc4_v2)
+ MOVQ(t1_v2, acc5_v2)
+ MOVQ(t2_v2, acc6_v2)
+ MOVQ(t3_v2, acc7_v2)
+}
+
+/* ---------------------------------------*/
+
+// These functions exist as #define macros in the reference implementation.
+//
+// In the reference assembly, these macros are later undefined and redefined.
+// They are implemented here as versioned functions.
+
+func x1in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*0 + off) }
+func y1in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*1 + off) }
+func z1in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*2 + off) }
+func x2in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*3 + off) }
+func y2in_v1(off int) Mem { return Mem{Base: SP}.Offset(32*4 + off) }
+func xout_v1(off int) Mem { return Mem{Base: SP}.Offset(32*5 + off) }
+func yout_v1(off int) Mem { return Mem{Base: SP}.Offset(32*6 + off) }
+func zout_v1(off int) Mem { return Mem{Base: SP}.Offset(32*7 + off) }
+func s2_v1(off int) Mem { return Mem{Base: SP}.Offset(32*8 + off) }
+func z1sqr_v1(off int) Mem { return Mem{Base: SP}.Offset(32*9 + off) }
+func h_v1(off int) Mem { return Mem{Base: SP}.Offset(32*10 + off) }
+func r_v1(off int) Mem { return Mem{Base: SP}.Offset(32*11 + off) }
+func hsqr_v1(off int) Mem { return Mem{Base: SP}.Offset(32*12 + off) }
+func rsqr_v1(off int) Mem { return Mem{Base: SP}.Offset(32*13 + off) }
+func hcub_v1(off int) Mem { return Mem{Base: SP}.Offset(32*14 + off) }
+
+var (
+ rptr_v1 Mem = Mem{Base: SP}.Offset(32*15 + 0)
+ sel_save_v1 = Mem{Base: SP}.Offset(32*15 + 8)
+ zero_save_v1 = Mem{Base: SP}.Offset(32*15 + 8 + 4)
+)
+
+// Implements:
+//
+// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
+func p256PointAddAffineAsm() {
+ Implement("p256PointAddAffineAsm")
+ AllocLocal(512)
+
+ Load(Param("res"), RAX)
+ Load(Param("in1"), RBX)
+ Load(Param("in2"), RCX)
+ Load(Param("sign"), RDX)
+ Load(Param("sel"), t1_v2)
+ Load(Param("zero"), t2_v2)
+
+ MOVOU(Mem{Base: BX}.Offset(16*0), X0)
+ MOVOU(Mem{Base: BX}.Offset(16*1), X1)
+ MOVOU(Mem{Base: BX}.Offset(16*2), X2)
+ MOVOU(Mem{Base: BX}.Offset(16*3), X3)
+ MOVOU(Mem{Base: BX}.Offset(16*4), X4)
+ MOVOU(Mem{Base: BX}.Offset(16*5), X5)
+
+ MOVOU(X0, x1in_v1(16*0))
+ MOVOU(X1, x1in_v1(16*1))
+ MOVOU(X2, y1in_v1(16*0))
+ MOVOU(X3, y1in_v1(16*1))
+ MOVOU(X4, z1in_v1(16*0))
+ MOVOU(X5, z1in_v1(16*1))
+
+ MOVOU(Mem{Base: CX}.Offset(16*0), X0)
+ MOVOU(Mem{Base: CX}.Offset(16*1), X1)
+
+ MOVOU(X0, x2in_v1(16*0))
+ MOVOU(X1, x2in_v1(16*1))
+
+ Comment("Store pointer to result")
+ MOVQ(mul0_v2, rptr_v1)
+
+ // Hack to get Avo to emit:
+ // MOVL t1, sel_save_v1
+ Instruction(&ir.Instruction{
+ Opcode: "MOVL",
+ Operands: []Op{t1_v2, sel_save_v1},
+ })
+
+ // Hack to get Avo to emit:
+ // MOVL t2_v2, zero_save_v1
+ Instruction(&ir.Instruction{
+ Opcode: "MOVL",
+ Operands: []Op{t2_v2, zero_save_v1},
+ })
+
+ Comment("Negate y2in based on sign")
+ MOVQ(Mem{Base: CX}.Offset(16*2+8*0), acc4_v2)
+ MOVQ(Mem{Base: CX}.Offset(16*2+8*1), acc5_v2)
+ MOVQ(Mem{Base: CX}.Offset(16*2+8*2), acc6_v2)
+ MOVQ(Mem{Base: CX}.Offset(16*2+8*3), acc7_v2)
+ MOVQ(I32(-1), acc0_v2)
+ p256const0 := p256const0_DATA()
+ MOVQ(p256const0, acc1_v2)
+ MOVQ(U32(0), acc2_v2)
+ p256const1 := p256const1_DATA()
+ MOVQ(p256const1, acc3_v2)
+ XORQ(mul0_v2, mul0_v2)
+
+ Comment("Speculatively subtract")
+ SUBQ(acc4_v2, acc0_v2)
+ SBBQ(acc5_v2, acc1_v2)
+ SBBQ(acc6_v2, acc2_v2)
+ SBBQ(acc7_v2, acc3_v2)
+ SBBQ(Imm(0), mul0_v2)
+ MOVQ(acc0_v2, t0_v2)
+ MOVQ(acc1_v2, t1_v2)
+ MOVQ(acc2_v2, t2_v2)
+ MOVQ(acc3_v2, t3_v2)
+
+ Comment("Add in case the operand was > p256")
+ ADDQ(I8(-1), acc0_v2)
+ ADCQ(p256const0, acc1_v2)
+ ADCQ(Imm(0), acc2_v2)
+ ADCQ(p256const1, acc3_v2)
+ ADCQ(Imm(0), mul0_v2)
+ CMOVQNE(t0_v2, acc0_v2)
+ CMOVQNE(t1_v2, acc1_v2)
+ CMOVQNE(t2_v2, acc2_v2)
+ CMOVQNE(t3_v2, acc3_v2)
+
+ Comment("If condition is 0, keep original value")
+ TESTQ(RDX, RDX)
+ CMOVQEQ(acc4_v2, acc0_v2)
+ CMOVQEQ(acc5_v2, acc1_v2)
+ CMOVQEQ(acc6_v2, acc2_v2)
+ CMOVQEQ(acc7_v2, acc3_v2)
+
+ Comment("Store result")
+ MOVQ(acc0_v2, y2in_v1(8*0))
+ MOVQ(acc1_v2, y2in_v1(8*1))
+ MOVQ(acc2_v2, y2in_v1(8*2))
+ MOVQ(acc3_v2, y2in_v1(8*3))
+
+ Comment("Begin point add")
+ LDacc(z1in_v1)
+ CALL(LabelRef("p256SqrInternal(SB)")) // z1ˆ2
+ ST(z1sqr_v1)
+
+ LDt(x2in_v1)
+ CALL(LabelRef("p256MulInternal(SB)")) // x2 * z1ˆ2
+
+ LDt(x1in_v1)
+ CALL(LabelRef("p256SubInternal(SB)")) // h = u2 - u1)
+ ST(h_v1)
+
+ LDt(z1in_v1)
+ CALL(LabelRef("p256MulInternal(SB)")) // z3 = h * z1
+ ST(zout_v1)
+
+ LDacc(z1sqr_v1)
+ CALL(LabelRef("p256MulInternal(SB)")) // z1ˆ3
+
+ LDt(y2in_v1)
+ CALL(LabelRef("p256MulInternal(SB)")) // s2 = y2 * z1ˆ3
+ ST(s2_v1)
+
+ LDt(y1in_v1)
+ CALL(LabelRef("p256SubInternal(SB)")) // r = s2 - s1)
+ ST(r_v1)
+
+ CALL(LabelRef("p256SqrInternal(SB)")) // rsqr = rˆ2
+ ST(rsqr_v1)
+
+ LDacc(h_v1)
+ CALL(LabelRef("p256SqrInternal(SB)")) // hsqr = hˆ2
+ ST(hsqr_v1)
+
+ LDt(h_v1)
+ CALL(LabelRef("p256MulInternal(SB)")) // hcub = hˆ3
+ ST(hcub_v1)
+
+ LDt(y1in_v1)
+ CALL(LabelRef("p256MulInternal(SB)")) // y1 * hˆ3
+ ST(s2_v1)
+
+ LDacc(x1in_v1)
+ LDt(hsqr_v1)
+ CALL(LabelRef("p256MulInternal(SB)")) // u1 * hˆ2
+ ST(h_v1)
+
+ p256MulBy2Inline() // u1 * hˆ2 * 2, inline
+ LDacc(rsqr_v1)
+ CALL(LabelRef("p256SubInternal(SB)")) // rˆ2 - u1 * hˆ2 * 2)
+
+ LDt(hcub_v1)
+ CALL(LabelRef("p256SubInternal(SB)"))
+ ST(xout_v1)
+
+ MOVQ(acc4_v2, t0_v2)
+ MOVQ(acc5_v2, t1_v2)
+ MOVQ(acc6_v2, t2_v2)
+ MOVQ(acc7_v2, t3_v2)
+ LDacc(h_v1)
+ CALL(LabelRef("p256SubInternal(SB)"))
+
+ LDt(r_v1)
+ CALL(LabelRef("p256MulInternal(SB)"))
+
+ LDt(s2_v1)
+ CALL(LabelRef("p256SubInternal(SB)"))
+ ST(yout_v1)
+
+ Comment("Load stored values from stack")
+ MOVQ(rptr_v1, RAX)
+ MOVL(sel_save_v1, EBX)
+ MOVL(zero_save_v1, ECX)
+
+ Comment("The result is not valid if (sel == 0), conditional choose")
+ MOVOU(xout_v1(16*0), X0)
+ MOVOU(xout_v1(16*1), X1)
+ MOVOU(yout_v1(16*0), X2)
+ MOVOU(yout_v1(16*1), X3)
+ MOVOU(zout_v1(16*0), X4)
+ MOVOU(zout_v1(16*1), X5)
+
+ // Hack to get Avo to emit:
+ // MOVL BX, X6
+ Instruction(&ir.Instruction{
+ Opcode: "MOVL",
+ Operands: []Op{EBX, X6},
+ })
+
+ // Hack to get Avo to emit:
+ // MOVL CX, X7
+ Instruction(&ir.Instruction{
+ Opcode: "MOVL",
+ Operands: []Op{ECX, X7},
+ })
+
+ PXOR(X8, X8)
+ PCMPEQL(X9, X9)
+
+ PSHUFD(Imm(0), X6, X6)
+ PSHUFD(Imm(0), X7, X7)
+
+ PCMPEQL(X8, X6)
+ PCMPEQL(X8, X7)
+
+ MOVOU(X6, X15)
+ PANDN(X9, X15)
+
+ MOVOU(x1in_v1(16*0), X9)
+ MOVOU(x1in_v1(16*1), X10)
+ MOVOU(y1in_v1(16*0), X11)
+ MOVOU(y1in_v1(16*1), X12)
+ MOVOU(z1in_v1(16*0), X13)
+ MOVOU(z1in_v1(16*1), X14)
+
+ PAND(X15, X0)
+ PAND(X15, X1)
+ PAND(X15, X2)
+ PAND(X15, X3)
+ PAND(X15, X4)
+ PAND(X15, X5)
+
+ PAND(X6, X9)
+ PAND(X6, X10)
+ PAND(X6, X11)
+ PAND(X6, X12)
+ PAND(X6, X13)
+ PAND(X6, X14)
+
+ PXOR(X9, X0)
+ PXOR(X10, X1)
+ PXOR(X11, X2)
+ PXOR(X12, X3)
+ PXOR(X13, X4)
+ PXOR(X14, X5)
+
+ Comment("Similarly if zero == 0")
+ PCMPEQL(X9, X9)
+ MOVOU(X7, X15)
+ PANDN(X9, X15)
+
+ MOVOU(x2in_v1(16*0), X9)
+ MOVOU(x2in_v1(16*1), X10)
+ MOVOU(y2in_v1(16*0), X11)
+ MOVOU(y2in_v1(16*1), X12)
+ p256one := p256one_DATA()
+ MOVOU(p256one.Offset(0x00), X13)
+ MOVOU(p256one.Offset(0x10), X14)
+
+ PAND(X15, X0)
+ PAND(X15, X1)
+ PAND(X15, X2)
+ PAND(X15, X3)
+ PAND(X15, X4)
+ PAND(X15, X5)
+
+ PAND(X7, X9)
+ PAND(X7, X10)
+ PAND(X7, X11)
+ PAND(X7, X12)
+ PAND(X7, X13)
+ PAND(X7, X14)
+
+ PXOR(X9, X0)
+ PXOR(X10, X1)
+ PXOR(X11, X2)
+ PXOR(X12, X3)
+ PXOR(X13, X4)
+ PXOR(X14, X5)
+
+ Comment("Finally output the result")
+ MOVOU(X0, Mem{Base: AX}.Offset(16*0))
+ MOVOU(X1, Mem{Base: AX}.Offset(16*1))
+ MOVOU(X2, Mem{Base: AX}.Offset(16*2))
+ MOVOU(X3, Mem{Base: AX}.Offset(16*3))
+ MOVOU(X4, Mem{Base: AX}.Offset(16*4))
+ MOVOU(X5, Mem{Base: AX}.Offset(16*5))
+ MOVQ(U32(0), rptr_v1)
+
+ RET()
+}
+
+// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
+// otherwise. It writes to [acc4..acc7], t0 and t1.
+func p256IsZero() {
+ Function("p256IsZero")
+ Attributes(NOSPLIT)
+
+ Comment("AX contains a flag that is set if the input is zero.")
+ XORQ(RAX, RAX)
+ MOVQ(U32(1), t1_v2)
+
+ Comment("Check whether [acc4..acc7] are all zero.")
+ MOVQ(acc4_v2, t0_v2)
+ ORQ(acc5_v2, t0_v2)
+ ORQ(acc6_v2, t0_v2)
+ ORQ(acc7_v2, t0_v2)
+
+ Comment("Set the zero flag if so. (CMOV of a constant to a register doesn't")
+ Comment("appear to be supported in Go. Thus t1 = 1.)")
+ CMOVQEQ(t1_v2, RAX)
+
+ Comment("XOR [acc4..acc7] with P and compare with zero again.")
+ XORQ(I8(-1), acc4_v2)
+ p256const0 := p256const0_DATA()
+ XORQ(p256const0, acc5_v2)
+ p256const1 := p256const1_DATA()
+ XORQ(p256const1, acc7_v2)
+ ORQ(acc5_v2, acc4_v2)
+ ORQ(acc6_v2, acc4_v2)
+ ORQ(acc7_v2, acc4_v2)
+
+ Comment("Set the zero flag if so.")
+ CMOVQEQ(t1_v2, RAX)
+ RET()
+}
+
+func x1in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*0 + off) }
+func y1in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*1 + off) }
+func z1in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*2 + off) }
+func x2in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*3 + off) }
+func y2in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*4 + off) }
+func z2in_v2(off int) Mem { return Mem{Base: SP}.Offset(32*5 + off) }
+
+func xout_v2(off int) Mem { return Mem{Base: SP}.Offset(32*6 + off) }
+func yout_v2(off int) Mem { return Mem{Base: SP}.Offset(32*7 + off) }
+func zout_v2(off int) Mem { return Mem{Base: SP}.Offset(32*8 + off) }
+
+func u1_v2(off int) Mem { return Mem{Base: SP}.Offset(32*9 + off) }
+func u2_v2(off int) Mem { return Mem{Base: SP}.Offset(32*10 + off) }
+func s1_v2(off int) Mem { return Mem{Base: SP}.Offset(32*11 + off) }
+func s2_v2(off int) Mem { return Mem{Base: SP}.Offset(32*12 + off) }
+func z1sqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*13 + off) }
+func z2sqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*14 + off) }
+func h_v2(off int) Mem { return Mem{Base: SP}.Offset(32*15 + off) }
+func r_v2(off int) Mem { return Mem{Base: SP}.Offset(32*16 + off) }
+func hsqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*17 + off) }
+func rsqr_v2(off int) Mem { return Mem{Base: SP}.Offset(32*18 + off) }
+func hcub_v2(off int) Mem { return Mem{Base: SP}.Offset(32*19 + off) }
+
+var (
+ rptr_v2 Mem = Mem{Base: SP}.Offset(32 * 20)
+ points_eq_v2 = Mem{Base: SP}.Offset(32*20 + 8)
+)
+
+// Implements:
+//
+// func p256PointAddAsm(res, in1, in2 *P256Point) int
+//
+// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
+func p256PointAddAsm() {
+ Implement("p256PointAddAsm")
+ AllocLocal(680)
+
+ Comment("Move input to stack in order to free registers")
+ Load(Param("res"), RAX)
+ Load(Param("in1"), RBX)
+ Load(Param("in2"), RCX)
+
+ MOVOU(Mem{Base: BX}.Offset(16*0), X0)
+ MOVOU(Mem{Base: BX}.Offset(16*1), X1)
+ MOVOU(Mem{Base: BX}.Offset(16*2), X2)
+ MOVOU(Mem{Base: BX}.Offset(16*3), X3)
+ MOVOU(Mem{Base: BX}.Offset(16*4), X4)
+ MOVOU(Mem{Base: BX}.Offset(16*5), X5)
+
+ MOVOU(X0, x1in_v2(16*0))
+ MOVOU(X1, x1in_v2(16*1))
+ MOVOU(X2, y1in_v2(16*0))
+ MOVOU(X3, y1in_v2(16*1))
+ MOVOU(X4, z1in_v2(16*0))
+ MOVOU(X5, z1in_v2(16*1))
+
+ MOVOU(Mem{Base: CX}.Offset(16*0), X0)
+ MOVOU(Mem{Base: CX}.Offset(16*1), X1)
+ MOVOU(Mem{Base: CX}.Offset(16*2), X2)
+ MOVOU(Mem{Base: CX}.Offset(16*3), X3)
+ MOVOU(Mem{Base: CX}.Offset(16*4), X4)
+ MOVOU(Mem{Base: CX}.Offset(16*5), X5)
+
+ MOVOU(X0, x2in_v2(16*0))
+ MOVOU(X1, x2in_v2(16*1))
+ MOVOU(X2, y2in_v2(16*0))
+ MOVOU(X3, y2in_v2(16*1))
+ MOVOU(X4, z2in_v2(16*0))
+ MOVOU(X5, z2in_v2(16*1))
+
+ Comment("Store pointer to result")
+ MOVQ(RAX, rptr_v2)
+
+ Comment("Begin point add")
+ LDacc(z2in_v2)
+ CALL(LabelRef("p256SqrInternal(SB)")) // z2ˆ2
+ ST(z2sqr_v2)
+ LDt(z2in_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // z2ˆ3
+ LDt(y1in_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // s1 = z2ˆ3*y1
+ ST(s1_v2)
+
+ LDacc(z1in_v2)
+ CALL(LabelRef("p256SqrInternal(SB)")) // z1ˆ2
+ ST(z1sqr_v2)
+ LDt(z1in_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // z1ˆ3
+ LDt(y2in_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // s2 = z1ˆ3*y2
+ ST(s2_v2)
+
+ LDt(s1_v2)
+ CALL(LabelRef("p256SubInternal(SB)")) // r = s2 - s1
+ ST(r_v2)
+ CALL(LabelRef("p256IsZero(SB)"))
+ MOVQ(RAX, points_eq_v2)
+
+ LDacc(z2sqr_v2)
+ LDt(x1in_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // u1 = x1 * z2ˆ2
+ ST(u1_v2)
+ LDacc(z1sqr_v2)
+ LDt(x2in_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // u2 = x2 * z1ˆ2
+ ST(u2_v2)
+
+ LDt(u1_v2)
+ CALL(LabelRef("p256SubInternal(SB)")) // h = u2 - u1
+ ST(h_v2)
+ CALL(LabelRef("p256IsZero(SB)"))
+ ANDQ(points_eq_v2, RAX)
+ MOVQ(RAX, points_eq_v2)
+
+ LDacc(r_v2)
+ CALL(LabelRef("p256SqrInternal(SB)")) // rsqr = rˆ2
+ ST(rsqr_v2)
+
+ LDacc(h_v2)
+ CALL(LabelRef("p256SqrInternal(SB)")) // hsqr = hˆ2
+ ST(hsqr_v2)
+
+ LDt(h_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // hcub = hˆ3
+ ST(hcub_v2)
+
+ LDt(s1_v2)
+ CALL(LabelRef("p256MulInternal(SB)"))
+ ST(s2_v2)
+
+ LDacc(z1in_v2)
+ LDt(z2in_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // z1 * z2
+ LDt(h_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // z1 * z2 * h
+ ST(zout_v2)
+
+ LDacc(hsqr_v2)
+ LDt(u1_v2)
+ CALL(LabelRef("p256MulInternal(SB)")) // hˆ2 * u1
+ ST(u2_v2)
+
+ p256MulBy2Inline() // u1 * hˆ2 * 2, inline
+ LDacc(rsqr_v2)
+ CALL(LabelRef("p256SubInternal(SB)")) // rˆ2 - u1 * hˆ2 * 2
+
+ LDt(hcub_v2)
+ CALL(LabelRef("p256SubInternal(SB)"))
+ ST(xout_v2)
+
+ MOVQ(acc4_v2, t0_v2)
+ MOVQ(acc5_v2, t1_v2)
+ MOVQ(acc6_v2, t2_v2)
+ MOVQ(acc7_v2, t3_v2)
+ LDacc(u2_v2)
+ CALL(LabelRef("p256SubInternal(SB)"))
+
+ LDt(r_v2)
+ CALL(LabelRef("p256MulInternal(SB)"))
+
+ LDt(s2_v2)
+ CALL(LabelRef("p256SubInternal(SB)"))
+ ST(yout_v2)
+
+ MOVOU(xout_v2(16*0), X0)
+ MOVOU(xout_v2(16*1), X1)
+ MOVOU(yout_v2(16*0), X2)
+ MOVOU(yout_v2(16*1), X3)
+ MOVOU(zout_v2(16*0), X4)
+ MOVOU(zout_v2(16*1), X5)
+
+ Comment("Finally output the result")
+ MOVQ(rptr_v2, RAX)
+ MOVQ(U32(0), rptr_v2)
+ MOVOU(X0, Mem{Base: AX}.Offset(16*0))
+ MOVOU(X1, Mem{Base: AX}.Offset(16*1))
+ MOVOU(X2, Mem{Base: AX}.Offset(16*2))
+ MOVOU(X3, Mem{Base: AX}.Offset(16*3))
+ MOVOU(X4, Mem{Base: AX}.Offset(16*4))
+ MOVOU(X5, Mem{Base: AX}.Offset(16*5))
+
+ MOVQ(points_eq_v2, RAX)
+ ret := NewParamAddr("ret", 24)
+ MOVQ(RAX, ret)
+
+ RET()
+}
+
+func x(off int) Mem { return Mem{Base: SP}.Offset(32*0 + off) }
+func y(off int) Mem { return Mem{Base: SP}.Offset(32*1 + off) }
+func z(off int) Mem { return Mem{Base: SP}.Offset(32*2 + off) }
+
+func s(off int) Mem { return Mem{Base: SP}.Offset(32*3 + off) }
+func m(off int) Mem { return Mem{Base: SP}.Offset(32*4 + off) }
+func zsqr(off int) Mem { return Mem{Base: SP}.Offset(32*5 + off) }
+func tmp(off int) Mem { return Mem{Base: SP}.Offset(32*6 + off) }
+
+var rptr_v3 = Mem{Base: SP}.Offset(32 * 7)
+
+// Implements:
+//
+// func p256PointDoubleAsm(res, in *P256Point)
+func p256PointDoubleAsm() {
+ Implement("p256PointDoubleAsm")
+ Attributes(NOSPLIT)
+ AllocLocal(256)
+
+ Load(Param("res"), RAX)
+ Load(Param("in"), RBX)
+
+ MOVOU(Mem{Base: BX}.Offset(16*0), X0)
+ MOVOU(Mem{Base: BX}.Offset(16*1), X1)
+ MOVOU(Mem{Base: BX}.Offset(16*2), X2)
+ MOVOU(Mem{Base: BX}.Offset(16*3), X3)
+ MOVOU(Mem{Base: BX}.Offset(16*4), X4)
+ MOVOU(Mem{Base: BX}.Offset(16*5), X5)
+
+ MOVOU(X0, x(16*0))
+ MOVOU(X1, x(16*1))
+ MOVOU(X2, y(16*0))
+ MOVOU(X3, y(16*1))
+ MOVOU(X4, z(16*0))
+ MOVOU(X5, z(16*1))
+
+ Comment("Store pointer to result")
+ MOVQ(RAX, rptr_v3)
+
+ Comment("Begin point double")
+ LDacc(z)
+ CALL(LabelRef("p256SqrInternal(SB)"))
+ ST(zsqr)
+
+ LDt(x)
+ p256AddInline()
+ STt(m)
+
+ LDacc(z)
+ LDt(y)
+ CALL(LabelRef("p256MulInternal(SB)"))
+ p256MulBy2Inline()
+ MOVQ(rptr_v3, RAX)
+
+ Comment("Store z")
+ MOVQ(t0_v2, Mem{Base: AX}.Offset(16*4+8*0))
+ MOVQ(t1_v2, Mem{Base: AX}.Offset(16*4+8*1))
+ MOVQ(t2_v2, Mem{Base: AX}.Offset(16*4+8*2))
+ MOVQ(t3_v2, Mem{Base: AX}.Offset(16*4+8*3))
+
+ LDacc(x)
+ LDt(zsqr)
+ CALL(LabelRef("p256SubInternal(SB)"))
+ LDt(m)
+ CALL(LabelRef("p256MulInternal(SB)"))
+ ST(m)
+
+ Comment("Multiply by 3")
+ p256MulBy2Inline()
+ LDacc(m)
+ p256AddInline()
+ STt(m)
+ Comment("////////////////////////")
+ LDacc(y)
+ p256MulBy2Inline()
+ t2acc()
+ CALL(LabelRef("p256SqrInternal(SB)"))
+ ST(s)
+ CALL(LabelRef("p256SqrInternal(SB)"))
+
+ Comment("Divide by 2")
+ XORQ(mul0_v2, mul0_v2)
+ MOVQ(acc4_v2, t0_v2)
+ MOVQ(acc5_v2, t1_v2)
+ MOVQ(acc6_v2, t2_v2)
+ MOVQ(acc7_v2, t3_v2)
+
+ ADDQ(I8(-1), acc4_v2)
+ p256const0 := p256const0_DATA()
+ ADCQ(p256const0, acc5_v2)
+ ADCQ(Imm(0), acc6_v2)
+ p256const1 := p256const1_DATA()
+ ADCQ(p256const1, acc7_v2)
+ ADCQ(Imm(0), mul0_v2)
+ TESTQ(U32(1), t0_v2)
+
+ CMOVQEQ(t0_v2, acc4_v2)
+ CMOVQEQ(t1_v2, acc5_v2)
+ CMOVQEQ(t2_v2, acc6_v2)
+ CMOVQEQ(t3_v2, acc7_v2)
+ ANDQ(t0_v2, mul0_v2)
+
+ SHRQ(Imm(1), acc5_v2, acc4_v2)
+ SHRQ(Imm(1), acc6_v2, acc5_v2)
+ SHRQ(Imm(1), acc7_v2, acc6_v2)
+ SHRQ(Imm(1), mul0_v2, acc7_v2)
+ ST(y)
+ Comment("/////////////////////////")
+ LDacc(x)
+ LDt(s)
+ CALL(LabelRef("p256MulInternal(SB)"))
+ ST(s)
+ p256MulBy2Inline()
+ STt(tmp)
+
+ LDacc(m)
+ CALL(LabelRef("p256SqrInternal(SB)"))
+ LDt(tmp)
+ CALL(LabelRef("p256SubInternal(SB)"))
+
+ MOVQ(rptr_v3, RAX)
+
+ Comment("Store x")
+ MOVQ(acc4_v2, Mem{Base: AX}.Offset(16*0+8*0))
+ MOVQ(acc5_v2, Mem{Base: AX}.Offset(16*0+8*1))
+ MOVQ(acc6_v2, Mem{Base: AX}.Offset(16*0+8*2))
+ MOVQ(acc7_v2, Mem{Base: AX}.Offset(16*0+8*3))
+
+ acc2t()
+ LDacc(s)
+ CALL(LabelRef("p256SubInternal(SB)"))
+
+ LDt(m)
+ CALL(LabelRef("p256MulInternal(SB)"))
+
+ LDt(y)
+ CALL(LabelRef("p256SubInternal(SB)"))
+ MOVQ(rptr_v3, RAX)
+
+ Comment("Store y")
+ MOVQ(acc4_v2, Mem{Base: AX}.Offset(16*2+8*0))
+ MOVQ(acc5_v2, Mem{Base: AX}.Offset(16*2+8*1))
+ MOVQ(acc6_v2, Mem{Base: AX}.Offset(16*2+8*2))
+ MOVQ(acc7_v2, Mem{Base: AX}.Offset(16*2+8*3))
+ Comment("///////////////////////")
+ MOVQ(U32(0), rptr_v3)
+
+ RET()
+}
+
+// #----------------------------DATA SECTION-----------------------------------##
+
+// Pointers for memoizing Data section symbols
+var p256const0_ptr, p256const1_ptr, p256ordK0_ptr, p256ord_ptr, p256one_ptr *Mem
+
+func p256const0_DATA() Mem {
+ if p256const0_ptr != nil {
+ return *p256const0_ptr
+ }
+
+ p256const0 := GLOBL("p256const0", 8)
+ p256const0_ptr = &p256const0
+ DATA(0, U64(0x00000000ffffffff))
+ return p256const0
+}
+
+func p256const1_DATA() Mem {
+ if p256const1_ptr != nil {
+ return *p256const1_ptr
+ }
+
+ p256const1 := GLOBL("p256const1", 8)
+ p256const1_ptr = &p256const1
+ DATA(0, U64(0xffffffff00000001))
+ return p256const1
+}
+
+func p256ordK0_DATA() Mem {
+ if p256ordK0_ptr != nil {
+ return *p256ordK0_ptr
+ }
+
+ p256ordK0 := GLOBL("p256ordK0", 8)
+ p256ordK0_ptr = &p256ordK0
+ DATA(0, U64(0xccd1c8aaee00bc4f))
+ return p256ordK0
+}
+
+var p256ordConstants = [4]uint64{
+ 0xf3b9cac2fc632551,
+ 0xbce6faada7179e84,
+ 0xffffffffffffffff,
+ 0xffffffff00000000,
+}
+
+func p256ord_DATA() Mem {
+ if p256ord_ptr != nil {
+ return *p256ord_ptr
+ }
+
+ p256ord := GLOBL("p256ord", 8)
+ p256ord_ptr = &p256ord
+
+ for i, k := range p256ordConstants {
+ DATA(i*8, U64(k))
+ }
+
+ return p256ord
+}
+
+var p256oneConstants = [4]uint64{
+ 0x0000000000000001,
+ 0xffffffff00000000,
+ 0xffffffffffffffff,
+ 0x00000000fffffffe,
+}
+
+func p256one_DATA() Mem {
+ if p256one_ptr != nil {
+ return *p256one_ptr
+ }
+
+ p256one := GLOBL("p256one", 8)
+ p256one_ptr = &p256one
+
+ for i, k := range p256oneConstants {
+ DATA(i*8, U64(k))
+ }
+
+ return p256one
+}
+
+const ThatPeskyUnicodeDot = "\u00b7"
+
+// removePeskyUnicodeDot strips the dot from the relevant TEXT directives such that they
+// can exist as internal assembly functions
+//
+// Avo v0.6.0 does not support the generation of internal assembly functions. Go's unicode
+// dot tells the compiler to link a TEXT symbol to a function in the current Go package
+// (or another package if specified). Avo unconditionally prepends the unicode dot to all
+// TEXT symbols, making it impossible to emit an internal function without this hack.
+//
+// There is a pending PR to add internal functions to Avo:
+// https://github.com/mmcloughlin/avo/pull/443
+//
+// If merged it should allow the usage of InternalFunction("NAME") for the specified functions
+func removePeskyUnicodeDot(internalFunctions []string, target string) {
+ bytes, err := os.ReadFile(target)
+ if err != nil {
+ panic(err)
+ }
+
+ content := string(bytes)
+
+ for _, from := range internalFunctions {
+ to := strings.ReplaceAll(from, ThatPeskyUnicodeDot, "")
+ content = strings.ReplaceAll(content, from, to)
+ }
+
+ err = os.WriteFile(target, []byte(content), 0644)
+ if err != nil {
+ panic(err)
+ }
+}
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run p256_asm_amd64.go -out ../p256_asm_amd64.s -pkg nistec. DO NOT EDIT.
//go:build !purego
-// This file contains constant-time, 64-bit assembly implementation of
-// P256. The optimizations performed here are described in detail in:
-// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
-// 256-bit primes"
-// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
-// https://eprint.iacr.org/2013/816.pdf
-
#include "textflag.h"
-#define res_ptr DI
-#define x_ptr SI
-#define y_ptr CX
-
-#define acc0 R8
-#define acc1 R9
-#define acc2 R10
-#define acc3 R11
-#define acc4 R12
-#define acc5 R13
-#define t0 R14
-#define t1 R15
-
-DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
-DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
-DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
-DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
-DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
-DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
-DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
-DATA p256one<>+0x00(SB)/8, $0x0000000000000001
-DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
-DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
-DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
-GLOBL p256const0<>(SB), 8, $8
-GLOBL p256const1<>(SB), 8, $8
-GLOBL p256ordK0<>(SB), 8, $8
-GLOBL p256ord<>(SB), 8, $32
-GLOBL p256one<>(SB), 8, $32
-
-/* ---------------------------------------*/
// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
-TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
+TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
JMP ·p256BigToLittle(SB)
-/* ---------------------------------------*/
+
// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
-TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
+TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
JMP ·p256BigToLittle(SB)
-/* ---------------------------------------*/
+
// func p256LittleToBig(res *[32]byte, in *p256Element)
-TEXT ·p256LittleToBig(SB),NOSPLIT,$0
+TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
JMP ·p256BigToLittle(SB)
-/* ---------------------------------------*/
-// func p256BigToLittle(res *p256Element, in *[32]byte)
-TEXT ·p256BigToLittle(SB),NOSPLIT,$0
- MOVQ res+0(FP), res_ptr
- MOVQ in+8(FP), x_ptr
-
- MOVQ (8*0)(x_ptr), acc0
- MOVQ (8*1)(x_ptr), acc1
- MOVQ (8*2)(x_ptr), acc2
- MOVQ (8*3)(x_ptr), acc3
-
- BSWAPQ acc0
- BSWAPQ acc1
- BSWAPQ acc2
- BSWAPQ acc3
-
- MOVQ acc3, (8*0)(res_ptr)
- MOVQ acc2, (8*1)(res_ptr)
- MOVQ acc1, (8*2)(res_ptr)
- MOVQ acc0, (8*3)(res_ptr)
+// func p256BigToLittle(res *p256Element, in *[32]byte)
+TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
+ MOVQ res+0(FP), DI
+ MOVQ in+8(FP), SI
+ MOVQ (SI), R8
+ MOVQ 8(SI), R9
+ MOVQ 16(SI), R10
+ MOVQ 24(SI), R11
+ BSWAPQ R8
+ BSWAPQ R9
+ BSWAPQ R10
+ BSWAPQ R11
+ MOVQ R11, (DI)
+ MOVQ R10, 8(DI)
+ MOVQ R9, 16(DI)
+ MOVQ R8, 24(DI)
RET
-/* ---------------------------------------*/
-// func p256MovCond(res, a, b *P256Point, cond int)
-TEXT ·p256MovCond(SB),NOSPLIT,$0
- MOVQ res+0(FP), res_ptr
- MOVQ a+8(FP), x_ptr
- MOVQ b+16(FP), y_ptr
- MOVQ cond+24(FP), X12
-
- PXOR X13, X13
- PSHUFD $0, X12, X12
- PCMPEQL X13, X12
-
- MOVOU X12, X0
- MOVOU (16*0)(x_ptr), X6
- PANDN X6, X0
- MOVOU X12, X1
- MOVOU (16*1)(x_ptr), X7
- PANDN X7, X1
- MOVOU X12, X2
- MOVOU (16*2)(x_ptr), X8
- PANDN X8, X2
- MOVOU X12, X3
- MOVOU (16*3)(x_ptr), X9
- PANDN X9, X3
- MOVOU X12, X4
- MOVOU (16*4)(x_ptr), X10
- PANDN X10, X4
- MOVOU X12, X5
- MOVOU (16*5)(x_ptr), X11
- PANDN X11, X5
-
- MOVOU (16*0)(y_ptr), X6
- MOVOU (16*1)(y_ptr), X7
- MOVOU (16*2)(y_ptr), X8
- MOVOU (16*3)(y_ptr), X9
- MOVOU (16*4)(y_ptr), X10
- MOVOU (16*5)(y_ptr), X11
-
- PAND X12, X6
- PAND X12, X7
- PAND X12, X8
- PAND X12, X9
- PAND X12, X10
- PAND X12, X11
-
- PXOR X6, X0
- PXOR X7, X1
- PXOR X8, X2
- PXOR X9, X3
- PXOR X10, X4
- PXOR X11, X5
-
- MOVOU X0, (16*0)(res_ptr)
- MOVOU X1, (16*1)(res_ptr)
- MOVOU X2, (16*2)(res_ptr)
- MOVOU X3, (16*3)(res_ptr)
- MOVOU X4, (16*4)(res_ptr)
- MOVOU X5, (16*5)(res_ptr)
+// func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
+// Requires: SSE2
+TEXT ·p256MovCond(SB), NOSPLIT, $0-32
+ MOVQ res+0(FP), DI
+ MOVQ a+8(FP), SI
+ MOVQ b+16(FP), CX
+ MOVQ cond+24(FP), X12
+ PXOR X13, X13
+ PSHUFD $0x00, X12, X12
+ PCMPEQL X13, X12
+ MOVOU X12, X0
+ MOVOU (SI), X6
+ PANDN X6, X0
+ MOVOU X12, X1
+ MOVOU 16(SI), X7
+ PANDN X7, X1
+ MOVOU X12, X2
+ MOVOU 32(SI), X8
+ PANDN X8, X2
+ MOVOU X12, X3
+ MOVOU 48(SI), X9
+ PANDN X9, X3
+ MOVOU X12, X4
+ MOVOU 64(SI), X10
+ PANDN X10, X4
+ MOVOU X12, X5
+ MOVOU 80(SI), X11
+ PANDN X11, X5
+ MOVOU (CX), X6
+ MOVOU 16(CX), X7
+ MOVOU 32(CX), X8
+ MOVOU 48(CX), X9
+ MOVOU 64(CX), X10
+ MOVOU 80(CX), X11
+ PAND X12, X6
+ PAND X12, X7
+ PAND X12, X8
+ PAND X12, X9
+ PAND X12, X10
+ PAND X12, X11
+ PXOR X6, X0
+ PXOR X7, X1
+ PXOR X8, X2
+ PXOR X9, X3
+ PXOR X10, X4
+ PXOR X11, X5
+ MOVOU X0, (DI)
+ MOVOU X1, 16(DI)
+ MOVOU X2, 32(DI)
+ MOVOU X3, 48(DI)
+ MOVOU X4, 64(DI)
+ MOVOU X5, 80(DI)
RET
-/* ---------------------------------------*/
+
// func p256NegCond(val *p256Element, cond int)
-TEXT ·p256NegCond(SB),NOSPLIT,$0
- MOVQ val+0(FP), res_ptr
- MOVQ cond+8(FP), t0
+// Requires: CMOV
+TEXT ·p256NegCond(SB), NOSPLIT, $0-16
+ MOVQ val+0(FP), DI
+ MOVQ cond+8(FP), R14
+
// acc = poly
- MOVQ $-1, acc0
- MOVQ p256const0<>(SB), acc1
- MOVQ $0, acc2
- MOVQ p256const1<>(SB), acc3
+ MOVQ $-1, R8
+ MOVQ p256const0<>+0(SB), R9
+ MOVQ $+0, R10
+ MOVQ p256const1<>+0(SB), R11
+
// Load the original value
- MOVQ (8*0)(res_ptr), acc5
- MOVQ (8*1)(res_ptr), x_ptr
- MOVQ (8*2)(res_ptr), y_ptr
- MOVQ (8*3)(res_ptr), t1
+ MOVQ (DI), R13
+ MOVQ 8(DI), SI
+ MOVQ 16(DI), CX
+ MOVQ 24(DI), R15
+
// Speculatively subtract
- SUBQ acc5, acc0
- SBBQ x_ptr, acc1
- SBBQ y_ptr, acc2
- SBBQ t1, acc3
+ SUBQ R13, R8
+ SBBQ SI, R9
+ SBBQ CX, R10
+ SBBQ R15, R11
+
// If condition is 0, keep original value
- TESTQ t0, t0
- CMOVQEQ acc5, acc0
- CMOVQEQ x_ptr, acc1
- CMOVQEQ y_ptr, acc2
- CMOVQEQ t1, acc3
- // Store result
- MOVQ acc0, (8*0)(res_ptr)
- MOVQ acc1, (8*1)(res_ptr)
- MOVQ acc2, (8*2)(res_ptr)
- MOVQ acc3, (8*3)(res_ptr)
+ TESTQ R14, R14
+ CMOVQEQ R13, R8
+ CMOVQEQ SI, R9
+ CMOVQEQ CX, R10
+ CMOVQEQ R15, R11
+ // Store result
+ MOVQ R8, (DI)
+ MOVQ R9, 8(DI)
+ MOVQ R10, 16(DI)
+ MOVQ R11, 24(DI)
RET
-/* ---------------------------------------*/
-// func p256Sqr(res, in *p256Element, n int)
-TEXT ·p256Sqr(SB),NOSPLIT,$0
- MOVQ res+0(FP), res_ptr
- MOVQ in+8(FP), x_ptr
+
+DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
+GLOBL p256const0<>(SB), RODATA, $8
+
+DATA p256const1<>+0(SB)/8, $0xffffffff00000001
+GLOBL p256const1<>(SB), RODATA, $8
+
+// func p256Sqr(res *p256Element, in *p256Element, n int)
+// Requires: CMOV
+TEXT ·p256Sqr(SB), NOSPLIT, $0-24
+ MOVQ res+0(FP), DI
+ MOVQ in+8(FP), SI
MOVQ n+16(FP), BX
sqrLoop:
-
// y[1:] * y[0]
- MOVQ (8*0)(x_ptr), t0
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- MOVQ AX, acc1
- MOVQ DX, acc2
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, acc3
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, acc4
+ MOVQ (SI), R14
+ MOVQ 8(SI), AX
+ MULQ R14
+ MOVQ AX, R9
+ MOVQ DX, R10
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R11
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R12
+
// y[2:] * y[1]
- MOVQ (8*1)(x_ptr), t0
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, acc5
+ MOVQ 8(SI), R14
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R13
+
// y[3] * y[2]
- MOVQ (8*2)(x_ptr), t0
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc5
- ADCQ $0, DX
- MOVQ DX, y_ptr
- XORQ t1, t1
+ MOVQ 16(SI), R14
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ AX, R13
+ ADCQ $0x00, DX
+ MOVQ DX, CX
+ XORQ R15, R15
+
// *2
- ADDQ acc1, acc1
- ADCQ acc2, acc2
- ADCQ acc3, acc3
- ADCQ acc4, acc4
- ADCQ acc5, acc5
- ADCQ y_ptr, y_ptr
- ADCQ $0, t1
+ ADDQ R9, R9
+ ADCQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ CX, CX
+ ADCQ $0x00, R15
+
// Missing products
- MOVQ (8*0)(x_ptr), AX
+ MOVQ (SI), AX
MULQ AX
- MOVQ AX, acc0
- MOVQ DX, t0
-
- MOVQ (8*1)(x_ptr), AX
+ MOVQ AX, R8
+ MOVQ DX, R14
+ MOVQ 8(SI), AX
MULQ AX
- ADDQ t0, acc1
- ADCQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t0
-
- MOVQ (8*2)(x_ptr), AX
+ ADDQ R14, R9
+ ADCQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R14
+ MOVQ 16(SI), AX
MULQ AX
- ADDQ t0, acc3
- ADCQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t0
-
- MOVQ (8*3)(x_ptr), AX
+ ADDQ R14, R11
+ ADCQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R14
+ MOVQ 24(SI), AX
MULQ AX
- ADDQ t0, acc5
- ADCQ AX, y_ptr
- ADCQ DX, t1
- MOVQ t1, x_ptr
+ ADDQ R14, R13
+ ADCQ AX, CX
+ ADCQ DX, R15
+ MOVQ R15, SI
+
// First reduction step
- MOVQ acc0, AX
- MOVQ acc0, t1
- SHLQ $32, acc0
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc0, acc1
- ADCQ t1, acc2
- ADCQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, acc0
+ MOVQ R8, AX
+ MOVQ R8, R15
+ SHLQ $0x20, R8
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R8, R9
+ ADCQ R15, R10
+ ADCQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R8
+
// Second reduction step
- MOVQ acc1, AX
- MOVQ acc1, t1
- SHLQ $32, acc1
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc1, acc2
- ADCQ t1, acc3
- ADCQ AX, acc0
- ADCQ $0, DX
- MOVQ DX, acc1
+ MOVQ R9, AX
+ MOVQ R9, R15
+ SHLQ $0x20, R9
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R9, R10
+ ADCQ R15, R11
+ ADCQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R9
+
// Third reduction step
- MOVQ acc2, AX
- MOVQ acc2, t1
- SHLQ $32, acc2
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc2, acc3
- ADCQ t1, acc0
- ADCQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, acc2
+ MOVQ R10, AX
+ MOVQ R10, R15
+ SHLQ $0x20, R10
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R10, R11
+ ADCQ R15, R8
+ ADCQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+
// Last reduction step
- XORQ t0, t0
- MOVQ acc3, AX
- MOVQ acc3, t1
- SHLQ $32, acc3
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc3, acc0
- ADCQ t1, acc1
- ADCQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, acc3
+ XORQ R14, R14
+ MOVQ R11, AX
+ MOVQ R11, R15
+ SHLQ $0x20, R11
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R11, R8
+ ADCQ R15, R9
+ ADCQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R11
+
// Add bits [511:256] of the sqr result
- ADCQ acc4, acc0
- ADCQ acc5, acc1
- ADCQ y_ptr, acc2
- ADCQ x_ptr, acc3
- ADCQ $0, t0
-
- MOVQ acc0, acc4
- MOVQ acc1, acc5
- MOVQ acc2, y_ptr
- MOVQ acc3, t1
- // Subtract p256
- SUBQ $-1, acc0
- SBBQ p256const0<>(SB) ,acc1
- SBBQ $0, acc2
- SBBQ p256const1<>(SB), acc3
- SBBQ $0, t0
-
- CMOVQCS acc4, acc0
- CMOVQCS acc5, acc1
- CMOVQCS y_ptr, acc2
- CMOVQCS t1, acc3
-
- MOVQ acc0, (8*0)(res_ptr)
- MOVQ acc1, (8*1)(res_ptr)
- MOVQ acc2, (8*2)(res_ptr)
- MOVQ acc3, (8*3)(res_ptr)
- MOVQ res_ptr, x_ptr
- DECQ BX
- JNE sqrLoop
+ ADCQ R12, R8
+ ADCQ R13, R9
+ ADCQ CX, R10
+ ADCQ SI, R11
+ ADCQ $0x00, R14
+ MOVQ R8, R12
+ MOVQ R9, R13
+ MOVQ R10, CX
+ MOVQ R11, R15
+ // Subtract p256
+ SUBQ $-1, R8
+ SBBQ p256const0<>+0(SB), R9
+ SBBQ $0x00, R10
+ SBBQ p256const1<>+0(SB), R11
+ SBBQ $0x00, R14
+ CMOVQCS R12, R8
+ CMOVQCS R13, R9
+ CMOVQCS CX, R10
+ CMOVQCS R15, R11
+ MOVQ R8, (DI)
+ MOVQ R9, 8(DI)
+ MOVQ R10, 16(DI)
+ MOVQ R11, 24(DI)
+ MOVQ DI, SI
+ DECQ BX
+ JNE sqrLoop
RET
-/* ---------------------------------------*/
-// func p256Mul(res, in1, in2 *p256Element)
-TEXT ·p256Mul(SB),NOSPLIT,$0
- MOVQ res+0(FP), res_ptr
- MOVQ in1+8(FP), x_ptr
- MOVQ in2+16(FP), y_ptr
+
+// func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
+// Requires: CMOV
+TEXT ·p256Mul(SB), NOSPLIT, $0-24
+ MOVQ res+0(FP), DI
+ MOVQ in1+8(FP), SI
+ MOVQ in2+16(FP), CX
+
// x * y[0]
- MOVQ (8*0)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- MOVQ AX, acc0
- MOVQ DX, acc1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, acc2
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, acc3
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, acc4
- XORQ acc5, acc5
+ MOVQ (CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ MOVQ AX, R8
+ MOVQ DX, R9
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R11
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R12
+ XORQ R13, R13
+
// First reduction step
- MOVQ acc0, AX
- MOVQ acc0, t1
- SHLQ $32, acc0
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc0, acc1
- ADCQ t1, acc2
- ADCQ AX, acc3
- ADCQ DX, acc4
- ADCQ $0, acc5
- XORQ acc0, acc0
+ MOVQ R8, AX
+ MOVQ R8, R15
+ SHLQ $0x20, R8
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R8, R9
+ ADCQ R15, R10
+ ADCQ AX, R11
+ ADCQ DX, R12
+ ADCQ $0x00, R13
+ XORQ R8, R8
+
// x * y[1]
- MOVQ (8*1)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc2
- ADCQ $0, DX
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ DX, acc5
- ADCQ $0, acc0
+ MOVQ 8(CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ R15, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ DX, R13
+ ADCQ $0x00, R8
+
// Second reduction step
- MOVQ acc1, AX
- MOVQ acc1, t1
- SHLQ $32, acc1
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc1, acc2
- ADCQ t1, acc3
- ADCQ AX, acc4
- ADCQ DX, acc5
- ADCQ $0, acc0
- XORQ acc1, acc1
+ MOVQ R9, AX
+ MOVQ R9, R15
+ SHLQ $0x20, R9
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R9, R10
+ ADCQ R15, R11
+ ADCQ AX, R12
+ ADCQ DX, R13
+ ADCQ $0x00, R8
+ XORQ R9, R9
+
// x * y[2]
- MOVQ (8*2)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc5
- ADCQ $0, DX
- ADDQ AX, acc5
- ADCQ DX, acc0
- ADCQ $0, acc1
+ MOVQ 16(CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R13
+ ADCQ $0x00, DX
+ ADDQ AX, R13
+ ADCQ DX, R8
+ ADCQ $0x00, R9
+
// Third reduction step
- MOVQ acc2, AX
- MOVQ acc2, t1
- SHLQ $32, acc2
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc2, acc3
- ADCQ t1, acc4
- ADCQ AX, acc5
- ADCQ DX, acc0
- ADCQ $0, acc1
- XORQ acc2, acc2
+ MOVQ R10, AX
+ MOVQ R10, R15
+ SHLQ $0x20, R10
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R10, R11
+ ADCQ R15, R12
+ ADCQ AX, R13
+ ADCQ DX, R8
+ ADCQ $0x00, R9
+ XORQ R10, R10
+
// x * y[3]
- MOVQ (8*3)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc5
- ADCQ $0, DX
- ADDQ AX, acc5
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc0
- ADCQ $0, DX
- ADDQ AX, acc0
- ADCQ DX, acc1
- ADCQ $0, acc2
+ MOVQ 24(CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ R15, R13
+ ADCQ $0x00, DX
+ ADDQ AX, R13
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R8
+ ADCQ $0x00, DX
+ ADDQ AX, R8
+ ADCQ DX, R9
+ ADCQ $0x00, R10
+
// Last reduction step
- MOVQ acc3, AX
- MOVQ acc3, t1
- SHLQ $32, acc3
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc3, acc4
- ADCQ t1, acc5
- ADCQ AX, acc0
- ADCQ DX, acc1
- ADCQ $0, acc2
+ MOVQ R11, AX
+ MOVQ R11, R15
+ SHLQ $0x20, R11
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R11, R12
+ ADCQ R15, R13
+ ADCQ AX, R8
+ ADCQ DX, R9
+ ADCQ $0x00, R10
+
// Copy result [255:0]
- MOVQ acc4, x_ptr
- MOVQ acc5, acc3
- MOVQ acc0, t0
- MOVQ acc1, t1
- // Subtract p256
- SUBQ $-1, acc4
- SBBQ p256const0<>(SB) ,acc5
- SBBQ $0, acc0
- SBBQ p256const1<>(SB), acc1
- SBBQ $0, acc2
-
- CMOVQCS x_ptr, acc4
- CMOVQCS acc3, acc5
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
-
- MOVQ acc4, (8*0)(res_ptr)
- MOVQ acc5, (8*1)(res_ptr)
- MOVQ acc0, (8*2)(res_ptr)
- MOVQ acc1, (8*3)(res_ptr)
+ MOVQ R12, SI
+ MOVQ R13, R11
+ MOVQ R8, R14
+ MOVQ R9, R15
+ // Subtract p256
+ SUBQ $-1, R12
+ SBBQ p256const0<>+0(SB), R13
+ SBBQ $0x00, R8
+ SBBQ p256const1<>+0(SB), R9
+ SBBQ $0x00, R10
+ CMOVQCS SI, R12
+ CMOVQCS R11, R13
+ CMOVQCS R14, R8
+ CMOVQCS R15, R9
+ MOVQ R12, (DI)
+ MOVQ R13, 8(DI)
+ MOVQ R8, 16(DI)
+ MOVQ R9, 24(DI)
RET
-/* ---------------------------------------*/
-// func p256FromMont(res, in *p256Element)
-TEXT ·p256FromMont(SB),NOSPLIT,$0
- MOVQ res+0(FP), res_ptr
- MOVQ in+8(FP), x_ptr
-
- MOVQ (8*0)(x_ptr), acc0
- MOVQ (8*1)(x_ptr), acc1
- MOVQ (8*2)(x_ptr), acc2
- MOVQ (8*3)(x_ptr), acc3
- XORQ acc4, acc4
+
+// func p256FromMont(res *p256Element, in *p256Element)
+// Requires: CMOV
+TEXT ·p256FromMont(SB), NOSPLIT, $0-16
+ MOVQ res+0(FP), DI
+ MOVQ in+8(FP), SI
+ MOVQ (SI), R8
+ MOVQ 8(SI), R9
+ MOVQ 16(SI), R10
+ MOVQ 24(SI), R11
+ XORQ R12, R12
// Only reduce, no multiplications are needed
// First stage
- MOVQ acc0, AX
- MOVQ acc0, t1
- SHLQ $32, acc0
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc0, acc1
- ADCQ t1, acc2
- ADCQ AX, acc3
- ADCQ DX, acc4
- XORQ acc5, acc5
+ MOVQ R8, AX
+ MOVQ R8, R15
+ SHLQ $0x20, R8
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R8, R9
+ ADCQ R15, R10
+ ADCQ AX, R11
+ ADCQ DX, R12
+ XORQ R13, R13
+
// Second stage
- MOVQ acc1, AX
- MOVQ acc1, t1
- SHLQ $32, acc1
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc1, acc2
- ADCQ t1, acc3
- ADCQ AX, acc4
- ADCQ DX, acc5
- XORQ acc0, acc0
+ MOVQ R9, AX
+ MOVQ R9, R15
+ SHLQ $0x20, R9
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R9, R10
+ ADCQ R15, R11
+ ADCQ AX, R12
+ ADCQ DX, R13
+ XORQ R8, R8
+
// Third stage
- MOVQ acc2, AX
- MOVQ acc2, t1
- SHLQ $32, acc2
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc2, acc3
- ADCQ t1, acc4
- ADCQ AX, acc5
- ADCQ DX, acc0
- XORQ acc1, acc1
- // Last stage
- MOVQ acc3, AX
- MOVQ acc3, t1
- SHLQ $32, acc3
- MULQ p256const1<>(SB)
- SHRQ $32, t1
- ADDQ acc3, acc4
- ADCQ t1, acc5
- ADCQ AX, acc0
- ADCQ DX, acc1
-
- MOVQ acc4, x_ptr
- MOVQ acc5, acc3
- MOVQ acc0, t0
- MOVQ acc1, t1
-
- SUBQ $-1, acc4
- SBBQ p256const0<>(SB), acc5
- SBBQ $0, acc0
- SBBQ p256const1<>(SB), acc1
-
- CMOVQCS x_ptr, acc4
- CMOVQCS acc3, acc5
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
-
- MOVQ acc4, (8*0)(res_ptr)
- MOVQ acc5, (8*1)(res_ptr)
- MOVQ acc0, (8*2)(res_ptr)
- MOVQ acc1, (8*3)(res_ptr)
+ MOVQ R10, AX
+ MOVQ R10, R15
+ SHLQ $0x20, R10
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R10, R11
+ ADCQ R15, R12
+ ADCQ AX, R13
+ ADCQ DX, R8
+ XORQ R9, R9
+ // Last stage
+ MOVQ R11, AX
+ MOVQ R11, R15
+ SHLQ $0x20, R11
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, R15
+ ADDQ R11, R12
+ ADCQ R15, R13
+ ADCQ AX, R8
+ ADCQ DX, R9
+ MOVQ R12, SI
+ MOVQ R13, R11
+ MOVQ R8, R14
+ MOVQ R9, R15
+ SUBQ $-1, R12
+ SBBQ p256const0<>+0(SB), R13
+ SBBQ $0x00, R8
+ SBBQ p256const1<>+0(SB), R9
+ CMOVQCS SI, R12
+ CMOVQCS R11, R13
+ CMOVQCS R14, R8
+ CMOVQCS R15, R9
+ MOVQ R12, (DI)
+ MOVQ R13, 8(DI)
+ MOVQ R8, 16(DI)
+ MOVQ R9, 24(DI)
RET
-/* ---------------------------------------*/
+
// func p256Select(res *P256Point, table *p256Table, idx int)
-TEXT ·p256Select(SB),NOSPLIT,$0
- MOVQ idx+16(FP),AX
- MOVQ table+8(FP),DI
- MOVQ res+0(FP),DX
-
- PXOR X15, X15 // X15 = 0
- PCMPEQL X14, X14 // X14 = -1
- PSUBL X14, X15 // X15 = 1
- MOVL AX, X14
- PSHUFD $0, X14, X14
-
- PXOR X0, X0
- PXOR X1, X1
- PXOR X2, X2
- PXOR X3, X3
- PXOR X4, X4
- PXOR X5, X5
- MOVQ $16, AX
-
- MOVOU X15, X13
+// Requires: SSE2
+TEXT ·p256Select(SB), NOSPLIT, $0-24
+ MOVQ idx+16(FP), AX
+ MOVQ table+8(FP), DI
+ MOVQ res+0(FP), DX
+ PXOR X15, X15
+ PCMPEQL X14, X14
+ PSUBL X14, X15
+ MOVL AX, X14
+ PSHUFD $0x00, X14, X14
+ PXOR X0, X0
+ PXOR X1, X1
+ PXOR X2, X2
+ PXOR X3, X3
+ PXOR X4, X4
+ PXOR X5, X5
+ MOVQ $0x00000010, AX
+ MOVOU X15, X13
loop_select:
-
- MOVOU X13, X12
- PADDL X15, X13
- PCMPEQL X14, X12
-
- MOVOU (16*0)(DI), X6
- MOVOU (16*1)(DI), X7
- MOVOU (16*2)(DI), X8
- MOVOU (16*3)(DI), X9
- MOVOU (16*4)(DI), X10
- MOVOU (16*5)(DI), X11
- ADDQ $(16*6), DI
-
- PAND X12, X6
- PAND X12, X7
- PAND X12, X8
- PAND X12, X9
- PAND X12, X10
- PAND X12, X11
-
- PXOR X6, X0
- PXOR X7, X1
- PXOR X8, X2
- PXOR X9, X3
- PXOR X10, X4
- PXOR X11, X5
-
- DECQ AX
- JNE loop_select
-
- MOVOU X0, (16*0)(DX)
- MOVOU X1, (16*1)(DX)
- MOVOU X2, (16*2)(DX)
- MOVOU X3, (16*3)(DX)
- MOVOU X4, (16*4)(DX)
- MOVOU X5, (16*5)(DX)
-
+ MOVOU X13, X12
+ PADDL X15, X13
+ PCMPEQL X14, X12
+ MOVOU (DI), X6
+ MOVOU 16(DI), X7
+ MOVOU 32(DI), X8
+ MOVOU 48(DI), X9
+ MOVOU 64(DI), X10
+ MOVOU 80(DI), X11
+ ADDQ $0x60, DI
+ PAND X12, X6
+ PAND X12, X7
+ PAND X12, X8
+ PAND X12, X9
+ PAND X12, X10
+ PAND X12, X11
+ PXOR X6, X0
+ PXOR X7, X1
+ PXOR X8, X2
+ PXOR X9, X3
+ PXOR X10, X4
+ PXOR X11, X5
+ DECQ AX
+ JNE loop_select
+ MOVOU X0, (DX)
+ MOVOU X1, 16(DX)
+ MOVOU X2, 32(DX)
+ MOVOU X3, 48(DX)
+ MOVOU X4, 64(DX)
+ MOVOU X5, 80(DX)
RET
-/* ---------------------------------------*/
-// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
-TEXT ·p256SelectAffine(SB),NOSPLIT,$0
- MOVQ idx+16(FP),AX
- MOVQ table+8(FP),DI
- MOVQ res+0(FP),DX
-
- PXOR X15, X15 // X15 = 0
- PCMPEQL X14, X14 // X14 = -1
- PSUBL X14, X15 // X15 = 1
- MOVL AX, X14
- PSHUFD $0, X14, X14
- PXOR X0, X0
- PXOR X1, X1
- PXOR X2, X2
- PXOR X3, X3
- MOVQ $16, AX
-
- MOVOU X15, X13
+// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
+// Requires: SSE2
+TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
+ MOVQ idx+16(FP), AX
+ MOVQ table+8(FP), DI
+ MOVQ res+0(FP), DX
+ PXOR X15, X15
+ PCMPEQL X14, X14
+ PSUBL X14, X15
+ MOVL AX, X14
+ PSHUFD $0x00, X14, X14
+ PXOR X0, X0
+ PXOR X1, X1
+ PXOR X2, X2
+ PXOR X3, X3
+ MOVQ $0x00000010, AX
+ MOVOU X15, X13
loop_select_base:
+ MOVOU X13, X12
+ PADDL X15, X13
+ PCMPEQL X14, X12
+ MOVOU (DI), X4
+ MOVOU 16(DI), X5
+ MOVOU 32(DI), X6
+ MOVOU 48(DI), X7
+ MOVOU 64(DI), X8
+ MOVOU 80(DI), X9
+ MOVOU 96(DI), X10
+ MOVOU 112(DI), X11
+ ADDQ $0x80, DI
+ PAND X12, X4
+ PAND X12, X5
+ PAND X12, X6
+ PAND X12, X7
+ MOVOU X13, X12
+ PADDL X15, X13
+ PCMPEQL X14, X12
+ PAND X12, X8
+ PAND X12, X9
+ PAND X12, X10
+ PAND X12, X11
+ PXOR X4, X0
+ PXOR X5, X1
+ PXOR X6, X2
+ PXOR X7, X3
+ PXOR X8, X0
+ PXOR X9, X1
+ PXOR X10, X2
+ PXOR X11, X3
+ DECQ AX
+ JNE loop_select_base
+ MOVOU X0, (DX)
+ MOVOU X1, 16(DX)
+ MOVOU X2, 32(DX)
+ MOVOU X3, 48(DX)
+ RET
- MOVOU X13, X12
- PADDL X15, X13
- PCMPEQL X14, X12
-
- MOVOU (16*0)(DI), X4
- MOVOU (16*1)(DI), X5
- MOVOU (16*2)(DI), X6
- MOVOU (16*3)(DI), X7
-
- MOVOU (16*4)(DI), X8
- MOVOU (16*5)(DI), X9
- MOVOU (16*6)(DI), X10
- MOVOU (16*7)(DI), X11
-
- ADDQ $(16*8), DI
-
- PAND X12, X4
- PAND X12, X5
- PAND X12, X6
- PAND X12, X7
-
- MOVOU X13, X12
- PADDL X15, X13
- PCMPEQL X14, X12
-
- PAND X12, X8
- PAND X12, X9
- PAND X12, X10
- PAND X12, X11
-
- PXOR X4, X0
- PXOR X5, X1
- PXOR X6, X2
- PXOR X7, X3
-
- PXOR X8, X0
- PXOR X9, X1
- PXOR X10, X2
- PXOR X11, X3
-
- DECQ AX
- JNE loop_select_base
-
- MOVOU X0, (16*0)(DX)
- MOVOU X1, (16*1)(DX)
- MOVOU X2, (16*2)(DX)
- MOVOU X3, (16*3)(DX)
+// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
+// Requires: CMOV
+TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
+ MOVQ res+0(FP), DI
+ MOVQ in1+8(FP), SI
+ MOVQ in2+16(FP), CX
- RET
-/* ---------------------------------------*/
-// func p256OrdMul(res, in1, in2 *p256OrdElement)
-TEXT ·p256OrdMul(SB),NOSPLIT,$0
- MOVQ res+0(FP), res_ptr
- MOVQ in1+8(FP), x_ptr
- MOVQ in2+16(FP), y_ptr
// x * y[0]
- MOVQ (8*0)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- MOVQ AX, acc0
- MOVQ DX, acc1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, acc2
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, acc3
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, acc4
- XORQ acc5, acc5
+ MOVQ (CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ MOVQ AX, R8
+ MOVQ DX, R9
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R11
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R12
+ XORQ R13, R13
+
// First reduction step
- MOVQ acc0, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc0
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc1
- ADCQ $0, DX
- ADDQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x10(SB), AX
- MULQ t0
- ADDQ t1, acc2
- ADCQ $0, DX
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x18(SB), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
- ADCQ DX, acc4
- ADCQ $0, acc5
+ MOVQ R8, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R9
+ ADCQ $0x00, DX
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+16(SB), AX
+ MULQ R14
+ ADDQ R15, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+24(SB), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ DX, R12
+ ADCQ $0x00, R13
+
// x * y[1]
- MOVQ (8*1)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc2
- ADCQ $0, DX
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ DX, acc5
- ADCQ $0, acc0
+ MOVQ 8(CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ R15, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ DX, R13
+ ADCQ $0x00, R8
+
// Second reduction step
- MOVQ acc1, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc2
- ADCQ $0, DX
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x10(SB), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x18(SB), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ DX, acc5
- ADCQ $0, acc0
+ MOVQ R9, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+16(SB), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+24(SB), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ DX, R13
+ ADCQ $0x00, R8
+
// x * y[2]
- MOVQ (8*2)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc5
- ADCQ $0, DX
- ADDQ AX, acc5
- ADCQ DX, acc0
- ADCQ $0, acc1
+ MOVQ 16(CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R13
+ ADCQ $0x00, DX
+ ADDQ AX, R13
+ ADCQ DX, R8
+ ADCQ $0x00, R9
+
// Third reduction step
- MOVQ acc2, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x10(SB), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x18(SB), AX
- MULQ t0
- ADDQ t1, acc5
- ADCQ $0, DX
- ADDQ AX, acc5
- ADCQ DX, acc0
- ADCQ $0, acc1
+ MOVQ R10, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+16(SB), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+24(SB), AX
+ MULQ R14
+ ADDQ R15, R13
+ ADCQ $0x00, DX
+ ADDQ AX, R13
+ ADCQ DX, R8
+ ADCQ $0x00, R9
+
// x * y[3]
- MOVQ (8*3)(y_ptr), t0
-
- MOVQ (8*0)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc5
- ADCQ $0, DX
- ADDQ AX, acc5
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc0
- ADCQ $0, DX
- ADDQ AX, acc0
- ADCQ DX, acc1
- ADCQ $0, acc2
+ MOVQ 24(CX), R14
+ MOVQ (SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 8(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ R15, R13
+ ADCQ $0x00, DX
+ ADDQ AX, R13
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R8
+ ADCQ $0x00, DX
+ ADDQ AX, R8
+ ADCQ DX, R9
+ ADCQ $0x00, R10
+
// Last reduction step
- MOVQ acc3, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x10(SB), AX
- MULQ t0
- ADDQ t1, acc5
- ADCQ $0, DX
- ADDQ AX, acc5
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x18(SB), AX
- MULQ t0
- ADDQ t1, acc0
- ADCQ $0, DX
- ADDQ AX, acc0
- ADCQ DX, acc1
- ADCQ $0, acc2
+ MOVQ R11, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+16(SB), AX
+ MULQ R14
+ ADDQ R15, R13
+ ADCQ $0x00, DX
+ ADDQ AX, R13
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+24(SB), AX
+ MULQ R14
+ ADDQ R15, R8
+ ADCQ $0x00, DX
+ ADDQ AX, R8
+ ADCQ DX, R9
+ ADCQ $0x00, R10
+
// Copy result [255:0]
- MOVQ acc4, x_ptr
- MOVQ acc5, acc3
- MOVQ acc0, t0
- MOVQ acc1, t1
- // Subtract p256
- SUBQ p256ord<>+0x00(SB), acc4
- SBBQ p256ord<>+0x08(SB) ,acc5
- SBBQ p256ord<>+0x10(SB), acc0
- SBBQ p256ord<>+0x18(SB), acc1
- SBBQ $0, acc2
-
- CMOVQCS x_ptr, acc4
- CMOVQCS acc3, acc5
- CMOVQCS t0, acc0
- CMOVQCS t1, acc1
-
- MOVQ acc4, (8*0)(res_ptr)
- MOVQ acc5, (8*1)(res_ptr)
- MOVQ acc0, (8*2)(res_ptr)
- MOVQ acc1, (8*3)(res_ptr)
+ MOVQ R12, SI
+ MOVQ R13, R11
+ MOVQ R8, R14
+ MOVQ R9, R15
+ // Subtract p256
+ SUBQ p256ord<>+0(SB), R12
+ SBBQ p256ord<>+8(SB), R13
+ SBBQ p256ord<>+16(SB), R8
+ SBBQ p256ord<>+24(SB), R9
+ SBBQ $0x00, R10
+ CMOVQCS SI, R12
+ CMOVQCS R11, R13
+ CMOVQCS R14, R8
+ CMOVQCS R15, R9
+ MOVQ R12, (DI)
+ MOVQ R13, 8(DI)
+ MOVQ R8, 16(DI)
+ MOVQ R9, 24(DI)
RET
-/* ---------------------------------------*/
-// func p256OrdSqr(res, in *p256OrdElement, n int)
-TEXT ·p256OrdSqr(SB),NOSPLIT,$0
- MOVQ res+0(FP), res_ptr
- MOVQ in+8(FP), x_ptr
+
+DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
+GLOBL p256ordK0<>(SB), RODATA, $8
+
+DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
+DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
+DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
+DATA p256ord<>+24(SB)/8, $0xffffffff00000000
+GLOBL p256ord<>(SB), RODATA, $32
+
+// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
+// Requires: CMOV
+TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
+ MOVQ res+0(FP), DI
+ MOVQ in+8(FP), SI
MOVQ n+16(FP), BX
ordSqrLoop:
-
// y[1:] * y[0]
- MOVQ (8*0)(x_ptr), t0
-
- MOVQ (8*1)(x_ptr), AX
- MULQ t0
- MOVQ AX, acc1
- MOVQ DX, acc2
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, acc3
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, acc4
+ MOVQ (SI), R14
+ MOVQ 8(SI), AX
+ MULQ R14
+ MOVQ AX, R9
+ MOVQ DX, R10
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R11
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R12
+
// y[2:] * y[1]
- MOVQ (8*1)(x_ptr), t0
-
- MOVQ (8*2)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ t1, acc4
- ADCQ $0, DX
- ADDQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, acc5
+ MOVQ 8(SI), R14
+ MOVQ 16(SI), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ R15, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R13
+
// y[3] * y[2]
- MOVQ (8*2)(x_ptr), t0
-
- MOVQ (8*3)(x_ptr), AX
- MULQ t0
- ADDQ AX, acc5
- ADCQ $0, DX
- MOVQ DX, y_ptr
- XORQ t1, t1
+ MOVQ 16(SI), R14
+ MOVQ 24(SI), AX
+ MULQ R14
+ ADDQ AX, R13
+ ADCQ $0x00, DX
+ MOVQ DX, CX
+ XORQ R15, R15
+
// *2
- ADDQ acc1, acc1
- ADCQ acc2, acc2
- ADCQ acc3, acc3
- ADCQ acc4, acc4
- ADCQ acc5, acc5
- ADCQ y_ptr, y_ptr
- ADCQ $0, t1
+ ADDQ R9, R9
+ ADCQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ CX, CX
+ ADCQ $0x00, R15
+
// Missing products
- MOVQ (8*0)(x_ptr), AX
+ MOVQ (SI), AX
MULQ AX
- MOVQ AX, acc0
- MOVQ DX, t0
-
- MOVQ (8*1)(x_ptr), AX
+ MOVQ AX, R8
+ MOVQ DX, R14
+ MOVQ 8(SI), AX
MULQ AX
- ADDQ t0, acc1
- ADCQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t0
-
- MOVQ (8*2)(x_ptr), AX
+ ADDQ R14, R9
+ ADCQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R14
+ MOVQ 16(SI), AX
MULQ AX
- ADDQ t0, acc3
- ADCQ AX, acc4
- ADCQ $0, DX
- MOVQ DX, t0
-
- MOVQ (8*3)(x_ptr), AX
+ ADDQ R14, R11
+ ADCQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R14
+ MOVQ 24(SI), AX
MULQ AX
- ADDQ t0, acc5
- ADCQ AX, y_ptr
- ADCQ DX, t1
- MOVQ t1, x_ptr
+ ADDQ R14, R13
+ ADCQ AX, CX
+ ADCQ DX, R15
+ MOVQ R15, SI
+
// First reduction step
- MOVQ acc0, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc0
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc1
- ADCQ $0, DX
- ADDQ AX, acc1
-
- MOVQ t0, t1
- ADCQ DX, acc2
- ADCQ $0, t1
- SUBQ t0, acc2
- SBBQ $0, t1
-
- MOVQ t0, AX
- MOVQ t0, DX
- MOVQ t0, acc0
- SHLQ $32, AX
- SHRQ $32, DX
-
- ADDQ t1, acc3
- ADCQ $0, acc0
- SUBQ AX, acc3
- SBBQ DX, acc0
+ MOVQ R8, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R9
+ ADCQ $0x00, DX
+ ADDQ AX, R9
+ MOVQ R14, R15
+ ADCQ DX, R10
+ ADCQ $0x00, R15
+ SUBQ R14, R10
+ SBBQ $0x00, R15
+ MOVQ R14, AX
+ MOVQ R14, DX
+ MOVQ R14, R8
+ SHLQ $0x20, AX
+ SHRQ $0x20, DX
+ ADDQ R15, R11
+ ADCQ $0x00, R8
+ SUBQ AX, R11
+ SBBQ DX, R8
+
// Second reduction step
- MOVQ acc1, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc1
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc2
- ADCQ $0, DX
- ADDQ AX, acc2
-
- MOVQ t0, t1
- ADCQ DX, acc3
- ADCQ $0, t1
- SUBQ t0, acc3
- SBBQ $0, t1
-
- MOVQ t0, AX
- MOVQ t0, DX
- MOVQ t0, acc1
- SHLQ $32, AX
- SHRQ $32, DX
-
- ADDQ t1, acc0
- ADCQ $0, acc1
- SUBQ AX, acc0
- SBBQ DX, acc1
+ MOVQ R9, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ MOVQ R14, R15
+ ADCQ DX, R11
+ ADCQ $0x00, R15
+ SUBQ R14, R11
+ SBBQ $0x00, R15
+ MOVQ R14, AX
+ MOVQ R14, DX
+ MOVQ R14, R9
+ SHLQ $0x20, AX
+ SHRQ $0x20, DX
+ ADDQ R15, R8
+ ADCQ $0x00, R9
+ SUBQ AX, R8
+ SBBQ DX, R9
+
// Third reduction step
- MOVQ acc2, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc2
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc3
- ADCQ $0, DX
- ADDQ AX, acc3
-
- MOVQ t0, t1
- ADCQ DX, acc0
- ADCQ $0, t1
- SUBQ t0, acc0
- SBBQ $0, t1
-
- MOVQ t0, AX
- MOVQ t0, DX
- MOVQ t0, acc2
- SHLQ $32, AX
- SHRQ $32, DX
-
- ADDQ t1, acc1
- ADCQ $0, acc2
- SUBQ AX, acc1
- SBBQ DX, acc2
+ MOVQ R10, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ MOVQ R14, R15
+ ADCQ DX, R8
+ ADCQ $0x00, R15
+ SUBQ R14, R8
+ SBBQ $0x00, R15
+ MOVQ R14, AX
+ MOVQ R14, DX
+ MOVQ R14, R10
+ SHLQ $0x20, AX
+ SHRQ $0x20, DX
+ ADDQ R15, R9
+ ADCQ $0x00, R10
+ SUBQ AX, R9
+ SBBQ DX, R10
+
// Last reduction step
- MOVQ acc3, AX
- MULQ p256ordK0<>(SB)
- MOVQ AX, t0
-
- MOVQ p256ord<>+0x00(SB), AX
- MULQ t0
- ADDQ AX, acc3
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ p256ord<>+0x08(SB), AX
- MULQ t0
- ADDQ t1, acc0
- ADCQ $0, DX
- ADDQ AX, acc0
- ADCQ $0, DX
- MOVQ DX, t1
-
- MOVQ t0, t1
- ADCQ DX, acc1
- ADCQ $0, t1
- SUBQ t0, acc1
- SBBQ $0, t1
-
- MOVQ t0, AX
- MOVQ t0, DX
- MOVQ t0, acc3
- SHLQ $32, AX
- SHRQ $32, DX
-
- ADDQ t1, acc2
- ADCQ $0, acc3
- SUBQ AX, acc2
- SBBQ DX, acc3
- XORQ t0, t0
+ MOVQ R11, AX
+ MULQ p256ordK0<>+0(SB)
+ MOVQ AX, R14
+ MOVQ p256ord<>+0(SB), AX
+ MULQ R14
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ p256ord<>+8(SB), AX
+ MULQ R14
+ ADDQ R15, R8
+ ADCQ $0x00, DX
+ ADDQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ R14, R15
+ ADCQ DX, R9
+ ADCQ $0x00, R15
+ SUBQ R14, R9
+ SBBQ $0x00, R15
+ MOVQ R14, AX
+ MOVQ R14, DX
+ MOVQ R14, R11
+ SHLQ $0x20, AX
+ SHRQ $0x20, DX
+ ADDQ R15, R10
+ ADCQ $0x00, R11
+ SUBQ AX, R10
+ SBBQ DX, R11
+ XORQ R14, R14
+
// Add bits [511:256] of the sqr result
- ADCQ acc4, acc0
- ADCQ acc5, acc1
- ADCQ y_ptr, acc2
- ADCQ x_ptr, acc3
- ADCQ $0, t0
-
- MOVQ acc0, acc4
- MOVQ acc1, acc5
- MOVQ acc2, y_ptr
- MOVQ acc3, t1
- // Subtract p256
- SUBQ p256ord<>+0x00(SB), acc0
- SBBQ p256ord<>+0x08(SB) ,acc1
- SBBQ p256ord<>+0x10(SB), acc2
- SBBQ p256ord<>+0x18(SB), acc3
- SBBQ $0, t0
-
- CMOVQCS acc4, acc0
- CMOVQCS acc5, acc1
- CMOVQCS y_ptr, acc2
- CMOVQCS t1, acc3
-
- MOVQ acc0, (8*0)(res_ptr)
- MOVQ acc1, (8*1)(res_ptr)
- MOVQ acc2, (8*2)(res_ptr)
- MOVQ acc3, (8*3)(res_ptr)
- MOVQ res_ptr, x_ptr
- DECQ BX
- JNE ordSqrLoop
+ ADCQ R12, R8
+ ADCQ R13, R9
+ ADCQ CX, R10
+ ADCQ SI, R11
+ ADCQ $0x00, R14
+ MOVQ R8, R12
+ MOVQ R9, R13
+ MOVQ R10, CX
+ MOVQ R11, R15
+ // Subtract p256
+ SUBQ p256ord<>+0(SB), R8
+ SBBQ p256ord<>+8(SB), R9
+ SBBQ p256ord<>+16(SB), R10
+ SBBQ p256ord<>+24(SB), R11
+ SBBQ $0x00, R14
+ CMOVQCS R12, R8
+ CMOVQCS R13, R9
+ CMOVQCS CX, R10
+ CMOVQCS R15, R11
+ MOVQ R8, (DI)
+ MOVQ R9, 8(DI)
+ MOVQ R10, 16(DI)
+ MOVQ R11, 24(DI)
+ MOVQ DI, SI
+ DECQ BX
+ JNE ordSqrLoop
RET
-/* ---------------------------------------*/
-#undef res_ptr
-#undef x_ptr
-#undef y_ptr
-
-#undef acc0
-#undef acc1
-#undef acc2
-#undef acc3
-#undef acc4
-#undef acc5
-#undef t0
-#undef t1
-/* ---------------------------------------*/
-#define mul0 AX
-#define mul1 DX
-#define acc0 BX
-#define acc1 CX
-#define acc2 R8
-#define acc3 R9
-#define acc4 R10
-#define acc5 R11
-#define acc6 R12
-#define acc7 R13
-#define t0 R14
-#define t1 R15
-#define t2 DI
-#define t3 SI
-#define hlp BP
-/* ---------------------------------------*/
-TEXT p256SubInternal(SB),NOSPLIT,$0
- XORQ mul0, mul0
- SUBQ t0, acc4
- SBBQ t1, acc5
- SBBQ t2, acc6
- SBBQ t3, acc7
- SBBQ $0, mul0
-
- MOVQ acc4, acc0
- MOVQ acc5, acc1
- MOVQ acc6, acc2
- MOVQ acc7, acc3
-
- ADDQ $-1, acc4
- ADCQ p256const0<>(SB), acc5
- ADCQ $0, acc6
- ADCQ p256const1<>(SB), acc7
- ANDQ $1, mul0
-
- CMOVQEQ acc0, acc4
- CMOVQEQ acc1, acc5
- CMOVQEQ acc2, acc6
- CMOVQEQ acc3, acc7
+// func p256SubInternal()
+// Requires: CMOV
+TEXT p256SubInternal(SB), NOSPLIT, $0
+ XORQ AX, AX
+ SUBQ R14, R10
+ SBBQ R15, R11
+ SBBQ DI, R12
+ SBBQ SI, R13
+ SBBQ $0x00, AX
+ MOVQ R10, BX
+ MOVQ R11, CX
+ MOVQ R12, R8
+ MOVQ R13, R9
+ ADDQ $-1, R10
+ ADCQ p256const0<>+0(SB), R11
+ ADCQ $0x00, R12
+ ADCQ p256const1<>+0(SB), R13
+ ANDQ $0x01, AX
+ CMOVQEQ BX, R10
+ CMOVQEQ CX, R11
+ CMOVQEQ R8, R12
+ CMOVQEQ R9, R13
RET
-/* ---------------------------------------*/
-TEXT p256MulInternal(SB),NOSPLIT,$8
- MOVQ acc4, mul0
- MULQ t0
- MOVQ mul0, acc0
- MOVQ mul1, acc1
-
- MOVQ acc4, mul0
- MULQ t1
- ADDQ mul0, acc1
- ADCQ $0, mul1
- MOVQ mul1, acc2
-
- MOVQ acc4, mul0
- MULQ t2
- ADDQ mul0, acc2
- ADCQ $0, mul1
- MOVQ mul1, acc3
-
- MOVQ acc4, mul0
- MULQ t3
- ADDQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, acc4
-
- MOVQ acc5, mul0
- MULQ t0
- ADDQ mul0, acc1
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc5, mul0
- MULQ t1
- ADDQ hlp, acc2
- ADCQ $0, mul1
- ADDQ mul0, acc2
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc5, mul0
- MULQ t2
- ADDQ hlp, acc3
- ADCQ $0, mul1
- ADDQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc5, mul0
- MULQ t3
- ADDQ hlp, acc4
- ADCQ $0, mul1
- ADDQ mul0, acc4
- ADCQ $0, mul1
- MOVQ mul1, acc5
-
- MOVQ acc6, mul0
- MULQ t0
- ADDQ mul0, acc2
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc6, mul0
- MULQ t1
- ADDQ hlp, acc3
- ADCQ $0, mul1
- ADDQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc6, mul0
- MULQ t2
- ADDQ hlp, acc4
- ADCQ $0, mul1
- ADDQ mul0, acc4
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc6, mul0
- MULQ t3
- ADDQ hlp, acc5
- ADCQ $0, mul1
- ADDQ mul0, acc5
- ADCQ $0, mul1
- MOVQ mul1, acc6
-
- MOVQ acc7, mul0
- MULQ t0
- ADDQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc7, mul0
- MULQ t1
- ADDQ hlp, acc4
- ADCQ $0, mul1
- ADDQ mul0, acc4
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc7, mul0
- MULQ t2
- ADDQ hlp, acc5
- ADCQ $0, mul1
- ADDQ mul0, acc5
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc7, mul0
- MULQ t3
- ADDQ hlp, acc6
- ADCQ $0, mul1
- ADDQ mul0, acc6
- ADCQ $0, mul1
- MOVQ mul1, acc7
+
+// func p256MulInternal()
+// Requires: CMOV
+TEXT p256MulInternal(SB), NOSPLIT, $8
+ MOVQ R10, AX
+ MULQ R14
+ MOVQ AX, BX
+ MOVQ DX, CX
+ MOVQ R10, AX
+ MULQ R15
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ DX, R8
+ MOVQ R10, AX
+ MULQ DI
+ ADDQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R9
+ MOVQ R10, AX
+ MULQ SI
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ R11, AX
+ MULQ R14
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R11, AX
+ MULQ R15
+ ADDQ BP, R8
+ ADCQ $0x00, DX
+ ADDQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R11, AX
+ MULQ DI
+ ADDQ BP, R9
+ ADCQ $0x00, DX
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R11, AX
+ MULQ SI
+ ADDQ BP, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, R11
+ MOVQ R12, AX
+ MULQ R14
+ ADDQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R12, AX
+ MULQ R15
+ ADDQ BP, R9
+ ADCQ $0x00, DX
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R12, AX
+ MULQ DI
+ ADDQ BP, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R12, AX
+ MULQ SI
+ ADDQ BP, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, R12
+ MOVQ R13, AX
+ MULQ R14
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R13, AX
+ MULQ R15
+ ADDQ BP, R10
+ ADCQ $0x00, DX
+ ADDQ AX, R10
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R13, AX
+ MULQ DI
+ ADDQ BP, R11
+ ADCQ $0x00, DX
+ ADDQ AX, R11
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R13, AX
+ MULQ SI
+ ADDQ BP, R12
+ ADCQ $0x00, DX
+ ADDQ AX, R12
+ ADCQ $0x00, DX
+ MOVQ DX, R13
+
// First reduction step
- MOVQ acc0, mul0
- MOVQ acc0, hlp
- SHLQ $32, acc0
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc0, acc1
- ADCQ hlp, acc2
- ADCQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, acc0
+ MOVQ BX, AX
+ MOVQ BX, BP
+ SHLQ $0x20, BX
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ BX, CX
+ ADCQ BP, R8
+ ADCQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, BX
+
// Second reduction step
- MOVQ acc1, mul0
- MOVQ acc1, hlp
- SHLQ $32, acc1
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc1, acc2
- ADCQ hlp, acc3
- ADCQ mul0, acc0
- ADCQ $0, mul1
- MOVQ mul1, acc1
+ MOVQ CX, AX
+ MOVQ CX, BP
+ SHLQ $0x20, CX
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ CX, R8
+ ADCQ BP, R9
+ ADCQ AX, BX
+ ADCQ $0x00, DX
+ MOVQ DX, CX
+
// Third reduction step
- MOVQ acc2, mul0
- MOVQ acc2, hlp
- SHLQ $32, acc2
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc2, acc3
- ADCQ hlp, acc0
- ADCQ mul0, acc1
- ADCQ $0, mul1
- MOVQ mul1, acc2
+ MOVQ R8, AX
+ MOVQ R8, BP
+ SHLQ $0x20, R8
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ R8, R9
+ ADCQ BP, BX
+ ADCQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ DX, R8
+
// Last reduction step
- MOVQ acc3, mul0
- MOVQ acc3, hlp
- SHLQ $32, acc3
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc3, acc0
- ADCQ hlp, acc1
- ADCQ mul0, acc2
- ADCQ $0, mul1
- MOVQ mul1, acc3
- MOVQ $0, BP
+ MOVQ R9, AX
+ MOVQ R9, BP
+ SHLQ $0x20, R9
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ R9, BX
+ ADCQ BP, CX
+ ADCQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R9
+ MOVQ $0x00000000, BP
+
// Add bits [511:256] of the result
- ADCQ acc0, acc4
- ADCQ acc1, acc5
- ADCQ acc2, acc6
- ADCQ acc3, acc7
- ADCQ $0, hlp
+ ADCQ BX, R10
+ ADCQ CX, R11
+ ADCQ R8, R12
+ ADCQ R9, R13
+ ADCQ $0x00, BP
+
// Copy result
- MOVQ acc4, acc0
- MOVQ acc5, acc1
- MOVQ acc6, acc2
- MOVQ acc7, acc3
+ MOVQ R10, BX
+ MOVQ R11, CX
+ MOVQ R12, R8
+ MOVQ R13, R9
+
// Subtract p256
- SUBQ $-1, acc4
- SBBQ p256const0<>(SB) ,acc5
- SBBQ $0, acc6
- SBBQ p256const1<>(SB), acc7
- SBBQ $0, hlp
- // If the result of the subtraction is negative, restore the previous result
- CMOVQCS acc0, acc4
- CMOVQCS acc1, acc5
- CMOVQCS acc2, acc6
- CMOVQCS acc3, acc7
+ SUBQ $-1, R10
+ SBBQ p256const0<>+0(SB), R11
+ SBBQ $0x00, R12
+ SBBQ p256const1<>+0(SB), R13
+ SBBQ $0x00, BP
+ // If the result of the subtraction is negative, restore the previous result
+ CMOVQCS BX, R10
+ CMOVQCS CX, R11
+ CMOVQCS R8, R12
+ CMOVQCS R9, R13
RET
-/* ---------------------------------------*/
-TEXT p256SqrInternal(SB),NOSPLIT,$8
-
- MOVQ acc4, mul0
- MULQ acc5
- MOVQ mul0, acc1
- MOVQ mul1, acc2
-
- MOVQ acc4, mul0
- MULQ acc6
- ADDQ mul0, acc2
- ADCQ $0, mul1
- MOVQ mul1, acc3
-
- MOVQ acc4, mul0
- MULQ acc7
- ADDQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, t0
-
- MOVQ acc5, mul0
- MULQ acc6
- ADDQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, hlp
-
- MOVQ acc5, mul0
- MULQ acc7
- ADDQ hlp, t0
- ADCQ $0, mul1
- ADDQ mul0, t0
- ADCQ $0, mul1
- MOVQ mul1, t1
-
- MOVQ acc6, mul0
- MULQ acc7
- ADDQ mul0, t1
- ADCQ $0, mul1
- MOVQ mul1, t2
- XORQ t3, t3
+
+// func p256SqrInternal()
+// Requires: CMOV
+TEXT p256SqrInternal(SB), NOSPLIT, $8
+ MOVQ R10, AX
+ MULQ R11
+ MOVQ AX, CX
+ MOVQ DX, R8
+ MOVQ R10, AX
+ MULQ R12
+ ADDQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R9
+ MOVQ R10, AX
+ MULQ R13
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, R14
+ MOVQ R11, AX
+ MULQ R12
+ ADDQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, BP
+ MOVQ R11, AX
+ MULQ R13
+ ADDQ BP, R14
+ ADCQ $0x00, DX
+ ADDQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R15
+ MOVQ R12, AX
+ MULQ R13
+ ADDQ AX, R15
+ ADCQ $0x00, DX
+ MOVQ DX, DI
+ XORQ SI, SI
+
// *2
- ADDQ acc1, acc1
- ADCQ acc2, acc2
- ADCQ acc3, acc3
- ADCQ t0, t0
- ADCQ t1, t1
- ADCQ t2, t2
- ADCQ $0, t3
+ ADDQ CX, CX
+ ADCQ R8, R8
+ ADCQ R9, R9
+ ADCQ R14, R14
+ ADCQ R15, R15
+ ADCQ DI, DI
+ ADCQ $0x00, SI
+
// Missing products
- MOVQ acc4, mul0
- MULQ mul0
- MOVQ mul0, acc0
- MOVQ DX, acc4
-
- MOVQ acc5, mul0
- MULQ mul0
- ADDQ acc4, acc1
- ADCQ mul0, acc2
- ADCQ $0, DX
- MOVQ DX, acc4
-
- MOVQ acc6, mul0
- MULQ mul0
- ADDQ acc4, acc3
- ADCQ mul0, t0
- ADCQ $0, DX
- MOVQ DX, acc4
-
- MOVQ acc7, mul0
- MULQ mul0
- ADDQ acc4, t1
- ADCQ mul0, t2
- ADCQ DX, t3
+ MOVQ R10, AX
+ MULQ AX
+ MOVQ AX, BX
+ MOVQ DX, R10
+ MOVQ R11, AX
+ MULQ AX
+ ADDQ R10, CX
+ ADCQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ R12, AX
+ MULQ AX
+ ADDQ R10, R9
+ ADCQ AX, R14
+ ADCQ $0x00, DX
+ MOVQ DX, R10
+ MOVQ R13, AX
+ MULQ AX
+ ADDQ R10, R15
+ ADCQ AX, DI
+ ADCQ DX, SI
+
// First reduction step
- MOVQ acc0, mul0
- MOVQ acc0, hlp
- SHLQ $32, acc0
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc0, acc1
- ADCQ hlp, acc2
- ADCQ mul0, acc3
- ADCQ $0, mul1
- MOVQ mul1, acc0
+ MOVQ BX, AX
+ MOVQ BX, BP
+ SHLQ $0x20, BX
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ BX, CX
+ ADCQ BP, R8
+ ADCQ AX, R9
+ ADCQ $0x00, DX
+ MOVQ DX, BX
+
// Second reduction step
- MOVQ acc1, mul0
- MOVQ acc1, hlp
- SHLQ $32, acc1
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc1, acc2
- ADCQ hlp, acc3
- ADCQ mul0, acc0
- ADCQ $0, mul1
- MOVQ mul1, acc1
+ MOVQ CX, AX
+ MOVQ CX, BP
+ SHLQ $0x20, CX
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ CX, R8
+ ADCQ BP, R9
+ ADCQ AX, BX
+ ADCQ $0x00, DX
+ MOVQ DX, CX
+
// Third reduction step
- MOVQ acc2, mul0
- MOVQ acc2, hlp
- SHLQ $32, acc2
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc2, acc3
- ADCQ hlp, acc0
- ADCQ mul0, acc1
- ADCQ $0, mul1
- MOVQ mul1, acc2
+ MOVQ R8, AX
+ MOVQ R8, BP
+ SHLQ $0x20, R8
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ R8, R9
+ ADCQ BP, BX
+ ADCQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ DX, R8
+
// Last reduction step
- MOVQ acc3, mul0
- MOVQ acc3, hlp
- SHLQ $32, acc3
- MULQ p256const1<>(SB)
- SHRQ $32, hlp
- ADDQ acc3, acc0
- ADCQ hlp, acc1
- ADCQ mul0, acc2
- ADCQ $0, mul1
- MOVQ mul1, acc3
- MOVQ $0, BP
+ MOVQ R9, AX
+ MOVQ R9, BP
+ SHLQ $0x20, R9
+ MULQ p256const1<>+0(SB)
+ SHRQ $0x20, BP
+ ADDQ R9, BX
+ ADCQ BP, CX
+ ADCQ AX, R8
+ ADCQ $0x00, DX
+ MOVQ DX, R9
+ MOVQ $0x00000000, BP
+
// Add bits [511:256] of the result
- ADCQ acc0, t0
- ADCQ acc1, t1
- ADCQ acc2, t2
- ADCQ acc3, t3
- ADCQ $0, hlp
+ ADCQ BX, R14
+ ADCQ CX, R15
+ ADCQ R8, DI
+ ADCQ R9, SI
+ ADCQ $0x00, BP
+
// Copy result
- MOVQ t0, acc4
- MOVQ t1, acc5
- MOVQ t2, acc6
- MOVQ t3, acc7
+ MOVQ R14, R10
+ MOVQ R15, R11
+ MOVQ DI, R12
+ MOVQ SI, R13
+
// Subtract p256
- SUBQ $-1, acc4
- SBBQ p256const0<>(SB) ,acc5
- SBBQ $0, acc6
- SBBQ p256const1<>(SB), acc7
- SBBQ $0, hlp
- // If the result of the subtraction is negative, restore the previous result
- CMOVQCS t0, acc4
- CMOVQCS t1, acc5
- CMOVQCS t2, acc6
- CMOVQCS t3, acc7
+ SUBQ $-1, R10
+ SBBQ p256const0<>+0(SB), R11
+ SBBQ $0x00, R12
+ SBBQ p256const1<>+0(SB), R13
+ SBBQ $0x00, BP
+ // If the result of the subtraction is negative, restore the previous result
+ CMOVQCS R14, R10
+ CMOVQCS R15, R11
+ CMOVQCS DI, R12
+ CMOVQCS SI, R13
RET
-/* ---------------------------------------*/
-#define p256MulBy2Inline\
- XORQ mul0, mul0;\
- ADDQ acc4, acc4;\
- ADCQ acc5, acc5;\
- ADCQ acc6, acc6;\
- ADCQ acc7, acc7;\
- ADCQ $0, mul0;\
- MOVQ acc4, t0;\
- MOVQ acc5, t1;\
- MOVQ acc6, t2;\
- MOVQ acc7, t3;\
- SUBQ $-1, t0;\
- SBBQ p256const0<>(SB), t1;\
- SBBQ $0, t2;\
- SBBQ p256const1<>(SB), t3;\
- SBBQ $0, mul0;\
- CMOVQCS acc4, t0;\
- CMOVQCS acc5, t1;\
- CMOVQCS acc6, t2;\
- CMOVQCS acc7, t3;
-/* ---------------------------------------*/
-#define p256AddInline \
- XORQ mul0, mul0;\
- ADDQ t0, acc4;\
- ADCQ t1, acc5;\
- ADCQ t2, acc6;\
- ADCQ t3, acc7;\
- ADCQ $0, mul0;\
- MOVQ acc4, t0;\
- MOVQ acc5, t1;\
- MOVQ acc6, t2;\
- MOVQ acc7, t3;\
- SUBQ $-1, t0;\
- SBBQ p256const0<>(SB), t1;\
- SBBQ $0, t2;\
- SBBQ p256const1<>(SB), t3;\
- SBBQ $0, mul0;\
- CMOVQCS acc4, t0;\
- CMOVQCS acc5, t1;\
- CMOVQCS acc6, t2;\
- CMOVQCS acc7, t3;
-/* ---------------------------------------*/
-#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
-#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
-#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
-#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
-#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
-#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
-/* ---------------------------------------*/
-#define x1in(off) (32*0 + off)(SP)
-#define y1in(off) (32*1 + off)(SP)
-#define z1in(off) (32*2 + off)(SP)
-#define x2in(off) (32*3 + off)(SP)
-#define y2in(off) (32*4 + off)(SP)
-#define xout(off) (32*5 + off)(SP)
-#define yout(off) (32*6 + off)(SP)
-#define zout(off) (32*7 + off)(SP)
-#define s2(off) (32*8 + off)(SP)
-#define z1sqr(off) (32*9 + off)(SP)
-#define h(off) (32*10 + off)(SP)
-#define r(off) (32*11 + off)(SP)
-#define hsqr(off) (32*12 + off)(SP)
-#define rsqr(off) (32*13 + off)(SP)
-#define hcub(off) (32*14 + off)(SP)
-#define rptr (32*15)(SP)
-#define sel_save (32*15 + 8)(SP)
-#define zero_save (32*15 + 8 + 4)(SP)
-
-// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
-TEXT ·p256PointAddAffineAsm(SB),0,$512-48
- // Move input to stack in order to free registers
- MOVQ res+0(FP), AX
- MOVQ in1+8(FP), BX
- MOVQ in2+16(FP), CX
- MOVQ sign+24(FP), DX
- MOVQ sel+32(FP), t1
- MOVQ zero+40(FP), t2
-
- MOVOU (16*0)(BX), X0
- MOVOU (16*1)(BX), X1
- MOVOU (16*2)(BX), X2
- MOVOU (16*3)(BX), X3
- MOVOU (16*4)(BX), X4
- MOVOU (16*5)(BX), X5
-
- MOVOU X0, x1in(16*0)
- MOVOU X1, x1in(16*1)
- MOVOU X2, y1in(16*0)
- MOVOU X3, y1in(16*1)
- MOVOU X4, z1in(16*0)
- MOVOU X5, z1in(16*1)
-
- MOVOU (16*0)(CX), X0
- MOVOU (16*1)(CX), X1
-
- MOVOU X0, x2in(16*0)
- MOVOU X1, x2in(16*1)
- // Store pointer to result
- MOVQ mul0, rptr
- MOVL t1, sel_save
- MOVL t2, zero_save
- // Negate y2in based on sign
- MOVQ (16*2 + 8*0)(CX), acc4
- MOVQ (16*2 + 8*1)(CX), acc5
- MOVQ (16*2 + 8*2)(CX), acc6
- MOVQ (16*2 + 8*3)(CX), acc7
- MOVQ $-1, acc0
- MOVQ p256const0<>(SB), acc1
- MOVQ $0, acc2
- MOVQ p256const1<>(SB), acc3
- XORQ mul0, mul0
- // Speculatively subtract
- SUBQ acc4, acc0
- SBBQ acc5, acc1
- SBBQ acc6, acc2
- SBBQ acc7, acc3
- SBBQ $0, mul0
- MOVQ acc0, t0
- MOVQ acc1, t1
- MOVQ acc2, t2
- MOVQ acc3, t3
- // Add in case the operand was > p256
- ADDQ $-1, acc0
- ADCQ p256const0<>(SB), acc1
- ADCQ $0, acc2
- ADCQ p256const1<>(SB), acc3
- ADCQ $0, mul0
- CMOVQNE t0, acc0
- CMOVQNE t1, acc1
- CMOVQNE t2, acc2
- CMOVQNE t3, acc3
- // If condition is 0, keep original value
- TESTQ DX, DX
- CMOVQEQ acc4, acc0
- CMOVQEQ acc5, acc1
- CMOVQEQ acc6, acc2
- CMOVQEQ acc7, acc3
- // Store result
- MOVQ acc0, y2in(8*0)
- MOVQ acc1, y2in(8*1)
- MOVQ acc2, y2in(8*2)
- MOVQ acc3, y2in(8*3)
- // Begin point add
- LDacc (z1in)
- CALL p256SqrInternal(SB) // z1ˆ2
- ST (z1sqr)
-
- LDt (x2in)
- CALL p256MulInternal(SB) // x2 * z1ˆ2
-
- LDt (x1in)
- CALL p256SubInternal(SB) // h = u2 - u1
- ST (h)
-
- LDt (z1in)
- CALL p256MulInternal(SB) // z3 = h * z1
- ST (zout)
-
- LDacc (z1sqr)
- CALL p256MulInternal(SB) // z1ˆ3
- LDt (y2in)
- CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3
- ST (s2)
+// func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
+// Requires: CMOV, SSE2
+TEXT ·p256PointAddAffineAsm(SB), $512-48
+ MOVQ res+0(FP), AX
+ MOVQ in1+8(FP), BX
+ MOVQ in2+16(FP), CX
+ MOVQ sign+24(FP), DX
+ MOVQ sel+32(FP), R15
+ MOVQ zero+40(FP), DI
+ MOVOU (BX), X0
+ MOVOU 16(BX), X1
+ MOVOU 32(BX), X2
+ MOVOU 48(BX), X3
+ MOVOU 64(BX), X4
+ MOVOU 80(BX), X5
+ MOVOU X0, (SP)
+ MOVOU X1, 16(SP)
+ MOVOU X2, 32(SP)
+ MOVOU X3, 48(SP)
+ MOVOU X4, 64(SP)
+ MOVOU X5, 80(SP)
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU X0, 96(SP)
+ MOVOU X1, 112(SP)
- LDt (y1in)
- CALL p256SubInternal(SB) // r = s2 - s1
- ST (r)
-
- CALL p256SqrInternal(SB) // rsqr = rˆ2
- ST (rsqr)
-
- LDacc (h)
- CALL p256SqrInternal(SB) // hsqr = hˆ2
- ST (hsqr)
-
- LDt (h)
- CALL p256MulInternal(SB) // hcub = hˆ3
- ST (hcub)
+ // Store pointer to result
+ MOVQ AX, 480(SP)
+ MOVL R15, 488(SP)
+ MOVL DI, 492(SP)
- LDt (y1in)
- CALL p256MulInternal(SB) // y1 * hˆ3
- ST (s2)
+ // Negate y2in based on sign
+ MOVQ 32(CX), R10
+ MOVQ 40(CX), R11
+ MOVQ 48(CX), R12
+ MOVQ 56(CX), R13
+ MOVQ $-1, BX
+ MOVQ p256const0<>+0(SB), CX
+ MOVQ $0x00000000, R8
+ MOVQ p256const1<>+0(SB), R9
+ XORQ AX, AX
- LDacc (x1in)
- LDt (hsqr)
- CALL p256MulInternal(SB) // u1 * hˆ2
- ST (h)
+ // Speculatively subtract
+ SUBQ R10, BX
+ SBBQ R11, CX
+ SBBQ R12, R8
+ SBBQ R13, R9
+ SBBQ $0x00, AX
+ MOVQ BX, R14
+ MOVQ CX, R15
+ MOVQ R8, DI
+ MOVQ R9, SI
- p256MulBy2Inline // u1 * hˆ2 * 2, inline
- LDacc (rsqr)
- CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
+ // Add in case the operand was > p256
+ ADDQ $-1, BX
+ ADCQ p256const0<>+0(SB), CX
+ ADCQ $0x00, R8
+ ADCQ p256const1<>+0(SB), R9
+ ADCQ $0x00, AX
+ CMOVQNE R14, BX
+ CMOVQNE R15, CX
+ CMOVQNE DI, R8
+ CMOVQNE SI, R9
- LDt (hcub)
- CALL p256SubInternal(SB)
- ST (xout)
+ // If condition is 0, keep original value
+ TESTQ DX, DX
+ CMOVQEQ R10, BX
+ CMOVQEQ R11, CX
+ CMOVQEQ R12, R8
+ CMOVQEQ R13, R9
- MOVQ acc4, t0
- MOVQ acc5, t1
- MOVQ acc6, t2
- MOVQ acc7, t3
- LDacc (h)
- CALL p256SubInternal(SB)
+ // Store result
+ MOVQ BX, 128(SP)
+ MOVQ CX, 136(SP)
+ MOVQ R8, 144(SP)
+ MOVQ R9, 152(SP)
- LDt (r)
- CALL p256MulInternal(SB)
+ // Begin point add
+ MOVQ 64(SP), R10
+ MOVQ 72(SP), R11
+ MOVQ 80(SP), R12
+ MOVQ 88(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 288(SP)
+ MOVQ R11, 296(SP)
+ MOVQ R12, 304(SP)
+ MOVQ R13, 312(SP)
+ MOVQ 96(SP), R14
+ MOVQ 104(SP), R15
+ MOVQ 112(SP), DI
+ MOVQ 120(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ (SP), R14
+ MOVQ 8(SP), R15
+ MOVQ 16(SP), DI
+ MOVQ 24(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 320(SP)
+ MOVQ R11, 328(SP)
+ MOVQ R12, 336(SP)
+ MOVQ R13, 344(SP)
+ MOVQ 64(SP), R14
+ MOVQ 72(SP), R15
+ MOVQ 80(SP), DI
+ MOVQ 88(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 224(SP)
+ MOVQ R11, 232(SP)
+ MOVQ R12, 240(SP)
+ MOVQ R13, 248(SP)
+ MOVQ 288(SP), R10
+ MOVQ 296(SP), R11
+ MOVQ 304(SP), R12
+ MOVQ 312(SP), R13
+ CALL p256MulInternal(SB)
+ MOVQ 128(SP), R14
+ MOVQ 136(SP), R15
+ MOVQ 144(SP), DI
+ MOVQ 152(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 256(SP)
+ MOVQ R11, 264(SP)
+ MOVQ R12, 272(SP)
+ MOVQ R13, 280(SP)
+ MOVQ 32(SP), R14
+ MOVQ 40(SP), R15
+ MOVQ 48(SP), DI
+ MOVQ 56(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 352(SP)
+ MOVQ R11, 360(SP)
+ MOVQ R12, 368(SP)
+ MOVQ R13, 376(SP)
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 416(SP)
+ MOVQ R11, 424(SP)
+ MOVQ R12, 432(SP)
+ MOVQ R13, 440(SP)
+ MOVQ 320(SP), R10
+ MOVQ 328(SP), R11
+ MOVQ 336(SP), R12
+ MOVQ 344(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 384(SP)
+ MOVQ R11, 392(SP)
+ MOVQ R12, 400(SP)
+ MOVQ R13, 408(SP)
+ MOVQ 320(SP), R14
+ MOVQ 328(SP), R15
+ MOVQ 336(SP), DI
+ MOVQ 344(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 448(SP)
+ MOVQ R11, 456(SP)
+ MOVQ R12, 464(SP)
+ MOVQ R13, 472(SP)
+ MOVQ 32(SP), R14
+ MOVQ 40(SP), R15
+ MOVQ 48(SP), DI
+ MOVQ 56(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 256(SP)
+ MOVQ R11, 264(SP)
+ MOVQ R12, 272(SP)
+ MOVQ R13, 280(SP)
+ MOVQ (SP), R10
+ MOVQ 8(SP), R11
+ MOVQ 16(SP), R12
+ MOVQ 24(SP), R13
+ MOVQ 384(SP), R14
+ MOVQ 392(SP), R15
+ MOVQ 400(SP), DI
+ MOVQ 408(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 320(SP)
+ MOVQ R11, 328(SP)
+ MOVQ R12, 336(SP)
+ MOVQ R13, 344(SP)
+ XORQ AX, AX
+ ADDQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ 416(SP), R10
+ MOVQ 424(SP), R11
+ MOVQ 432(SP), R12
+ MOVQ 440(SP), R13
+ CALL p256SubInternal(SB)
+ MOVQ 448(SP), R14
+ MOVQ 456(SP), R15
+ MOVQ 464(SP), DI
+ MOVQ 472(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 160(SP)
+ MOVQ R11, 168(SP)
+ MOVQ R12, 176(SP)
+ MOVQ R13, 184(SP)
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ MOVQ 320(SP), R10
+ MOVQ 328(SP), R11
+ MOVQ 336(SP), R12
+ MOVQ 344(SP), R13
+ CALL p256SubInternal(SB)
+ MOVQ 352(SP), R14
+ MOVQ 360(SP), R15
+ MOVQ 368(SP), DI
+ MOVQ 376(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ 256(SP), R14
+ MOVQ 264(SP), R15
+ MOVQ 272(SP), DI
+ MOVQ 280(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 192(SP)
+ MOVQ R11, 200(SP)
+ MOVQ R12, 208(SP)
+ MOVQ R13, 216(SP)
- LDt (s2)
- CALL p256SubInternal(SB)
- ST (yout)
// Load stored values from stack
- MOVQ rptr, AX
- MOVL sel_save, BX
- MOVL zero_save, CX
- // The result is not valid if (sel == 0), conditional choose
- MOVOU xout(16*0), X0
- MOVOU xout(16*1), X1
- MOVOU yout(16*0), X2
- MOVOU yout(16*1), X3
- MOVOU zout(16*0), X4
- MOVOU zout(16*1), X5
-
- MOVL BX, X6
- MOVL CX, X7
+ MOVQ 480(SP), AX
+ MOVL 488(SP), BX
+ MOVL 492(SP), CX
- PXOR X8, X8
+ // The result is not valid if (sel == 0), conditional choose
+ MOVOU 160(SP), X0
+ MOVOU 176(SP), X1
+ MOVOU 192(SP), X2
+ MOVOU 208(SP), X3
+ MOVOU 224(SP), X4
+ MOVOU 240(SP), X5
+ MOVL BX, X6
+ MOVL CX, X7
+ PXOR X8, X8
PCMPEQL X9, X9
-
- PSHUFD $0, X6, X6
- PSHUFD $0, X7, X7
-
+ PSHUFD $0x00, X6, X6
+ PSHUFD $0x00, X7, X7
PCMPEQL X8, X6
PCMPEQL X8, X7
+ MOVOU X6, X15
+ PANDN X9, X15
+ MOVOU (SP), X9
+ MOVOU 16(SP), X10
+ MOVOU 32(SP), X11
+ MOVOU 48(SP), X12
+ MOVOU 64(SP), X13
+ MOVOU 80(SP), X14
+ PAND X15, X0
+ PAND X15, X1
+ PAND X15, X2
+ PAND X15, X3
+ PAND X15, X4
+ PAND X15, X5
+ PAND X6, X9
+ PAND X6, X10
+ PAND X6, X11
+ PAND X6, X12
+ PAND X6, X13
+ PAND X6, X14
+ PXOR X9, X0
+ PXOR X10, X1
+ PXOR X11, X2
+ PXOR X12, X3
+ PXOR X13, X4
+ PXOR X14, X5
- MOVOU X6, X15
- PANDN X9, X15
-
- MOVOU x1in(16*0), X9
- MOVOU x1in(16*1), X10
- MOVOU y1in(16*0), X11
- MOVOU y1in(16*1), X12
- MOVOU z1in(16*0), X13
- MOVOU z1in(16*1), X14
-
- PAND X15, X0
- PAND X15, X1
- PAND X15, X2
- PAND X15, X3
- PAND X15, X4
- PAND X15, X5
-
- PAND X6, X9
- PAND X6, X10
- PAND X6, X11
- PAND X6, X12
- PAND X6, X13
- PAND X6, X14
-
- PXOR X9, X0
- PXOR X10, X1
- PXOR X11, X2
- PXOR X12, X3
- PXOR X13, X4
- PXOR X14, X5
// Similarly if zero == 0
PCMPEQL X9, X9
- MOVOU X7, X15
- PANDN X9, X15
-
- MOVOU x2in(16*0), X9
- MOVOU x2in(16*1), X10
- MOVOU y2in(16*0), X11
- MOVOU y2in(16*1), X12
- MOVOU p256one<>+0x00(SB), X13
- MOVOU p256one<>+0x10(SB), X14
-
- PAND X15, X0
- PAND X15, X1
- PAND X15, X2
- PAND X15, X3
- PAND X15, X4
- PAND X15, X5
-
- PAND X7, X9
- PAND X7, X10
- PAND X7, X11
- PAND X7, X12
- PAND X7, X13
- PAND X7, X14
-
- PXOR X9, X0
- PXOR X10, X1
- PXOR X11, X2
- PXOR X12, X3
- PXOR X13, X4
- PXOR X14, X5
- // Finally output the result
- MOVOU X0, (16*0)(AX)
- MOVOU X1, (16*1)(AX)
- MOVOU X2, (16*2)(AX)
- MOVOU X3, (16*3)(AX)
- MOVOU X4, (16*4)(AX)
- MOVOU X5, (16*5)(AX)
- MOVQ $0, rptr
+ MOVOU X7, X15
+ PANDN X9, X15
+ MOVOU 96(SP), X9
+ MOVOU 112(SP), X10
+ MOVOU 128(SP), X11
+ MOVOU 144(SP), X12
+ MOVOU p256one<>+0(SB), X13
+ MOVOU p256one<>+16(SB), X14
+ PAND X15, X0
+ PAND X15, X1
+ PAND X15, X2
+ PAND X15, X3
+ PAND X15, X4
+ PAND X15, X5
+ PAND X7, X9
+ PAND X7, X10
+ PAND X7, X11
+ PAND X7, X12
+ PAND X7, X13
+ PAND X7, X14
+ PXOR X9, X0
+ PXOR X10, X1
+ PXOR X11, X2
+ PXOR X12, X3
+ PXOR X13, X4
+ PXOR X14, X5
+ // Finally output the result
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, 32(AX)
+ MOVOU X3, 48(AX)
+ MOVOU X4, 64(AX)
+ MOVOU X5, 80(AX)
+ MOVQ $0x00000000, 480(SP)
RET
-#undef x1in
-#undef y1in
-#undef z1in
-#undef x2in
-#undef y2in
-#undef xout
-#undef yout
-#undef zout
-#undef s2
-#undef z1sqr
-#undef h
-#undef r
-#undef hsqr
-#undef rsqr
-#undef hcub
-#undef rptr
-#undef sel_save
-#undef zero_save
-
-// p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
-// otherwise. It writes to [acc4..acc7], t0 and t1.
-TEXT p256IsZero(SB),NOSPLIT,$0
+
+DATA p256one<>+0(SB)/8, $0x0000000000000001
+DATA p256one<>+8(SB)/8, $0xffffffff00000000
+DATA p256one<>+16(SB)/8, $0xffffffffffffffff
+DATA p256one<>+24(SB)/8, $0x00000000fffffffe
+GLOBL p256one<>(SB), RODATA, $32
+
+// func p256IsZero()
+// Requires: CMOV
+TEXT p256IsZero(SB), NOSPLIT, $0
// AX contains a flag that is set if the input is zero.
XORQ AX, AX
- MOVQ $1, t1
+ MOVQ $0x00000001, R15
// Check whether [acc4..acc7] are all zero.
- MOVQ acc4, t0
- ORQ acc5, t0
- ORQ acc6, t0
- ORQ acc7, t0
+ MOVQ R10, R14
+ ORQ R11, R14
+ ORQ R12, R14
+ ORQ R13, R14
// Set the zero flag if so. (CMOV of a constant to a register doesn't
// appear to be supported in Go. Thus t1 = 1.)
- CMOVQEQ t1, AX
+ CMOVQEQ R15, AX
// XOR [acc4..acc7] with P and compare with zero again.
- XORQ $-1, acc4
- XORQ p256const0<>(SB), acc5
- XORQ p256const1<>(SB), acc7
- ORQ acc5, acc4
- ORQ acc6, acc4
- ORQ acc7, acc4
+ XORQ $-1, R10
+ XORQ p256const0<>+0(SB), R11
+ XORQ p256const1<>+0(SB), R13
+ ORQ R11, R10
+ ORQ R12, R10
+ ORQ R13, R10
// Set the zero flag if so.
- CMOVQEQ t1, AX
+ CMOVQEQ R15, AX
RET
-/* ---------------------------------------*/
-#define x1in(off) (32*0 + off)(SP)
-#define y1in(off) (32*1 + off)(SP)
-#define z1in(off) (32*2 + off)(SP)
-#define x2in(off) (32*3 + off)(SP)
-#define y2in(off) (32*4 + off)(SP)
-#define z2in(off) (32*5 + off)(SP)
-
-#define xout(off) (32*6 + off)(SP)
-#define yout(off) (32*7 + off)(SP)
-#define zout(off) (32*8 + off)(SP)
-
-#define u1(off) (32*9 + off)(SP)
-#define u2(off) (32*10 + off)(SP)
-#define s1(off) (32*11 + off)(SP)
-#define s2(off) (32*12 + off)(SP)
-#define z1sqr(off) (32*13 + off)(SP)
-#define z2sqr(off) (32*14 + off)(SP)
-#define h(off) (32*15 + off)(SP)
-#define r(off) (32*16 + off)(SP)
-#define hsqr(off) (32*17 + off)(SP)
-#define rsqr(off) (32*18 + off)(SP)
-#define hcub(off) (32*19 + off)(SP)
-#define rptr (32*20)(SP)
-#define points_eq (32*20+8)(SP)
-
-//func p256PointAddAsm(res, in1, in2 *P256Point) int
-TEXT ·p256PointAddAsm(SB),0,$680-32
- // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
+// func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
+// Requires: CMOV, SSE2
+TEXT ·p256PointAddAsm(SB), $680-32
// Move input to stack in order to free registers
- MOVQ res+0(FP), AX
- MOVQ in1+8(FP), BX
- MOVQ in2+16(FP), CX
+ MOVQ res+0(FP), AX
+ MOVQ in1+8(FP), BX
+ MOVQ in2+16(FP), CX
+ MOVOU (BX), X0
+ MOVOU 16(BX), X1
+ MOVOU 32(BX), X2
+ MOVOU 48(BX), X3
+ MOVOU 64(BX), X4
+ MOVOU 80(BX), X5
+ MOVOU X0, (SP)
+ MOVOU X1, 16(SP)
+ MOVOU X2, 32(SP)
+ MOVOU X3, 48(SP)
+ MOVOU X4, 64(SP)
+ MOVOU X5, 80(SP)
+ MOVOU (CX), X0
+ MOVOU 16(CX), X1
+ MOVOU 32(CX), X2
+ MOVOU 48(CX), X3
+ MOVOU 64(CX), X4
+ MOVOU 80(CX), X5
+ MOVOU X0, 96(SP)
+ MOVOU X1, 112(SP)
+ MOVOU X2, 128(SP)
+ MOVOU X3, 144(SP)
+ MOVOU X4, 160(SP)
+ MOVOU X5, 176(SP)
- MOVOU (16*0)(BX), X0
- MOVOU (16*1)(BX), X1
- MOVOU (16*2)(BX), X2
- MOVOU (16*3)(BX), X3
- MOVOU (16*4)(BX), X4
- MOVOU (16*5)(BX), X5
-
- MOVOU X0, x1in(16*0)
- MOVOU X1, x1in(16*1)
- MOVOU X2, y1in(16*0)
- MOVOU X3, y1in(16*1)
- MOVOU X4, z1in(16*0)
- MOVOU X5, z1in(16*1)
-
- MOVOU (16*0)(CX), X0
- MOVOU (16*1)(CX), X1
- MOVOU (16*2)(CX), X2
- MOVOU (16*3)(CX), X3
- MOVOU (16*4)(CX), X4
- MOVOU (16*5)(CX), X5
-
- MOVOU X0, x2in(16*0)
- MOVOU X1, x2in(16*1)
- MOVOU X2, y2in(16*0)
- MOVOU X3, y2in(16*1)
- MOVOU X4, z2in(16*0)
- MOVOU X5, z2in(16*1)
// Store pointer to result
- MOVQ AX, rptr
- // Begin point add
- LDacc (z2in)
- CALL p256SqrInternal(SB) // z2ˆ2
- ST (z2sqr)
- LDt (z2in)
- CALL p256MulInternal(SB) // z2ˆ3
- LDt (y1in)
- CALL p256MulInternal(SB) // s1 = z2ˆ3*y1
- ST (s1)
-
- LDacc (z1in)
- CALL p256SqrInternal(SB) // z1ˆ2
- ST (z1sqr)
- LDt (z1in)
- CALL p256MulInternal(SB) // z1ˆ3
- LDt (y2in)
- CALL p256MulInternal(SB) // s2 = z1ˆ3*y2
- ST (s2)
-
- LDt (s1)
- CALL p256SubInternal(SB) // r = s2 - s1
- ST (r)
- CALL p256IsZero(SB)
- MOVQ AX, points_eq
-
- LDacc (z2sqr)
- LDt (x1in)
- CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2
- ST (u1)
- LDacc (z1sqr)
- LDt (x2in)
- CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2
- ST (u2)
-
- LDt (u1)
- CALL p256SubInternal(SB) // h = u2 - u1
- ST (h)
- CALL p256IsZero(SB)
- ANDQ points_eq, AX
- MOVQ AX, points_eq
-
- LDacc (r)
- CALL p256SqrInternal(SB) // rsqr = rˆ2
- ST (rsqr)
-
- LDacc (h)
- CALL p256SqrInternal(SB) // hsqr = hˆ2
- ST (hsqr)
-
- LDt (h)
- CALL p256MulInternal(SB) // hcub = hˆ3
- ST (hcub)
-
- LDt (s1)
- CALL p256MulInternal(SB)
- ST (s2)
-
- LDacc (z1in)
- LDt (z2in)
- CALL p256MulInternal(SB) // z1 * z2
- LDt (h)
- CALL p256MulInternal(SB) // z1 * z2 * h
- ST (zout)
-
- LDacc (hsqr)
- LDt (u1)
- CALL p256MulInternal(SB) // hˆ2 * u1
- ST (u2)
-
- p256MulBy2Inline // u1 * hˆ2 * 2, inline
- LDacc (rsqr)
- CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
+ MOVQ AX, 640(SP)
- LDt (hcub)
- CALL p256SubInternal(SB)
- ST (xout)
-
- MOVQ acc4, t0
- MOVQ acc5, t1
- MOVQ acc6, t2
- MOVQ acc7, t3
- LDacc (u2)
- CALL p256SubInternal(SB)
-
- LDt (r)
- CALL p256MulInternal(SB)
+ // Begin point add
+ MOVQ 160(SP), R10
+ MOVQ 168(SP), R11
+ MOVQ 176(SP), R12
+ MOVQ 184(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 448(SP)
+ MOVQ R11, 456(SP)
+ MOVQ R12, 464(SP)
+ MOVQ R13, 472(SP)
+ MOVQ 160(SP), R14
+ MOVQ 168(SP), R15
+ MOVQ 176(SP), DI
+ MOVQ 184(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ 32(SP), R14
+ MOVQ 40(SP), R15
+ MOVQ 48(SP), DI
+ MOVQ 56(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 352(SP)
+ MOVQ R11, 360(SP)
+ MOVQ R12, 368(SP)
+ MOVQ R13, 376(SP)
+ MOVQ 64(SP), R10
+ MOVQ 72(SP), R11
+ MOVQ 80(SP), R12
+ MOVQ 88(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 416(SP)
+ MOVQ R11, 424(SP)
+ MOVQ R12, 432(SP)
+ MOVQ R13, 440(SP)
+ MOVQ 64(SP), R14
+ MOVQ 72(SP), R15
+ MOVQ 80(SP), DI
+ MOVQ 88(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ 128(SP), R14
+ MOVQ 136(SP), R15
+ MOVQ 144(SP), DI
+ MOVQ 152(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 384(SP)
+ MOVQ R11, 392(SP)
+ MOVQ R12, 400(SP)
+ MOVQ R13, 408(SP)
+ MOVQ 352(SP), R14
+ MOVQ 360(SP), R15
+ MOVQ 368(SP), DI
+ MOVQ 376(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 512(SP)
+ MOVQ R11, 520(SP)
+ MOVQ R12, 528(SP)
+ MOVQ R13, 536(SP)
+ CALL p256IsZero(SB)
+ MOVQ AX, 648(SP)
+ MOVQ 448(SP), R10
+ MOVQ 456(SP), R11
+ MOVQ 464(SP), R12
+ MOVQ 472(SP), R13
+ MOVQ (SP), R14
+ MOVQ 8(SP), R15
+ MOVQ 16(SP), DI
+ MOVQ 24(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 288(SP)
+ MOVQ R11, 296(SP)
+ MOVQ R12, 304(SP)
+ MOVQ R13, 312(SP)
+ MOVQ 416(SP), R10
+ MOVQ 424(SP), R11
+ MOVQ 432(SP), R12
+ MOVQ 440(SP), R13
+ MOVQ 96(SP), R14
+ MOVQ 104(SP), R15
+ MOVQ 112(SP), DI
+ MOVQ 120(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 320(SP)
+ MOVQ R11, 328(SP)
+ MOVQ R12, 336(SP)
+ MOVQ R13, 344(SP)
+ MOVQ 288(SP), R14
+ MOVQ 296(SP), R15
+ MOVQ 304(SP), DI
+ MOVQ 312(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 480(SP)
+ MOVQ R11, 488(SP)
+ MOVQ R12, 496(SP)
+ MOVQ R13, 504(SP)
+ CALL p256IsZero(SB)
+ ANDQ 648(SP), AX
+ MOVQ AX, 648(SP)
+ MOVQ 512(SP), R10
+ MOVQ 520(SP), R11
+ MOVQ 528(SP), R12
+ MOVQ 536(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 576(SP)
+ MOVQ R11, 584(SP)
+ MOVQ R12, 592(SP)
+ MOVQ R13, 600(SP)
+ MOVQ 480(SP), R10
+ MOVQ 488(SP), R11
+ MOVQ 496(SP), R12
+ MOVQ 504(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 544(SP)
+ MOVQ R11, 552(SP)
+ MOVQ R12, 560(SP)
+ MOVQ R13, 568(SP)
+ MOVQ 480(SP), R14
+ MOVQ 488(SP), R15
+ MOVQ 496(SP), DI
+ MOVQ 504(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 608(SP)
+ MOVQ R11, 616(SP)
+ MOVQ R12, 624(SP)
+ MOVQ R13, 632(SP)
+ MOVQ 352(SP), R14
+ MOVQ 360(SP), R15
+ MOVQ 368(SP), DI
+ MOVQ 376(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 384(SP)
+ MOVQ R11, 392(SP)
+ MOVQ R12, 400(SP)
+ MOVQ R13, 408(SP)
+ MOVQ 64(SP), R10
+ MOVQ 72(SP), R11
+ MOVQ 80(SP), R12
+ MOVQ 88(SP), R13
+ MOVQ 160(SP), R14
+ MOVQ 168(SP), R15
+ MOVQ 176(SP), DI
+ MOVQ 184(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ 480(SP), R14
+ MOVQ 488(SP), R15
+ MOVQ 496(SP), DI
+ MOVQ 504(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 256(SP)
+ MOVQ R11, 264(SP)
+ MOVQ R12, 272(SP)
+ MOVQ R13, 280(SP)
+ MOVQ 544(SP), R10
+ MOVQ 552(SP), R11
+ MOVQ 560(SP), R12
+ MOVQ 568(SP), R13
+ MOVQ 288(SP), R14
+ MOVQ 296(SP), R15
+ MOVQ 304(SP), DI
+ MOVQ 312(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 320(SP)
+ MOVQ R11, 328(SP)
+ MOVQ R12, 336(SP)
+ MOVQ R13, 344(SP)
+ XORQ AX, AX
+ ADDQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ 576(SP), R10
+ MOVQ 584(SP), R11
+ MOVQ 592(SP), R12
+ MOVQ 600(SP), R13
+ CALL p256SubInternal(SB)
+ MOVQ 608(SP), R14
+ MOVQ 616(SP), R15
+ MOVQ 624(SP), DI
+ MOVQ 632(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 192(SP)
+ MOVQ R11, 200(SP)
+ MOVQ R12, 208(SP)
+ MOVQ R13, 216(SP)
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ MOVQ 320(SP), R10
+ MOVQ 328(SP), R11
+ MOVQ 336(SP), R12
+ MOVQ 344(SP), R13
+ CALL p256SubInternal(SB)
+ MOVQ 512(SP), R14
+ MOVQ 520(SP), R15
+ MOVQ 528(SP), DI
+ MOVQ 536(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ 384(SP), R14
+ MOVQ 392(SP), R15
+ MOVQ 400(SP), DI
+ MOVQ 408(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ R10, 224(SP)
+ MOVQ R11, 232(SP)
+ MOVQ R12, 240(SP)
+ MOVQ R13, 248(SP)
+ MOVOU 192(SP), X0
+ MOVOU 208(SP), X1
+ MOVOU 224(SP), X2
+ MOVOU 240(SP), X3
+ MOVOU 256(SP), X4
+ MOVOU 272(SP), X5
- LDt (s2)
- CALL p256SubInternal(SB)
- ST (yout)
-
- MOVOU xout(16*0), X0
- MOVOU xout(16*1), X1
- MOVOU yout(16*0), X2
- MOVOU yout(16*1), X3
- MOVOU zout(16*0), X4
- MOVOU zout(16*1), X5
// Finally output the result
- MOVQ rptr, AX
- MOVQ $0, rptr
- MOVOU X0, (16*0)(AX)
- MOVOU X1, (16*1)(AX)
- MOVOU X2, (16*2)(AX)
- MOVOU X3, (16*3)(AX)
- MOVOU X4, (16*4)(AX)
- MOVOU X5, (16*5)(AX)
-
- MOVQ points_eq, AX
- MOVQ AX, ret+24(FP)
-
+ MOVQ 640(SP), AX
+ MOVQ $0x00000000, 640(SP)
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, 32(AX)
+ MOVOU X3, 48(AX)
+ MOVOU X4, 64(AX)
+ MOVOU X5, 80(AX)
+ MOVQ 648(SP), AX
+ MOVQ AX, ret+24(FP)
RET
-#undef x1in
-#undef y1in
-#undef z1in
-#undef x2in
-#undef y2in
-#undef z2in
-#undef xout
-#undef yout
-#undef zout
-#undef s1
-#undef s2
-#undef u1
-#undef u2
-#undef z1sqr
-#undef z2sqr
-#undef h
-#undef r
-#undef hsqr
-#undef rsqr
-#undef hcub
-#undef rptr
-/* ---------------------------------------*/
-#define x(off) (32*0 + off)(SP)
-#define y(off) (32*1 + off)(SP)
-#define z(off) (32*2 + off)(SP)
-
-#define s(off) (32*3 + off)(SP)
-#define m(off) (32*4 + off)(SP)
-#define zsqr(off) (32*5 + off)(SP)
-#define tmp(off) (32*6 + off)(SP)
-#define rptr (32*7)(SP)
-
-//func p256PointDoubleAsm(res, in *P256Point)
-TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
- // Move input to stack in order to free registers
- MOVQ res+0(FP), AX
- MOVQ in+8(FP), BX
-
- MOVOU (16*0)(BX), X0
- MOVOU (16*1)(BX), X1
- MOVOU (16*2)(BX), X2
- MOVOU (16*3)(BX), X3
- MOVOU (16*4)(BX), X4
- MOVOU (16*5)(BX), X5
-
- MOVOU X0, x(16*0)
- MOVOU X1, x(16*1)
- MOVOU X2, y(16*0)
- MOVOU X3, y(16*1)
- MOVOU X4, z(16*0)
- MOVOU X5, z(16*1)
+
+// func p256PointDoubleAsm(res *P256Point, in *P256Point)
+// Requires: CMOV, SSE2
+TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
+ MOVQ res+0(FP), AX
+ MOVQ in+8(FP), BX
+ MOVOU (BX), X0
+ MOVOU 16(BX), X1
+ MOVOU 32(BX), X2
+ MOVOU 48(BX), X3
+ MOVOU 64(BX), X4
+ MOVOU 80(BX), X5
+ MOVOU X0, (SP)
+ MOVOU X1, 16(SP)
+ MOVOU X2, 32(SP)
+ MOVOU X3, 48(SP)
+ MOVOU X4, 64(SP)
+ MOVOU X5, 80(SP)
+
// Store pointer to result
- MOVQ AX, rptr
- // Begin point double
- LDacc (z)
- CALL p256SqrInternal(SB)
- ST (zsqr)
+ MOVQ AX, 224(SP)
- LDt (x)
- p256AddInline
- STt (m)
+ // Begin point double
+ MOVQ 64(SP), R10
+ MOVQ 72(SP), R11
+ MOVQ 80(SP), R12
+ MOVQ 88(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 160(SP)
+ MOVQ R11, 168(SP)
+ MOVQ R12, 176(SP)
+ MOVQ R13, 184(SP)
+ MOVQ (SP), R14
+ MOVQ 8(SP), R15
+ MOVQ 16(SP), DI
+ MOVQ 24(SP), SI
+ XORQ AX, AX
+ ADDQ R14, R10
+ ADCQ R15, R11
+ ADCQ DI, R12
+ ADCQ SI, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ R14, 128(SP)
+ MOVQ R15, 136(SP)
+ MOVQ DI, 144(SP)
+ MOVQ SI, 152(SP)
+ MOVQ 64(SP), R10
+ MOVQ 72(SP), R11
+ MOVQ 80(SP), R12
+ MOVQ 88(SP), R13
+ MOVQ 32(SP), R14
+ MOVQ 40(SP), R15
+ MOVQ 48(SP), DI
+ MOVQ 56(SP), SI
+ CALL p256MulInternal(SB)
+ XORQ AX, AX
+ ADDQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ 224(SP), AX
- LDacc (z)
- LDt (y)
- CALL p256MulInternal(SB)
- p256MulBy2Inline
- MOVQ rptr, AX
// Store z
- MOVQ t0, (16*4 + 8*0)(AX)
- MOVQ t1, (16*4 + 8*1)(AX)
- MOVQ t2, (16*4 + 8*2)(AX)
- MOVQ t3, (16*4 + 8*3)(AX)
-
- LDacc (x)
- LDt (zsqr)
+ MOVQ R14, 64(AX)
+ MOVQ R15, 72(AX)
+ MOVQ DI, 80(AX)
+ MOVQ SI, 88(AX)
+ MOVQ (SP), R10
+ MOVQ 8(SP), R11
+ MOVQ 16(SP), R12
+ MOVQ 24(SP), R13
+ MOVQ 160(SP), R14
+ MOVQ 168(SP), R15
+ MOVQ 176(SP), DI
+ MOVQ 184(SP), SI
CALL p256SubInternal(SB)
- LDt (m)
+ MOVQ 128(SP), R14
+ MOVQ 136(SP), R15
+ MOVQ 144(SP), DI
+ MOVQ 152(SP), SI
CALL p256MulInternal(SB)
- ST (m)
+ MOVQ R10, 128(SP)
+ MOVQ R11, 136(SP)
+ MOVQ R12, 144(SP)
+ MOVQ R13, 152(SP)
+
// Multiply by 3
- p256MulBy2Inline
- LDacc (m)
- p256AddInline
- STt (m)
- ////////////////////////
- LDacc (y)
- p256MulBy2Inline
- t2acc
- CALL p256SqrInternal(SB)
- ST (s)
- CALL p256SqrInternal(SB)
- // Divide by 2
- XORQ mul0, mul0
- MOVQ acc4, t0
- MOVQ acc5, t1
- MOVQ acc6, t2
- MOVQ acc7, t3
-
- ADDQ $-1, acc4
- ADCQ p256const0<>(SB), acc5
- ADCQ $0, acc6
- ADCQ p256const1<>(SB), acc7
- ADCQ $0, mul0
- TESTQ $1, t0
-
- CMOVQEQ t0, acc4
- CMOVQEQ t1, acc5
- CMOVQEQ t2, acc6
- CMOVQEQ t3, acc7
- ANDQ t0, mul0
-
- SHRQ $1, acc5, acc4
- SHRQ $1, acc6, acc5
- SHRQ $1, acc7, acc6
- SHRQ $1, mul0, acc7
- ST (y)
- /////////////////////////
- LDacc (x)
- LDt (s)
- CALL p256MulInternal(SB)
- ST (s)
- p256MulBy2Inline
- STt (tmp)
+ XORQ AX, AX
+ ADDQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ 128(SP), R10
+ MOVQ 136(SP), R11
+ MOVQ 144(SP), R12
+ MOVQ 152(SP), R13
+ XORQ AX, AX
+ ADDQ R14, R10
+ ADCQ R15, R11
+ ADCQ DI, R12
+ ADCQ SI, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ R14, 128(SP)
+ MOVQ R15, 136(SP)
+ MOVQ DI, 144(SP)
+ MOVQ SI, 152(SP)
+
+ // ////////////////////////
+ MOVQ 32(SP), R10
+ MOVQ 40(SP), R11
+ MOVQ 48(SP), R12
+ MOVQ 56(SP), R13
+ XORQ AX, AX
+ ADDQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ R14, R10
+ MOVQ R15, R11
+ MOVQ DI, R12
+ MOVQ SI, R13
+ CALL p256SqrInternal(SB)
+ MOVQ R10, 96(SP)
+ MOVQ R11, 104(SP)
+ MOVQ R12, 112(SP)
+ MOVQ R13, 120(SP)
+ CALL p256SqrInternal(SB)
- LDacc (m)
- CALL p256SqrInternal(SB)
- LDt (tmp)
- CALL p256SubInternal(SB)
+ // Divide by 2
+ XORQ AX, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ ADDQ $-1, R10
+ ADCQ p256const0<>+0(SB), R11
+ ADCQ $0x00, R12
+ ADCQ p256const1<>+0(SB), R13
+ ADCQ $0x00, AX
+ TESTQ $0x00000001, R14
+ CMOVQEQ R14, R10
+ CMOVQEQ R15, R11
+ CMOVQEQ DI, R12
+ CMOVQEQ SI, R13
+ ANDQ R14, AX
+ SHRQ $0x01, R11, R10
+ SHRQ $0x01, R12, R11
+ SHRQ $0x01, R13, R12
+ SHRQ $0x01, AX, R13
+ MOVQ R10, 32(SP)
+ MOVQ R11, 40(SP)
+ MOVQ R12, 48(SP)
+ MOVQ R13, 56(SP)
+
+ // /////////////////////////
+ MOVQ (SP), R10
+ MOVQ 8(SP), R11
+ MOVQ 16(SP), R12
+ MOVQ 24(SP), R13
+ MOVQ 96(SP), R14
+ MOVQ 104(SP), R15
+ MOVQ 112(SP), DI
+ MOVQ 120(SP), SI
+ CALL p256MulInternal(SB)
+ MOVQ R10, 96(SP)
+ MOVQ R11, 104(SP)
+ MOVQ R12, 112(SP)
+ MOVQ R13, 120(SP)
+ XORQ AX, AX
+ ADDQ R10, R10
+ ADCQ R11, R11
+ ADCQ R12, R12
+ ADCQ R13, R13
+ ADCQ $+0, AX
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ SUBQ $-1, R14
+ SBBQ p256const0<>+0(SB), R15
+ SBBQ $+0, DI
+ SBBQ p256const1<>+0(SB), SI
+ SBBQ $+0, AX
+ CMOVQCS R10, R14
+ CMOVQCS R11, R15
+ CMOVQCS R12, DI
+ CMOVQCS R13, SI
+ MOVQ R14, 192(SP)
+ MOVQ R15, 200(SP)
+ MOVQ DI, 208(SP)
+ MOVQ SI, 216(SP)
+ MOVQ 128(SP), R10
+ MOVQ 136(SP), R11
+ MOVQ 144(SP), R12
+ MOVQ 152(SP), R13
+ CALL p256SqrInternal(SB)
+ MOVQ 192(SP), R14
+ MOVQ 200(SP), R15
+ MOVQ 208(SP), DI
+ MOVQ 216(SP), SI
+ CALL p256SubInternal(SB)
+ MOVQ 224(SP), AX
- MOVQ rptr, AX
// Store x
- MOVQ acc4, (16*0 + 8*0)(AX)
- MOVQ acc5, (16*0 + 8*1)(AX)
- MOVQ acc6, (16*0 + 8*2)(AX)
- MOVQ acc7, (16*0 + 8*3)(AX)
-
- acc2t
- LDacc (s)
+ MOVQ R10, (AX)
+ MOVQ R11, 8(AX)
+ MOVQ R12, 16(AX)
+ MOVQ R13, 24(AX)
+ MOVQ R10, R14
+ MOVQ R11, R15
+ MOVQ R12, DI
+ MOVQ R13, SI
+ MOVQ 96(SP), R10
+ MOVQ 104(SP), R11
+ MOVQ 112(SP), R12
+ MOVQ 120(SP), R13
CALL p256SubInternal(SB)
-
- LDt (m)
+ MOVQ 128(SP), R14
+ MOVQ 136(SP), R15
+ MOVQ 144(SP), DI
+ MOVQ 152(SP), SI
CALL p256MulInternal(SB)
-
- LDt (y)
+ MOVQ 32(SP), R14
+ MOVQ 40(SP), R15
+ MOVQ 48(SP), DI
+ MOVQ 56(SP), SI
CALL p256SubInternal(SB)
- MOVQ rptr, AX
+ MOVQ 224(SP), AX
+
// Store y
- MOVQ acc4, (16*2 + 8*0)(AX)
- MOVQ acc5, (16*2 + 8*1)(AX)
- MOVQ acc6, (16*2 + 8*2)(AX)
- MOVQ acc7, (16*2 + 8*3)(AX)
- ///////////////////////
- MOVQ $0, rptr
+ MOVQ R10, 32(AX)
+ MOVQ R11, 40(AX)
+ MOVQ R12, 48(AX)
+ MOVQ R13, 56(AX)
+ // ///////////////////////
+ MOVQ $0x00000000, 224(SP)
RET
-/* ---------------------------------------*/