From 7bacfc640fba4fb2e50bbcc16a4c15fe4bf5b870 Mon Sep 17 00:00:00 2001 From: Vlad Krasnov Date: Fri, 17 Apr 2015 06:10:35 -0700 Subject: [PATCH] crypto/elliptic,crypto/ecdsa: P256 amd64 assembly This is based on the implementation used in OpenSSL, from a submission by Shay Gueron and myself. Besides using assembly, this implementation employs several optimizations described in: S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 256-bit primes" In addition a new and improved modular inverse modulo N is implemented here. The performance measured on a Haswell based Macbook Pro shows 21X speedup for the sign and 9X for the verify operations. The operation BaseMult is 30X faster (and the Diffie-Hellman/ECDSA key generation that use it are sped up as well). The adaptation to Go with the help of Filippo Valsorda Updated the submission for faster verify/ecdh, fixed some asm syntax and API problems and added benchmarks. Change-Id: I86a33636747d5c92f15e0c8344caa2e7e07e0028 Reviewed-on: https://go-review.googlesource.com/8968 Run-TryBot: Adam Langley TryBot-Result: Gobot Gobot Reviewed-by: Adam Langley --- src/crypto/ecdsa/ecdsa.go | 39 +- src/crypto/ecdsa/ecdsa_test.go | 35 + src/crypto/elliptic/elliptic_test.go | 12 + src/crypto/elliptic/p256.go | 2 + src/crypto/elliptic/p256_amd64.go | 552 +++++++ src/crypto/elliptic/p256_asm_amd64.s | 2295 ++++++++++++++++++++++++++ 6 files changed, 2930 insertions(+), 5 deletions(-) create mode 100644 src/crypto/elliptic/p256_amd64.go create mode 100644 src/crypto/elliptic/p256_asm_amd64.s diff --git a/src/crypto/ecdsa/ecdsa.go b/src/crypto/ecdsa/ecdsa.go index 8d66477fd1..0731f2b670 100644 --- a/src/crypto/ecdsa/ecdsa.go +++ b/src/crypto/ecdsa/ecdsa.go @@ -27,6 +27,17 @@ import ( "math/big" ) +// A invertible implements fast inverse mod Curve.Params().N +type invertible interface { + // Inverse returns the inverse of k in GF(P) + Inverse(k *big.Int) *big.Int +} + +// combinedMult implements fast multiplication S1*g + S2*p (g - generator, p - arbitrary point) +type combinedMult interface { + CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) +} + const ( aesIV = "IV for ECDSA CTR" ) @@ -179,7 +190,12 @@ func Sign(rand io.Reader, priv *PrivateKey, hash []byte) (r, s *big.Int, err err return } - kInv = fermatInverse(k, N) + if in, ok := priv.Curve.(invertible); ok { + kInv = in.Inverse(k) + } else { + kInv = fermatInverse(k, N) + } + r, _ = priv.Curve.ScalarBaseMult(k.Bytes()) r.Mod(r, N) if r.Sign() != 0 { @@ -214,16 +230,29 @@ func Verify(pub *PublicKey, hash []byte, r, s *big.Int) bool { return false } e := hashToInt(hash, c) - w := new(big.Int).ModInverse(s, N) + + var w *big.Int + if in, ok := c.(invertible); ok { + w = in.Inverse(s) + } else { + w = new(big.Int).ModInverse(s, N) + } u1 := e.Mul(e, w) u1.Mod(u1, N) u2 := w.Mul(r, w) u2.Mod(u2, N) - x1, y1 := c.ScalarBaseMult(u1.Bytes()) - x2, y2 := c.ScalarMult(pub.X, pub.Y, u2.Bytes()) - x, y := c.Add(x1, y1, x2, y2) + // Check if implements S1*g + S2*p + var x, y *big.Int + if opt, ok := c.(combinedMult); ok { + x, y = opt.CombinedMult(pub.X, pub.Y, u1.Bytes(), u2.Bytes()) + } else { + x1, y1 := c.ScalarBaseMult(u1.Bytes()) + x2, y2 := c.ScalarMult(pub.X, pub.Y, u2.Bytes()) + x, y = c.Add(x1, y1, x2, y2) + } + if x.Sign() == 0 && y.Sign() == 0 { return false } diff --git a/src/crypto/ecdsa/ecdsa_test.go b/src/crypto/ecdsa/ecdsa_test.go index 2bd31b850e..62a3fcc496 100644 --- a/src/crypto/ecdsa/ecdsa_test.go +++ b/src/crypto/ecdsa/ecdsa_test.go @@ -42,6 +42,41 @@ func TestKeyGeneration(t *testing.T) { testKeyGeneration(t, elliptic.P521(), "p521") } +func BenchmarkSignP256(b *testing.B) { + b.ResetTimer() + p256 := elliptic.P256() + hashed := []byte("testing") + priv, _ := GenerateKey(p256, rand.Reader) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _, _ = Sign(rand.Reader, priv, hashed) + } +} + +func BenchmarkVerifyP256(b *testing.B) { + b.ResetTimer() + p256 := elliptic.P256() + hashed := []byte("testing") + priv, _ := GenerateKey(p256, rand.Reader) + r, s, _ := Sign(rand.Reader, priv, hashed) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + Verify(&priv.PublicKey, hashed, r, s) + } +} + +func BenchmarkKeyGeneration(b *testing.B) { + b.ResetTimer() + p256 := elliptic.P256() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + GenerateKey(p256, rand.Reader) + } +} + func testSignAndVerify(t *testing.T, c elliptic.Curve, tag string) { priv, _ := GenerateKey(c, rand.Reader) diff --git a/src/crypto/elliptic/elliptic_test.go b/src/crypto/elliptic/elliptic_test.go index 7e27913dcd..7f3f1a2118 100644 --- a/src/crypto/elliptic/elliptic_test.go +++ b/src/crypto/elliptic/elliptic_test.go @@ -441,6 +441,18 @@ func BenchmarkBaseMultP256(b *testing.B) { } } +func BenchmarkScalarMultP256(b *testing.B) { + b.ResetTimer() + p256 := P256() + _, x, y, _ := GenerateKey(p256, rand.Reader) + priv, _, _, _ := GenerateKey(p256, rand.Reader) + + b.StartTimer() + for i := 0; i < b.N; i++ { + p256.ScalarMult(x, y, priv) + } +} + func TestMarshal(t *testing.T) { p224 := P224() _, x, y, err := GenerateKey(p224, rand.Reader) diff --git a/src/crypto/elliptic/p256.go b/src/crypto/elliptic/p256.go index 82bc7b3019..5103e86de9 100644 --- a/src/crypto/elliptic/p256.go +++ b/src/crypto/elliptic/p256.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +// +build !amd64 + package elliptic // This file contains a constant-time, 32-bit implementation of P256. diff --git a/src/crypto/elliptic/p256_amd64.go b/src/crypto/elliptic/p256_amd64.go new file mode 100644 index 0000000000..586cd10c4f --- /dev/null +++ b/src/crypto/elliptic/p256_amd64.go @@ -0,0 +1,552 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains the Go wrapper for the constant-time, 64-bit assembly +// implementation of P256. The optimizations performed here are described in +// detail in: +// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with +// 256-bit primes" +// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x +// https://eprint.iacr.org/2013/816.pdf + +// +build amd64 + +package elliptic + +import ( + "math/big" + "sync" +) + +type ( + p256Curve struct { + *CurveParams + } + + p256Point struct { + xyz [12]uint64 + } +) + +var ( + p256 p256Curve + p256Precomputed *[37][64 * 8]uint64 + precomputeOnce sync.Once +) + +func initP256() { + // See FIPS 186-3, section D.2.3 + p256.CurveParams = &CurveParams{Name: "P-256"} + p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10) + p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10) + p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16) + p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16) + p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16) + p256.BitSize = 256 +} + +func (curve p256Curve) Params() *CurveParams { + return curve.CurveParams +} + +// Functions implemented in p256_asm_amd64.s +// Montgomery multiplication modulo P256 +func p256Mul(res, in1, in2 []uint64) + +// Montgomery square modulo P256 +func p256Sqr(res, in []uint64) + +// Montgomery multiplication by 1 +func p256FromMont(res, in []uint64) + +// iff cond == 1 val <- -val +func p256NegCond(val []uint64, cond int) + +// if cond == 0 res <- b; else res <- a +func p256MovCond(res, a, b []uint64, cond int) + +// Endianess swap +func p256BigToLittle(res []uint64, in []byte) +func p256LittleToBig(res []byte, in []uint64) + +// Constant time table access +func p256Select(point, table []uint64, idx int) +func p256SelectBase(point, table []uint64, idx int) + +// Montgomery multiplication modulo Ord(G) +func p256OrdMul(res, in1, in2 []uint64) + +// Montgomery square modulo Ord(G), repeated n times +func p256OrdSqr(res, in []uint64, n int) + +// Point add with in2 being affine point +// If sign == 1 -> in2 = -in2 +// If sel == 0 -> res = in1 +// if zero == 0 -> res = in2 +func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) + +// Point add +func p256PointAddAsm(res, in1, in2 []uint64) + +// Point double +func p256PointDoubleAsm(res, in []uint64) + +func (curve p256Curve) Inverse(k *big.Int) *big.Int { + if k.Cmp(p256.N) >= 0 { + // This should never happen. + reducedK := new(big.Int).Mod(k, p256.N) + k = reducedK + } + + // table will store precomputed powers of x. The four words at index + // 4×i store x^(i+1). + var table [4 * 15]uint64 + + x := make([]uint64, 4) + fromBig(x[:], k) + // This code operates in the Montgomery domain where R = 2^256 mod n + // and n is the order of the scalar field. (See initP256 for the + // value.) Elements in the Montgomery domain take the form a×R and + // multiplication of x and y in the calculates (x × y × R^-1) mod n. RR + // is R×R mod n thus the Montgomery multiplication x and RR gives x×R, + // i.e. converts x into the Montgomery domain. + RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620} + p256OrdMul(table[:4], x, RR) + + // Prepare the table, no need in constant time access, because the + // power is not a secret. (Entry 0 is never used.) + for i := 2; i < 16; i += 2 { + p256OrdSqr(table[4*(i-1):], table[4*((i/2)-1):], 1) + p256OrdMul(table[4*i:], table[4*(i-1):], table[:4]) + } + + x[0] = table[4*14+0] // f + x[1] = table[4*14+1] + x[2] = table[4*14+2] + x[3] = table[4*14+3] + + p256OrdSqr(x, x, 4) + p256OrdMul(x, x, table[4*14:4*14+4]) // ff + t := make([]uint64, 4, 4) + t[0] = x[0] + t[1] = x[1] + t[2] = x[2] + t[3] = x[3] + + p256OrdSqr(x, x, 8) + p256OrdMul(x, x, t) // ffff + t[0] = x[0] + t[1] = x[1] + t[2] = x[2] + t[3] = x[3] + + p256OrdSqr(x, x, 16) + p256OrdMul(x, x, t) // ffffffff + t[0] = x[0] + t[1] = x[1] + t[2] = x[2] + t[3] = x[3] + + p256OrdSqr(x, x, 64) // ffffffff0000000000000000 + p256OrdMul(x, x, t) // ffffffff00000000ffffffff + p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000 + p256OrdMul(x, x, t) // ffffffff00000000ffffffffffffffff + + // Remaining 32 windows + expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4, 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf} + for i := 0; i < 32; i++ { + p256OrdSqr(x, x, 4) + p256OrdMul(x, x, table[4*(expLo[i]-1):]) + } + + // Multiplying by one in the Montgomery domain converts a Montgomery + // value out of the domain. + one := []uint64{1, 0, 0, 0} + p256OrdMul(x, x, one) + + xOut := make([]byte, 32) + p256LittleToBig(xOut, x) + return new(big.Int).SetBytes(xOut) +} + +// fromBig converts a *big.Int into a format used by this code. +func fromBig(out []uint64, big *big.Int) { + for i := range out { + out[i] = 0 + } + + for i, v := range big.Bits() { + out[i] = uint64(v) + } +} + +// p256GetScalar endian-swaps the big-endian scalar value from in and writes it +// to out. If the scalar is equal or greater than the order of the group, it's +// reduced modulo that order. +func p256GetScalar(out []uint64, in []byte) { + n := new(big.Int).SetBytes(in) + + if n.Cmp(p256.N) >= 0 { + n.Mod(n, p256.N) + } + fromBig(out, n) +} + +// p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the +// underlying field of the curve. (See initP256 for the value.) Thus rr here is +// R×R mod p. See comment in Inverse about how this is used. +var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd} + +func maybeReduceModP(in *big.Int) *big.Int { + if in.Cmp(p256.P) < 0 { + return in + } + return new(big.Int).Mod(in, p256.P) +} + +func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) { + scalarReversed := make([]uint64, 4) + var r1, r2 p256Point + p256GetScalar(scalarReversed, baseScalar) + r1.p256BaseMult(scalarReversed) + + p256GetScalar(scalarReversed, scalar) + fromBig(r2.xyz[0:4], maybeReduceModP(bigX)) + fromBig(r2.xyz[4:8], maybeReduceModP(bigY)) + p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:]) + p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:]) + + // This sets r2's Z value to 1, in the Montgomery domain. + r2.xyz[8] = 0x0000000000000001 + r2.xyz[9] = 0xffffffff00000000 + r2.xyz[10] = 0xffffffffffffffff + r2.xyz[11] = 0x00000000fffffffe + + r2.p256ScalarMult(scalarReversed) + p256PointAddAsm(r1.xyz[:], r1.xyz[:], r2.xyz[:]) + return r1.p256PointToAffine() +} + +func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) { + scalarReversed := make([]uint64, 4) + p256GetScalar(scalarReversed, scalar) + + var r p256Point + r.p256BaseMult(scalarReversed) + return r.p256PointToAffine() +} + +func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { + scalarReversed := make([]uint64, 4) + p256GetScalar(scalarReversed, scalar) + + var r p256Point + fromBig(r.xyz[0:4], maybeReduceModP(bigX)) + fromBig(r.xyz[4:8], maybeReduceModP(bigY)) + p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:]) + p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:]) + // This sets r2's Z value to 1, in the Montgomery domain. + r.xyz[8] = 0x0000000000000001 + r.xyz[9] = 0xffffffff00000000 + r.xyz[10] = 0xffffffffffffffff + r.xyz[11] = 0x00000000fffffffe + + r.p256ScalarMult(scalarReversed) + return r.p256PointToAffine() +} + +func (p *p256Point) p256PointToAffine() (x, y *big.Int) { + zInv := make([]uint64, 4) + zInvSq := make([]uint64, 4) + p256Inverse(zInv, p.xyz[8:12]) + p256Sqr(zInvSq, zInv) + p256Mul(zInv, zInv, zInvSq) + + p256Mul(zInvSq, p.xyz[0:4], zInvSq) + p256Mul(zInv, p.xyz[4:8], zInv) + + p256FromMont(zInvSq, zInvSq) + p256FromMont(zInv, zInv) + + xOut := make([]byte, 32) + yOut := make([]byte, 32) + p256LittleToBig(xOut, zInvSq) + p256LittleToBig(yOut, zInv) + + return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut) +} + +// p256Inverse sets out to in^-1 mod p. +func p256Inverse(out, in []uint64) { + var stack [6 * 4]uint64 + p2 := stack[4*0 : 4*0+4] + p4 := stack[4*1 : 4*1+4] + p8 := stack[4*2 : 4*2+4] + p16 := stack[4*3 : 4*3+4] + p32 := stack[4*4 : 4*4+4] + + p256Sqr(out, in) + p256Mul(p2, out, in) // 3*p + + p256Sqr(out, p2) + p256Sqr(out, out) + p256Mul(p4, out, p2) // f*p + + p256Sqr(out, p4) + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256Mul(p8, out, p4) // ff*p + + p256Sqr(out, p8) + + for i := 0; i < 7; i++ { + p256Sqr(out, out) + } + p256Mul(p16, out, p8) // ffff*p + + p256Sqr(out, p16) + for i := 0; i < 15; i++ { + p256Sqr(out, out) + } + p256Mul(p32, out, p16) // ffffffff*p + + p256Sqr(out, p32) + + for i := 0; i < 31; i++ { + p256Sqr(out, out) + } + p256Mul(out, out, in) + + for i := 0; i < 32*4; i++ { + p256Sqr(out, out) + } + p256Mul(out, out, p32) + + for i := 0; i < 32; i++ { + p256Sqr(out, out) + } + p256Mul(out, out, p32) + + for i := 0; i < 16; i++ { + p256Sqr(out, out) + } + p256Mul(out, out, p16) + + for i := 0; i < 8; i++ { + p256Sqr(out, out) + } + p256Mul(out, out, p8) + + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256Mul(out, out, p4) + + p256Sqr(out, out) + p256Sqr(out, out) + p256Mul(out, out, p2) + + p256Sqr(out, out) + p256Sqr(out, out) + p256Mul(out, out, in) +} + +func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) { + copy(r[index*12:], p.xyz[:]) +} + +func boothW5(in uint) (int, int) { + var s uint = ^((in >> 5) - 1) + var d uint = (1 << 6) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func boothW7(in uint) (int, int) { + var s uint = ^((in >> 7) - 1) + var d uint = (1 << 8) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func initTable() { + p256Precomputed = new([37][64 * 8]uint64) + + basePoint := []uint64{ + 0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6, + 0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, 0x8571ff1825885d85, + 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe, + } + t1 := make([]uint64, 12) + t2 := make([]uint64, 12) + copy(t2, basePoint) + + zInv := make([]uint64, 4) + zInvSq := make([]uint64, 4) + for j := 0; j < 64; j++ { + copy(t1, t2) + for i := 0; i < 37; i++ { + // The window size is 7 so we need to double 7 times. + if i != 0 { + for k := 0; k < 7; k++ { + p256PointDoubleAsm(t1, t1) + } + } + // Convert the point to affine form. (Its values are + // still in Montgomery form however.) + p256Inverse(zInv, t1[8:12]) + p256Sqr(zInvSq, zInv) + p256Mul(zInv, zInv, zInvSq) + + p256Mul(t1[:4], t1[:4], zInvSq) + p256Mul(t1[4:8], t1[4:8], zInv) + + copy(t1[8:12], basePoint[8:12]) + // Update the table entry + copy(p256Precomputed[i][j*8:], t1[:8]) + } + if j == 0 { + p256PointDoubleAsm(t2, basePoint) + } else { + p256PointAddAsm(t2, t2, basePoint) + } + } +} + +func (p *p256Point) p256BaseMult(scalar []uint64) { + precomputeOnce.Do(initTable) + + wvalue := (scalar[0] << 1) & 0xff + sel, sign := boothW7(uint(wvalue)) + p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel) + p256NegCond(p.xyz[4:8], sign) + + // (This is one, in the Montgomery domain.) + p.xyz[8] = 0x0000000000000001 + p.xyz[9] = 0xffffffff00000000 + p.xyz[10] = 0xffffffffffffffff + p.xyz[11] = 0x00000000fffffffe + + var t0 p256Point + // (This is one, in the Montgomery domain.) + t0.xyz[8] = 0x0000000000000001 + t0.xyz[9] = 0xffffffff00000000 + t0.xyz[10] = 0xffffffffffffffff + t0.xyz[11] = 0x00000000fffffffe + + index := uint(6) + zero := sel + + for i := 1; i < 37; i++ { + if index < 192 { + wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0xff + } else { + wvalue = (scalar[index/64] >> (index % 64)) & 0xff + } + index += 7 + sel, sign = boothW7(uint(wvalue)) + p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel) + p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero) + zero |= sel + } +} + +func (p *p256Point) p256ScalarMult(scalar []uint64) { + // precomp is a table of precomputed points that stores powers of p + // from p^1 to p^16. + var precomp [16 * 4 * 3]uint64 + var t0, t1, t2, t3 p256Point + + // Prepare the table + p.p256StorePoint(&precomp, 0) // 1 + + p256PointDoubleAsm(t0.xyz[:], p.xyz[:]) + p256PointDoubleAsm(t1.xyz[:], t0.xyz[:]) + p256PointDoubleAsm(t2.xyz[:], t1.xyz[:]) + p256PointDoubleAsm(t3.xyz[:], t2.xyz[:]) + t0.p256StorePoint(&precomp, 1) // 2 + t1.p256StorePoint(&precomp, 3) // 4 + t2.p256StorePoint(&precomp, 7) // 8 + t3.p256StorePoint(&precomp, 15) // 16 + + p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) + p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) + p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) + t0.p256StorePoint(&precomp, 2) // 3 + t1.p256StorePoint(&precomp, 4) // 5 + t2.p256StorePoint(&precomp, 8) // 9 + + p256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) + p256PointDoubleAsm(t1.xyz[:], t1.xyz[:]) + t0.p256StorePoint(&precomp, 5) // 6 + t1.p256StorePoint(&precomp, 9) // 10 + + p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:]) + p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) + t2.p256StorePoint(&precomp, 6) // 7 + t1.p256StorePoint(&precomp, 10) // 11 + + p256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) + p256PointDoubleAsm(t2.xyz[:], t2.xyz[:]) + t0.p256StorePoint(&precomp, 11) // 12 + t2.p256StorePoint(&precomp, 13) // 14 + + p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) + p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) + t0.p256StorePoint(&precomp, 12) // 13 + t2.p256StorePoint(&precomp, 14) // 15 + + // Start scanning the window from top bit + index := uint(254) + var sel, sign int + + wvalue := (scalar[index/64] >> (index % 64)) & 0x3f + sel, _ = boothW5(uint(wvalue)) + + p256Select(p.xyz[0:12], precomp[0:], sel) + zero := sel + + for index > 4 { + index -= 5 + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + + if index < 192 { + wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f + } else { + wvalue = (scalar[index/64] >> (index % 64)) & 0x3f + } + + sel, sign = boothW5(uint(wvalue)) + + p256Select(t0.xyz[0:], precomp[0:], sel) + p256NegCond(t0.xyz[4:8], sign) + p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) + p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) + p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) + zero |= sel + } + + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + p256PointDoubleAsm(p.xyz[:], p.xyz[:]) + + wvalue = (scalar[0] << 1) & 0x3f + sel, sign = boothW5(uint(wvalue)) + + p256Select(t0.xyz[0:], precomp[0:], sel) + p256NegCond(t0.xyz[4:8], sign) + p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) + p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) + p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) +} diff --git a/src/crypto/elliptic/p256_asm_amd64.s b/src/crypto/elliptic/p256_asm_amd64.s new file mode 100644 index 0000000000..6c7bde16e5 --- /dev/null +++ b/src/crypto/elliptic/p256_asm_amd64.s @@ -0,0 +1,2295 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file contains constant-time, 64-bit assembly implementation of +// P256. The optimizations performed here are described in detail in: +// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with +// 256-bit primes" +// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x +// https://eprint.iacr.org/2013/816.pdf + +#include "textflag.h" + +#define res_ptr DI +#define x_ptr SI +#define y_ptr CX + +#define acc0 R8 +#define acc1 R9 +#define acc2 R10 +#define acc3 R11 +#define acc4 R12 +#define acc5 R13 +#define t0 R14 +#define t1 R15 + +DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff +DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 +DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f +DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 +DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 +DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff +DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 +DATA p256one<>+0x00(SB)/8, $0x0000000000000001 +DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 +DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff +DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe +GLOBL p256const0<>(SB), 8, $8 +GLOBL p256const1<>(SB), 8, $8 +GLOBL p256ordK0<>(SB), 8, $8 +GLOBL p256ord<>(SB), 8, $32 +GLOBL p256one<>(SB), 8, $32 + +/* ---------------------------------------*/ +// func p256LittleToBig(res []byte, in []uint64) +TEXT ·p256LittleToBig(SB),NOSPLIT,$0 + JMP ·p256BigToLittle(SB) +/* ---------------------------------------*/ +// func p256BigToLittle(res []uint64, in []byte) +TEXT ·p256BigToLittle(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + + BSWAPQ acc0 + BSWAPQ acc1 + BSWAPQ acc2 + BSWAPQ acc3 + + MOVQ acc3, (8*0)(res_ptr) + MOVQ acc2, (8*1)(res_ptr) + MOVQ acc1, (8*2)(res_ptr) + MOVQ acc0, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256MovCond(res, a, b []uint64, cond int) +// If cond == 0 res=b, else res=a +TEXT ·p256MovCond(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ a+24(FP), x_ptr + MOVQ b+48(FP), y_ptr + MOVQ cond+72(FP), X12 + + PXOR X13, X13 + PSHUFD $0, X12, X12 + PCMPEQL X13, X12 + + MOVOU X12, X0 + PANDN (16*0)(x_ptr), X0 + MOVOU X12, X1 + PANDN (16*1)(x_ptr), X1 + MOVOU X12, X2 + PANDN (16*2)(x_ptr), X2 + MOVOU X12, X3 + PANDN (16*3)(x_ptr), X3 + MOVOU X12, X4 + PANDN (16*4)(x_ptr), X4 + MOVOU X12, X5 + PANDN (16*5)(x_ptr), X5 + + MOVOU (16*0)(y_ptr), X6 + MOVOU (16*1)(y_ptr), X7 + MOVOU (16*2)(y_ptr), X8 + MOVOU (16*3)(y_ptr), X9 + MOVOU (16*4)(y_ptr), X10 + MOVOU (16*5)(y_ptr), X11 + + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + + MOVOU X0, (16*0)(res_ptr) + MOVOU X1, (16*1)(res_ptr) + MOVOU X2, (16*2)(res_ptr) + MOVOU X3, (16*3)(res_ptr) + MOVOU X4, (16*4)(res_ptr) + MOVOU X5, (16*5)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256NegCond(val []uint64, cond int) +TEXT ·p256NegCond(SB),NOSPLIT,$0 + MOVQ val+0(FP), res_ptr + MOVQ cond+24(FP), t0 + // acc = poly + MOVQ $-1, acc0 + MOVQ p256const0<>(SB), acc1 + MOVQ $0, acc2 + MOVQ p256const1<>(SB), acc3 + // Load the original value + MOVQ (8*0)(res_ptr), acc5 + MOVQ (8*1)(res_ptr), x_ptr + MOVQ (8*2)(res_ptr), y_ptr + MOVQ (8*3)(res_ptr), t1 + // Speculatively subtract + SUBQ acc5, acc0 + SBBQ x_ptr, acc1 + SBBQ y_ptr, acc2 + SBBQ t1, acc3 + // If condition is 0, keep original value + TESTQ t0, t0 + CMOVQEQ acc5, acc0 + CMOVQEQ x_ptr, acc1 + CMOVQEQ y_ptr, acc2 + CMOVQEQ t1, acc3 + // Store result + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256Sqr(res, in []uint64) +TEXT ·p256Sqr(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ t1, t1 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, t1 + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, t1 + MOVQ t1, x_ptr + // First reduction step + MOVQ acc0, AX + MOVQ acc0, t1 + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc0, acc1 + ADCQ t1, acc2 + ADCQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc0 + // Second reduction step + MOVQ acc1, AX + MOVQ acc1, t1 + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc1, acc2 + ADCQ t1, acc3 + ADCQ AX, acc0 + ADCQ $0, DX + MOVQ DX, acc1 + // Third reduction step + MOVQ acc2, AX + MOVQ acc2, t1 + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc2, acc3 + ADCQ t1, acc0 + ADCQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + // Last reduction step + XORQ t0, t0 + MOVQ acc3, AX + MOVQ acc3, t1 + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc3, acc0 + ADCQ t1, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + MOVQ acc0, acc4 + MOVQ acc1, acc5 + MOVQ acc2, y_ptr + MOVQ acc3, t1 + // Subtract p256 + SUBQ $-1, acc0 + SBBQ p256const0<>(SB) ,acc1 + SBBQ $0, acc2 + SBBQ p256const1<>(SB), acc3 + SBBQ $0, t0 + + CMOVQCS acc4, acc0 + CMOVQCS acc5, acc1 + CMOVQCS y_ptr, acc2 + CMOVQCS t1, acc3 + + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256Mul(res, in1, in2 []uint64) +TEXT ·p256Mul(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in1+24(FP), x_ptr + MOVQ in2+48(FP), y_ptr + // x * y[0] + MOVQ (8*0)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MOVQ acc0, t1 + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc0, acc1 + ADCQ t1, acc2 + ADCQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + XORQ acc0, acc0 + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MOVQ acc1, t1 + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc1, acc2 + ADCQ t1, acc3 + ADCQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + XORQ acc1, acc1 + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MOVQ acc2, t1 + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc2, acc3 + ADCQ t1, acc4 + ADCQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + XORQ acc2, acc2 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MOVQ acc3, t1 + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc3, acc4 + ADCQ t1, acc5 + ADCQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Copy result [255:0] + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256const0<>(SB) ,acc5 + SBBQ $0, acc0 + SBBQ p256const1<>(SB), acc1 + SBBQ $0, acc2 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256FromMont(res, in []uint64) +TEXT ·p256FromMont(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + + MOVQ (8*0)(x_ptr), acc0 + MOVQ (8*1)(x_ptr), acc1 + MOVQ (8*2)(x_ptr), acc2 + MOVQ (8*3)(x_ptr), acc3 + XORQ acc4, acc4 + + // Only reduce, no multiplications are needed + // First stage + MOVQ acc0, AX + MOVQ acc0, t1 + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc0, acc1 + ADCQ t1, acc2 + ADCQ AX, acc3 + ADCQ DX, acc4 + XORQ acc5, acc5 + // Second stage + MOVQ acc1, AX + MOVQ acc1, t1 + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc1, acc2 + ADCQ t1, acc3 + ADCQ AX, acc4 + ADCQ DX, acc5 + XORQ acc0, acc0 + // Third stage + MOVQ acc2, AX + MOVQ acc2, t1 + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc2, acc3 + ADCQ t1, acc4 + ADCQ AX, acc5 + ADCQ DX, acc0 + XORQ acc1, acc1 + // Last stage + MOVQ acc3, AX + MOVQ acc3, t1 + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, t1 + ADDQ acc3, acc4 + ADCQ t1, acc5 + ADCQ AX, acc0 + ADCQ DX, acc1 + + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + + SUBQ $-1, acc4 + SBBQ p256const0<>(SB), acc5 + SBBQ $0, acc0 + SBBQ p256const1<>(SB), acc1 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// Constant time point access to arbitrary point table. +// Indexed from 1 to 15, with -1 offset +// (index 0 is implicitly point at infinity) +// func p256Select(point, table []uint64, idx int) +TEXT ·p256Select(SB),NOSPLIT,$0 + MOVQ idx+48(FP),AX + MOVQ table+24(FP),DI + MOVQ point+0(FP),DX + + PXOR X15, X15 // X15 = 0 + PCMPEQL X14, X14 // X14 = -1 + PSUBL X14, X15 // X15 = 1 + MOVL AX, X14 + PSHUFD $0, X14, X14 + + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + PXOR X4, X4 + PXOR X5, X5 + MOVQ $16, AX + + MOVOU X15, X13 + +loop_select: + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + MOVOU (16*0)(DI), X6 + MOVOU (16*1)(DI), X7 + MOVOU (16*2)(DI), X8 + MOVOU (16*3)(DI), X9 + MOVOU (16*4)(DI), X10 + MOVOU (16*5)(DI), X11 + ADDQ $(16*6), DI + + PAND X12, X6 + PAND X12, X7 + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X6, X0 + PXOR X7, X1 + PXOR X8, X2 + PXOR X9, X3 + PXOR X10, X4 + PXOR X11, X5 + + DECQ AX + JNE loop_select + + MOVOU X0, (16*0)(DX) + MOVOU X1, (16*1)(DX) + MOVOU X2, (16*2)(DX) + MOVOU X3, (16*3)(DX) + MOVOU X4, (16*4)(DX) + MOVOU X5, (16*5)(DX) + + RET +/* ---------------------------------------*/ +// Constant time point access to base point table. +// func p256SelectBase(point, table []uint64, idx int) +TEXT ·p256SelectBase(SB),NOSPLIT,$0 + MOVQ idx+48(FP),AX + MOVQ table+24(FP),DI + MOVQ point+0(FP),DX + + PXOR X15, X15 // X15 = 0 + PCMPEQL X14, X14 // X14 = -1 + PSUBL X14, X15 // X15 = 1 + MOVL AX, X14 + PSHUFD $0, X14, X14 + + PXOR X0, X0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + MOVQ $32, AX + + MOVOU X15, X13 + +loop_select_base: + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + MOVOU (16*0)(DI), X4 + MOVOU (16*1)(DI), X5 + MOVOU (16*2)(DI), X6 + MOVOU (16*3)(DI), X7 + + MOVOU (16*4)(DI), X8 + MOVOU (16*5)(DI), X9 + MOVOU (16*6)(DI), X10 + MOVOU (16*7)(DI), X11 + + ADDQ $(16*8), DI + + PAND X12, X4 + PAND X12, X5 + PAND X12, X6 + PAND X12, X7 + + MOVOU X13, X12 + PADDL X15, X13 + PCMPEQL X14, X12 + + PAND X12, X8 + PAND X12, X9 + PAND X12, X10 + PAND X12, X11 + + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + + PXOR X8, X0 + PXOR X9, X1 + PXOR X10, X2 + PXOR X11, X3 + + DECQ AX + JNE loop_select_base + + MOVOU X0, (16*0)(DX) + MOVOU X1, (16*1)(DX) + MOVOU X2, (16*2)(DX) + MOVOU X3, (16*3)(DX) + + RET +/* ---------------------------------------*/ +// func p256OrdMul(res, in1, in2 []uint64) +TEXT ·p256OrdMul(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in1+24(FP), x_ptr + MOVQ in2+48(FP), y_ptr + // x * y[0] + MOVQ (8*0)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + MOVQ AX, acc0 + MOVQ DX, acc1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + XORQ acc5, acc5 + // First reduction step + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ DX, acc4 + ADCQ $0, acc5 + // x * y[1] + MOVQ (8*1)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ DX, acc5 + ADCQ $0, acc0 + // x * y[2] + MOVQ (8*2)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ DX, acc0 + ADCQ $0, acc1 + // x * y[3] + MOVQ (8*3)(y_ptr), t0 + + MOVQ (8*0)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x10(SB), AX + MULQ t0 + ADDQ t1, acc5 + ADCQ $0, DX + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x18(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ DX, acc1 + ADCQ $0, acc2 + // Copy result [255:0] + MOVQ acc4, x_ptr + MOVQ acc5, acc3 + MOVQ acc0, t0 + MOVQ acc1, t1 + // Subtract p256 + SUBQ p256ord<>+0x00(SB), acc4 + SBBQ p256ord<>+0x08(SB) ,acc5 + SBBQ p256ord<>+0x10(SB), acc0 + SBBQ p256ord<>+0x18(SB), acc1 + SBBQ $0, acc2 + + CMOVQCS x_ptr, acc4 + CMOVQCS acc3, acc5 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + + MOVQ acc4, (8*0)(res_ptr) + MOVQ acc5, (8*1)(res_ptr) + MOVQ acc0, (8*2)(res_ptr) + MOVQ acc1, (8*3)(res_ptr) + + RET +/* ---------------------------------------*/ +// func p256OrdSqr(res, in []uint64, n int) +TEXT ·p256OrdSqr(SB),NOSPLIT,$0 + MOVQ res+0(FP), res_ptr + MOVQ in+24(FP), x_ptr + MOVQ n+48(FP), BX + +ordSqrLoop: + + // y[1:] * y[0] + MOVQ (8*0)(x_ptr), t0 + + MOVQ (8*1)(x_ptr), AX + MULQ t0 + MOVQ AX, acc1 + MOVQ DX, acc2 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, acc3 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, acc4 + // y[2:] * y[1] + MOVQ (8*1)(x_ptr), t0 + + MOVQ (8*2)(x_ptr), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ t1, acc4 + ADCQ $0, DX + ADDQ AX, acc4 + ADCQ $0, DX + MOVQ DX, acc5 + // y[3] * y[2] + MOVQ (8*2)(x_ptr), t0 + + MOVQ (8*3)(x_ptr), AX + MULQ t0 + ADDQ AX, acc5 + ADCQ $0, DX + MOVQ DX, y_ptr + XORQ t1, t1 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ acc4, acc4 + ADCQ acc5, acc5 + ADCQ y_ptr, y_ptr + ADCQ $0, t1 + // Missing products + MOVQ (8*0)(x_ptr), AX + MULQ AX + MOVQ AX, acc0 + MOVQ DX, t0 + + MOVQ (8*1)(x_ptr), AX + MULQ AX + ADDQ t0, acc1 + ADCQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*2)(x_ptr), AX + MULQ AX + ADDQ t0, acc3 + ADCQ AX, acc4 + ADCQ $0, DX + MOVQ DX, t0 + + MOVQ (8*3)(x_ptr), AX + MULQ AX + ADDQ t0, acc5 + ADCQ AX, y_ptr + ADCQ DX, t1 + MOVQ t1, x_ptr + // First reduction step + MOVQ acc0, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc1 + ADCQ $0, DX + ADDQ AX, acc1 + + MOVQ t0, t1 + ADCQ DX, acc2 + ADCQ $0, t1 + SUBQ t0, acc2 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc0 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc3 + ADCQ $0, acc0 + SUBQ AX, acc3 + SBBQ DX, acc0 + // Second reduction step + MOVQ acc1, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc1 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc2 + ADCQ $0, DX + ADDQ AX, acc2 + + MOVQ t0, t1 + ADCQ DX, acc3 + ADCQ $0, t1 + SUBQ t0, acc3 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc1 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc0 + ADCQ $0, acc1 + SUBQ AX, acc0 + SBBQ DX, acc1 + // Third reduction step + MOVQ acc2, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc2 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc3 + ADCQ $0, DX + ADDQ AX, acc3 + + MOVQ t0, t1 + ADCQ DX, acc0 + ADCQ $0, t1 + SUBQ t0, acc0 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc2 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc1 + ADCQ $0, acc2 + SUBQ AX, acc1 + SBBQ DX, acc2 + // Last reduction step + MOVQ acc3, AX + MULQ p256ordK0<>(SB) + MOVQ AX, t0 + + MOVQ p256ord<>+0x00(SB), AX + MULQ t0 + ADDQ AX, acc3 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ p256ord<>+0x08(SB), AX + MULQ t0 + ADDQ t1, acc0 + ADCQ $0, DX + ADDQ AX, acc0 + ADCQ $0, DX + MOVQ DX, t1 + + MOVQ t0, t1 + ADCQ DX, acc1 + ADCQ $0, t1 + SUBQ t0, acc1 + SBBQ $0, t1 + + MOVQ t0, AX + MOVQ t0, DX + MOVQ t0, acc3 + SHLQ $32, AX + SHRQ $32, DX + + ADDQ t1, acc2 + ADCQ $0, acc3 + SUBQ AX, acc2 + SBBQ DX, acc3 + XORQ t0, t0 + // Add bits [511:256] of the sqr result + ADCQ acc4, acc0 + ADCQ acc5, acc1 + ADCQ y_ptr, acc2 + ADCQ x_ptr, acc3 + ADCQ $0, t0 + + MOVQ acc0, acc4 + MOVQ acc1, acc5 + MOVQ acc2, y_ptr + MOVQ acc3, t1 + // Subtract p256 + SUBQ p256ord<>+0x00(SB), acc0 + SBBQ p256ord<>+0x08(SB) ,acc1 + SBBQ p256ord<>+0x10(SB), acc2 + SBBQ p256ord<>+0x18(SB), acc3 + SBBQ $0, t0 + + CMOVQCS acc4, acc0 + CMOVQCS acc5, acc1 + CMOVQCS y_ptr, acc2 + CMOVQCS t1, acc3 + + MOVQ acc0, (8*0)(res_ptr) + MOVQ acc1, (8*1)(res_ptr) + MOVQ acc2, (8*2)(res_ptr) + MOVQ acc3, (8*3)(res_ptr) + MOVQ res_ptr, x_ptr + DECQ BX + JNE ordSqrLoop + + RET +/* ---------------------------------------*/ +#undef res_ptr +#undef x_ptr +#undef y_ptr + +#undef acc0 +#undef acc1 +#undef acc2 +#undef acc3 +#undef acc4 +#undef acc5 +#undef t0 +#undef t1 +/* ---------------------------------------*/ +#define mul0 AX +#define mul1 DX +#define acc0 BX +#define acc1 CX +#define acc2 R8 +#define acc3 R9 +#define acc4 R10 +#define acc5 R11 +#define acc6 R12 +#define acc7 R13 +#define t0 R14 +#define t1 R15 +#define t2 DI +#define t3 SI +#define hlp BP +/* ---------------------------------------*/ +TEXT p256SubInternal(SB),NOSPLIT,$0 + XORQ mul0, mul0 + SUBQ t0, acc4 + SBBQ t1, acc5 + SBBQ t2, acc6 + SBBQ t3, acc7 + SBBQ $0, mul0 + + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + + ADDQ $-1, acc4 + ADCQ p256const0<>(SB), acc5 + ADCQ $0, acc6 + ADCQ p256const1<>(SB), acc7 + ADCQ $0, mul0 + + CMOVQNE acc0, acc4 + CMOVQNE acc1, acc5 + CMOVQNE acc2, acc6 + CMOVQNE acc3, acc7 + + RET +/* ---------------------------------------*/ +TEXT p256MulInternal(SB),NOSPLIT,$0 + MOVQ acc4, mul0 + MULQ t0 + MOVQ mul0, acc0 + MOVQ mul1, acc1 + + MOVQ acc4, mul0 + MULQ t1 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ t2 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ t3 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc4 + + MOVQ acc5, mul0 + MULQ t0 + ADDQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t1 + ADDQ hlp, acc2 + ADCQ $0, mul1 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t2 + ADDQ hlp, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ t3 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, acc5 + + MOVQ acc6, mul0 + MULQ t0 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t1 + ADDQ hlp, acc3 + ADCQ $0, mul1 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t2 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc6, mul0 + MULQ t3 + ADDQ hlp, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, acc6 + + MOVQ acc7, mul0 + MULQ t0 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t1 + ADDQ hlp, acc4 + ADCQ $0, mul1 + ADDQ mul0, acc4 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t2 + ADDQ hlp, acc5 + ADCQ $0, mul1 + ADDQ mul0, acc5 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc7, mul0 + MULQ t3 + ADDQ hlp, acc6 + ADCQ $0, mul1 + ADDQ mul0, acc6 + ADCQ $0, mul1 + MOVQ mul1, acc7 + // First reduction step + MOVQ acc0, mul0 + MOVQ acc0, hlp + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc0, acc1 + ADCQ hlp, acc2 + ADCQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + // Second reduction step + MOVQ acc1, mul0 + MOVQ acc1, hlp + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc1, acc2 + ADCQ hlp, acc3 + ADCQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, acc1 + // Third reduction step + MOVQ acc2, mul0 + MOVQ acc2, hlp + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc2, acc3 + ADCQ hlp, acc0 + ADCQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + // Last reduction step + MOVQ acc3, mul0 + MOVQ acc3, hlp + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc3, acc0 + ADCQ hlp, acc1 + ADCQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP + // Add bits [511:256] of the result + ADCQ acc0, acc4 + ADCQ acc1, acc5 + ADCQ acc2, acc6 + ADCQ acc3, acc7 + ADCQ $0, hlp + // Copy result + MOVQ acc4, acc0 + MOVQ acc5, acc1 + MOVQ acc6, acc2 + MOVQ acc7, acc3 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256const0<>(SB) ,acc5 + SBBQ $0, acc6 + SBBQ p256const1<>(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS acc0, acc4 + CMOVQCS acc1, acc5 + CMOVQCS acc2, acc6 + CMOVQCS acc3, acc7 + + RET +/* ---------------------------------------*/ +TEXT p256SqrInternal(SB),NOSPLIT,$0 + + MOVQ acc4, mul0 + MULQ acc5 + MOVQ mul0, acc1 + MOVQ mul1, acc2 + + MOVQ acc4, mul0 + MULQ acc6 + ADDQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + + MOVQ acc4, mul0 + MULQ acc7 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, t0 + + MOVQ acc5, mul0 + MULQ acc6 + ADDQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, hlp + + MOVQ acc5, mul0 + MULQ acc7 + ADDQ hlp, t0 + ADCQ $0, mul1 + ADDQ mul0, t0 + ADCQ $0, mul1 + MOVQ mul1, t1 + + MOVQ acc6, mul0 + MULQ acc7 + ADDQ mul0, t1 + ADCQ $0, mul1 + MOVQ mul1, t2 + XORQ t3, t3 + // *2 + ADDQ acc1, acc1 + ADCQ acc2, acc2 + ADCQ acc3, acc3 + ADCQ t0, t0 + ADCQ t1, t1 + ADCQ t2, t2 + ADCQ $0, t3 + // Missing products + MOVQ acc4, mul0 + MULQ mul0 + MOVQ mul0, acc0 + MOVQ DX, acc4 + + MOVQ acc5, mul0 + MULQ mul0 + ADDQ acc4, acc1 + ADCQ mul0, acc2 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc6, mul0 + MULQ mul0 + ADDQ acc4, acc3 + ADCQ mul0, t0 + ADCQ $0, DX + MOVQ DX, acc4 + + MOVQ acc7, mul0 + MULQ mul0 + ADDQ acc4, t1 + ADCQ mul0, t2 + ADCQ DX, t3 + // First reduction step + MOVQ acc0, mul0 + MOVQ acc0, hlp + SHLQ $32, acc0 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc0, acc1 + ADCQ hlp, acc2 + ADCQ mul0, acc3 + ADCQ $0, mul1 + MOVQ mul1, acc0 + // Second reduction step + MOVQ acc1, mul0 + MOVQ acc1, hlp + SHLQ $32, acc1 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc1, acc2 + ADCQ hlp, acc3 + ADCQ mul0, acc0 + ADCQ $0, mul1 + MOVQ mul1, acc1 + // Third reduction step + MOVQ acc2, mul0 + MOVQ acc2, hlp + SHLQ $32, acc2 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc2, acc3 + ADCQ hlp, acc0 + ADCQ mul0, acc1 + ADCQ $0, mul1 + MOVQ mul1, acc2 + // Last reduction step + MOVQ acc3, mul0 + MOVQ acc3, hlp + SHLQ $32, acc3 + MULQ p256const1<>(SB) + SHRQ $32, hlp + ADDQ acc3, acc0 + ADCQ hlp, acc1 + ADCQ mul0, acc2 + ADCQ $0, mul1 + MOVQ mul1, acc3 + BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP + // Add bits [511:256] of the result + ADCQ acc0, t0 + ADCQ acc1, t1 + ADCQ acc2, t2 + ADCQ acc3, t3 + ADCQ $0, hlp + // Copy result + MOVQ t0, acc4 + MOVQ t1, acc5 + MOVQ t2, acc6 + MOVQ t3, acc7 + // Subtract p256 + SUBQ $-1, acc4 + SBBQ p256const0<>(SB) ,acc5 + SBBQ $0, acc6 + SBBQ p256const1<>(SB), acc7 + SBBQ $0, hlp + // If the result of the subtraction is negative, restore the previous result + CMOVQCS t0, acc4 + CMOVQCS t1, acc5 + CMOVQCS t2, acc6 + CMOVQCS t3, acc7 + + RET +/* ---------------------------------------*/ +#define p256MulBy2Inline\ + XORQ mul0, mul0;\ + ADDQ acc4, acc4;\ + ADCQ acc5, acc5;\ + ADCQ acc6, acc6;\ + ADCQ acc7, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ $-1, t0;\ + SBBQ p256const0<>(SB), t1;\ + SBBQ $0, t2;\ + SBBQ p256const1<>(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; +/* ---------------------------------------*/ +#define p256AddInline \ + XORQ mul0, mul0;\ + ADDQ t0, acc4;\ + ADCQ t1, acc5;\ + ADCQ t2, acc6;\ + ADCQ t3, acc7;\ + ADCQ $0, mul0;\ + MOVQ acc4, t0;\ + MOVQ acc5, t1;\ + MOVQ acc6, t2;\ + MOVQ acc7, t3;\ + SUBQ $-1, t0;\ + SBBQ p256const0<>(SB), t1;\ + SBBQ $0, t2;\ + SBBQ p256const1<>(SB), t3;\ + SBBQ $0, mul0;\ + CMOVQCS acc4, t0;\ + CMOVQCS acc5, t1;\ + CMOVQCS acc6, t2;\ + CMOVQCS acc7, t3; +/* ---------------------------------------*/ +#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 +#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 +#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) +#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) +#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 +#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define xout(off) (32*5 + off)(SP) +#define yout(off) (32*6 + off)(SP) +#define zout(off) (32*7 + off)(SP) +#define s2(off) (32*8 + off)(SP) +#define z1sqr(off) (32*9 + off)(SP) +#define h(off) (32*10 + off)(SP) +#define r(off) (32*11 + off)(SP) +#define hsqr(off) (32*12 + off)(SP) +#define rsqr(off) (32*13 + off)(SP) +#define hcub(off) (32*14 + off)(SP) +#define rptr (32*15)(SP) +#define sel_save (32*15 + 8)(SP) +#define zero_save (32*15 + 8 + 4)(SP) + +// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) +TEXT ·p256PointAddAffineAsm(SB),0,$512-96 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+24(FP), BX + MOVQ in2+48(FP), CX + MOVQ sign+72(FP), DX + MOVQ sel+80(FP), t1 + MOVQ zero+88(FP), t2 + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + // Store pointer to result + MOVQ mul0, rptr + MOVL t1, sel_save + MOVL t2, zero_save + // Negate y2in based on sign + MOVQ (16*2 + 8*0)(CX), acc4 + MOVQ (16*2 + 8*1)(CX), acc5 + MOVQ (16*2 + 8*2)(CX), acc6 + MOVQ (16*2 + 8*3)(CX), acc7 + MOVQ $-1, acc0 + MOVQ p256const0<>(SB), acc1 + MOVQ $0, acc2 + MOVQ p256const1<>(SB), acc3 + XORQ mul0, mul0 + // Speculatively subtract + SUBQ acc4, acc0 + SBBQ acc5, acc1 + SBBQ acc6, acc2 + SBBQ acc7, acc3 + SBBQ $0, mul0 + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + MOVQ acc3, t3 + // Add in case the operand was > p256 + ADDQ $-1, acc0 + ADCQ p256const0<>(SB), acc1 + ADCQ $0, acc2 + ADCQ p256const1<>(SB), acc3 + ADCQ $0, mul0 + CMOVQNE t0, acc0 + CMOVQNE t1, acc1 + CMOVQNE t2, acc2 + CMOVQNE t3, acc3 + // If condition is 0, keep original value + TESTQ DX, DX + CMOVQEQ acc4, acc0 + CMOVQEQ acc5, acc1 + CMOVQEQ acc6, acc2 + CMOVQEQ acc7, acc3 + // Store result + MOVQ acc0, y2in(8*0) + MOVQ acc1, y2in(8*1) + MOVQ acc2, y2in(8*2) + MOVQ acc3, y2in(8*3) + // Begin point add + LDacc (z1in) + CALL p256SqrInternal(SB) // z1ˆ2 + ST (z1sqr) + + LDt (x2in) + CALL p256MulInternal(SB) // x2 * z1ˆ2 + + LDt (x1in) + CALL p256SubInternal(SB) // h = u2 - u1 + ST (h) + + LDt (z1in) + CALL p256MulInternal(SB) // z3 = h * z1 + ST (zout) + + LDacc (z1sqr) + CALL p256MulInternal(SB) // z1ˆ3 + + LDt (y2in) + CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 + ST (s2) + + LDt (y1in) + CALL p256SubInternal(SB) // r = s2 - s1 + ST (r) + + CALL p256SqrInternal(SB) // rsqr = rˆ2 + ST (rsqr) + + LDacc (h) + CALL p256SqrInternal(SB) // hsqr = hˆ2 + ST (hsqr) + + LDt (h) + CALL p256MulInternal(SB) // hcub = hˆ3 + ST (hcub) + + LDt (y1in) + CALL p256MulInternal(SB) // y1 * hˆ3 + ST (s2) + + LDacc (x1in) + LDt (hsqr) + CALL p256MulInternal(SB) // u1 * hˆ2 + ST (h) + + p256MulBy2Inline // u1 * hˆ2 * 2, inline + LDacc (rsqr) + CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 + + LDt (hcub) + CALL p256SubInternal(SB) + ST (xout) + + MOVQ acc4, t0 + MOVQ acc5, t1 + MOVQ acc6, t2 + MOVQ acc7, t3 + LDacc (h) + CALL p256SubInternal(SB) + + LDt (r) + CALL p256MulInternal(SB) + + LDt (s2) + CALL p256SubInternal(SB) + ST (yout) + // Load stored values from stack + MOVQ rptr, AX + MOVL sel_save, BX + MOVL zero_save, CX + // The result is not valid if (sel == 0), conditional choose + MOVOU xout(16*0), X0 + MOVOU xout(16*1), X1 + MOVOU yout(16*0), X2 + MOVOU yout(16*1), X3 + MOVOU zout(16*0), X4 + MOVOU zout(16*1), X5 + + MOVL BX, X6 + MOVL CX, X7 + + PXOR X8, X8 + PCMPEQL X9, X9 + + PSHUFD $0, X6, X6 + PSHUFD $0, X7, X7 + + PCMPEQL X8, X6 + PCMPEQL X8, X7 + + MOVOU X6, X15 + PANDN X9, X15 + + MOVOU x1in(16*0), X9 + MOVOU x1in(16*1), X10 + MOVOU y1in(16*0), X11 + MOVOU y1in(16*1), X12 + MOVOU z1in(16*0), X13 + MOVOU z1in(16*1), X14 + + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + + PAND X6, X9 + PAND X6, X10 + PAND X6, X11 + PAND X6, X12 + PAND X6, X13 + PAND X6, X14 + + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 + // Similarly if zero == 0 + PCMPEQL X9, X9 + MOVOU X7, X15 + PANDN X9, X15 + + MOVOU x2in(16*0), X9 + MOVOU x2in(16*1), X10 + MOVOU y2in(16*0), X11 + MOVOU y2in(16*1), X12 + MOVOU p256one<>+0x00(SB), X13 + MOVOU p256one<>+0x10(SB), X14 + + PAND X15, X0 + PAND X15, X1 + PAND X15, X2 + PAND X15, X3 + PAND X15, X4 + PAND X15, X5 + + PAND X7, X9 + PAND X7, X10 + PAND X7, X11 + PAND X7, X12 + PAND X7, X13 + PAND X7, X14 + + PXOR X9, X0 + PXOR X10, X1 + PXOR X11, X2 + PXOR X12, X3 + PXOR X13, X4 + PXOR X14, X5 + // Finally output the result + MOVOU X0, (16*0)(AX) + MOVOU X1, (16*1)(AX) + MOVOU X2, (16*2)(AX) + MOVOU X3, (16*3)(AX) + MOVOU X4, (16*4)(AX) + MOVOU X5, (16*5)(AX) + MOVQ $0, rptr + + RET +#undef x1in +#undef y1in +#undef z1in +#undef x2in +#undef y2in +#undef xout +#undef yout +#undef zout +#undef s2 +#undef z1sqr +#undef h +#undef r +#undef hsqr +#undef rsqr +#undef hcub +#undef rptr +#undef sel_save +#undef zero_save +/* ---------------------------------------*/ +#define x1in(off) (32*0 + off)(SP) +#define y1in(off) (32*1 + off)(SP) +#define z1in(off) (32*2 + off)(SP) +#define x2in(off) (32*3 + off)(SP) +#define y2in(off) (32*4 + off)(SP) +#define z2in(off) (32*5 + off)(SP) + +#define xout(off) (32*6 + off)(SP) +#define yout(off) (32*7 + off)(SP) +#define zout(off) (32*8 + off)(SP) + +#define u1(off) (32*9 + off)(SP) +#define u2(off) (32*10 + off)(SP) +#define s1(off) (32*11 + off)(SP) +#define s2(off) (32*12 + off)(SP) +#define z1sqr(off) (32*13 + off)(SP) +#define z2sqr(off) (32*14 + off)(SP) +#define h(off) (32*15 + off)(SP) +#define r(off) (32*16 + off)(SP) +#define hsqr(off) (32*17 + off)(SP) +#define rsqr(off) (32*18 + off)(SP) +#define hcub(off) (32*19 + off)(SP) +#define rptr (32*20)(SP) + +//func p256PointAddAsm(res, in1, in2 []uint64) +TEXT ·p256PointAddAsm(SB),0,$672-72 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in1+24(FP), BX + MOVQ in2+48(FP), CX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x1in(16*0) + MOVOU X1, x1in(16*1) + MOVOU X2, y1in(16*0) + MOVOU X3, y1in(16*1) + MOVOU X4, z1in(16*0) + MOVOU X5, z1in(16*1) + + MOVOU (16*0)(CX), X0 + MOVOU (16*1)(CX), X1 + MOVOU (16*2)(CX), X2 + MOVOU (16*3)(CX), X3 + MOVOU (16*4)(CX), X4 + MOVOU (16*5)(CX), X5 + + MOVOU X0, x2in(16*0) + MOVOU X1, x2in(16*1) + MOVOU X2, y2in(16*0) + MOVOU X3, y2in(16*1) + MOVOU X4, z2in(16*0) + MOVOU X5, z2in(16*1) + // Store pointer to result + MOVQ AX, rptr + // Begin point add + LDacc (z2in) + CALL p256SqrInternal(SB) // z2ˆ2 + ST (z2sqr) + LDt (z2in) + CALL p256MulInternal(SB) // z2ˆ3 + LDt (y1in) + CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 + ST (s1) + + LDacc (z1in) + CALL p256SqrInternal(SB) // z1ˆ2 + ST (z1sqr) + LDt (z1in) + CALL p256MulInternal(SB) // z1ˆ3 + LDt (y2in) + CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 + ST (s2) + + LDt (s1) + CALL p256SubInternal(SB) // r = s2 - s1 + ST (r) + + LDacc (z2sqr) + LDt (x1in) + CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 + ST (u1) + LDacc (z1sqr) + LDt (x2in) + CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 + ST (u2) + + LDt (u1) + CALL p256SubInternal(SB) // h = u2 - u1 + ST (h) + + LDacc (r) + CALL p256SqrInternal(SB) // rsqr = rˆ2 + ST (rsqr) + + LDacc (h) + CALL p256SqrInternal(SB) // hsqr = hˆ2 + ST (hsqr) + + LDt (h) + CALL p256MulInternal(SB) // hcub = hˆ3 + ST (hcub) + + LDt (s1) + CALL p256MulInternal(SB) + ST (s2) + + LDacc (z1in) + LDt (z2in) + CALL p256MulInternal(SB) // z1 * z2 + LDt (h) + CALL p256MulInternal(SB) // z1 * z2 * h + ST (zout) + + LDacc (hsqr) + LDt (u1) + CALL p256MulInternal(SB) // hˆ2 * u1 + ST (u2) + + p256MulBy2Inline // u1 * hˆ2 * 2, inline + LDacc (rsqr) + CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 + + LDt (hcub) + CALL p256SubInternal(SB) + ST (xout) + + MOVQ acc4, t0 + MOVQ acc5, t1 + MOVQ acc6, t2 + MOVQ acc7, t3 + LDacc (u2) + CALL p256SubInternal(SB) + + LDt (r) + CALL p256MulInternal(SB) + + LDt (s2) + CALL p256SubInternal(SB) + ST (yout) + + MOVOU xout(16*0), X0 + MOVOU xout(16*1), X1 + MOVOU yout(16*0), X2 + MOVOU yout(16*1), X3 + MOVOU zout(16*0), X4 + MOVOU zout(16*1), X5 + // Finally output the result + MOVQ rptr, AX + MOVQ $0, rptr + MOVOU X0, (16*0)(AX) + MOVOU X1, (16*1)(AX) + MOVOU X2, (16*2)(AX) + MOVOU X3, (16*3)(AX) + MOVOU X4, (16*4)(AX) + MOVOU X5, (16*5)(AX) + + RET +#undef x1in +#undef y1in +#undef z1in +#undef x2in +#undef y2in +#undef z2in +#undef xout +#undef yout +#undef zout +#undef s1 +#undef s2 +#undef u1 +#undef u2 +#undef z1sqr +#undef z2sqr +#undef h +#undef r +#undef hsqr +#undef rsqr +#undef hcub +#undef rptr +/* ---------------------------------------*/ +#define x(off) (32*0 + off)(SP) +#define y(off) (32*1 + off)(SP) +#define z(off) (32*2 + off)(SP) + +#define s(off) (32*3 + off)(SP) +#define m(off) (32*4 + off)(SP) +#define zsqr(off) (32*5 + off)(SP) +#define tmp(off) (32*6 + off)(SP) +#define rptr (32*7)(SP) + +//func p256PointDoubleAsm(res, in []uint64) +TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 + // Move input to stack in order to free registers + MOVQ res+0(FP), AX + MOVQ in+24(FP), BX + + MOVOU (16*0)(BX), X0 + MOVOU (16*1)(BX), X1 + MOVOU (16*2)(BX), X2 + MOVOU (16*3)(BX), X3 + MOVOU (16*4)(BX), X4 + MOVOU (16*5)(BX), X5 + + MOVOU X0, x(16*0) + MOVOU X1, x(16*1) + MOVOU X2, y(16*0) + MOVOU X3, y(16*1) + MOVOU X4, z(16*0) + MOVOU X5, z(16*1) + // Store pointer to result + MOVQ AX, rptr + // Begin point double + LDacc (z) + CALL p256SqrInternal(SB) + ST (zsqr) + + LDt (x) + p256AddInline + STt (m) + + LDacc (z) + LDt (y) + CALL p256MulInternal(SB) + p256MulBy2Inline + MOVQ rptr, AX + // Store z + MOVQ t0, (16*4 + 8*0)(AX) + MOVQ t1, (16*4 + 8*1)(AX) + MOVQ t2, (16*4 + 8*2)(AX) + MOVQ t3, (16*4 + 8*3)(AX) + + LDacc (x) + LDt (zsqr) + CALL p256SubInternal(SB) + LDt (m) + CALL p256MulInternal(SB) + ST (m) + // Multiply by 3 + p256MulBy2Inline + LDacc (m) + p256AddInline + STt (m) + //////////////////////// + LDacc (y) + p256MulBy2Inline + t2acc + CALL p256SqrInternal(SB) + ST (s) + CALL p256SqrInternal(SB) + // Divide by 2 + XORQ mul0, mul0 + MOVQ acc4, t0 + MOVQ acc5, t1 + MOVQ acc6, t2 + MOVQ acc7, t3 + + ADDQ $-1, acc4 + ADCQ p256const0<>(SB), acc5 + ADCQ $0, acc6 + ADCQ p256const1<>(SB), acc7 + ADCQ $0, mul0 + TESTQ $1, t0 + + CMOVQEQ t0, acc4 + CMOVQEQ t1, acc5 + CMOVQEQ t2, acc6 + CMOVQEQ t3, acc7 + ANDQ t0, mul0 + + SHRQ $1, acc4:acc5 + SHRQ $1, acc5:acc6 + SHRQ $1, acc6:acc7 + SHRQ $1, acc7:mul0 + ST (y) + ///////////////////////// + LDacc (x) + LDt (s) + CALL p256MulInternal(SB) + ST (s) + p256MulBy2Inline + STt (tmp) + + LDacc (m) + CALL p256SqrInternal(SB) + LDt (tmp) + CALL p256SubInternal(SB) + + MOVQ rptr, AX + // Store x + MOVQ acc4, (16*0 + 8*0)(AX) + MOVQ acc5, (16*0 + 8*1)(AX) + MOVQ acc6, (16*0 + 8*2)(AX) + MOVQ acc7, (16*0 + 8*3)(AX) + + acc2t + LDacc (s) + CALL p256SubInternal(SB) + + LDt (m) + CALL p256MulInternal(SB) + + LDt (y) + CALL p256SubInternal(SB) + MOVQ rptr, AX + // Store y + MOVQ acc4, (16*2 + 8*0)(AX) + MOVQ acc5, (16*2 + 8*1)(AX) + MOVQ acc6, (16*2 + 8*2)(AX) + MOVQ acc7, (16*2 + 8*3)(AX) + /////////////////////// + MOVQ $0, rptr + + RET +/* ---------------------------------------*/ + -- 2.48.1