From 73f284e2f24b890b907ffc5e274084359a39d2e5 Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Wed, 29 Nov 2017 13:20:08 -0600 Subject: [PATCH] crypto/elliptic: reduce allocations on amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is inspired by https://blog.cloudflare.com/go-dont-collect-my-garbage/ This CL adds allocation tracking and parallelizes p256-related benchmarks. Amount of allocations can be significantly reduced by marking amd64 asm functions as noescape. This exposes a bug in p256MovCond: PANDN with memory argument will fault if memory is not aligned, so they are replaced with MOVDQU (which is ok with unaligned memory) and register version of PANDN. Results on 88-thread machine (2x 22 cores) below: crypto/elliptic: name old time/op new time/op delta BaseMultP256-88 1.50µs ±11% 1.19µs ± 5% -20.20% (p=0.000 n=10+10) ScalarMultP256-88 5.47µs ± 5% 3.63µs ±10% -33.66% (p=0.000 n=9+10) name old alloc/op new alloc/op delta BaseMultP256-88 800B ± 0% 288B ± 0% -64.00% (p=0.000 n=10+10) ScalarMultP256-88 2.59kB ± 0% 0.26kB ± 0% -90.12% (p=0.000 n=10+10) name old allocs/op new allocs/op delta BaseMultP256-88 13.0 ± 0% 6.0 ± 0% -53.85% (p=0.000 n=10+10) ScalarMultP256-88 16.0 ± 0% 5.0 ± 0% -68.75% (p=0.000 n=10+10) crypto/ecdsa: name old time/op new time/op delta SignP256-88 8.63µs ±37% 7.55µs ±38% ~ (p=0.393 n=10+10) VerifyP256-88 13.9µs ± 8% 7.0µs ± 7% -49.29% (p=0.000 n=10+9) KeyGeneration-88 2.77µs ±11% 2.34µs ±11% -15.57% (p=0.000 n=10+10) name old alloc/op new alloc/op delta SignP256-88 4.14kB ± 1% 2.98kB ± 2% -27.94% (p=0.000 n=10+10) VerifyP256-88 4.47kB ± 0% 0.99kB ± 0% -77.84% (p=0.000 n=9+10) KeyGeneration-88 1.21kB ± 0% 0.69kB ± 0% -42.78% (p=0.000 n=10+10) name old allocs/op new allocs/op delta SignP256-88 47.0 ± 0% 34.0 ± 0% -27.66% (p=0.000 n=10+10) VerifyP256-88 38.0 ± 0% 17.0 ± 0% -55.26% (p=0.000 n=10+10) KeyGeneration-88 20.0 ± 0% 13.0 ± 0% -35.00% (p=0.000 n=10+10) On machine with only 4 cores, results are much less impressive: around 2% performance gain. Change-Id: I8a2f8168f83d27ad9ace1b4b1a1e11cb83edf717 Reviewed-on: https://go-review.googlesource.com/80757 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Ian Lance Taylor --- src/crypto/ecdsa/ecdsa_test.go | 36 ++++++++++++++++++---------- src/crypto/elliptic/elliptic_test.go | 27 ++++++++++++++------- src/crypto/elliptic/p256_amd64.go | 16 +++++++++++++ src/crypto/elliptic/p256_asm_amd64.s | 18 +++++++++----- 4 files changed, 70 insertions(+), 27 deletions(-) diff --git a/src/crypto/ecdsa/ecdsa_test.go b/src/crypto/ecdsa/ecdsa_test.go index 2b3d44ac7a..9224a039f3 100644 --- a/src/crypto/ecdsa/ecdsa_test.go +++ b/src/crypto/ecdsa/ecdsa_test.go @@ -48,10 +48,13 @@ func BenchmarkSignP256(b *testing.B) { hashed := []byte("testing") priv, _ := GenerateKey(p256, rand.Reader) + b.ReportAllocs() b.ResetTimer() - for i := 0; i < b.N; i++ { - _, _, _ = Sign(rand.Reader, priv, hashed) - } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + _, _, _ = Sign(rand.Reader, priv, hashed) + } + }) } func BenchmarkSignP384(b *testing.B) { @@ -60,10 +63,13 @@ func BenchmarkSignP384(b *testing.B) { hashed := []byte("testing") priv, _ := GenerateKey(p384, rand.Reader) + b.ReportAllocs() b.ResetTimer() - for i := 0; i < b.N; i++ { - _, _, _ = Sign(rand.Reader, priv, hashed) - } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + _, _, _ = Sign(rand.Reader, priv, hashed) + } + }) } func BenchmarkVerifyP256(b *testing.B) { @@ -73,20 +79,26 @@ func BenchmarkVerifyP256(b *testing.B) { priv, _ := GenerateKey(p256, rand.Reader) r, s, _ := Sign(rand.Reader, priv, hashed) + b.ReportAllocs() b.ResetTimer() - for i := 0; i < b.N; i++ { - Verify(&priv.PublicKey, hashed, r, s) - } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + Verify(&priv.PublicKey, hashed, r, s) + } + }) } func BenchmarkKeyGeneration(b *testing.B) { b.ResetTimer() p256 := elliptic.P256() + b.ReportAllocs() b.ResetTimer() - for i := 0; i < b.N; i++ { - GenerateKey(p256, rand.Reader) - } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + GenerateKey(p256, rand.Reader) + } + }) } func testSignAndVerify(t *testing.T, c elliptic.Curve, tag string) { diff --git a/src/crypto/elliptic/elliptic_test.go b/src/crypto/elliptic/elliptic_test.go index 55c6e894b0..f661359c35 100644 --- a/src/crypto/elliptic/elliptic_test.go +++ b/src/crypto/elliptic/elliptic_test.go @@ -523,10 +523,13 @@ func BenchmarkBaseMult(b *testing.B) { p224 := P224() e := p224BaseMultTests[25] k, _ := new(big.Int).SetString(e.k, 10) + b.ReportAllocs() b.StartTimer() - for i := 0; i < b.N; i++ { - p224.ScalarBaseMult(k.Bytes()) - } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + p224.ScalarBaseMult(k.Bytes()) + } + }) } func BenchmarkBaseMultP256(b *testing.B) { @@ -534,10 +537,13 @@ func BenchmarkBaseMultP256(b *testing.B) { p256 := P256() e := p224BaseMultTests[25] k, _ := new(big.Int).SetString(e.k, 10) + b.ReportAllocs() b.StartTimer() - for i := 0; i < b.N; i++ { - p256.ScalarBaseMult(k.Bytes()) - } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + p256.ScalarBaseMult(k.Bytes()) + } + }) } func BenchmarkScalarMultP256(b *testing.B) { @@ -546,10 +552,13 @@ func BenchmarkScalarMultP256(b *testing.B) { _, x, y, _ := GenerateKey(p256, rand.Reader) priv, _, _, _ := GenerateKey(p256, rand.Reader) + b.ReportAllocs() b.StartTimer() - for i := 0; i < b.N; i++ { - p256.ScalarMult(x, y, priv) - } + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + p256.ScalarMult(x, y, priv) + } + }) } func TestMarshal(t *testing.T) { diff --git a/src/crypto/elliptic/p256_amd64.go b/src/crypto/elliptic/p256_amd64.go index bde8e3dfbb..6f8c9999e6 100644 --- a/src/crypto/elliptic/p256_amd64.go +++ b/src/crypto/elliptic/p256_amd64.go @@ -52,46 +52,62 @@ func (curve p256Curve) Params() *CurveParams { // Functions implemented in p256_asm_amd64.s // Montgomery multiplication modulo P256 +//go:noescape func p256Mul(res, in1, in2 []uint64) // Montgomery square modulo P256 +//go:noescape func p256Sqr(res, in []uint64) // Montgomery multiplication by 1 +//go:noescape func p256FromMont(res, in []uint64) // iff cond == 1 val <- -val +//go:noescape func p256NegCond(val []uint64, cond int) // if cond == 0 res <- b; else res <- a +//go:noescape func p256MovCond(res, a, b []uint64, cond int) // Endianness swap +//go:noescape func p256BigToLittle(res []uint64, in []byte) + +//go:noescape func p256LittleToBig(res []byte, in []uint64) // Constant time table access +//go:noescape func p256Select(point, table []uint64, idx int) + +//go:noescape func p256SelectBase(point, table []uint64, idx int) // Montgomery multiplication modulo Ord(G) +//go:noescape func p256OrdMul(res, in1, in2 []uint64) // Montgomery square modulo Ord(G), repeated n times +//go:noescape func p256OrdSqr(res, in []uint64, n int) // Point add with in2 being affine point // If sign == 1 -> in2 = -in2 // If sel == 0 -> res = in1 // if zero == 0 -> res = in2 +//go:noescape func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) // Point add. Returns one if the two input points were equal and zero // otherwise. (Note that, due to the way that the equations work out, some // representations of ∞ are considered equal to everything by this function.) +//go:noescape func p256PointAddAsm(res, in1, in2 []uint64) int // Point double +//go:noescape func p256PointDoubleAsm(res, in []uint64) func (curve p256Curve) Inverse(k *big.Int) *big.Int { diff --git a/src/crypto/elliptic/p256_asm_amd64.s b/src/crypto/elliptic/p256_asm_amd64.s index 73f0fdd159..3f9d624270 100644 --- a/src/crypto/elliptic/p256_asm_amd64.s +++ b/src/crypto/elliptic/p256_asm_amd64.s @@ -81,17 +81,23 @@ TEXT ·p256MovCond(SB),NOSPLIT,$0 PCMPEQL X13, X12 MOVOU X12, X0 - PANDN (16*0)(x_ptr), X0 + MOVOU (16*0)(x_ptr), X6 + PANDN X6, X0 MOVOU X12, X1 - PANDN (16*1)(x_ptr), X1 + MOVOU (16*1)(x_ptr), X7 + PANDN X7, X1 MOVOU X12, X2 - PANDN (16*2)(x_ptr), X2 + MOVOU (16*2)(x_ptr), X8 + PANDN X8, X2 MOVOU X12, X3 - PANDN (16*3)(x_ptr), X3 + MOVOU (16*3)(x_ptr), X9 + PANDN X9, X3 MOVOU X12, X4 - PANDN (16*4)(x_ptr), X4 + MOVOU (16*4)(x_ptr), X10 + PANDN X10, X4 MOVOU X12, X5 - PANDN (16*5)(x_ptr), X5 + MOVOU (16*5)(x_ptr), X11 + PANDN X11, X5 MOVOU (16*0)(y_ptr), X6 MOVOU (16*1)(y_ptr), X7 -- 2.50.0