From 1079e574d4f09520c987c7a0405c2d1ae2a275b3 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 5 May 2023 11:29:39 +0200 Subject: [PATCH] crypto/rsa,crypto/internal/bigmod: optimized short exponentiations MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit RSA encryption and verification performs an exponentiation by a value usually just a few bits long. The current strategy with table precomputation is not efficient. Add an ExpShort bigmod method, and use it in RSA public key operations. After this, almost all CPU time in encryption/verification is spent preparing the constants for the modulus, because PublicKey doesn't have a Precompute function. This speeds up signing a bit too, because it performs a verification to protect against faults. name old time/op new time/op delta DecryptPKCS1v15/2048-4 1.13ms ± 0% 1.13ms ± 0% -0.43% (p=0.000 n=8+9) DecryptPKCS1v15/3072-4 3.20ms ± 0% 3.15ms ± 0% -1.59% (p=0.000 n=10+8) DecryptPKCS1v15/4096-4 6.45ms ± 0% 6.42ms ± 0% -0.49% (p=0.000 n=10+10) EncryptPKCS1v15/2048-4 132µs ± 0% 108µs ± 0% -17.99% (p=0.000 n=10+10) DecryptOAEP/2048-4 1.13ms ± 0% 1.14ms ± 0% +0.91% (p=0.000 n=10+10) EncryptOAEP/2048-4 132µs ± 0% 108µs ± 0% -18.09% (p=0.000 n=10+10) SignPKCS1v15/2048-4 1.18ms ± 0% 1.14ms ± 1% -3.30% (p=0.000 n=10+10) VerifyPKCS1v15/2048-4 131µs ± 0% 107µs ± 0% -18.30% (p=0.000 n=9+10) SignPSS/2048-4 1.18ms ± 0% 1.15ms ± 1% -1.87% (p=0.000 n=10+10) VerifyPSS/2048-4 132µs ± 0% 108µs ± 0% -18.30% (p=0.000 n=10+9) Updates #57752 Change-Id: Ic89273a58002b32b1c5c3185a35262694ceef409 Reviewed-on: https://go-review.googlesource.com/c/go/+/492935 Run-TryBot: Filippo Valsorda Auto-Submit: Filippo Valsorda Reviewed-by: Roland Shoemaker TryBot-Result: Gopher Robot Reviewed-by: Matthew Dempsky --- src/crypto/internal/bigmod/nat.go | 56 ++++++++++++++++++-------- src/crypto/internal/bigmod/nat_test.go | 11 +++++ src/crypto/rsa/rsa.go | 22 ++++------ 3 files changed, 57 insertions(+), 32 deletions(-) diff --git a/src/crypto/internal/bigmod/nat.go b/src/crypto/internal/bigmod/nat.go index 3cad382b53..a08c12b76e 100644 --- a/src/crypto/internal/bigmod/nat.go +++ b/src/crypto/internal/bigmod/nat.go @@ -522,7 +522,7 @@ func (x *Nat) Add(y *Nat, m *Modulus) *Nat { func (x *Nat) montgomeryRepresentation(m *Modulus) *Nat { // A Montgomery multiplication (which computes a * b / R) by R * R works out // to a multiplication by R, which takes the value out of the Montgomery domain. - return x.montgomeryMul(NewNat().set(x), m.rr, m) + return x.montgomeryMul(x, m.rr, m) } // montgomeryReduction calculates x = x / R mod m, with R = 2^(_W * n) and @@ -533,10 +533,9 @@ func (x *Nat) montgomeryReduction(m *Modulus) *Nat { // By Montgomery multiplying with 1 not in Montgomery representation, we // convert out back from Montgomery representation, because it works out to // dividing by R. - t0 := NewNat().set(x) - t1 := NewNat().ExpandFor(m) - t1.limbs[0] = 1 - return x.montgomeryMul(t0, t1, m) + one := NewNat().ExpandFor(m) + one.limbs[0] = 1 + return x.montgomeryMul(x, one, m) } // montgomeryMul calculates x = a * b / R mod m, with R = 2^(_W * n) and @@ -681,10 +680,10 @@ func addMulVVW(z, x []uint, y uint) (carry uint) { return carry } -// Mul calculates x *= y mod m. +// Mul calculates x = x * y mod m. // -// x and y must already be reduced modulo m, they must share its announced -// length, and they may not alias. +// The length of both operands must be the same as the modulus. Both operands +// must already be reduced modulo m. func (x *Nat) Mul(y *Nat, m *Modulus) *Nat { // A Montgomery multiplication by a value out of the Montgomery domain // takes the result out of Montgomery representation. @@ -716,28 +715,51 @@ func (out *Nat) Exp(x *Nat, e []byte, m *Modulus) *Nat { out.resetFor(m) out.limbs[0] = 1 out.montgomeryRepresentation(m) - t0 := NewNat().ExpandFor(m) - t1 := NewNat().ExpandFor(m) + tmp := NewNat().ExpandFor(m) for _, b := range e { for _, j := range []int{4, 0} { // Square four times. Optimization note: this can be implemented // more efficiently than with generic Montgomery multiplication. - t1.montgomeryMul(out, out, m) - out.montgomeryMul(t1, t1, m) - t1.montgomeryMul(out, out, m) - out.montgomeryMul(t1, t1, m) + out.montgomeryMul(out, out, m) + out.montgomeryMul(out, out, m) + out.montgomeryMul(out, out, m) + out.montgomeryMul(out, out, m) // Select x^k in constant time from the table. k := uint((b >> j) & 0b1111) for i := range table { - t0.assign(ctEq(k, uint(i+1)), table[i]) + tmp.assign(ctEq(k, uint(i+1)), table[i]) } // Multiply by x^k, discarding the result if k = 0. - t1.montgomeryMul(out, t0, m) - out.assign(not(ctEq(k, 0)), t1) + tmp.montgomeryMul(out, tmp, m) + out.assign(not(ctEq(k, 0)), tmp) } } return out.montgomeryReduction(m) } + +// ExpShort calculates out = x^e mod m. +// +// The output will be resized to the size of m and overwritten. x must already +// be reduced modulo m. This leaks the exact bit size of the exponent. +func (out *Nat) ExpShort(x *Nat, e uint, m *Modulus) *Nat { + xR := NewNat().set(x).montgomeryRepresentation(m) + + out.resetFor(m) + out.limbs[0] = 1 + out.montgomeryRepresentation(m) + + // For short exponents, precomputing a table and using a window like in Exp + // doesn't pay off. Instead, we do a simple constant-time conditional + // square-and-multiply chain, skipping the initial run of zeroes. + tmp := NewNat().ExpandFor(m) + for i := bits.UintSize - bitLen(e); i < bits.UintSize; i++ { + out.montgomeryMul(out, out, m) + k := (e >> (bits.UintSize - i - 1)) & 1 + tmp.montgomeryMul(out, xR, m) + out.assign(ctEq(k, 1), tmp) + } + return out.montgomeryReduction(m) +} diff --git a/src/crypto/internal/bigmod/nat_test.go b/src/crypto/internal/bigmod/nat_test.go index cc5ffe7bb7..1c615b9888 100644 --- a/src/crypto/internal/bigmod/nat_test.go +++ b/src/crypto/internal/bigmod/nat_test.go @@ -299,6 +299,17 @@ func TestExp(t *testing.T) { } } +func TestExpShort(t *testing.T) { + m := modulusFromBytes([]byte{13}) + x := &Nat{[]uint{3}} + out := &Nat{[]uint{0}} + out.ExpShort(x, 12, m) + expected := &Nat{[]uint{1}} + if out.Equal(expected) != 1 { + t.Errorf("%+v != %+v", out, expected) + } +} + // TestMulReductions tests that Mul reduces results equal or slightly greater // than the modulus. Some Montgomery algorithms don't and need extra care to // return correct results. See https://go.dev/issue/13907. diff --git a/src/crypto/rsa/rsa.go b/src/crypto/rsa/rsa.go index 6f0221d74b..1d01ff3ed1 100644 --- a/src/crypto/rsa/rsa.go +++ b/src/crypto/rsa/rsa.go @@ -33,7 +33,6 @@ import ( "crypto/internal/randutil" "crypto/rand" "crypto/subtle" - "encoding/binary" "errors" "hash" "io" @@ -462,25 +461,18 @@ var ErrMessageTooLong = errors.New("crypto/rsa: message too long for RSA key siz func encrypt(pub *PublicKey, plaintext []byte) ([]byte, error) { boring.Unreachable() + // Most of the CPU time for encryption and verification is spent in this + // NewModulusFromBig call, because PublicKey doesn't have a Precomputed + // field. If performance becomes an issue, consider placing a private + // sync.Once on PublicKey to compute this. N := bigmod.NewModulusFromBig(pub.N) m, err := bigmod.NewNat().SetBytes(plaintext, N) if err != nil { return nil, err } - e := intToBytes(pub.E) + e := uint(pub.E) - return bigmod.NewNat().Exp(m, e, N).Bytes(N), nil -} - -// intToBytes returns i as a big-endian slice of bytes with no leading zeroes, -// leaking only the bit size of i through timing side-channels. -func intToBytes(i int) []byte { - b := make([]byte, 8) - binary.BigEndian.PutUint64(b, uint64(i)) - for len(b) > 1 && b[0] == 0 { - b = b[1:] - } - return b + return bigmod.NewNat().ExpShort(m, e, N).Bytes(N), nil } // EncryptOAEP encrypts the given message with RSA-OAEP. @@ -648,7 +640,7 @@ func decrypt(priv *PrivateKey, ciphertext []byte, check bool) ([]byte, error) { } if check { - c1 := bigmod.NewNat().Exp(m, intToBytes(priv.E), N) + c1 := bigmod.NewNat().ExpShort(m, uint(priv.E), N) if c1.Equal(c) != 1 { return nil, ErrDecryption } -- 2.48.1