From 1079e574d4f09520c987c7a0405c2d1ae2a275b3 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo@golang.org>
Date: Fri, 5 May 2023 11:29:39 +0200
Subject: [PATCH] crypto/rsa,crypto/internal/bigmod: optimized short
 exponentiations
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

RSA encryption and verification performs an exponentiation by a value
usually just a few bits long. The current strategy with table
precomputation is not efficient.

Add an ExpShort bigmod method, and use it in RSA public key operations.

After this, almost all CPU time in encryption/verification is spent
preparing the constants for the modulus, because PublicKey doesn't have
a Precompute function.

This speeds up signing a bit too, because it performs a verification to
protect against faults.

name                    old time/op  new time/op  delta
DecryptPKCS1v15/2048-4  1.13ms Â± 0%  1.13ms Â± 0%   -0.43%  (p=0.000 n=8+9)
DecryptPKCS1v15/3072-4  3.20ms Â± 0%  3.15ms Â± 0%   -1.59%  (p=0.000 n=10+8)
DecryptPKCS1v15/4096-4  6.45ms Â± 0%  6.42ms Â± 0%   -0.49%  (p=0.000 n=10+10)
EncryptPKCS1v15/2048-4   132Âµs Â± 0%   108Âµs Â± 0%  -17.99%  (p=0.000 n=10+10)
DecryptOAEP/2048-4      1.13ms Â± 0%  1.14ms Â± 0%   +0.91%  (p=0.000 n=10+10)
EncryptOAEP/2048-4       132Âµs Â± 0%   108Âµs Â± 0%  -18.09%  (p=0.000 n=10+10)
SignPKCS1v15/2048-4     1.18ms Â± 0%  1.14ms Â± 1%   -3.30%  (p=0.000 n=10+10)
VerifyPKCS1v15/2048-4    131Âµs Â± 0%   107Âµs Â± 0%  -18.30%  (p=0.000 n=9+10)
SignPSS/2048-4          1.18ms Â± 0%  1.15ms Â± 1%   -1.87%  (p=0.000 n=10+10)
VerifyPSS/2048-4         132Âµs Â± 0%   108Âµs Â± 0%  -18.30%  (p=0.000 n=10+9)

Updates #57752

Change-Id: Ic89273a58002b32b1c5c3185a35262694ceef409
Reviewed-on: https://go-review.googlesource.com/c/go/+/492935
Run-TryBot: Filippo Valsorda <filippo@golang.org>
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Roland Shoemaker <roland@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
---
 src/crypto/internal/bigmod/nat.go      | 56 ++++++++++++++++++--------
 src/crypto/internal/bigmod/nat_test.go | 11 +++++
 src/crypto/rsa/rsa.go                  | 22 ++++------
 3 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/src/crypto/internal/bigmod/nat.go b/src/crypto/internal/bigmod/nat.go
index 3cad382b53..a08c12b76e 100644
--- a/src/crypto/internal/bigmod/nat.go
+++ b/src/crypto/internal/bigmod/nat.go
@@ -522,7 +522,7 @@ func (x *Nat) Add(y *Nat, m *Modulus) *Nat {
 func (x *Nat) montgomeryRepresentation(m *Modulus) *Nat {
 	// A Montgomery multiplication (which computes a * b / R) by R * R works out
 	// to a multiplication by R, which takes the value out of the Montgomery domain.
-	return x.montgomeryMul(NewNat().set(x), m.rr, m)
+	return x.montgomeryMul(x, m.rr, m)
 }
 
 // montgomeryReduction calculates x = x / R mod m, with R = 2^(_W * n) and
@@ -533,10 +533,9 @@ func (x *Nat) montgomeryReduction(m *Modulus) *Nat {
 	// By Montgomery multiplying with 1 not in Montgomery representation, we
 	// convert out back from Montgomery representation, because it works out to
 	// dividing by R.
-	t0 := NewNat().set(x)
-	t1 := NewNat().ExpandFor(m)
-	t1.limbs[0] = 1
-	return x.montgomeryMul(t0, t1, m)
+	one := NewNat().ExpandFor(m)
+	one.limbs[0] = 1
+	return x.montgomeryMul(x, one, m)
 }
 
 // montgomeryMul calculates x = a * b / R mod m, with R = 2^(_W * n) and
@@ -681,10 +680,10 @@ func addMulVVW(z, x []uint, y uint) (carry uint) {
 	return carry
 }
 
-// Mul calculates x *= y mod m.
+// Mul calculates x = x * y mod m.
 //
-// x and y must already be reduced modulo m, they must share its announced
-// length, and they may not alias.
+// The length of both operands must be the same as the modulus. Both operands
+// must already be reduced modulo m.
 func (x *Nat) Mul(y *Nat, m *Modulus) *Nat {
 	// A Montgomery multiplication by a value out of the Montgomery domain
 	// takes the result out of Montgomery representation.
@@ -716,28 +715,51 @@ func (out *Nat) Exp(x *Nat, e []byte, m *Modulus) *Nat {
 	out.resetFor(m)
 	out.limbs[0] = 1
 	out.montgomeryRepresentation(m)
-	t0 := NewNat().ExpandFor(m)
-	t1 := NewNat().ExpandFor(m)
+	tmp := NewNat().ExpandFor(m)
 	for _, b := range e {
 		for _, j := range []int{4, 0} {
 			// Square four times. Optimization note: this can be implemented
 			// more efficiently than with generic Montgomery multiplication.
-			t1.montgomeryMul(out, out, m)
-			out.montgomeryMul(t1, t1, m)
-			t1.montgomeryMul(out, out, m)
-			out.montgomeryMul(t1, t1, m)
+			out.montgomeryMul(out, out, m)
+			out.montgomeryMul(out, out, m)
+			out.montgomeryMul(out, out, m)
+			out.montgomeryMul(out, out, m)
 
 			// Select x^k in constant time from the table.
 			k := uint((b >> j) & 0b1111)
 			for i := range table {
-				t0.assign(ctEq(k, uint(i+1)), table[i])
+				tmp.assign(ctEq(k, uint(i+1)), table[i])
 			}
 
 			// Multiply by x^k, discarding the result if k = 0.
-			t1.montgomeryMul(out, t0, m)
-			out.assign(not(ctEq(k, 0)), t1)
+			tmp.montgomeryMul(out, tmp, m)
+			out.assign(not(ctEq(k, 0)), tmp)
 		}
 	}
 
 	return out.montgomeryReduction(m)
 }
+
+// ExpShort calculates out = x^e mod m.
+//
+// The output will be resized to the size of m and overwritten. x must already
+// be reduced modulo m. This leaks the exact bit size of the exponent.
+func (out *Nat) ExpShort(x *Nat, e uint, m *Modulus) *Nat {
+	xR := NewNat().set(x).montgomeryRepresentation(m)
+
+	out.resetFor(m)
+	out.limbs[0] = 1
+	out.montgomeryRepresentation(m)
+
+	// For short exponents, precomputing a table and using a window like in Exp
+	// doesn't pay off. Instead, we do a simple constant-time conditional
+	// square-and-multiply chain, skipping the initial run of zeroes.
+	tmp := NewNat().ExpandFor(m)
+	for i := bits.UintSize - bitLen(e); i < bits.UintSize; i++ {
+		out.montgomeryMul(out, out, m)
+		k := (e >> (bits.UintSize - i - 1)) & 1
+		tmp.montgomeryMul(out, xR, m)
+		out.assign(ctEq(k, 1), tmp)
+	}
+	return out.montgomeryReduction(m)
+}
diff --git a/src/crypto/internal/bigmod/nat_test.go b/src/crypto/internal/bigmod/nat_test.go
index cc5ffe7bb7..1c615b9888 100644
--- a/src/crypto/internal/bigmod/nat_test.go
+++ b/src/crypto/internal/bigmod/nat_test.go
@@ -299,6 +299,17 @@ func TestExp(t *testing.T) {
 	}
 }
 
+func TestExpShort(t *testing.T) {
+	m := modulusFromBytes([]byte{13})
+	x := &Nat{[]uint{3}}
+	out := &Nat{[]uint{0}}
+	out.ExpShort(x, 12, m)
+	expected := &Nat{[]uint{1}}
+	if out.Equal(expected) != 1 {
+		t.Errorf("%+v != %+v", out, expected)
+	}
+}
+
 // TestMulReductions tests that Mul reduces results equal or slightly greater
 // than the modulus. Some Montgomery algorithms don't and need extra care to
 // return correct results. See https://go.dev/issue/13907.
diff --git a/src/crypto/rsa/rsa.go b/src/crypto/rsa/rsa.go
index 6f0221d74b..1d01ff3ed1 100644
--- a/src/crypto/rsa/rsa.go
+++ b/src/crypto/rsa/rsa.go
@@ -33,7 +33,6 @@ import (
 	"crypto/internal/randutil"
 	"crypto/rand"
 	"crypto/subtle"
-	"encoding/binary"
 	"errors"
 	"hash"
 	"io"
@@ -462,25 +461,18 @@ var ErrMessageTooLong = errors.New("crypto/rsa: message too long for RSA key siz
 func encrypt(pub *PublicKey, plaintext []byte) ([]byte, error) {
 	boring.Unreachable()
 
+	// Most of the CPU time for encryption and verification is spent in this
+	// NewModulusFromBig call, because PublicKey doesn't have a Precomputed
+	// field. If performance becomes an issue, consider placing a private
+	// sync.Once on PublicKey to compute this.
 	N := bigmod.NewModulusFromBig(pub.N)
 	m, err := bigmod.NewNat().SetBytes(plaintext, N)
 	if err != nil {
 		return nil, err
 	}
-	e := intToBytes(pub.E)
+	e := uint(pub.E)
 
-	return bigmod.NewNat().Exp(m, e, N).Bytes(N), nil
-}
-
-// intToBytes returns i as a big-endian slice of bytes with no leading zeroes,
-// leaking only the bit size of i through timing side-channels.
-func intToBytes(i int) []byte {
-	b := make([]byte, 8)
-	binary.BigEndian.PutUint64(b, uint64(i))
-	for len(b) > 1 && b[0] == 0 {
-		b = b[1:]
-	}
-	return b
+	return bigmod.NewNat().ExpShort(m, e, N).Bytes(N), nil
 }
 
 // EncryptOAEP encrypts the given message with RSA-OAEP.
@@ -648,7 +640,7 @@ func decrypt(priv *PrivateKey, ciphertext []byte, check bool) ([]byte, error) {
 	}
 
 	if check {
-		c1 := bigmod.NewNat().Exp(m, intToBytes(priv.E), N)
+		c1 := bigmod.NewNat().ExpShort(m, uint(priv.E), N)
 		if c1.Equal(c) != 1 {
 			return nil, ErrDecryption
 		}
-- 
2.50.0