From 50b1add5a73f05b0142adf783b6825c0c149882b Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo@golang.org>
Date: Wed, 4 May 2022 17:49:37 -0400
Subject: [PATCH] crypto/elliptic: precompute ScalarBaseMult doublings
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

name                    old time/op    new time/op    delta
pkg:crypto/ecdsa goos:darwin goarch:amd64
Sign/P224-16               250Âµs Â± 2%      91Âµs Â± 2%  -63.42%  (p=0.000 n=10+9)
Sign/P384-16               955Âµs Â± 3%     311Âµs Â± 2%  -67.48%  (p=0.000 n=10+10)
Sign/P521-16              2.74ms Â± 2%    0.82ms Â± 2%  -69.95%  (p=0.000 n=10+10)
Verify/P224-16             440Âµs Â± 3%     282Âµs Â± 5%  -35.94%  (p=0.000 n=9+10)
Verify/P384-16            1.72ms Â± 2%    1.07ms Â± 1%  -38.02%  (p=0.000 n=10+9)
Verify/P521-16            5.10ms Â± 2%    3.18ms Â± 3%  -37.70%  (p=0.000 n=10+10)
GenerateKey/P224-16        225Âµs Â± 3%      67Âµs Â± 4%  -70.42%  (p=0.000 n=9+10)
GenerateKey/P384-16        881Âµs Â± 1%     241Âµs Â± 2%  -72.67%  (p=0.000 n=10+10)
GenerateKey/P521-16       2.62ms Â± 3%    0.69ms Â± 3%  -73.78%  (p=0.000 n=10+9)
pkg:crypto/elliptic/internal/nistec goos:darwin goarch:amd64
ScalarMult/P224-16         219Âµs Â± 4%     209Âµs Â± 3%   -4.57%  (p=0.003 n=10+10)
ScalarMult/P384-16         838Âµs Â± 2%     823Âµs Â± 1%   -1.72%  (p=0.004 n=10+9)
ScalarMult/P521-16        2.48ms Â± 2%    2.45ms Â± 2%     ~     (p=0.052 n=10+10)
ScalarBaseMult/P224-16     214Âµs Â± 4%      54Âµs Â± 4%  -74.88%  (p=0.000 n=10+10)
ScalarBaseMult/P384-16     828Âµs Â± 2%     196Âµs Â± 3%  -76.38%  (p=0.000 n=10+10)
ScalarBaseMult/P521-16    2.50ms Â± 3%    0.55ms Â± 2%  -77.96%  (p=0.000 n=10+10)

Updates #52424
For #52182

Change-Id: I2be3c2b8cdeead512063ef489e43805f4ee71d0f
Reviewed-on: https://go-review.googlesource.com/c/go/+/404174
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Fernando Lobato Meeser <felobato@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
---
 .../elliptic/internal/nistec/generate.go      | 115 ++++++++++++++----
 src/crypto/elliptic/internal/nistec/p224.go   | 115 ++++++++++++++----
 src/crypto/elliptic/internal/nistec/p256.go   | 115 ++++++++++++++----
 src/crypto/elliptic/internal/nistec/p384.go   | 115 ++++++++++++++----
 src/crypto/elliptic/internal/nistec/p521.go   | 115 ++++++++++++++----
 5 files changed, 465 insertions(+), 110 deletions(-)

diff --git a/src/crypto/elliptic/internal/nistec/generate.go b/src/crypto/elliptic/internal/nistec/generate.go
index f3726a06ca..fbca6c3741 100644
--- a/src/crypto/elliptic/internal/nistec/generate.go
+++ b/src/crypto/elliptic/internal/nistec/generate.go
@@ -97,12 +97,15 @@ import (
 	"crypto/elliptic/internal/fiat"
 	"crypto/subtle"
 	"errors"
+	"sync"
 )
 
 var {{.p}}B, _ = new({{.Element}}).SetBytes({{.B}})
 
 var {{.p}}G, _ = New{{.P}}Point().SetBytes({{.G}})
 
+// {{.p}}ElementLength is the length of an element of the base or scalar field,
+// which have the same bytes length for all NIST P curves.
 const {{.p}}ElementLength = {{ .ElementLen }}
 
 // {{.P}}Point is a {{.P}} point. The zero value is NOT valid.
@@ -329,34 +332,54 @@ func (q *{{.P}}Point) Select(p1, p2 *{{.P}}Point, cond int) *{{.P}}Point {
 	return q
 }
 
+// A {{.p}}Table holds the first 15 multiples of a point at offset -1, so [1]P
+// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity
+// point.
+type {{.p}}Table [15]*{{.P}}Point
+
+// Select selects the n-th multiple of the table base point into p. It works in
+// constant time by iterating over every entry of the table. n must be in [0, 15].
+func (table *{{.p}}Table) Select(p *{{.P}}Point, n uint8) {
+	if n >= 16 {
+		panic("nistec: internal error: {{.p}}Table called with out-of-bounds value")
+	}
+	p.Set(New{{.P}}Point())
+	for i := uint8(1); i < 16; i++ {
+		cond := subtle.ConstantTimeByteEq(i, n)
+		p.Select(table[i-1], p, cond)
+	}
+}
+
 // ScalarMult sets p = scalar * q, and returns p.
 func (p *{{.P}}Point) ScalarMult(q *{{.P}}Point, scalar []byte) (*{{.P}}Point, error) {
-	// table holds the first 16 multiples of q. The explicit new{{.P}}Point calls
-	// get inlined, letting the allocations live on the stack.
-	var table = [16]*{{.P}}Point{
-		New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(),
-		New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(),
+	// Compute a {{.p}}Table for the base point q. The explicit New{{.P}}Point
+	// calls get inlined, letting the allocations live on the stack.
+	var table = {{.p}}Table{New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(),
 		New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(),
 		New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(),
-	}
-	for i := 1; i < 16; i++ {
-        table[i].Add(table[i-1], q)
+		New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point()}
+	table[0].Set(q)
+	for i := 1; i < 15; i += 2 {
+		table[i].Double(table[i/2])
+		table[i+1].Add(table[i], q)
 	}
 
 	// Instead of doing the classic double-and-add chain, we do it with a
 	// four-bit window: we double four times, and then add [0-15]P.
 	t := New{{.P}}Point()
 	p.Set(New{{.P}}Point())
-	for _, byte := range scalar {
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte>>4, i)
-			t.Select(table[i], t, cond)
+	for i, byte := range scalar {
+		// No need to double on the first iteration, as p is the identity at
+		// this point, and [N]â = â.
+		if i != 0 {
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
 		}
+
+		windowValue := byte >> 4
+		table.Select(t, windowValue)
 		p.Add(p, t)
 
 		p.Double(p)
@@ -364,19 +387,67 @@ func (p *{{.P}}Point) ScalarMult(q *{{.P}}Point, scalar []byte) (*{{.P}}Point, e
 		p.Double(p)
 		p.Double(p)
 
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte&0b1111, i)
-			t.Select(table[i], t, cond)
-		}
+		windowValue = byte & 0b1111
+		table.Select(t, windowValue)
 		p.Add(p, t)
 	}
 
 	return p, nil
 }
 
+var {{.p}}GeneratorTable *[{{.p}}ElementLength * 2]{{.p}}Table
+var {{.p}}GeneratorTableOnce sync.Once
+
+// generatorTable returns a sequence of {{.p}}Tables. The first table contains
+// multiples of G. Each successive table is the previous table doubled four
+// times.
+func (p *{{.P}}Point) generatorTable() *[{{.p}}ElementLength * 2]{{.p}}Table {
+	{{.p}}GeneratorTableOnce.Do(func() {
+		{{.p}}GeneratorTable = new([{{.p}}ElementLength * 2]{{.p}}Table)
+		base := New{{.P}}Generator()
+		for i := 0; i < {{.p}}ElementLength*2; i++ {
+			{{.p}}GeneratorTable[i][0] = New{{.P}}Point().Set(base)
+			for j := 1; j < 15; j++ {
+				{{.p}}GeneratorTable[i][j] = New{{.P}}Point().Add({{.p}}GeneratorTable[i][j-1], base)
+			}
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+		}
+	})
+	return {{.p}}GeneratorTable
+}
+
 // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and
 // returns p.
 func (p *{{.P}}Point) ScalarBaseMult(scalar []byte) (*{{.P}}Point, error) {
-	return p.ScalarMult(New{{.P}}Generator(), scalar)
+	if len(scalar) != {{.p}}ElementLength {
+		return nil, errors.New("invalid scalar length")
+	}
+	tables := p.generatorTable()
+
+	// This is also a scalar multiplication with a four-bit window like in
+	// ScalarMult, but in this case the doublings are precomputed. The value
+	// [windowValue]G added at iteration k would normally get doubled
+	// (totIterations-k)Ã4 times, but with a larger precomputation we can
+	// instead add [2^((totIterations-k)Ã4)][windowValue]G and avoid the
+	// doublings between iterations.
+	t := New{{.P}}Point()
+	p.Set(New{{.P}}Point())
+	tableIndex := len(tables) - 1
+	for _, byte := range scalar {
+		windowValue := byte >> 4
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+
+		windowValue = byte & 0b1111
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+	}
+
+	return p, nil
 }
 `
diff --git a/src/crypto/elliptic/internal/nistec/p224.go b/src/crypto/elliptic/internal/nistec/p224.go
index 7f3dcca742..0db4ba1316 100644
--- a/src/crypto/elliptic/internal/nistec/p224.go
+++ b/src/crypto/elliptic/internal/nistec/p224.go
@@ -10,12 +10,15 @@ import (
 	"crypto/elliptic/internal/fiat"
 	"crypto/subtle"
 	"errors"
+	"sync"
 )
 
 var p224B, _ = new(fiat.P224Element).SetBytes([]byte{0xb4, 0x5, 0xa, 0x85, 0xc, 0x4, 0xb3, 0xab, 0xf5, 0x41, 0x32, 0x56, 0x50, 0x44, 0xb0, 0xb7, 0xd7, 0xbf, 0xd8, 0xba, 0x27, 0xb, 0x39, 0x43, 0x23, 0x55, 0xff, 0xb4})
 
 var p224G, _ = NewP224Point().SetBytes([]byte{0x4, 0xb7, 0xe, 0xc, 0xbd, 0x6b, 0xb4, 0xbf, 0x7f, 0x32, 0x13, 0x90, 0xb9, 0x4a, 0x3, 0xc1, 0xd3, 0x56, 0xc2, 0x11, 0x22, 0x34, 0x32, 0x80, 0xd6, 0x11, 0x5c, 0x1d, 0x21, 0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, 0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x7, 0x47, 0x64, 0x44, 0xd5, 0x81, 0x99, 0x85, 0x0, 0x7e, 0x34})
 
+// p224ElementLength is the length of an element of the base or scalar field,
+// which have the same bytes length for all NIST P curves.
 const p224ElementLength = 28
 
 // P224Point is a P224 point. The zero value is NOT valid.
@@ -242,34 +245,54 @@ func (q *P224Point) Select(p1, p2 *P224Point, cond int) *P224Point {
 	return q
 }
 
+// A p224Table holds the first 15 multiples of a point at offset -1, so [1]P
+// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity
+// point.
+type p224Table [15]*P224Point
+
+// Select selects the n-th multiple of the table base point into p. It works in
+// constant time by iterating over every entry of the table. n must be in [0, 15].
+func (table *p224Table) Select(p *P224Point, n uint8) {
+	if n >= 16 {
+		panic("nistec: internal error: p224Table called with out-of-bounds value")
+	}
+	p.Set(NewP224Point())
+	for i := uint8(1); i < 16; i++ {
+		cond := subtle.ConstantTimeByteEq(i, n)
+		p.Select(table[i-1], p, cond)
+	}
+}
+
 // ScalarMult sets p = scalar * q, and returns p.
 func (p *P224Point) ScalarMult(q *P224Point, scalar []byte) (*P224Point, error) {
-	// table holds the first 16 multiples of q. The explicit newP224Point calls
-	// get inlined, letting the allocations live on the stack.
-	var table = [16]*P224Point{
-		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
-		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
+	// Compute a p224Table for the base point q. The explicit NewP224Point
+	// calls get inlined, letting the allocations live on the stack.
+	var table = p224Table{NewP224Point(), NewP224Point(), NewP224Point(),
 		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
 		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(),
-	}
-	for i := 1; i < 16; i++ {
-		table[i].Add(table[i-1], q)
+		NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point()}
+	table[0].Set(q)
+	for i := 1; i < 15; i += 2 {
+		table[i].Double(table[i/2])
+		table[i+1].Add(table[i], q)
 	}
 
 	// Instead of doing the classic double-and-add chain, we do it with a
 	// four-bit window: we double four times, and then add [0-15]P.
 	t := NewP224Point()
 	p.Set(NewP224Point())
-	for _, byte := range scalar {
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte>>4, i)
-			t.Select(table[i], t, cond)
+	for i, byte := range scalar {
+		// No need to double on the first iteration, as p is the identity at
+		// this point, and [N]â = â.
+		if i != 0 {
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
 		}
+
+		windowValue := byte >> 4
+		table.Select(t, windowValue)
 		p.Add(p, t)
 
 		p.Double(p)
@@ -277,18 +300,66 @@ func (p *P224Point) ScalarMult(q *P224Point, scalar []byte) (*P224Point, error)
 		p.Double(p)
 		p.Double(p)
 
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte&0b1111, i)
-			t.Select(table[i], t, cond)
-		}
+		windowValue = byte & 0b1111
+		table.Select(t, windowValue)
 		p.Add(p, t)
 	}
 
 	return p, nil
 }
 
+var p224GeneratorTable *[p224ElementLength * 2]p224Table
+var p224GeneratorTableOnce sync.Once
+
+// generatorTable returns a sequence of p224Tables. The first table contains
+// multiples of G. Each successive table is the previous table doubled four
+// times.
+func (p *P224Point) generatorTable() *[p224ElementLength * 2]p224Table {
+	p224GeneratorTableOnce.Do(func() {
+		p224GeneratorTable = new([p224ElementLength * 2]p224Table)
+		base := NewP224Generator()
+		for i := 0; i < p224ElementLength*2; i++ {
+			p224GeneratorTable[i][0] = NewP224Point().Set(base)
+			for j := 1; j < 15; j++ {
+				p224GeneratorTable[i][j] = NewP224Point().Add(p224GeneratorTable[i][j-1], base)
+			}
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+		}
+	})
+	return p224GeneratorTable
+}
+
 // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and
 // returns p.
 func (p *P224Point) ScalarBaseMult(scalar []byte) (*P224Point, error) {
-	return p.ScalarMult(NewP224Generator(), scalar)
+	if len(scalar) != p224ElementLength {
+		return nil, errors.New("invalid scalar length")
+	}
+	tables := p.generatorTable()
+
+	// This is also a scalar multiplication with a four-bit window like in
+	// ScalarMult, but in this case the doublings are precomputed. The value
+	// [windowValue]G added at iteration k would normally get doubled
+	// (totIterations-k)Ã4 times, but with a larger precomputation we can
+	// instead add [2^((totIterations-k)Ã4)][windowValue]G and avoid the
+	// doublings between iterations.
+	t := NewP224Point()
+	p.Set(NewP224Point())
+	tableIndex := len(tables) - 1
+	for _, byte := range scalar {
+		windowValue := byte >> 4
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+
+		windowValue = byte & 0b1111
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+	}
+
+	return p, nil
 }
diff --git a/src/crypto/elliptic/internal/nistec/p256.go b/src/crypto/elliptic/internal/nistec/p256.go
index c288a2d75f..81812df159 100644
--- a/src/crypto/elliptic/internal/nistec/p256.go
+++ b/src/crypto/elliptic/internal/nistec/p256.go
@@ -12,12 +12,15 @@ import (
 	"crypto/elliptic/internal/fiat"
 	"crypto/subtle"
 	"errors"
+	"sync"
 )
 
 var p256B, _ = new(fiat.P256Element).SetBytes([]byte{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, 0x65, 0x1d, 0x6, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b})
 
 var p256G, _ = NewP256Point().SetBytes([]byte{0x4, 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x3, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96, 0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0xf, 0x9e, 0x16, 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5})
 
+// p256ElementLength is the length of an element of the base or scalar field,
+// which have the same bytes length for all NIST P curves.
 const p256ElementLength = 32
 
 // P256Point is a P256 point. The zero value is NOT valid.
@@ -244,34 +247,54 @@ func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
 	return q
 }
 
+// A p256Table holds the first 15 multiples of a point at offset -1, so [1]P
+// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity
+// point.
+type p256Table [15]*P256Point
+
+// Select selects the n-th multiple of the table base point into p. It works in
+// constant time by iterating over every entry of the table. n must be in [0, 15].
+func (table *p256Table) Select(p *P256Point, n uint8) {
+	if n >= 16 {
+		panic("nistec: internal error: p256Table called with out-of-bounds value")
+	}
+	p.Set(NewP256Point())
+	for i := uint8(1); i < 16; i++ {
+		cond := subtle.ConstantTimeByteEq(i, n)
+		p.Select(table[i-1], p, cond)
+	}
+}
+
 // ScalarMult sets p = scalar * q, and returns p.
 func (p *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) {
-	// table holds the first 16 multiples of q. The explicit newP256Point calls
-	// get inlined, letting the allocations live on the stack.
-	var table = [16]*P256Point{
-		NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(),
-		NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(),
+	// Compute a p256Table for the base point q. The explicit NewP256Point
+	// calls get inlined, letting the allocations live on the stack.
+	var table = p256Table{NewP256Point(), NewP256Point(), NewP256Point(),
 		NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(),
 		NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(),
-	}
-	for i := 1; i < 16; i++ {
-		table[i].Add(table[i-1], q)
+		NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point()}
+	table[0].Set(q)
+	for i := 1; i < 15; i += 2 {
+		table[i].Double(table[i/2])
+		table[i+1].Add(table[i], q)
 	}
 
 	// Instead of doing the classic double-and-add chain, we do it with a
 	// four-bit window: we double four times, and then add [0-15]P.
 	t := NewP256Point()
 	p.Set(NewP256Point())
-	for _, byte := range scalar {
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte>>4, i)
-			t.Select(table[i], t, cond)
+	for i, byte := range scalar {
+		// No need to double on the first iteration, as p is the identity at
+		// this point, and [N]â = â.
+		if i != 0 {
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
 		}
+
+		windowValue := byte >> 4
+		table.Select(t, windowValue)
 		p.Add(p, t)
 
 		p.Double(p)
@@ -279,18 +302,66 @@ func (p *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error)
 		p.Double(p)
 		p.Double(p)
 
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte&0b1111, i)
-			t.Select(table[i], t, cond)
-		}
+		windowValue = byte & 0b1111
+		table.Select(t, windowValue)
 		p.Add(p, t)
 	}
 
 	return p, nil
 }
 
+var p256GeneratorTable *[p256ElementLength * 2]p256Table
+var p256GeneratorTableOnce sync.Once
+
+// generatorTable returns a sequence of p256Tables. The first table contains
+// multiples of G. Each successive table is the previous table doubled four
+// times.
+func (p *P256Point) generatorTable() *[p256ElementLength * 2]p256Table {
+	p256GeneratorTableOnce.Do(func() {
+		p256GeneratorTable = new([p256ElementLength * 2]p256Table)
+		base := NewP256Generator()
+		for i := 0; i < p256ElementLength*2; i++ {
+			p256GeneratorTable[i][0] = NewP256Point().Set(base)
+			for j := 1; j < 15; j++ {
+				p256GeneratorTable[i][j] = NewP256Point().Add(p256GeneratorTable[i][j-1], base)
+			}
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+		}
+	})
+	return p256GeneratorTable
+}
+
 // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and
 // returns p.
 func (p *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) {
-	return p.ScalarMult(NewP256Generator(), scalar)
+	if len(scalar) != p256ElementLength {
+		return nil, errors.New("invalid scalar length")
+	}
+	tables := p.generatorTable()
+
+	// This is also a scalar multiplication with a four-bit window like in
+	// ScalarMult, but in this case the doublings are precomputed. The value
+	// [windowValue]G added at iteration k would normally get doubled
+	// (totIterations-k)Ã4 times, but with a larger precomputation we can
+	// instead add [2^((totIterations-k)Ã4)][windowValue]G and avoid the
+	// doublings between iterations.
+	t := NewP256Point()
+	p.Set(NewP256Point())
+	tableIndex := len(tables) - 1
+	for _, byte := range scalar {
+		windowValue := byte >> 4
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+
+		windowValue = byte & 0b1111
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+	}
+
+	return p, nil
 }
diff --git a/src/crypto/elliptic/internal/nistec/p384.go b/src/crypto/elliptic/internal/nistec/p384.go
index dff3feefb8..1830149b2b 100644
--- a/src/crypto/elliptic/internal/nistec/p384.go
+++ b/src/crypto/elliptic/internal/nistec/p384.go
@@ -10,12 +10,15 @@ import (
 	"crypto/elliptic/internal/fiat"
 	"crypto/subtle"
 	"errors"
+	"sync"
 )
 
 var p384B, _ = new(fiat.P384Element).SetBytes([]byte{0xb3, 0x31, 0x2f, 0xa7, 0xe2, 0x3e, 0xe7, 0xe4, 0x98, 0x8e, 0x5, 0x6b, 0xe3, 0xf8, 0x2d, 0x19, 0x18, 0x1d, 0x9c, 0x6e, 0xfe, 0x81, 0x41, 0x12, 0x3, 0x14, 0x8, 0x8f, 0x50, 0x13, 0x87, 0x5a, 0xc6, 0x56, 0x39, 0x8d, 0x8a, 0x2e, 0xd1, 0x9d, 0x2a, 0x85, 0xc8, 0xed, 0xd3, 0xec, 0x2a, 0xef})
 
 var p384G, _ = NewP384Point().SetBytes([]byte{0x4, 0xaa, 0x87, 0xca, 0x22, 0xbe, 0x8b, 0x5, 0x37, 0x8e, 0xb1, 0xc7, 0x1e, 0xf3, 0x20, 0xad, 0x74, 0x6e, 0x1d, 0x3b, 0x62, 0x8b, 0xa7, 0x9b, 0x98, 0x59, 0xf7, 0x41, 0xe0, 0x82, 0x54, 0x2a, 0x38, 0x55, 0x2, 0xf2, 0x5d, 0xbf, 0x55, 0x29, 0x6c, 0x3a, 0x54, 0x5e, 0x38, 0x72, 0x76, 0xa, 0xb7, 0x36, 0x17, 0xde, 0x4a, 0x96, 0x26, 0x2c, 0x6f, 0x5d, 0x9e, 0x98, 0xbf, 0x92, 0x92, 0xdc, 0x29, 0xf8, 0xf4, 0x1d, 0xbd, 0x28, 0x9a, 0x14, 0x7c, 0xe9, 0xda, 0x31, 0x13, 0xb5, 0xf0, 0xb8, 0xc0, 0xa, 0x60, 0xb1, 0xce, 0x1d, 0x7e, 0x81, 0x9d, 0x7a, 0x43, 0x1d, 0x7c, 0x90, 0xea, 0xe, 0x5f})
 
+// p384ElementLength is the length of an element of the base or scalar field,
+// which have the same bytes length for all NIST P curves.
 const p384ElementLength = 48
 
 // P384Point is a P384 point. The zero value is NOT valid.
@@ -242,34 +245,54 @@ func (q *P384Point) Select(p1, p2 *P384Point, cond int) *P384Point {
 	return q
 }
 
+// A p384Table holds the first 15 multiples of a point at offset -1, so [1]P
+// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity
+// point.
+type p384Table [15]*P384Point
+
+// Select selects the n-th multiple of the table base point into p. It works in
+// constant time by iterating over every entry of the table. n must be in [0, 15].
+func (table *p384Table) Select(p *P384Point, n uint8) {
+	if n >= 16 {
+		panic("nistec: internal error: p384Table called with out-of-bounds value")
+	}
+	p.Set(NewP384Point())
+	for i := uint8(1); i < 16; i++ {
+		cond := subtle.ConstantTimeByteEq(i, n)
+		p.Select(table[i-1], p, cond)
+	}
+}
+
 // ScalarMult sets p = scalar * q, and returns p.
 func (p *P384Point) ScalarMult(q *P384Point, scalar []byte) (*P384Point, error) {
-	// table holds the first 16 multiples of q. The explicit newP384Point calls
-	// get inlined, letting the allocations live on the stack.
-	var table = [16]*P384Point{
-		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
-		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
+	// Compute a p384Table for the base point q. The explicit NewP384Point
+	// calls get inlined, letting the allocations live on the stack.
+	var table = p384Table{NewP384Point(), NewP384Point(), NewP384Point(),
 		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
 		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(),
-	}
-	for i := 1; i < 16; i++ {
-		table[i].Add(table[i-1], q)
+		NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point()}
+	table[0].Set(q)
+	for i := 1; i < 15; i += 2 {
+		table[i].Double(table[i/2])
+		table[i+1].Add(table[i], q)
 	}
 
 	// Instead of doing the classic double-and-add chain, we do it with a
 	// four-bit window: we double four times, and then add [0-15]P.
 	t := NewP384Point()
 	p.Set(NewP384Point())
-	for _, byte := range scalar {
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte>>4, i)
-			t.Select(table[i], t, cond)
+	for i, byte := range scalar {
+		// No need to double on the first iteration, as p is the identity at
+		// this point, and [N]â = â.
+		if i != 0 {
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
 		}
+
+		windowValue := byte >> 4
+		table.Select(t, windowValue)
 		p.Add(p, t)
 
 		p.Double(p)
@@ -277,18 +300,66 @@ func (p *P384Point) ScalarMult(q *P384Point, scalar []byte) (*P384Point, error)
 		p.Double(p)
 		p.Double(p)
 
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte&0b1111, i)
-			t.Select(table[i], t, cond)
-		}
+		windowValue = byte & 0b1111
+		table.Select(t, windowValue)
 		p.Add(p, t)
 	}
 
 	return p, nil
 }
 
+var p384GeneratorTable *[p384ElementLength * 2]p384Table
+var p384GeneratorTableOnce sync.Once
+
+// generatorTable returns a sequence of p384Tables. The first table contains
+// multiples of G. Each successive table is the previous table doubled four
+// times.
+func (p *P384Point) generatorTable() *[p384ElementLength * 2]p384Table {
+	p384GeneratorTableOnce.Do(func() {
+		p384GeneratorTable = new([p384ElementLength * 2]p384Table)
+		base := NewP384Generator()
+		for i := 0; i < p384ElementLength*2; i++ {
+			p384GeneratorTable[i][0] = NewP384Point().Set(base)
+			for j := 1; j < 15; j++ {
+				p384GeneratorTable[i][j] = NewP384Point().Add(p384GeneratorTable[i][j-1], base)
+			}
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+		}
+	})
+	return p384GeneratorTable
+}
+
 // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and
 // returns p.
 func (p *P384Point) ScalarBaseMult(scalar []byte) (*P384Point, error) {
-	return p.ScalarMult(NewP384Generator(), scalar)
+	if len(scalar) != p384ElementLength {
+		return nil, errors.New("invalid scalar length")
+	}
+	tables := p.generatorTable()
+
+	// This is also a scalar multiplication with a four-bit window like in
+	// ScalarMult, but in this case the doublings are precomputed. The value
+	// [windowValue]G added at iteration k would normally get doubled
+	// (totIterations-k)Ã4 times, but with a larger precomputation we can
+	// instead add [2^((totIterations-k)Ã4)][windowValue]G and avoid the
+	// doublings between iterations.
+	t := NewP384Point()
+	p.Set(NewP384Point())
+	tableIndex := len(tables) - 1
+	for _, byte := range scalar {
+		windowValue := byte >> 4
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+
+		windowValue = byte & 0b1111
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+	}
+
+	return p, nil
 }
diff --git a/src/crypto/elliptic/internal/nistec/p521.go b/src/crypto/elliptic/internal/nistec/p521.go
index d60c3a7065..731af4758f 100644
--- a/src/crypto/elliptic/internal/nistec/p521.go
+++ b/src/crypto/elliptic/internal/nistec/p521.go
@@ -10,12 +10,15 @@ import (
 	"crypto/elliptic/internal/fiat"
 	"crypto/subtle"
 	"errors"
+	"sync"
 )
 
 var p521B, _ = new(fiat.P521Element).SetBytes([]byte{0x0, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, 0x9, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, 0xbf, 0x7, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, 0x3f, 0x0})
 
 var p521G, _ = NewP521Point().SetBytes([]byte{0x4, 0x0, 0xc6, 0x85, 0x8e, 0x6, 0xb7, 0x4, 0x4, 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x5, 0x3f, 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, 0xbd, 0x66, 0x1, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, 0xc0, 0x4, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x1, 0x3f, 0xad, 0x7, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, 0x66, 0x50})
 
+// p521ElementLength is the length of an element of the base or scalar field,
+// which have the same bytes length for all NIST P curves.
 const p521ElementLength = 66
 
 // P521Point is a P521 point. The zero value is NOT valid.
@@ -242,34 +245,54 @@ func (q *P521Point) Select(p1, p2 *P521Point, cond int) *P521Point {
 	return q
 }
 
+// A p521Table holds the first 15 multiples of a point at offset -1, so [1]P
+// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity
+// point.
+type p521Table [15]*P521Point
+
+// Select selects the n-th multiple of the table base point into p. It works in
+// constant time by iterating over every entry of the table. n must be in [0, 15].
+func (table *p521Table) Select(p *P521Point, n uint8) {
+	if n >= 16 {
+		panic("nistec: internal error: p521Table called with out-of-bounds value")
+	}
+	p.Set(NewP521Point())
+	for i := uint8(1); i < 16; i++ {
+		cond := subtle.ConstantTimeByteEq(i, n)
+		p.Select(table[i-1], p, cond)
+	}
+}
+
 // ScalarMult sets p = scalar * q, and returns p.
 func (p *P521Point) ScalarMult(q *P521Point, scalar []byte) (*P521Point, error) {
-	// table holds the first 16 multiples of q. The explicit newP521Point calls
-	// get inlined, letting the allocations live on the stack.
-	var table = [16]*P521Point{
-		NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(),
-		NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(),
+	// Compute a p521Table for the base point q. The explicit NewP521Point
+	// calls get inlined, letting the allocations live on the stack.
+	var table = p521Table{NewP521Point(), NewP521Point(), NewP521Point(),
 		NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(),
 		NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(),
-	}
-	for i := 1; i < 16; i++ {
-		table[i].Add(table[i-1], q)
+		NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point()}
+	table[0].Set(q)
+	for i := 1; i < 15; i += 2 {
+		table[i].Double(table[i/2])
+		table[i+1].Add(table[i], q)
 	}
 
 	// Instead of doing the classic double-and-add chain, we do it with a
 	// four-bit window: we double four times, and then add [0-15]P.
 	t := NewP521Point()
 	p.Set(NewP521Point())
-	for _, byte := range scalar {
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-		p.Double(p)
-
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte>>4, i)
-			t.Select(table[i], t, cond)
+	for i, byte := range scalar {
+		// No need to double on the first iteration, as p is the identity at
+		// this point, and [N]â = â.
+		if i != 0 {
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
+			p.Double(p)
 		}
+
+		windowValue := byte >> 4
+		table.Select(t, windowValue)
 		p.Add(p, t)
 
 		p.Double(p)
@@ -277,18 +300,66 @@ func (p *P521Point) ScalarMult(q *P521Point, scalar []byte) (*P521Point, error)
 		p.Double(p)
 		p.Double(p)
 
-		for i := uint8(0); i < 16; i++ {
-			cond := subtle.ConstantTimeByteEq(byte&0b1111, i)
-			t.Select(table[i], t, cond)
-		}
+		windowValue = byte & 0b1111
+		table.Select(t, windowValue)
 		p.Add(p, t)
 	}
 
 	return p, nil
 }
 
+var p521GeneratorTable *[p521ElementLength * 2]p521Table
+var p521GeneratorTableOnce sync.Once
+
+// generatorTable returns a sequence of p521Tables. The first table contains
+// multiples of G. Each successive table is the previous table doubled four
+// times.
+func (p *P521Point) generatorTable() *[p521ElementLength * 2]p521Table {
+	p521GeneratorTableOnce.Do(func() {
+		p521GeneratorTable = new([p521ElementLength * 2]p521Table)
+		base := NewP521Generator()
+		for i := 0; i < p521ElementLength*2; i++ {
+			p521GeneratorTable[i][0] = NewP521Point().Set(base)
+			for j := 1; j < 15; j++ {
+				p521GeneratorTable[i][j] = NewP521Point().Add(p521GeneratorTable[i][j-1], base)
+			}
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+			base.Double(base)
+		}
+	})
+	return p521GeneratorTable
+}
+
 // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and
 // returns p.
 func (p *P521Point) ScalarBaseMult(scalar []byte) (*P521Point, error) {
-	return p.ScalarMult(NewP521Generator(), scalar)
+	if len(scalar) != p521ElementLength {
+		return nil, errors.New("invalid scalar length")
+	}
+	tables := p.generatorTable()
+
+	// This is also a scalar multiplication with a four-bit window like in
+	// ScalarMult, but in this case the doublings are precomputed. The value
+	// [windowValue]G added at iteration k would normally get doubled
+	// (totIterations-k)Ã4 times, but with a larger precomputation we can
+	// instead add [2^((totIterations-k)Ã4)][windowValue]G and avoid the
+	// doublings between iterations.
+	t := NewP521Point()
+	p.Set(NewP521Point())
+	tableIndex := len(tables) - 1
+	for _, byte := range scalar {
+		windowValue := byte >> 4
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+
+		windowValue = byte & 0b1111
+		tables[tableIndex].Select(t, windowValue)
+		p.Add(p, t)
+		tableIndex--
+	}
+
+	return p, nil
 }
-- 
2.52.0