From 50b1add5a73f05b0142adf783b6825c0c149882b Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Wed, 4 May 2022 17:49:37 -0400 Subject: [PATCH] crypto/elliptic: precompute ScalarBaseMult doublings MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta pkg:crypto/ecdsa goos:darwin goarch:amd64 Sign/P224-16 250µs ± 2% 91µs ± 2% -63.42% (p=0.000 n=10+9) Sign/P384-16 955µs ± 3% 311µs ± 2% -67.48% (p=0.000 n=10+10) Sign/P521-16 2.74ms ± 2% 0.82ms ± 2% -69.95% (p=0.000 n=10+10) Verify/P224-16 440µs ± 3% 282µs ± 5% -35.94% (p=0.000 n=9+10) Verify/P384-16 1.72ms ± 2% 1.07ms ± 1% -38.02% (p=0.000 n=10+9) Verify/P521-16 5.10ms ± 2% 3.18ms ± 3% -37.70% (p=0.000 n=10+10) GenerateKey/P224-16 225µs ± 3% 67µs ± 4% -70.42% (p=0.000 n=9+10) GenerateKey/P384-16 881µs ± 1% 241µs ± 2% -72.67% (p=0.000 n=10+10) GenerateKey/P521-16 2.62ms ± 3% 0.69ms ± 3% -73.78% (p=0.000 n=10+9) pkg:crypto/elliptic/internal/nistec goos:darwin goarch:amd64 ScalarMult/P224-16 219µs ± 4% 209µs ± 3% -4.57% (p=0.003 n=10+10) ScalarMult/P384-16 838µs ± 2% 823µs ± 1% -1.72% (p=0.004 n=10+9) ScalarMult/P521-16 2.48ms ± 2% 2.45ms ± 2% ~ (p=0.052 n=10+10) ScalarBaseMult/P224-16 214µs ± 4% 54µs ± 4% -74.88% (p=0.000 n=10+10) ScalarBaseMult/P384-16 828µs ± 2% 196µs ± 3% -76.38% (p=0.000 n=10+10) ScalarBaseMult/P521-16 2.50ms ± 3% 0.55ms ± 2% -77.96% (p=0.000 n=10+10) Updates #52424 For #52182 Change-Id: I2be3c2b8cdeead512063ef489e43805f4ee71d0f Reviewed-on: https://go-review.googlesource.com/c/go/+/404174 TryBot-Result: Gopher Robot Run-TryBot: Filippo Valsorda Reviewed-by: Fernando Lobato Meeser Reviewed-by: Roland Shoemaker --- .../elliptic/internal/nistec/generate.go | 115 ++++++++++++++---- src/crypto/elliptic/internal/nistec/p224.go | 115 ++++++++++++++---- src/crypto/elliptic/internal/nistec/p256.go | 115 ++++++++++++++---- src/crypto/elliptic/internal/nistec/p384.go | 115 ++++++++++++++---- src/crypto/elliptic/internal/nistec/p521.go | 115 ++++++++++++++---- 5 files changed, 465 insertions(+), 110 deletions(-) diff --git a/src/crypto/elliptic/internal/nistec/generate.go b/src/crypto/elliptic/internal/nistec/generate.go index f3726a06ca..fbca6c3741 100644 --- a/src/crypto/elliptic/internal/nistec/generate.go +++ b/src/crypto/elliptic/internal/nistec/generate.go @@ -97,12 +97,15 @@ import ( "crypto/elliptic/internal/fiat" "crypto/subtle" "errors" + "sync" ) var {{.p}}B, _ = new({{.Element}}).SetBytes({{.B}}) var {{.p}}G, _ = New{{.P}}Point().SetBytes({{.G}}) +// {{.p}}ElementLength is the length of an element of the base or scalar field, +// which have the same bytes length for all NIST P curves. const {{.p}}ElementLength = {{ .ElementLen }} // {{.P}}Point is a {{.P}} point. The zero value is NOT valid. @@ -329,34 +332,54 @@ func (q *{{.P}}Point) Select(p1, p2 *{{.P}}Point, cond int) *{{.P}}Point { return q } +// A {{.p}}Table holds the first 15 multiples of a point at offset -1, so [1]P +// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity +// point. +type {{.p}}Table [15]*{{.P}}Point + +// Select selects the n-th multiple of the table base point into p. It works in +// constant time by iterating over every entry of the table. n must be in [0, 15]. +func (table *{{.p}}Table) Select(p *{{.P}}Point, n uint8) { + if n >= 16 { + panic("nistec: internal error: {{.p}}Table called with out-of-bounds value") + } + p.Set(New{{.P}}Point()) + for i := uint8(1); i < 16; i++ { + cond := subtle.ConstantTimeByteEq(i, n) + p.Select(table[i-1], p, cond) + } +} + // ScalarMult sets p = scalar * q, and returns p. func (p *{{.P}}Point) ScalarMult(q *{{.P}}Point, scalar []byte) (*{{.P}}Point, error) { - // table holds the first 16 multiples of q. The explicit new{{.P}}Point calls - // get inlined, letting the allocations live on the stack. - var table = [16]*{{.P}}Point{ - New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), - New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), + // Compute a {{.p}}Table for the base point q. The explicit New{{.P}}Point + // calls get inlined, letting the allocations live on the stack. + var table = {{.p}}Table{New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), - } - for i := 1; i < 16; i++ { - table[i].Add(table[i-1], q) + New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point(), New{{.P}}Point()} + table[0].Set(q) + for i := 1; i < 15; i += 2 { + table[i].Double(table[i/2]) + table[i+1].Add(table[i], q) } // Instead of doing the classic double-and-add chain, we do it with a // four-bit window: we double four times, and then add [0-15]P. t := New{{.P}}Point() p.Set(New{{.P}}Point()) - for _, byte := range scalar { - p.Double(p) - p.Double(p) - p.Double(p) - p.Double(p) - - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte>>4, i) - t.Select(table[i], t, cond) + for i, byte := range scalar { + // No need to double on the first iteration, as p is the identity at + // this point, and [N]∞ = ∞. + if i != 0 { + p.Double(p) + p.Double(p) + p.Double(p) + p.Double(p) } + + windowValue := byte >> 4 + table.Select(t, windowValue) p.Add(p, t) p.Double(p) @@ -364,19 +387,67 @@ func (p *{{.P}}Point) ScalarMult(q *{{.P}}Point, scalar []byte) (*{{.P}}Point, e p.Double(p) p.Double(p) - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte&0b1111, i) - t.Select(table[i], t, cond) - } + windowValue = byte & 0b1111 + table.Select(t, windowValue) p.Add(p, t) } return p, nil } +var {{.p}}GeneratorTable *[{{.p}}ElementLength * 2]{{.p}}Table +var {{.p}}GeneratorTableOnce sync.Once + +// generatorTable returns a sequence of {{.p}}Tables. The first table contains +// multiples of G. Each successive table is the previous table doubled four +// times. +func (p *{{.P}}Point) generatorTable() *[{{.p}}ElementLength * 2]{{.p}}Table { + {{.p}}GeneratorTableOnce.Do(func() { + {{.p}}GeneratorTable = new([{{.p}}ElementLength * 2]{{.p}}Table) + base := New{{.P}}Generator() + for i := 0; i < {{.p}}ElementLength*2; i++ { + {{.p}}GeneratorTable[i][0] = New{{.P}}Point().Set(base) + for j := 1; j < 15; j++ { + {{.p}}GeneratorTable[i][j] = New{{.P}}Point().Add({{.p}}GeneratorTable[i][j-1], base) + } + base.Double(base) + base.Double(base) + base.Double(base) + base.Double(base) + } + }) + return {{.p}}GeneratorTable +} + // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and // returns p. func (p *{{.P}}Point) ScalarBaseMult(scalar []byte) (*{{.P}}Point, error) { - return p.ScalarMult(New{{.P}}Generator(), scalar) + if len(scalar) != {{.p}}ElementLength { + return nil, errors.New("invalid scalar length") + } + tables := p.generatorTable() + + // This is also a scalar multiplication with a four-bit window like in + // ScalarMult, but in this case the doublings are precomputed. The value + // [windowValue]G added at iteration k would normally get doubled + // (totIterations-k)×4 times, but with a larger precomputation we can + // instead add [2^((totIterations-k)×4)][windowValue]G and avoid the + // doublings between iterations. + t := New{{.P}}Point() + p.Set(New{{.P}}Point()) + tableIndex := len(tables) - 1 + for _, byte := range scalar { + windowValue := byte >> 4 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + + windowValue = byte & 0b1111 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + } + + return p, nil } ` diff --git a/src/crypto/elliptic/internal/nistec/p224.go b/src/crypto/elliptic/internal/nistec/p224.go index 7f3dcca742..0db4ba1316 100644 --- a/src/crypto/elliptic/internal/nistec/p224.go +++ b/src/crypto/elliptic/internal/nistec/p224.go @@ -10,12 +10,15 @@ import ( "crypto/elliptic/internal/fiat" "crypto/subtle" "errors" + "sync" ) var p224B, _ = new(fiat.P224Element).SetBytes([]byte{0xb4, 0x5, 0xa, 0x85, 0xc, 0x4, 0xb3, 0xab, 0xf5, 0x41, 0x32, 0x56, 0x50, 0x44, 0xb0, 0xb7, 0xd7, 0xbf, 0xd8, 0xba, 0x27, 0xb, 0x39, 0x43, 0x23, 0x55, 0xff, 0xb4}) var p224G, _ = NewP224Point().SetBytes([]byte{0x4, 0xb7, 0xe, 0xc, 0xbd, 0x6b, 0xb4, 0xbf, 0x7f, 0x32, 0x13, 0x90, 0xb9, 0x4a, 0x3, 0xc1, 0xd3, 0x56, 0xc2, 0x11, 0x22, 0x34, 0x32, 0x80, 0xd6, 0x11, 0x5c, 0x1d, 0x21, 0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, 0xdf, 0xe6, 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x7, 0x47, 0x64, 0x44, 0xd5, 0x81, 0x99, 0x85, 0x0, 0x7e, 0x34}) +// p224ElementLength is the length of an element of the base or scalar field, +// which have the same bytes length for all NIST P curves. const p224ElementLength = 28 // P224Point is a P224 point. The zero value is NOT valid. @@ -242,34 +245,54 @@ func (q *P224Point) Select(p1, p2 *P224Point, cond int) *P224Point { return q } +// A p224Table holds the first 15 multiples of a point at offset -1, so [1]P +// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity +// point. +type p224Table [15]*P224Point + +// Select selects the n-th multiple of the table base point into p. It works in +// constant time by iterating over every entry of the table. n must be in [0, 15]. +func (table *p224Table) Select(p *P224Point, n uint8) { + if n >= 16 { + panic("nistec: internal error: p224Table called with out-of-bounds value") + } + p.Set(NewP224Point()) + for i := uint8(1); i < 16; i++ { + cond := subtle.ConstantTimeByteEq(i, n) + p.Select(table[i-1], p, cond) + } +} + // ScalarMult sets p = scalar * q, and returns p. func (p *P224Point) ScalarMult(q *P224Point, scalar []byte) (*P224Point, error) { - // table holds the first 16 multiples of q. The explicit newP224Point calls - // get inlined, letting the allocations live on the stack. - var table = [16]*P224Point{ - NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), - NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), + // Compute a p224Table for the base point q. The explicit NewP224Point + // calls get inlined, letting the allocations live on the stack. + var table = p224Table{NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point(), - } - for i := 1; i < 16; i++ { - table[i].Add(table[i-1], q) + NewP224Point(), NewP224Point(), NewP224Point(), NewP224Point()} + table[0].Set(q) + for i := 1; i < 15; i += 2 { + table[i].Double(table[i/2]) + table[i+1].Add(table[i], q) } // Instead of doing the classic double-and-add chain, we do it with a // four-bit window: we double four times, and then add [0-15]P. t := NewP224Point() p.Set(NewP224Point()) - for _, byte := range scalar { - p.Double(p) - p.Double(p) - p.Double(p) - p.Double(p) - - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte>>4, i) - t.Select(table[i], t, cond) + for i, byte := range scalar { + // No need to double on the first iteration, as p is the identity at + // this point, and [N]∞ = ∞. + if i != 0 { + p.Double(p) + p.Double(p) + p.Double(p) + p.Double(p) } + + windowValue := byte >> 4 + table.Select(t, windowValue) p.Add(p, t) p.Double(p) @@ -277,18 +300,66 @@ func (p *P224Point) ScalarMult(q *P224Point, scalar []byte) (*P224Point, error) p.Double(p) p.Double(p) - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte&0b1111, i) - t.Select(table[i], t, cond) - } + windowValue = byte & 0b1111 + table.Select(t, windowValue) p.Add(p, t) } return p, nil } +var p224GeneratorTable *[p224ElementLength * 2]p224Table +var p224GeneratorTableOnce sync.Once + +// generatorTable returns a sequence of p224Tables. The first table contains +// multiples of G. Each successive table is the previous table doubled four +// times. +func (p *P224Point) generatorTable() *[p224ElementLength * 2]p224Table { + p224GeneratorTableOnce.Do(func() { + p224GeneratorTable = new([p224ElementLength * 2]p224Table) + base := NewP224Generator() + for i := 0; i < p224ElementLength*2; i++ { + p224GeneratorTable[i][0] = NewP224Point().Set(base) + for j := 1; j < 15; j++ { + p224GeneratorTable[i][j] = NewP224Point().Add(p224GeneratorTable[i][j-1], base) + } + base.Double(base) + base.Double(base) + base.Double(base) + base.Double(base) + } + }) + return p224GeneratorTable +} + // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and // returns p. func (p *P224Point) ScalarBaseMult(scalar []byte) (*P224Point, error) { - return p.ScalarMult(NewP224Generator(), scalar) + if len(scalar) != p224ElementLength { + return nil, errors.New("invalid scalar length") + } + tables := p.generatorTable() + + // This is also a scalar multiplication with a four-bit window like in + // ScalarMult, but in this case the doublings are precomputed. The value + // [windowValue]G added at iteration k would normally get doubled + // (totIterations-k)×4 times, but with a larger precomputation we can + // instead add [2^((totIterations-k)×4)][windowValue]G and avoid the + // doublings between iterations. + t := NewP224Point() + p.Set(NewP224Point()) + tableIndex := len(tables) - 1 + for _, byte := range scalar { + windowValue := byte >> 4 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + + windowValue = byte & 0b1111 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + } + + return p, nil } diff --git a/src/crypto/elliptic/internal/nistec/p256.go b/src/crypto/elliptic/internal/nistec/p256.go index c288a2d75f..81812df159 100644 --- a/src/crypto/elliptic/internal/nistec/p256.go +++ b/src/crypto/elliptic/internal/nistec/p256.go @@ -12,12 +12,15 @@ import ( "crypto/elliptic/internal/fiat" "crypto/subtle" "errors" + "sync" ) var p256B, _ = new(fiat.P256Element).SetBytes([]byte{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, 0x65, 0x1d, 0x6, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b}) var p256G, _ = NewP256Point().SetBytes([]byte{0x4, 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x3, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96, 0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0xf, 0x9e, 0x16, 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}) +// p256ElementLength is the length of an element of the base or scalar field, +// which have the same bytes length for all NIST P curves. const p256ElementLength = 32 // P256Point is a P256 point. The zero value is NOT valid. @@ -244,34 +247,54 @@ func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point { return q } +// A p256Table holds the first 15 multiples of a point at offset -1, so [1]P +// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity +// point. +type p256Table [15]*P256Point + +// Select selects the n-th multiple of the table base point into p. It works in +// constant time by iterating over every entry of the table. n must be in [0, 15]. +func (table *p256Table) Select(p *P256Point, n uint8) { + if n >= 16 { + panic("nistec: internal error: p256Table called with out-of-bounds value") + } + p.Set(NewP256Point()) + for i := uint8(1); i < 16; i++ { + cond := subtle.ConstantTimeByteEq(i, n) + p.Select(table[i-1], p, cond) + } +} + // ScalarMult sets p = scalar * q, and returns p. func (p *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) { - // table holds the first 16 multiples of q. The explicit newP256Point calls - // get inlined, letting the allocations live on the stack. - var table = [16]*P256Point{ - NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), - NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), + // Compute a p256Table for the base point q. The explicit NewP256Point + // calls get inlined, letting the allocations live on the stack. + var table = p256Table{NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point(), - } - for i := 1; i < 16; i++ { - table[i].Add(table[i-1], q) + NewP256Point(), NewP256Point(), NewP256Point(), NewP256Point()} + table[0].Set(q) + for i := 1; i < 15; i += 2 { + table[i].Double(table[i/2]) + table[i+1].Add(table[i], q) } // Instead of doing the classic double-and-add chain, we do it with a // four-bit window: we double four times, and then add [0-15]P. t := NewP256Point() p.Set(NewP256Point()) - for _, byte := range scalar { - p.Double(p) - p.Double(p) - p.Double(p) - p.Double(p) - - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte>>4, i) - t.Select(table[i], t, cond) + for i, byte := range scalar { + // No need to double on the first iteration, as p is the identity at + // this point, and [N]∞ = ∞. + if i != 0 { + p.Double(p) + p.Double(p) + p.Double(p) + p.Double(p) } + + windowValue := byte >> 4 + table.Select(t, windowValue) p.Add(p, t) p.Double(p) @@ -279,18 +302,66 @@ func (p *P256Point) ScalarMult(q *P256Point, scalar []byte) (*P256Point, error) p.Double(p) p.Double(p) - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte&0b1111, i) - t.Select(table[i], t, cond) - } + windowValue = byte & 0b1111 + table.Select(t, windowValue) p.Add(p, t) } return p, nil } +var p256GeneratorTable *[p256ElementLength * 2]p256Table +var p256GeneratorTableOnce sync.Once + +// generatorTable returns a sequence of p256Tables. The first table contains +// multiples of G. Each successive table is the previous table doubled four +// times. +func (p *P256Point) generatorTable() *[p256ElementLength * 2]p256Table { + p256GeneratorTableOnce.Do(func() { + p256GeneratorTable = new([p256ElementLength * 2]p256Table) + base := NewP256Generator() + for i := 0; i < p256ElementLength*2; i++ { + p256GeneratorTable[i][0] = NewP256Point().Set(base) + for j := 1; j < 15; j++ { + p256GeneratorTable[i][j] = NewP256Point().Add(p256GeneratorTable[i][j-1], base) + } + base.Double(base) + base.Double(base) + base.Double(base) + base.Double(base) + } + }) + return p256GeneratorTable +} + // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and // returns p. func (p *P256Point) ScalarBaseMult(scalar []byte) (*P256Point, error) { - return p.ScalarMult(NewP256Generator(), scalar) + if len(scalar) != p256ElementLength { + return nil, errors.New("invalid scalar length") + } + tables := p.generatorTable() + + // This is also a scalar multiplication with a four-bit window like in + // ScalarMult, but in this case the doublings are precomputed. The value + // [windowValue]G added at iteration k would normally get doubled + // (totIterations-k)×4 times, but with a larger precomputation we can + // instead add [2^((totIterations-k)×4)][windowValue]G and avoid the + // doublings between iterations. + t := NewP256Point() + p.Set(NewP256Point()) + tableIndex := len(tables) - 1 + for _, byte := range scalar { + windowValue := byte >> 4 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + + windowValue = byte & 0b1111 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + } + + return p, nil } diff --git a/src/crypto/elliptic/internal/nistec/p384.go b/src/crypto/elliptic/internal/nistec/p384.go index dff3feefb8..1830149b2b 100644 --- a/src/crypto/elliptic/internal/nistec/p384.go +++ b/src/crypto/elliptic/internal/nistec/p384.go @@ -10,12 +10,15 @@ import ( "crypto/elliptic/internal/fiat" "crypto/subtle" "errors" + "sync" ) var p384B, _ = new(fiat.P384Element).SetBytes([]byte{0xb3, 0x31, 0x2f, 0xa7, 0xe2, 0x3e, 0xe7, 0xe4, 0x98, 0x8e, 0x5, 0x6b, 0xe3, 0xf8, 0x2d, 0x19, 0x18, 0x1d, 0x9c, 0x6e, 0xfe, 0x81, 0x41, 0x12, 0x3, 0x14, 0x8, 0x8f, 0x50, 0x13, 0x87, 0x5a, 0xc6, 0x56, 0x39, 0x8d, 0x8a, 0x2e, 0xd1, 0x9d, 0x2a, 0x85, 0xc8, 0xed, 0xd3, 0xec, 0x2a, 0xef}) var p384G, _ = NewP384Point().SetBytes([]byte{0x4, 0xaa, 0x87, 0xca, 0x22, 0xbe, 0x8b, 0x5, 0x37, 0x8e, 0xb1, 0xc7, 0x1e, 0xf3, 0x20, 0xad, 0x74, 0x6e, 0x1d, 0x3b, 0x62, 0x8b, 0xa7, 0x9b, 0x98, 0x59, 0xf7, 0x41, 0xe0, 0x82, 0x54, 0x2a, 0x38, 0x55, 0x2, 0xf2, 0x5d, 0xbf, 0x55, 0x29, 0x6c, 0x3a, 0x54, 0x5e, 0x38, 0x72, 0x76, 0xa, 0xb7, 0x36, 0x17, 0xde, 0x4a, 0x96, 0x26, 0x2c, 0x6f, 0x5d, 0x9e, 0x98, 0xbf, 0x92, 0x92, 0xdc, 0x29, 0xf8, 0xf4, 0x1d, 0xbd, 0x28, 0x9a, 0x14, 0x7c, 0xe9, 0xda, 0x31, 0x13, 0xb5, 0xf0, 0xb8, 0xc0, 0xa, 0x60, 0xb1, 0xce, 0x1d, 0x7e, 0x81, 0x9d, 0x7a, 0x43, 0x1d, 0x7c, 0x90, 0xea, 0xe, 0x5f}) +// p384ElementLength is the length of an element of the base or scalar field, +// which have the same bytes length for all NIST P curves. const p384ElementLength = 48 // P384Point is a P384 point. The zero value is NOT valid. @@ -242,34 +245,54 @@ func (q *P384Point) Select(p1, p2 *P384Point, cond int) *P384Point { return q } +// A p384Table holds the first 15 multiples of a point at offset -1, so [1]P +// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity +// point. +type p384Table [15]*P384Point + +// Select selects the n-th multiple of the table base point into p. It works in +// constant time by iterating over every entry of the table. n must be in [0, 15]. +func (table *p384Table) Select(p *P384Point, n uint8) { + if n >= 16 { + panic("nistec: internal error: p384Table called with out-of-bounds value") + } + p.Set(NewP384Point()) + for i := uint8(1); i < 16; i++ { + cond := subtle.ConstantTimeByteEq(i, n) + p.Select(table[i-1], p, cond) + } +} + // ScalarMult sets p = scalar * q, and returns p. func (p *P384Point) ScalarMult(q *P384Point, scalar []byte) (*P384Point, error) { - // table holds the first 16 multiples of q. The explicit newP384Point calls - // get inlined, letting the allocations live on the stack. - var table = [16]*P384Point{ - NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), - NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), + // Compute a p384Table for the base point q. The explicit NewP384Point + // calls get inlined, letting the allocations live on the stack. + var table = p384Table{NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point(), - } - for i := 1; i < 16; i++ { - table[i].Add(table[i-1], q) + NewP384Point(), NewP384Point(), NewP384Point(), NewP384Point()} + table[0].Set(q) + for i := 1; i < 15; i += 2 { + table[i].Double(table[i/2]) + table[i+1].Add(table[i], q) } // Instead of doing the classic double-and-add chain, we do it with a // four-bit window: we double four times, and then add [0-15]P. t := NewP384Point() p.Set(NewP384Point()) - for _, byte := range scalar { - p.Double(p) - p.Double(p) - p.Double(p) - p.Double(p) - - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte>>4, i) - t.Select(table[i], t, cond) + for i, byte := range scalar { + // No need to double on the first iteration, as p is the identity at + // this point, and [N]∞ = ∞. + if i != 0 { + p.Double(p) + p.Double(p) + p.Double(p) + p.Double(p) } + + windowValue := byte >> 4 + table.Select(t, windowValue) p.Add(p, t) p.Double(p) @@ -277,18 +300,66 @@ func (p *P384Point) ScalarMult(q *P384Point, scalar []byte) (*P384Point, error) p.Double(p) p.Double(p) - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte&0b1111, i) - t.Select(table[i], t, cond) - } + windowValue = byte & 0b1111 + table.Select(t, windowValue) p.Add(p, t) } return p, nil } +var p384GeneratorTable *[p384ElementLength * 2]p384Table +var p384GeneratorTableOnce sync.Once + +// generatorTable returns a sequence of p384Tables. The first table contains +// multiples of G. Each successive table is the previous table doubled four +// times. +func (p *P384Point) generatorTable() *[p384ElementLength * 2]p384Table { + p384GeneratorTableOnce.Do(func() { + p384GeneratorTable = new([p384ElementLength * 2]p384Table) + base := NewP384Generator() + for i := 0; i < p384ElementLength*2; i++ { + p384GeneratorTable[i][0] = NewP384Point().Set(base) + for j := 1; j < 15; j++ { + p384GeneratorTable[i][j] = NewP384Point().Add(p384GeneratorTable[i][j-1], base) + } + base.Double(base) + base.Double(base) + base.Double(base) + base.Double(base) + } + }) + return p384GeneratorTable +} + // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and // returns p. func (p *P384Point) ScalarBaseMult(scalar []byte) (*P384Point, error) { - return p.ScalarMult(NewP384Generator(), scalar) + if len(scalar) != p384ElementLength { + return nil, errors.New("invalid scalar length") + } + tables := p.generatorTable() + + // This is also a scalar multiplication with a four-bit window like in + // ScalarMult, but in this case the doublings are precomputed. The value + // [windowValue]G added at iteration k would normally get doubled + // (totIterations-k)×4 times, but with a larger precomputation we can + // instead add [2^((totIterations-k)×4)][windowValue]G and avoid the + // doublings between iterations. + t := NewP384Point() + p.Set(NewP384Point()) + tableIndex := len(tables) - 1 + for _, byte := range scalar { + windowValue := byte >> 4 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + + windowValue = byte & 0b1111 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + } + + return p, nil } diff --git a/src/crypto/elliptic/internal/nistec/p521.go b/src/crypto/elliptic/internal/nistec/p521.go index d60c3a7065..731af4758f 100644 --- a/src/crypto/elliptic/internal/nistec/p521.go +++ b/src/crypto/elliptic/internal/nistec/p521.go @@ -10,12 +10,15 @@ import ( "crypto/elliptic/internal/fiat" "crypto/subtle" "errors" + "sync" ) var p521B, _ = new(fiat.P521Element).SetBytes([]byte{0x0, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, 0x9, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, 0xbf, 0x7, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, 0x3f, 0x0}) var p521G, _ = NewP521Point().SetBytes([]byte{0x4, 0x0, 0xc6, 0x85, 0x8e, 0x6, 0xb7, 0x4, 0x4, 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x5, 0x3f, 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, 0xbd, 0x66, 0x1, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, 0xc0, 0x4, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x1, 0x3f, 0xad, 0x7, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, 0x66, 0x50}) +// p521ElementLength is the length of an element of the base or scalar field, +// which have the same bytes length for all NIST P curves. const p521ElementLength = 66 // P521Point is a P521 point. The zero value is NOT valid. @@ -242,34 +245,54 @@ func (q *P521Point) Select(p1, p2 *P521Point, cond int) *P521Point { return q } +// A p521Table holds the first 15 multiples of a point at offset -1, so [1]P +// is at table[0], [15]P is at table[14], and [0]P is implicitly the identity +// point. +type p521Table [15]*P521Point + +// Select selects the n-th multiple of the table base point into p. It works in +// constant time by iterating over every entry of the table. n must be in [0, 15]. +func (table *p521Table) Select(p *P521Point, n uint8) { + if n >= 16 { + panic("nistec: internal error: p521Table called with out-of-bounds value") + } + p.Set(NewP521Point()) + for i := uint8(1); i < 16; i++ { + cond := subtle.ConstantTimeByteEq(i, n) + p.Select(table[i-1], p, cond) + } +} + // ScalarMult sets p = scalar * q, and returns p. func (p *P521Point) ScalarMult(q *P521Point, scalar []byte) (*P521Point, error) { - // table holds the first 16 multiples of q. The explicit newP521Point calls - // get inlined, letting the allocations live on the stack. - var table = [16]*P521Point{ - NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), - NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), + // Compute a p521Table for the base point q. The explicit NewP521Point + // calls get inlined, letting the allocations live on the stack. + var table = p521Table{NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point(), - } - for i := 1; i < 16; i++ { - table[i].Add(table[i-1], q) + NewP521Point(), NewP521Point(), NewP521Point(), NewP521Point()} + table[0].Set(q) + for i := 1; i < 15; i += 2 { + table[i].Double(table[i/2]) + table[i+1].Add(table[i], q) } // Instead of doing the classic double-and-add chain, we do it with a // four-bit window: we double four times, and then add [0-15]P. t := NewP521Point() p.Set(NewP521Point()) - for _, byte := range scalar { - p.Double(p) - p.Double(p) - p.Double(p) - p.Double(p) - - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte>>4, i) - t.Select(table[i], t, cond) + for i, byte := range scalar { + // No need to double on the first iteration, as p is the identity at + // this point, and [N]∞ = ∞. + if i != 0 { + p.Double(p) + p.Double(p) + p.Double(p) + p.Double(p) } + + windowValue := byte >> 4 + table.Select(t, windowValue) p.Add(p, t) p.Double(p) @@ -277,18 +300,66 @@ func (p *P521Point) ScalarMult(q *P521Point, scalar []byte) (*P521Point, error) p.Double(p) p.Double(p) - for i := uint8(0); i < 16; i++ { - cond := subtle.ConstantTimeByteEq(byte&0b1111, i) - t.Select(table[i], t, cond) - } + windowValue = byte & 0b1111 + table.Select(t, windowValue) p.Add(p, t) } return p, nil } +var p521GeneratorTable *[p521ElementLength * 2]p521Table +var p521GeneratorTableOnce sync.Once + +// generatorTable returns a sequence of p521Tables. The first table contains +// multiples of G. Each successive table is the previous table doubled four +// times. +func (p *P521Point) generatorTable() *[p521ElementLength * 2]p521Table { + p521GeneratorTableOnce.Do(func() { + p521GeneratorTable = new([p521ElementLength * 2]p521Table) + base := NewP521Generator() + for i := 0; i < p521ElementLength*2; i++ { + p521GeneratorTable[i][0] = NewP521Point().Set(base) + for j := 1; j < 15; j++ { + p521GeneratorTable[i][j] = NewP521Point().Add(p521GeneratorTable[i][j-1], base) + } + base.Double(base) + base.Double(base) + base.Double(base) + base.Double(base) + } + }) + return p521GeneratorTable +} + // ScalarBaseMult sets p = scalar * B, where B is the canonical generator, and // returns p. func (p *P521Point) ScalarBaseMult(scalar []byte) (*P521Point, error) { - return p.ScalarMult(NewP521Generator(), scalar) + if len(scalar) != p521ElementLength { + return nil, errors.New("invalid scalar length") + } + tables := p.generatorTable() + + // This is also a scalar multiplication with a four-bit window like in + // ScalarMult, but in this case the doublings are precomputed. The value + // [windowValue]G added at iteration k would normally get doubled + // (totIterations-k)×4 times, but with a larger precomputation we can + // instead add [2^((totIterations-k)×4)][windowValue]G and avoid the + // doublings between iterations. + t := NewP521Point() + p.Set(NewP521Point()) + tableIndex := len(tables) - 1 + for _, byte := range scalar { + windowValue := byte >> 4 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + + windowValue = byte & 0b1111 + tables[tableIndex].Select(t, windowValue) + p.Add(p, t) + tableIndex-- + } + + return p, nil } -- 2.50.0