From: Egon Elbre Date: Wed, 26 Feb 2025 12:21:20 +0000 (+0200) Subject: crypto/internal/fips140/edwards25519/field: optimize AMD64 X-Git-Tag: go1.25rc1~893 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=983e30bd3b78ca77a5028a94323c6da363358648;p=gostls13.git crypto/internal/fips140/edwards25519/field: optimize AMD64 Replace constant multiplication with shift and adds, this reduces pressure on multiplications, making things overall faster. goos: windows goarch: amd64 pkg: crypto/internal/fips140/edwards25519/field cpu: AMD Ryzen Threadripper 2950X 16-Core Processor │ v0.log~ │ v1.log~ │ │ sec/op │ sec/op vs base │ Add-32 4.768n ± 1% 4.763n ± 0% ~ (p=0.683 n=20) Multiply-32 20.93n ± 0% 19.48n ± 0% -6.88% (p=0.000 n=20) Square-32 15.88n ± 0% 15.00n ± 0% -5.51% (p=0.000 n=20) Invert-32 4.291µ ± 0% 4.072µ ± 0% -5.10% (p=0.000 n=20) Mult32-32 5.184n ± 0% 5.169n ± 0% -0.30% (p=0.032 n=20) Bytes-32 11.36n ± 0% 11.34n ± 0% ~ (p=0.106 n=20) geomean 27.15n 26.32n -3.06% Change-Id: I9c2f588fad29d89c3e6c712c092b32b66479f596 Reviewed-on: https://go-review.googlesource.com/c/go/+/652716 Reviewed-by: Filippo Valsorda Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI Commit-Queue: Junyang Shao Reviewed-by: Michael Pratt Auto-Submit: Filippo Valsorda --- diff --git a/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go b/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go index e509052160..ecb713b3c4 100644 --- a/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go +++ b/src/crypto/internal/fips140/edwards25519/field/_asm/fe_amd64_asm.go @@ -256,6 +256,23 @@ func addMul64(r uint128, i uint64, aX, bX namedComponent) { case 1: Comment(fmt.Sprintf("%s += %s×%s", r, aX, bX)) Load(aX, RAX) + case 2: + Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX)) + Load(aX, RAX) + SHLQ(U8(1), RAX) + case 19: + Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX)) + // 19 * v ==> v + (v+v*8)*2 + tmp := Load(aX, GP64()) + LEAQ(Mem{Base: tmp, Index: tmp, Scale: 8}, RAX) + LEAQ(Mem{Base: tmp, Index: RAX, Scale: 2}, RAX) + case 38: + Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX)) + // 38 * v ==> (v + (v+v*8)*2) * 2 + tmp := Load(aX, GP64()) + LEAQ(Mem{Base: tmp, Index: tmp, Scale: 8}, RAX) + LEAQ(Mem{Base: tmp, Index: RAX, Scale: 2}, RAX) + SHLQ(U8(1), RAX) default: Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX)) IMUL3Q(Imm(i), Load(aX, GP64()), RAX) diff --git a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s index 657851c85e..5e06e242ed 100644 --- a/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s +++ b/src/crypto/internal/fips140/edwards25519/field/fe_amd64.s @@ -16,32 +16,36 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 MOVQ DX, SI // r0 += 19×a1×b4 - MOVQ 8(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 8(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, DI + ADCQ DX, SI // r0 += 19×a2×b3 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(BX) + ADDQ AX, DI + ADCQ DX, SI // r0 += 19×a3×b2 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 16(BX) + ADDQ AX, DI + ADCQ DX, SI // r0 += 19×a4×b1 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 8(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 8(BX) + ADDQ AX, DI + ADCQ DX, SI // r1 = a0×b1 MOVQ (CX), AX @@ -56,25 +60,28 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 ADCQ DX, R8 // r1 += 19×a2×b4 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R9 - ADCQ DX, R8 + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, R9 + ADCQ DX, R8 // r1 += 19×a3×b3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R9 - ADCQ DX, R8 + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(BX) + ADDQ AX, R9 + ADCQ DX, R8 // r1 += 19×a4×b2 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, R9 - ADCQ DX, R8 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 16(BX) + ADDQ AX, R9 + ADCQ DX, R8 // r2 = a0×b2 MOVQ (CX), AX @@ -95,18 +102,20 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 ADCQ DX, R10 // r2 += 19×a3×b4 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R11 - ADCQ DX, R10 + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, R11 + ADCQ DX, R10 // r2 += 19×a4×b3 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R11 - ADCQ DX, R10 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(BX) + ADDQ AX, R11 + ADCQ DX, R10 // r3 = a0×b3 MOVQ (CX), AX @@ -133,11 +142,12 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 ADCQ DX, R12 // r3 += 19×a4×b4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R13 - ADCQ DX, R12 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, R13 + ADCQ DX, R12 // r4 = a0×b4 MOVQ (CX), AX @@ -231,18 +241,22 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, BX // r0 += 38×l1×l4 - MOVQ 8(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, SI - ADCQ DX, BX + MOVQ 8(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 32(CX) + ADDQ AX, SI + ADCQ DX, BX // r0 += 38×l2×l3 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 24(CX) - ADDQ AX, SI - ADCQ DX, BX + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 24(CX) + ADDQ AX, SI + ADCQ DX, BX // r1 = 2×l0×l1 MOVQ (CX), AX @@ -252,18 +266,21 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, DI // r1 += 38×l2×l4 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R8 - ADCQ DX, DI + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 32(CX) + ADDQ AX, R8 + ADCQ DX, DI // r1 += 19×l3×l3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(CX) - ADDQ AX, R8 - ADCQ DX, DI + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(CX) + ADDQ AX, R8 + ADCQ DX, DI // r2 = 2×l0×l2 MOVQ (CX), AX @@ -279,11 +296,13 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 ADCQ DX, R9 // r2 += 38×l3×l4 - MOVQ 24(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R10 - ADCQ DX, R9 + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 32(CX) + ADDQ AX, R10 + ADCQ DX, R9 // r3 = 2×l0×l3 MOVQ (CX), AX @@ -293,18 +312,19 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, R11 // r3 += 2×l1×l2 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 16(CX) - ADDQ AX, R12 - ADCQ DX, R11 + MOVQ 8(CX), AX + SHLQ $0x01, AX + MULQ 16(CX) + ADDQ AX, R12 + ADCQ DX, R11 // r3 += 19×l4×l4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(CX) - ADDQ AX, R12 - ADCQ DX, R11 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(CX) + ADDQ AX, R12 + ADCQ DX, R11 // r4 = 2×l0×l4 MOVQ (CX), AX @@ -314,11 +334,11 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, R13 // r4 += 2×l1×l3 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 24(CX) - ADDQ AX, R14 - ADCQ DX, R13 + MOVQ 8(CX), AX + SHLQ $0x01, AX + MULQ 24(CX) + ADDQ AX, R14 + ADCQ DX, R13 // r4 += l2×l2 MOVQ 16(CX), AX