From 2c7c62b97235c376205653200c2bd14ac03baa41 Mon Sep 17 00:00:00 2001 From: Neal Patel Date: Tue, 14 Oct 2025 16:14:00 -0400 Subject: [PATCH] crypto/internal/fips140/sha512: interleave scheduling with rounds for 10.3% speed-up MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit goos: linux goarch: amd64 pkg: crypto/sha512 cpu: AMD EPYC 7B13 │ before │ after │ │ sec/op │ sec/op vs base │ Hash8Bytes/New 486.7n ± 1% 440.3n ± 0% -9.53% (p=0.001 n=7) Hash8Bytes/New-2 487.3n ± 1% 442.6n ± 1% -9.17% (p=0.001 n=7) Hash8Bytes/New-4 488.0n ± 1% 442.3n ± 0% -9.36% (p=0.001 n=7) Hash8Bytes/Sum384 495.1n ± 0% 451.2n ± 1% -8.87% (p=0.001 n=7) Hash8Bytes/Sum384-2 495.0n ± 1% 450.8n ± 1% -8.93% (p=0.001 n=7) Hash8Bytes/Sum384-4 500.2n ± 2% 453.6n ± 2% -9.32% (p=0.001 n=7) Hash8Bytes/Sum512 497.3n ± 1% 452.1n ± 0% -9.09% (p=0.001 n=7) Hash8Bytes/Sum512-2 496.0n ± 1% 453.5n ± 0% -8.57% (p=0.001 n=7) Hash8Bytes/Sum512-4 498.5n ± 0% 452.3n ± 1% -9.27% (p=0.001 n=7) Hash1K/New 3.985µ ± 1% 3.543µ ± 1% -11.09% (p=0.001 n=7) Hash1K/New-2 4.004µ ± 2% 3.558µ ± 1% -11.14% (p=0.001 n=7) Hash1K/New-4 3.997µ ± 0% 3.563µ ± 1% -10.86% (p=0.001 n=7) Hash1K/Sum384 3.996µ ± 1% 3.560µ ± 1% -10.91% (p=0.001 n=7) Hash1K/Sum384-2 4.011µ ± 1% 3.576µ ± 1% -10.85% (p=0.001 n=7) Hash1K/Sum384-4 4.004µ ± 0% 3.564µ ± 0% -10.99% (p=0.001 n=7) Hash1K/Sum512 3.998µ ± 1% 3.555µ ± 1% -11.08% (p=0.001 n=7) Hash1K/Sum512-2 3.996µ ± 0% 3.560µ ± 1% -10.91% (p=0.001 n=7) Hash1K/Sum512-4 4.004µ ± 1% 3.573µ ± 1% -10.76% (p=0.001 n=7) Hash8K/New 28.34µ ± 1% 25.29µ ± 1% -10.77% (p=0.001 n=7) Hash8K/New-2 28.45µ ± 1% 25.48µ ± 1% -10.44% (p=0.001 n=7) Hash8K/New-4 28.42µ ± 1% 25.37µ ± 1% -10.71% (p=0.001 n=7) Hash8K/Sum384 28.38µ ± 0% 25.18µ ± 0% -11.28% (p=0.001 n=7) Hash8K/Sum384-2 28.50µ ± 1% 25.35µ ± 1% -11.07% (p=0.001 n=7) Hash8K/Sum384-4 28.51µ ± 1% 25.35µ ± 0% -11.07% (p=0.001 n=7) Hash8K/Sum512 28.34µ ± 1% 25.23µ ± 0% -10.96% (p=0.001 n=7) Hash8K/Sum512-2 28.33µ ± 1% 25.23µ ± 1% -10.95% (p=0.001 n=7) Hash8K/Sum512-4 28.48µ ± 1% 25.31µ ± 1% -11.13% (p=0.001 n=7) geomean 3.828µ 3.433µ -10.34% │ before │ after │ │ B/s │ B/s vs base │ Hash8Bytes/New 15.68Mi ± 1% 17.33Mi ± 0% +10.52% (p=0.001 n=7) Hash8Bytes/New-2 15.66Mi ± 1% 17.23Mi ± 1% +10.05% (p=0.001 n=7) Hash8Bytes/New-4 15.63Mi ± 0% 17.25Mi ± 0% +10.37% (p=0.001 n=7) Hash8Bytes/Sum384 15.41Mi ± 0% 16.91Mi ± 1% +9.72% (p=0.001 n=7) Hash8Bytes/Sum384-2 15.41Mi ± 1% 16.93Mi ± 1% +9.84% (p=0.001 n=7) Hash8Bytes/Sum384-4 15.25Mi ± 2% 16.82Mi ± 2% +10.32% (p=0.001 n=7) Hash8Bytes/Sum512 15.34Mi ± 1% 16.87Mi ± 0% +9.94% (p=0.001 n=7) Hash8Bytes/Sum512-2 15.38Mi ± 1% 16.82Mi ± 0% +9.36% (p=0.001 n=7) Hash8Bytes/Sum512-4 15.31Mi ± 0% 16.87Mi ± 1% +10.22% (p=0.001 n=7) Hash1K/New 245.0Mi ± 1% 275.6Mi ± 1% +12.47% (p=0.001 n=7) Hash1K/New-2 243.9Mi ± 2% 274.5Mi ± 1% +12.55% (p=0.001 n=7) Hash1K/New-4 244.3Mi ± 0% 274.1Mi ± 1% +12.21% (p=0.001 n=7) Hash1K/Sum384 244.4Mi ± 0% 274.3Mi ± 1% +12.24% (p=0.001 n=7) Hash1K/Sum384-2 243.5Mi ± 1% 273.1Mi ± 1% +12.16% (p=0.001 n=7) Hash1K/Sum384-4 243.9Mi ± 0% 274.0Mi ± 0% +12.35% (p=0.001 n=7) Hash1K/Sum512 244.3Mi ± 1% 274.7Mi ± 1% +12.46% (p=0.001 n=7) Hash1K/Sum512-2 244.4Mi ± 0% 274.3Mi ± 1% +12.25% (p=0.001 n=7) Hash1K/Sum512-4 243.9Mi ± 1% 273.3Mi ± 1% +12.08% (p=0.001 n=7) Hash8K/New 275.7Mi ± 1% 309.0Mi ± 1% +12.07% (p=0.001 n=7) Hash8K/New-2 274.6Mi ± 1% 306.6Mi ± 1% +11.67% (p=0.001 n=7) Hash8K/New-4 274.9Mi ± 1% 307.9Mi ± 1% +11.99% (p=0.001 n=7) Hash8K/Sum384 275.3Mi ± 0% 310.3Mi ± 0% +12.71% (p=0.001 n=7) Hash8K/Sum384-2 274.1Mi ± 1% 308.2Mi ± 1% +12.45% (p=0.001 n=7) Hash8K/Sum384-4 274.1Mi ± 1% 308.2Mi ± 0% +12.44% (p=0.001 n=7) Hash8K/Sum512 275.7Mi ± 1% 309.6Mi ± 0% +12.31% (p=0.001 n=7) Hash8K/Sum512-2 275.8Mi ± 1% 309.7Mi ± 1% +12.29% (p=0.001 n=7) Hash8K/Sum512-4 274.3Mi ± 1% 308.7Mi ± 1% +12.52% (p=0.001 n=7) geomean 101.2Mi 112.9Mi +11.53% │ before │ after │ │ B/op │ B/op vs base │ Hash8Bytes/New 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/New-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum384 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum384-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum384-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum512 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum512-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum512-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/New 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/New-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum384 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum384-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum384-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum512 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum512-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum512-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/New 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/New-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum384 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum384-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum384-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum512 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum512-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum512-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ before │ after │ │ allocs/op │ allocs/op vs base │ Hash8Bytes/New 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/New-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum384 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum384-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum384-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum512 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum512-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8Bytes/Sum512-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/New 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/New-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum384 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum384-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum384-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum512 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum512-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash1K/Sum512-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/New 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/New-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum384 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum384-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum384-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum512 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum512-2 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ Hash8K/Sum512-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=7) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean Change-Id: I3791244b3f69e093203f6aa46dc59428afcb9223 Reviewed-on: https://go-review.googlesource.com/c/go/+/711844 LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker --- .../internal/fips140/sha512/sha512block.go | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/crypto/internal/fips140/sha512/sha512block.go b/src/crypto/internal/fips140/sha512/sha512block.go index 517e8389f7..88b75574d7 100644 --- a/src/crypto/internal/fips140/sha512/sha512block.go +++ b/src/crypto/internal/fips140/sha512/sha512block.go @@ -97,23 +97,22 @@ func blockGeneric(dig *Digest, p []byte) { var w [80]uint64 h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] for len(p) >= chunk { - for i := 0; i < 16; i++ { - j := i * 8 - w[i] = uint64(p[j])<<56 | uint64(p[j+1])<<48 | uint64(p[j+2])<<40 | uint64(p[j+3])<<32 | - uint64(p[j+4])<<24 | uint64(p[j+5])<<16 | uint64(p[j+6])<<8 | uint64(p[j+7]) - } - for i := 16; i < 80; i++ { - v1 := w[i-2] - t1 := bits.RotateLeft64(v1, -19) ^ bits.RotateLeft64(v1, -61) ^ (v1 >> 6) - v2 := w[i-15] - t2 := bits.RotateLeft64(v2, -1) ^ bits.RotateLeft64(v2, -8) ^ (v2 >> 7) + a, b, c, d, e, f, g, h := h0, h1, h2, h3, h4, h5, h6, h7 - w[i] = t1 + w[i-7] + t2 + w[i-16] - } + for i := range 80 { + if i < 16 { + j := i * 8 + w[i] = uint64(p[j])<<56 | uint64(p[j+1])<<48 | uint64(p[j+2])<<40 | uint64(p[j+3])<<32 | + uint64(p[j+4])<<24 | uint64(p[j+5])<<16 | uint64(p[j+6])<<8 | uint64(p[j+7]) + } else { + v1 := w[i-2] + t1 := bits.RotateLeft64(v1, -19) ^ bits.RotateLeft64(v1, -61) ^ (v1 >> 6) + v2 := w[i-15] + t2 := bits.RotateLeft64(v2, -1) ^ bits.RotateLeft64(v2, -8) ^ (v2 >> 7) - a, b, c, d, e, f, g, h := h0, h1, h2, h3, h4, h5, h6, h7 + w[i] = t1 + w[i-7] + t2 + w[i-16] + } - for i := 0; i < 80; i++ { t1 := h + (bits.RotateLeft64(e, -14) ^ bits.RotateLeft64(e, -18) ^ bits.RotateLeft64(e, -41)) + ((e & f) ^ (^e & g)) + _K[i] + w[i] t2 := (bits.RotateLeft64(a, -28) ^ bits.RotateLeft64(a, -34) ^ bits.RotateLeft64(a, -39)) + ((a & b) ^ (a & c) ^ (b & c)) -- 2.52.0