]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/sha3: un-interleave EOR instructions
authorRoland Shoemaker <roland@golang.org>
Wed, 21 May 2025 18:43:35 +0000 (11:43 -0700)
committerGopher Robot <gobot@golang.org>
Wed, 21 May 2025 22:09:49 +0000 (15:09 -0700)
Move two EOR instructions out of blocks of RAX and
BCAX instructions. This appears to get a teeny
performance improvement, and matches what the
Linux kernel implementation does.

goos: darwin
goarch: arm64
pkg: crypto/sha3
cpu: Apple M1 Pro
                 │ sha3-non-interleaved │          sha3-interleaved           │
                 │        sec/op        │    sec/op     vs base               │
Sha3_512_MTU-10            3.122µ ±  2%   3.107µ ±  1%       ~ (p=0.382 n=10)
Sha3_384_MTU-10            2.266µ ±  7%   2.287µ ± 11%       ~ (p=0.424 n=10)
Sha3_256_MTU-10            1.770µ ±  5%   1.793µ ±  4%       ~ (p=0.353 n=10)
Sha3_224_MTU-10            1.675µ ±  1%   1.664µ ±  2%       ~ (p=0.210 n=10)
Shake128_MTU-10            1.459µ ±  1%   1.446µ ±  1%  -0.89% (p=0.000 n=10)
Shake256_MTU-10            1.591µ ±  1%   1.597µ ±  1%       ~ (p=0.342 n=10)
Shake256_16x-10            27.46µ ± 13%   27.58µ ±  1%       ~ (p=0.247 n=10)
Shake256_1MiB-10           1.269m ± 10%   1.233m ±  1%  -2.89% (p=0.000 n=10)
Sha3_512_1MiB-10           2.283m ±  2%   2.275m ±  0%       ~ (p=0.247 n=10)
geomean                    11.62µ         11.59µ        -0.25%

                 │ sha3-non-interleaved │           sha3-interleaved           │
                 │         B/s          │      B/s       vs base               │
Sha3_512_MTU-10           412.4Mi ±  2%   414.4Mi ±  1%       ~ (p=0.393 n=10)
Sha3_384_MTU-10           568.3Mi ±  6%   563.2Mi ± 10%       ~ (p=0.436 n=10)
Sha3_256_MTU-10           727.7Mi ±  4%   718.0Mi ±  4%       ~ (p=0.353 n=10)
Sha3_224_MTU-10           768.8Mi ±  1%   773.7Mi ±  1%       ~ (p=0.218 n=10)
Shake128_MTU-10           882.7Mi ±  1%   890.9Mi ±  1%  +0.92% (p=0.000 n=10)
Shake256_MTU-10           808.9Mi ±  1%   806.2Mi ±  1%       ~ (p=0.353 n=10)
Shake256_16x-10           569.0Mi ± 11%   566.6Mi ±  1%       ~ (p=0.247 n=10)
Shake256_1MiB-10          787.9Mi ±  9%   811.3Mi ±  1%  +2.97% (p=0.000 n=10)
Sha3_512_1MiB-10          438.0Mi ±  2%   439.6Mi ±  0%       ~ (p=0.247 n=10)
geomean                   641.4Mi         643.1Mi        +0.26%

Change-Id: I5f358d954aeccb91928caa79be96c2902d9ac97e
Reviewed-on: https://go-review.googlesource.com/c/go/+/675136
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Roland Shoemaker <roland@golang.org>
Reviewed-by: Hongxiang Jiang <hxjiang@golang.org>
Reviewed-by: Daniel McCarney <daniel@binaryparadox.net>
src/crypto/internal/fips140/sha3/sha3_arm64.s

index 6a2c121e2c506e648a9044aac7b460fceb6df2d5..7688d178d51c4cd198fab1f799917fddbb9aba7e 100644 (file)
@@ -48,6 +48,8 @@ loop:
        VRAX1   V26.D2, V29.D2, V29.D2
 
        // theta and rho and Pi
+       VEOR    V29.B16, V0.B16, V0.B16
+
        VXAR    $63, V30.D2, V1.D2, V25.D2
 
        VXAR    $20, V30.D2, V6.D2, V1.D2
@@ -74,8 +76,6 @@ loop:
 
        VXAR    $36, V27.D2, V3.D2, V5.D2
 
-       VEOR    V29.B16, V0.B16, V0.B16
-
        VXAR    $43, V27.D2, V18.D2, V27.D2
        VXAR    $49, V31.D2, V17.D2, V3.D2
        VXAR    $54, V30.D2, V11.D2, V30.D2
@@ -113,11 +113,12 @@ loop:
        VBCAX   V0.B16, V1.B16, V28.B16, V4.B16
 
        VBCAX   V1.B16, V2.B16, V0.B16, V0.B16  // iota (chi part)
-       VEOR    V26.B16, V0.B16, V0.B16 // iota
 
        VBCAX   V2.B16, V27.B16, V1.B16, V1.B16
        VBCAX   V27.B16, V28.B16, V2.B16, V2.B16
 
+       VEOR    V26.B16, V0.B16, V0.B16 // iota
+
        SUB             $1, R2, R2
        CBNZ    R2, loop