From d99be5c44449a3a40a62942272e99642962a37d9 Mon Sep 17 00:00:00 2001 From: Julian Zhu Date: Wed, 24 Dec 2025 21:16:56 +0800 Subject: [PATCH] crypto/sha1: use const table for key loading on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Load constant keys from a static memory table rather than loading immediates into registers on loong64. Benchmark for Loongson-3A5000: goos: linux goarch: loong64 pkg: crypto/sha1 cpu: Loongson-3A5000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-4 235.9n ± 0% 229.1n ± 0% -2.88% (p=0.000 n=8) Hash8Bytes/Sum-4 1.892µ ± 0% 1.882µ ± 0% -0.50% (p=0.000 n=8) Hash320Bytes/New-4 1022.0n ± 0% 963.8n ± 0% -5.70% (p=0.000 n=8) Hash320Bytes/Sum-4 1037.0n ± 0% 981.1n ± 0% -5.39% (p=0.000 n=8) Hash1K/New-4 2.760µ ± 0% 2.594µ ± 0% -6.01% (p=0.000 n=8) Hash1K/Sum-4 2.775µ ± 0% 2.610µ ± 0% -5.95% (p=0.000 n=8) Hash8K/New-4 20.46µ ± 0% 19.20µ ± 0% -6.17% (p=0.000 n=8) Hash8K/Sum-4 20.49µ ± 0% 19.22µ ± 0% -6.17% (p=0.000 n=8) geomean 2.498µ 2.377µ -4.87% │ old │ new │ │ B/s │ B/s vs base │ Hash8Bytes/New-4 32.34Mi ± 0% 33.30Mi ± 0% +2.98% (p=0.000 n=8) Hash8Bytes/Sum-4 4.034Mi ± 0% 4.053Mi ± 0% +0.47% (p=0.000 n=8) Hash320Bytes/New-4 298.7Mi ± 0% 316.7Mi ± 0% +6.02% (p=0.000 n=8) Hash320Bytes/Sum-4 294.3Mi ± 0% 311.0Mi ± 0% +5.69% (p=0.000 n=8) Hash1K/New-4 353.8Mi ± 0% 376.5Mi ± 0% +6.41% (p=0.000 n=8) Hash1K/Sum-4 351.9Mi ± 0% 374.1Mi ± 0% +6.31% (p=0.000 n=8) Hash8K/New-4 381.8Mi ± 0% 406.9Mi ± 0% +6.57% (p=0.000 n=8) Hash8K/Sum-4 381.4Mi ± 0% 406.4Mi ± 0% +6.58% (p=0.000 n=8) geomean 146.1Mi 153.6Mi +5.11% Change-Id: I7305caefa1434ab2bb4ce94a1c789d4ee5b7ccf3 Reviewed-on: https://go-review.googlesource.com/c/go/+/732580 LUCI-TryBot-Result: Go LUCI Reviewed-by: abner chenc Reviewed-by: Dmitri Shuralyov Reviewed-by: Carlos Amedee --- src/crypto/sha1/sha1block_loong64.s | 32 +++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/crypto/sha1/sha1block_loong64.s b/src/crypto/sha1/sha1block_loong64.s index b76b193ad0..cd2e5dcfdc 100644 --- a/src/crypto/sha1/sha1block_loong64.s +++ b/src/crypto/sha1/sha1block_loong64.s @@ -26,6 +26,10 @@ #define REGTMP1 R17 #define REGTMP2 R18 #define REGTMP3 R19 +#define KEYREG1 R25 +#define KEYREG2 R26 +#define KEYREG3 R27 +#define KEYREG4 R28 #define LOAD1(index) \ MOVW (index*4)(R5), REGTMP3; \ @@ -63,38 +67,38 @@ #define FUNC4 FUNC2 -#define MIX(a, b, c, d, e, const) \ +#define MIX(a, b, c, d, e, key) \ ROTR $2, b; \ // b << 30 ADD REGTMP1, e; \ // e = e + f ROTR $27, a, REGTMP2; \ // a << 5 ADD REGTMP3, e; \ // e = e + w[i] - ADDV $const, e; \ // e = e + k + ADDV key, e; \ // e = e + k ADD REGTMP2, e // e = e + a<<5 #define ROUND1(a, b, c, d, e, index) \ LOAD1(index); \ FUNC1(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x5A827999) + MIX(a, b, c, d, e, KEYREG1) #define ROUND1x(a, b, c, d, e, index) \ LOAD(index); \ FUNC1(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x5A827999) + MIX(a, b, c, d, e, KEYREG1) #define ROUND2(a, b, c, d, e, index) \ LOAD(index); \ FUNC2(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x6ED9EBA1) + MIX(a, b, c, d, e, KEYREG2) #define ROUND3(a, b, c, d, e, index) \ LOAD(index); \ FUNC3(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x8F1BBCDC) + MIX(a, b, c, d, e, KEYREG3) #define ROUND4(a, b, c, d, e, index) \ LOAD(index); \ FUNC4(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0xCA62C1D6) + MIX(a, b, c, d, e, KEYREG4) // A stack frame size of 64 bytes is required here, because // the frame size used for data expansion is 64 bytes. @@ -108,13 +112,19 @@ TEXT ·block(SB),NOSPLIT,$64-32 BEQ R6, zero // p_len >= 64 - ADDV R5, R6, R24 + ADDV R5, R6, R24 MOVW (0*4)(R4), R7 MOVW (1*4)(R4), R8 MOVW (2*4)(R4), R9 MOVW (3*4)(R4), R10 MOVW (4*4)(R4), R11 + MOVV $·_K(SB), R21 + MOVW (0*4)(R21), KEYREG1 + MOVW (1*4)(R21), KEYREG2 + MOVW (2*4)(R21), KEYREG3 + MOVW (3*4)(R21), KEYREG4 + loop: MOVW R7, R12 MOVW R8, R13 @@ -224,3 +234,9 @@ end: MOVW R11, (4*4)(R4) zero: RET + +GLOBL ·_K(SB),RODATA,$16 +DATA ·_K+0(SB)/4, $0x5A827999 +DATA ·_K+4(SB)/4, $0x6ED9EBA1 +DATA ·_K+8(SB)/4, $0x8F1BBCDC +DATA ·_K+12(SB)/4, $0xCA62C1D6 -- 2.52.0