From: Xiaolin Zhao Date: Tue, 29 Apr 2025 13:04:11 +0000 (+0800) Subject: crypto/sha512: improve performance of loong64 X-Git-Tag: go1.25rc1~321 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=0c8615be08673838ed5c1c324dd138f73dfe0689;p=gostls13.git crypto/sha512: improve performance of loong64 1. Replaced WORD with instruction REVBV. 2. Simplified the implementation of Ch and Maj by reducing instructions, refer to the implementation of riscv64. goos: linux goarch: loong64 pkg: crypto/sha512 cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Hash8Bytes/New 415.6n ± 0% 398.9n ± 0% -4.01% (p=0.000 n=10) Hash8Bytes/Sum384 427.6n ± 0% 409.7n ± 0% -4.20% (p=0.000 n=10) Hash8Bytes/Sum512 432.1n ± 0% 415.3n ± 0% -3.89% (p=0.000 n=10) Hash1K/New 3.087µ ± 0% 2.931µ ± 0% -5.05% (p=0.000 n=10) Hash1K/Sum384 3.094µ ± 0% 2.938µ ± 0% -5.04% (p=0.000 n=10) Hash1K/Sum512 3.102µ ± 0% 2.946µ ± 0% -5.01% (p=0.000 n=10) Hash8K/New 21.81µ ± 0% 20.67µ ± 0% -5.25% (p=0.000 n=10) Hash8K/Sum384 21.81µ ± 0% 20.66µ ± 0% -5.26% (p=0.000 n=10) Hash8K/Sum512 21.82µ ± 0% 20.69µ ± 0% -5.21% (p=0.000 n=10) geomean 3.061µ 2.915µ -4.77% goos: linux goarch: loong64 pkg: crypto/sha512 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Hash8Bytes/New 509.4n ± 0% 484.9n ± 0% -4.79% (p=0.000 n=10) Hash8Bytes/Sum384 522.9n ± 0% 498.2n ± 0% -4.71% (p=0.000 n=10) Hash8Bytes/Sum512 529.0n ± 0% 504.5n ± 0% -4.63% (p=0.000 n=10) Hash1K/New 3.578µ ± 0% 3.364µ ± 0% -5.98% (p=0.000 n=10) Hash1K/Sum384 3.593µ ± 0% 3.382µ ± 0% -5.87% (p=0.000 n=10) Hash1K/Sum512 3.599µ ± 0% 3.386µ ± 0% -5.93% (p=0.000 n=10) Hash8K/New 25.10µ ± 0% 23.56µ ± 0% -6.14% (p=0.000 n=10) Hash8K/Sum384 25.12µ ± 0% 23.58µ ± 0% -6.13% (p=0.000 n=10) Hash8K/Sum512 25.12µ ± 0% 23.59µ ± 0% -6.12% (p=0.000 n=10) geomean 3.607µ 3.405µ -5.59% Change-Id: I8307ea0fd2d474671f1eef2da2ba5fe899c645d5 Reviewed-on: https://go-review.googlesource.com/c/go/+/668835 Reviewed-by: abner chenc LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee Reviewed-by: Cherry Mui --- diff --git a/src/crypto/internal/fips140/sha512/sha512block_loong64.s b/src/crypto/internal/fips140/sha512/sha512block_loong64.s index 00f686c9f7..f65d563ca3 100644 --- a/src/crypto/internal/fips140/sha512/sha512block_loong64.s +++ b/src/crypto/internal/fips140/sha512/sha512block_loong64.s @@ -18,7 +18,7 @@ // W[i] = M[i]; for 0 <= i <= 15 #define LOAD0(index) \ MOVV (index*8)(R5), REGTMP4; \ - WORD $0x3ce7; \ //REVBV REGTMP4, REGTMP4 + REVBV REGTMP4, REGTMP4; \ MOVV REGTMP4, (index*8)(R3) // W[i] = SIGMA1(W[i-2]) + W[i-7] + SIGMA0(W[i-15]) + W[i-16]; for 16 <= i <= 79 @@ -50,38 +50,37 @@ // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + K[i] + W[i] // BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x) // Ch(x, y, z) = (x AND y) XOR (NOT x AND z) +// = ((y XOR z) AND x) XOR z // Calculate T1 in REGTMP4 #define SHA512T1(const, e, f, g, h) \ ADDV $const, h; \ ADDV REGTMP4, h; \ - ROTRV $14, e, REGTMP4; \ + ROTRV $14, e, REGTMP5; \ ROTRV $18, e, REGTMP; \ ROTRV $41, e, REGTMP3; \ - AND f, e, REGTMP2; \ - XOR REGTMP, REGTMP4; \ - MOVV $0xffffffffffffffff, REGTMP; \ - XOR REGTMP4, REGTMP3; \ - XOR REGTMP, e, REGTMP5; \ + XOR f, g, REGTMP2; \ + XOR REGTMP, REGTMP5; \ + AND e, REGTMP2; \ + XOR REGTMP5, REGTMP3; \ + XOR g, REGTMP2; \ ADDV REGTMP3, h; \ - AND g, REGTMP5; \ - XOR REGTMP2, REGTMP5; \ - ADDV h, REGTMP5, REGTMP4 + ADDV h, REGTMP2, REGTMP4 // T2 = BIGSIGMA0(a) + Maj(a, b, c) // BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x) // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) +// = ((y XOR z) AND x) XOR (y AND z) // Calculate T2 in REGTMP1 #define SHA512T2(a, b, c) \ ROTRV $28, a, REGTMP5; \ - AND b, c, REGTMP1; \ ROTRV $34, a, REGTMP3; \ - AND c, a, REGTMP; \ - XOR REGTMP3, REGTMP5; \ - XOR REGTMP, REGTMP1; \ ROTRV $39, a, REGTMP2; \ - AND a, b, REGTMP3; \ - XOR REGTMP3, REGTMP1; \ + XOR b, c, REGTMP; \ + AND b, c, REGTMP1; \ + XOR REGTMP3, REGTMP5; \ + AND REGTMP, a, REGTMP; \ XOR REGTMP2, REGTMP5; \ + XOR REGTMP, REGTMP1; \ ADDV REGTMP5, REGTMP1 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.