From: Julian Zhu Date: Wed, 24 Dec 2025 12:31:46 +0000 (+0800) Subject: crypto/sha1: provide optimised assembly for riscv64 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=a59593313d75d9e7c99da0cff0e12555597621ec;p=gostls13.git crypto/sha1: provide optimised assembly for riscv64 Provide an optimised assembly implementation of sha1 for riscv64. goos: linux goarch: riscv64 pkg: crypto/sha1 cpu: Spacemit(R) X60 │ oldsha1 │ newsha1 │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-8 2.136µ ± 0% 1.173µ ± 0% -45.09% (p=0.000 n=8) Hash8Bytes/Sum-8 2.079µ ± 0% 1.116µ ± 0% -46.32% (p=0.000 n=8) Hash320Bytes/New-8 10.704µ ± 0% 4.954µ ± 0% -53.72% (p=0.000 n=8) Hash320Bytes/Sum-8 10.645µ ± 0% 4.872µ ± 0% -54.23% (p=0.000 n=8) Hash1K/New-8 29.66µ ± 0% 13.38µ ± 0% -54.90% (p=0.000 n=8) Hash1K/Sum-8 29.63µ ± 0% 13.24µ ± 0% -55.32% (p=0.000 n=8) Hash8K/New-8 226.8µ ± 1% 104.7µ ± 2% -53.84% (p=0.000 n=8) Hash8K/Sum-8 226.7µ ± 1% 102.9µ ± 1% -54.62% (p=0.000 n=8) geomean 19.72µ 9.387µ -52.40% │ oldsha1 │ newsha1 │ │ B/s │ B/s vs base │ Hash8Bytes/New-8 3.572Mi ± 0% 6.504Mi ± 0% +82.11% (p=0.000 n=8) Hash8Bytes/Sum-8 3.672Mi ± 0% 6.838Mi ± 0% +86.23% (p=0.000 n=8) Hash320Bytes/New-8 28.51Mi ± 0% 61.60Mi ± 0% +116.02% (p=0.000 n=8) Hash320Bytes/Sum-8 28.67Mi ± 0% 62.64Mi ± 0% +118.51% (p=0.000 n=8) Hash1K/New-8 32.92Mi ± 0% 73.00Mi ± 0% +121.74% (p=0.000 n=8) Hash1K/Sum-8 32.96Mi ± 0% 73.76Mi ± 0% +123.78% (p=0.000 n=8) Hash8K/New-8 34.44Mi ± 1% 74.61Mi ± 2% +116.61% (p=0.000 n=8) Hash8K/Sum-8 34.46Mi ± 1% 75.93Mi ± 1% +120.37% (p=0.000 n=8) geomean 18.51Mi 38.89Mi +110.07% Change-Id: I3d4d05fe19872412fdf77a337395e0bf84c41dd5 Reviewed-on: https://go-review.googlesource.com/c/go/+/732560 Reviewed-by: Roland Shoemaker Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Joel Sing --- diff --git a/src/crypto/sha1/sha1block_decl.go b/src/crypto/sha1/sha1block_decl.go index 887d8cad01..f32008a13a 100644 --- a/src/crypto/sha1/sha1block_decl.go +++ b/src/crypto/sha1/sha1block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (386 || arm || loong64) && !purego +//go:build (386 || arm || loong64 || riscv64) && !purego package sha1 diff --git a/src/crypto/sha1/sha1block_generic.go b/src/crypto/sha1/sha1block_generic.go index 5989a24347..5dcfe6a36f 100644 --- a/src/crypto/sha1/sha1block_generic.go +++ b/src/crypto/sha1/sha1block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !s390x) || purego +//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !riscv64 && !s390x) || purego package sha1 diff --git a/src/crypto/sha1/sha1block_riscv64.s b/src/crypto/sha1/sha1block_riscv64.s new file mode 100644 index 0000000000..0849694008 --- /dev/null +++ b/src/crypto/sha1/sha1block_riscv64.s @@ -0,0 +1,225 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +#include "textflag.h" + +#define LOAD(index) \ + MOVBU ((index*4)+0)(X29), X5; \ + MOVBU ((index*4)+1)(X29), X6; \ + MOVBU ((index*4)+2)(X29), X7; \ + MOVBU ((index*4)+3)(X29), X8; \ + SLL $24, X5; \ + SLL $16, X6; \ + OR X5, X6, X5; \ + SLL $8, X7; \ + OR X5, X7, X5; \ + OR X5, X8, X5; \ + MOVW X5, (index*4)(X19) + +#define SHUFFLE(index) \ + MOVWU (((index)&0xf)*4)(X19), X5; \ + MOVWU (((index-3)&0xf)*4)(X19), X6; \ + MOVWU (((index-8)&0xf)*4)(X19), X7; \ + MOVWU (((index-14)&0xf)*4)(X19), X8; \ + XOR X6, X5; \ + XOR X7, X5; \ + XOR X8, X5; \ + RORW $31, X5; \ + MOVW X5, (((index)&0xf)*4)(X19) + +// f = d ^ (b & (c ^ d)) +#define FUNC1(a, b, c, d, e) \ + XOR c, d, X7; \ + AND b, X7; \ + XOR d, X7 + +// f = b ^ c ^ d +#define FUNC2(a, b, c, d, e) \ + XOR b, c, X7; \ + XOR d, X7 + +// f = (b & c) | ((b | c) & d) +#define FUNC3(a, b, c, d, e) \ + OR b, c, X8; \ + AND b, c, X6; \ + AND d, X8; \ + OR X6, X8, X7 + +#define FUNC4 FUNC2 + +#define MIX(a, b, c, d, e, key) \ + RORW $2, b; \ + ADD X7, e; \ + RORW $27, a, X8; \ + ADD X5, e; \ + ADD key, e; \ + ADD X8, e + +#define ROUND1(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, X15) + +#define ROUND1x(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, X15) + +#define ROUND2(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC2(a, b, c, d, e); \ + MIX(a, b, c, d, e, X16) + +#define ROUND3(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC3(a, b, c, d, e); \ + MIX(a, b, c, d, e, X17) + +#define ROUND4(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC4(a, b, c, d, e); \ + MIX(a, b, c, d, e, X18) + +// func block(dig *Digest, p []byte) +TEXT ·block(SB),NOSPLIT,$64-32 + MOV p_base+8(FP), X29 + MOV p_len+16(FP), X30 + SRL $6, X30 + SLL $6, X30 + + ADD X29, X30, X28 + BEQ X28, X29, end + + ADD $8, X2, X19 // message schedule buffer on stack + + MOV dig+0(FP), X20 + MOVWU (0*4)(X20), X10 // a = H0 + MOVWU (1*4)(X20), X11 // b = H1 + MOVWU (2*4)(X20), X12 // c = H2 + MOVWU (3*4)(X20), X13 // d = H3 + MOVWU (4*4)(X20), X14 // e = H4 + + MOV $·_K(SB), X21 + MOVW (0*4)(X21), X15 + MOVW (1*4)(X21), X16 + MOVW (2*4)(X21), X17 + MOVW (3*4)(X21), X18 + +loop: + MOVW X10, X22 + MOVW X11, X23 + MOVW X12, X24 + MOVW X13, X25 + MOVW X14, X26 + + ROUND1(X10, X11, X12, X13, X14, 0) + ROUND1(X14, X10, X11, X12, X13, 1) + ROUND1(X13, X14, X10, X11, X12, 2) + ROUND1(X12, X13, X14, X10, X11, 3) + ROUND1(X11, X12, X13, X14, X10, 4) + ROUND1(X10, X11, X12, X13, X14, 5) + ROUND1(X14, X10, X11, X12, X13, 6) + ROUND1(X13, X14, X10, X11, X12, 7) + ROUND1(X12, X13, X14, X10, X11, 8) + ROUND1(X11, X12, X13, X14, X10, 9) + ROUND1(X10, X11, X12, X13, X14, 10) + ROUND1(X14, X10, X11, X12, X13, 11) + ROUND1(X13, X14, X10, X11, X12, 12) + ROUND1(X12, X13, X14, X10, X11, 13) + ROUND1(X11, X12, X13, X14, X10, 14) + ROUND1(X10, X11, X12, X13, X14, 15) + + ROUND1x(X14, X10, X11, X12, X13, 16) + ROUND1x(X13, X14, X10, X11, X12, 17) + ROUND1x(X12, X13, X14, X10, X11, 18) + ROUND1x(X11, X12, X13, X14, X10, 19) + + ROUND2(X10, X11, X12, X13, X14, 20) + ROUND2(X14, X10, X11, X12, X13, 21) + ROUND2(X13, X14, X10, X11, X12, 22) + ROUND2(X12, X13, X14, X10, X11, 23) + ROUND2(X11, X12, X13, X14, X10, 24) + ROUND2(X10, X11, X12, X13, X14, 25) + ROUND2(X14, X10, X11, X12, X13, 26) + ROUND2(X13, X14, X10, X11, X12, 27) + ROUND2(X12, X13, X14, X10, X11, 28) + ROUND2(X11, X12, X13, X14, X10, 29) + ROUND2(X10, X11, X12, X13, X14, 30) + ROUND2(X14, X10, X11, X12, X13, 31) + ROUND2(X13, X14, X10, X11, X12, 32) + ROUND2(X12, X13, X14, X10, X11, 33) + ROUND2(X11, X12, X13, X14, X10, 34) + ROUND2(X10, X11, X12, X13, X14, 35) + ROUND2(X14, X10, X11, X12, X13, 36) + ROUND2(X13, X14, X10, X11, X12, 37) + ROUND2(X12, X13, X14, X10, X11, 38) + ROUND2(X11, X12, X13, X14, X10, 39) + + ROUND3(X10, X11, X12, X13, X14, 40) + ROUND3(X14, X10, X11, X12, X13, 41) + ROUND3(X13, X14, X10, X11, X12, 42) + ROUND3(X12, X13, X14, X10, X11, 43) + ROUND3(X11, X12, X13, X14, X10, 44) + ROUND3(X10, X11, X12, X13, X14, 45) + ROUND3(X14, X10, X11, X12, X13, 46) + ROUND3(X13, X14, X10, X11, X12, 47) + ROUND3(X12, X13, X14, X10, X11, 48) + ROUND3(X11, X12, X13, X14, X10, 49) + ROUND3(X10, X11, X12, X13, X14, 50) + ROUND3(X14, X10, X11, X12, X13, 51) + ROUND3(X13, X14, X10, X11, X12, 52) + ROUND3(X12, X13, X14, X10, X11, 53) + ROUND3(X11, X12, X13, X14, X10, 54) + ROUND3(X10, X11, X12, X13, X14, 55) + ROUND3(X14, X10, X11, X12, X13, 56) + ROUND3(X13, X14, X10, X11, X12, 57) + ROUND3(X12, X13, X14, X10, X11, 58) + ROUND3(X11, X12, X13, X14, X10, 59) + + ROUND4(X10, X11, X12, X13, X14, 60) + ROUND4(X14, X10, X11, X12, X13, 61) + ROUND4(X13, X14, X10, X11, X12, 62) + ROUND4(X12, X13, X14, X10, X11, 63) + ROUND4(X11, X12, X13, X14, X10, 64) + ROUND4(X10, X11, X12, X13, X14, 65) + ROUND4(X14, X10, X11, X12, X13, 66) + ROUND4(X13, X14, X10, X11, X12, 67) + ROUND4(X12, X13, X14, X10, X11, 68) + ROUND4(X11, X12, X13, X14, X10, 69) + ROUND4(X10, X11, X12, X13, X14, 70) + ROUND4(X14, X10, X11, X12, X13, 71) + ROUND4(X13, X14, X10, X11, X12, 72) + ROUND4(X12, X13, X14, X10, X11, 73) + ROUND4(X11, X12, X13, X14, X10, 74) + ROUND4(X10, X11, X12, X13, X14, 75) + ROUND4(X14, X10, X11, X12, X13, 76) + ROUND4(X13, X14, X10, X11, X12, 77) + ROUND4(X12, X13, X14, X10, X11, 78) + ROUND4(X11, X12, X13, X14, X10, 79) + + ADD X22, X10 + ADD X23, X11 + ADD X24, X12 + ADD X25, X13 + ADD X26, X14 + + ADD $64, X29 + BNE X28, X29, loop + +end: + MOVW X10, (0*4)(X20) + MOVW X11, (1*4)(X20) + MOVW X12, (2*4)(X20) + MOVW X13, (3*4)(X20) + MOVW X14, (4*4)(X20) + + RET + +GLOBL ·_K(SB),RODATA,$16 +DATA ·_K+0(SB)/4, $0x5A827999 +DATA ·_K+4(SB)/4, $0x6ED9EBA1 +DATA ·_K+8(SB)/4, $0x8F1BBCDC +DATA ·_K+12(SB)/4, $0xCA62C1D6