From 5752a946776e55af1985b87472c5a9fdb8d7ea03 Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Wed, 13 Sep 2023 10:59:41 +0200 Subject: [PATCH] crypto/md5: provide optimised assembly for riscv64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Provide an optimised assembly implementation of MD5 for RISC-V. There are significant performance improvements. The assembler takes advantage of Zbb instructions when they are available. Results for the VisionFive 2 running Ubuntu 24.04 with GORISCV64=rva20u64. goos: linux goarch: riscv64 pkg: crypto/md5 │ md5_go.txt │ md5_ass.txt │ │ sec/op │ sec/op vs base │ Hash8Bytes 1.202µ ± 0% 1.220µ ± 0% +1.50% (p=0.000 n=10) Hash64 1.665µ ± 0% 1.518µ ± 0% -8.83% (p=0.000 n=10) Hash128 2.165µ ± 0% 1.885µ ± 0% -12.94% (p=0.000 n=10) Hash256 3.162µ ± 0% 2.613µ ± 0% -17.38% (p=0.000 n=10) Hash512 5.146µ ± 0% 4.063µ ± 0% -21.05% (p=0.000 n=10) Hash1K 9.115µ ± 0% 6.959µ ± 0% -23.65% (p=0.000 n=10) Hash8K 64.68µ ± 0% 47.52µ ± 0% -26.54% (p=0.000 n=10) Hash1M 8.131m ± 0% 5.936m ± 0% -27.00% (p=0.000 n=10) Hash8M 65.06m ± 0% 47.50m ± 0% -26.99% (p=0.000 n=10) Hash8BytesUnaligned 1.210µ ± 0% 1.199µ ± 0% -0.91% (p=0.000 n=10) Hash1KUnaligned 9.114µ ± 0% 8.266µ ± 0% -9.30% (p=0.000 n=10) Hash8KUnaligned 64.68µ ± 0% 57.97µ ± 0% -10.38% (p=0.000 n=10) geomean 22.37µ 18.83µ -15.82% Results for the VisionFive 2 running Ubuntu 24.04 with GORISCV64=rva22u64. goos: linux goarch: riscv64 pkg: crypto/md5 │ md5_g22.txt │ md5_a22.txt │ │ sec/op │ sec/op vs base │ Hash8Bytes 1.175µ ± 0% 1.002µ ± 0% -14.72% (p=0.000 n=10) Hash64 1.575µ ± 0% 1.274µ ± 0% -19.11% (p=0.000 n=10) Hash128 2.033µ ± 0% 1.587µ ± 0% -21.92% (p=0.000 n=10) Hash256 2.943µ ± 0% 2.209µ ± 0% -24.93% (p=0.000 n=10) Hash512 4.755µ ± 0% 3.443µ ± 0% -27.58% (p=0.000 n=10) Hash1K 8.378µ ± 0% 5.910µ ± 0% -29.46% (p=0.000 n=10) Hash8K 59.12µ ± 0% 40.45µ ± 0% -31.58% (p=0.000 n=10) Hash1M 7.426m ± 0% 5.056m ± 0% -31.92% (p=0.000 n=10) Hash8M 59.41m ± 0% 40.45m ± 0% -31.91% (p=0.000 n=10) Hash8BytesUnaligned 1.169µ ± 0% 1.012µ ± 0% -13.43% (p=0.000 n=10) Hash1KUnaligned 8.379µ ± 0% 7.213µ ± 0% -13.91% (p=0.000 n=10) Hash8KUnaligned 59.12µ ± 0% 50.90µ ± 0% -13.91% (p=0.000 n=10) geomean 20.83µ 15.99µ -23.21% Change-Id: I61e3fa802c2cc50e0b5f71f151b4741691ccb481 Reviewed-on: https://go-review.googlesource.com/c/go/+/527936 Reviewed-by: Joel Sing Auto-Submit: Tim King Reviewed-by: Dmitri Shuralyov Reviewed-by: Meng Zhuo LUCI-TryBot-Result: Go LUCI Reviewed-by: Tim King --- src/crypto/md5/md5block_decl.go | 2 +- src/crypto/md5/md5block_generic.go | 2 +- src/crypto/md5/md5block_riscv64.s | 279 +++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 src/crypto/md5/md5block_riscv64.s diff --git a/src/crypto/md5/md5block_decl.go b/src/crypto/md5/md5block_decl.go index 57b7462bb2..0af9c69a5c 100644 --- a/src/crypto/md5/md5block_decl.go +++ b/src/crypto/md5/md5block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || s390x) && !purego +//go:build (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || riscv64 || s390x) && !purego package md5 diff --git a/src/crypto/md5/md5block_generic.go b/src/crypto/md5/md5block_generic.go index d6b852db91..22d0831300 100644 --- a/src/crypto/md5/md5block_generic.go +++ b/src/crypto/md5/md5block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !s390x) || purego +//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !riscv64 && !s390x) || purego package md5 diff --git a/src/crypto/md5/md5block_riscv64.s b/src/crypto/md5/md5block_riscv64.s new file mode 100644 index 0000000000..017c70b936 --- /dev/null +++ b/src/crypto/md5/md5block_riscv64.s @@ -0,0 +1,279 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// RISCV64 version of md5block.go +// derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go + +//go:build !purego + +#include "textflag.h" + +#define LOAD32U(base, offset, tmp, dest) \ + MOVBU (offset+0*1)(base), dest; \ + MOVBU (offset+1*1)(base), tmp; \ + SLL $8, tmp; \ + OR tmp, dest; \ + MOVBU (offset+2*1)(base), tmp; \ + SLL $16, tmp; \ + OR tmp, dest; \ + MOVBU (offset+3*1)(base), tmp; \ + SLL $24, tmp; \ + OR tmp, dest + +#define LOAD64U(base, offset, tmp1, tmp2, dst) \ + LOAD32U(base, offset, tmp1, dst); \ + LOAD32U(base, offset+4, tmp1, tmp2); \ + SLL $32, tmp2; \ + OR tmp2, dst + +#define ROUND1EVN(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW x, a; \ + ADDW X23, a; \ + XOR c, d, X23; \ + AND b, X23; \ + XOR d, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +#define ROUND1ODD(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW X23, a; \ + SRL $32, x, X23; \ + ADDW X23, a; \ + XOR c, d, X23; \ + AND b, X23; \ + XOR d, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +#define ROUND2EVN(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW x, a; \ + ADDW X23, a; \ + XOR b, c, X23; \ + AND d, X23; \ + XOR c, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +#define ROUND2ODD(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW X23, a; \ + SRL $32, x, X23; \ + ADDW X23, a; \ + XOR b, c, X23; \ + AND d, X23; \ + XOR c, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +#define ROUND3EVN(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW x, a; \ + ADDW X23, a; \ + XOR c, d, X23; \ + XOR b, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +#define ROUND3ODD(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW X23, a; \ + SRL $32, x, X23; \ + ADDW X23, a; \ + XOR c, d, X23; \ + XOR b, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +#define ROUND4EVN(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW x, a; \ + ADDW X23, a; \ + ORN d, b, X23; \ + XOR c, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +#define ROUND4ODD(a, b, c, d, x, const, shift) \ + MOV $const, X23; \ + ADDW X23, a; \ + SRL $32, x, X23; \ + ADDW X23, a; \ + ORN d, b, X23; \ + XOR c, X23; \ + ADDW X23, a; \ + RORIW $(32-shift), a; \ + ADDW b, a + +// Register use for the block function +// +// X5 - X12 : contain the 16 32 bit data items in the block we're +// processing. Odd numbered values, e.g., x1, x3 are stored in +// the upper 32 bits of the register. +// X13 - X16 : a, b, c, d +// X17 - X20 : used to store the old values of a, b, c, d, i.e., aa, bb, cc, +// dd. X17 and X18 are also used as temporary registers when +// loading unaligned data. +// X22 : pointer to dig.s +// X23 : temporary register +// X28 : pointer to the first byte beyond the end of p +// X29 : pointer to current 64 byte block of data, initially set to +// &p[0] +// X30 : temporary register + +TEXT ·block(SB),NOSPLIT,$0-32 + MOV p+8(FP), X29 + MOV p_len+16(FP), X30 + SRL $6, X30 + SLL $6, X30 + BEQZ X30, zero + + ADD X29, X30, X28 + + MOV dig+0(FP), X22 + MOVWU (0*4)(X22), X13 // a = s[0] + MOVWU (1*4)(X22), X14 // b = s[1] + MOVWU (2*4)(X22), X15 // c = s[2] + MOVWU (3*4)(X22), X16 // d = s[3] + +loop: + + // Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12. + // Different paths are taken to load the values depending on whether the + // buffer is 8 byte aligned or not. We load all the values up front + // here at the start of the loop to avoid multiple alignment checks and + // to reduce code size. It takes 10 instructions to load an unaligned + // 32 bit value and this value will be used 4 times in the main body + // of the loop below. + + AND $7, X29, X30 + BEQZ X30, aligned + + LOAD64U(X29,0, X17, X18, X5) + LOAD64U(X29,8, X17, X18, X6) + LOAD64U(X29,16, X17, X18, X7) + LOAD64U(X29,24, X17, X18, X8) + LOAD64U(X29,32, X17, X18, X9) + LOAD64U(X29,40, X17, X18, X10) + LOAD64U(X29,48, X17, X18, X11) + LOAD64U(X29,56, X17, X18, X12) + JMP block_loaded + +aligned: + MOV (0*8)(X29), X5 + MOV (1*8)(X29), X6 + MOV (2*8)(X29), X7 + MOV (3*8)(X29), X8 + MOV (4*8)(X29), X9 + MOV (5*8)(X29), X10 + MOV (6*8)(X29), X11 + MOV (7*8)(X29), X12 + +block_loaded: + MOV X13, X17 + MOV X14, X18 + MOV X15, X19 + MOV X16, X20 + + // Some of the hex constants below are too large to fit into a + // signed 32 bit value. The assembler will handle these + // constants in a special way to ensure that they are + // zero extended. Our algorithm is only interested in the + // bottom 32 bits and doesn't care whether constants are + // sign or zero extended when moved into 64 bit registers. + // So we use signed constants instead of hex when bit 31 is + // set so all constants can be loaded by lui+addi. + + ROUND1EVN(X13,X14,X15,X16,X5, -680876936, 7); // 0xd76aa478 + ROUND1ODD(X16,X13,X14,X15,X5, -389564586,12); // 0xe8c7b756 + ROUND1EVN(X15,X16,X13,X14,X6, 0x242070db,17); // 0x242070db + ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee + ROUND1EVN(X13,X14,X15,X16,X7, -176418897, 7); // 0xf57c0faf + ROUND1ODD(X16,X13,X14,X15,X7, 0x4787c62a,12); // 0x4787c62a + ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613 + ROUND1ODD(X14,X15,X16,X13,X8, -45705983,22); // 0xfd469501 + ROUND1EVN(X13,X14,X15,X16,X9, 0x698098d8, 7); // 0x698098d8 + ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af + ROUND1EVN(X15,X16,X13,X14,X10, -42063,17); // 0xffff5bb1 + ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be + ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122 + ROUND1ODD(X16,X13,X14,X15,X11, -40341101,12); // 0xfd987193 + ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e + ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821 + + ROUND2ODD(X13,X14,X15,X16,X5, -165796510, 5); // f61e2562 + ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340 + ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51 + ROUND2EVN(X14,X15,X16,X13,X5, -373897302,20); // e9b6c7aa + ROUND2ODD(X13,X14,X15,X16,X7, -701558691, 5); // d62f105d + ROUND2EVN(X16,X13,X14,X15,X10, 0x2441453, 9); // 2441453 + ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681 + ROUND2EVN(X14,X15,X16,X13,X7, -405537848,20); // e7d3fbc8 + ROUND2ODD(X13,X14,X15,X16,X9, 0x21e1cde6, 5); // 21e1cde6 + ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6 + ROUND2ODD(X15,X16,X13,X14,X6, -187363961,14); // f4d50d87 + ROUND2EVN(X14,X15,X16,X13,X9, 0x455a14ed,20); // 455a14ed + ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905 + ROUND2EVN(X16,X13,X14,X15,X6, -51403784, 9); // fcefa3f8 + ROUND2ODD(X15,X16,X13,X14,X8, 0x676f02d9,14); // 676f02d9 + ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a + + ROUND3ODD(X13,X14,X15,X16,X7, -378558, 4); // fffa3942 + ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681 + ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122 + ROUND3EVN(X14,X15,X16,X13,X12, -35309556,23); // fde5380c + ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44 + ROUND3EVN(X16,X13,X14,X15,X7, 0x4bdecfa9,11); // 4bdecfa9 + ROUND3ODD(X15,X16,X13,X14,X8, -155497632,16); // f6bb4b60 + ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70 + ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6 + ROUND3EVN(X16,X13,X14,X15,X5, -358537222,11); // eaa127fa + ROUND3ODD(X15,X16,X13,X14,X6, -722521979,16); // d4ef3085 + ROUND3EVN(X14,X15,X16,X13,X8, 0x4881d05,23); // 4881d05 + ROUND3ODD(X13,X14,X15,X16,X9, -640364487, 4); // d9d4d039 + ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5 + ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8 + ROUND3EVN(X14,X15,X16,X13,X6, -995338651,23); // c4ac5665 + + ROUND4EVN(X13,X14,X15,X16,X5, -198630844, 6); // f4292244 + ROUND4ODD(X16,X13,X14,X15,X8, 0x432aff97,10); // 432aff97 + ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7 + ROUND4ODD(X14,X15,X16,X13,X7, -57434055,21); // fc93a039 + ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3 + ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92 + ROUND4EVN(X15,X16,X13,X14,X10 ,-1051523,15); // ffeff47d + ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1 + ROUND4EVN(X13,X14,X15,X16,X9, 0x6fa87e4f, 6); // 6fa87e4f + ROUND4ODD(X16,X13,X14,X15,X12, -30611744,10); // fe2ce6e0 + ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314 + ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1 + ROUND4EVN(X13,X14,X15,X16,X7, -145523070, 6); // f7537e82 + ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235 + ROUND4EVN(X15,X16,X13,X14,X6, 0x2ad7d2bb,15); // 2ad7d2bb + ROUND4ODD(X14,X15,X16,X13,X9, -343485551,21); // eb86d391 + + ADDW X17, X13 + ADDW X18, X14 + ADDW X19, X15 + ADDW X20, X16 + + ADD $64, X29 + BNE X28, X29, loop + + MOVW X13, (0*4)(X22) + MOVW X14, (1*4)(X22) + MOVW X15, (2*4)(X22) + MOVW X16, (3*4)(X22) + +zero: + RET -- 2.48.1