From 5752a946776e55af1985b87472c5a9fdb8d7ea03 Mon Sep 17 00:00:00 2001
From: Mark Ryan <markdryan@rivosinc.com>
Date: Wed, 13 Sep 2023 10:59:41 +0200
Subject: [PATCH] crypto/md5: provide optimised assembly for riscv64
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Provide an optimised assembly implementation of MD5 for RISC-V.
There are significant performance improvements.  The assembler takes
advantage of Zbb instructions when they are available.

Results for the VisionFive 2 running Ubuntu 24.04 with
GORISCV64=rva20u64.

goos: linux
goarch: riscv64
pkg: crypto/md5
                    â md5_go.txt  â             md5_ass.txt             â
                    â   sec/op    â   sec/op     vs base                â
Hash8Bytes            1.202Âµ Â± 0%   1.220Âµ Â± 0%   +1.50% (p=0.000 n=10)
Hash64                1.665Âµ Â± 0%   1.518Âµ Â± 0%   -8.83% (p=0.000 n=10)
Hash128               2.165Âµ Â± 0%   1.885Âµ Â± 0%  -12.94% (p=0.000 n=10)
Hash256               3.162Âµ Â± 0%   2.613Âµ Â± 0%  -17.38% (p=0.000 n=10)
Hash512               5.146Âµ Â± 0%   4.063Âµ Â± 0%  -21.05% (p=0.000 n=10)
Hash1K                9.115Âµ Â± 0%   6.959Âµ Â± 0%  -23.65% (p=0.000 n=10)
Hash8K                64.68Âµ Â± 0%   47.52Âµ Â± 0%  -26.54% (p=0.000 n=10)
Hash1M                8.131m Â± 0%   5.936m Â± 0%  -27.00% (p=0.000 n=10)
Hash8M                65.06m Â± 0%   47.50m Â± 0%  -26.99% (p=0.000 n=10)
Hash8BytesUnaligned   1.210Âµ Â± 0%   1.199Âµ Â± 0%   -0.91% (p=0.000 n=10)
Hash1KUnaligned       9.114Âµ Â± 0%   8.266Âµ Â± 0%   -9.30% (p=0.000 n=10)
Hash8KUnaligned       64.68Âµ Â± 0%   57.97Âµ Â± 0%  -10.38% (p=0.000 n=10)
geomean               22.37Âµ        18.83Âµ       -15.82%

Results for the VisionFive 2 running Ubuntu 24.04 with
GORISCV64=rva22u64.

goos: linux
goarch: riscv64
pkg: crypto/md5
                    â md5_g22.txt â             md5_a22.txt             â
                    â   sec/op    â   sec/op     vs base                â
Hash8Bytes            1.175Âµ Â± 0%   1.002Âµ Â± 0%  -14.72% (p=0.000 n=10)
Hash64                1.575Âµ Â± 0%   1.274Âµ Â± 0%  -19.11% (p=0.000 n=10)
Hash128               2.033Âµ Â± 0%   1.587Âµ Â± 0%  -21.92% (p=0.000 n=10)
Hash256               2.943Âµ Â± 0%   2.209Âµ Â± 0%  -24.93% (p=0.000 n=10)
Hash512               4.755Âµ Â± 0%   3.443Âµ Â± 0%  -27.58% (p=0.000 n=10)
Hash1K                8.378Âµ Â± 0%   5.910Âµ Â± 0%  -29.46% (p=0.000 n=10)
Hash8K                59.12Âµ Â± 0%   40.45Âµ Â± 0%  -31.58% (p=0.000 n=10)
Hash1M                7.426m Â± 0%   5.056m Â± 0%  -31.92% (p=0.000 n=10)
Hash8M                59.41m Â± 0%   40.45m Â± 0%  -31.91% (p=0.000 n=10)
Hash8BytesUnaligned   1.169Âµ Â± 0%   1.012Âµ Â± 0%  -13.43% (p=0.000 n=10)
Hash1KUnaligned       8.379Âµ Â± 0%   7.213Âµ Â± 0%  -13.91% (p=0.000 n=10)
Hash8KUnaligned       59.12Âµ Â± 0%   50.90Âµ Â± 0%  -13.91% (p=0.000 n=10)
geomean               20.83Âµ        15.99Âµ       -23.21%

Change-Id: I61e3fa802c2cc50e0b5f71f151b4741691ccb481
Reviewed-on: https://go-review.googlesource.com/c/go/+/527936
Reviewed-by: Joel Sing <joel@sing.id.au>
Auto-Submit: Tim King <taking@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Tim King <taking@google.com>
---
 src/crypto/md5/md5block_decl.go    |   2 +-
 src/crypto/md5/md5block_generic.go |   2 +-
 src/crypto/md5/md5block_riscv64.s  | 279 +++++++++++++++++++++++++++++
 3 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 src/crypto/md5/md5block_riscv64.s

diff --git a/src/crypto/md5/md5block_decl.go b/src/crypto/md5/md5block_decl.go
index 57b7462bb2..0af9c69a5c 100644
--- a/src/crypto/md5/md5block_decl.go
+++ b/src/crypto/md5/md5block_decl.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || s390x) && !purego
+//go:build (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || riscv64 || s390x) && !purego
 
 package md5
 
diff --git a/src/crypto/md5/md5block_generic.go b/src/crypto/md5/md5block_generic.go
index d6b852db91..22d0831300 100644
--- a/src/crypto/md5/md5block_generic.go
+++ b/src/crypto/md5/md5block_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !s390x) || purego
+//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !riscv64 && !s390x) || purego
 
 package md5
 
diff --git a/src/crypto/md5/md5block_riscv64.s b/src/crypto/md5/md5block_riscv64.s
new file mode 100644
index 0000000000..017c70b936
--- /dev/null
+++ b/src/crypto/md5/md5block_riscv64.s
@@ -0,0 +1,279 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// RISCV64 version of md5block.go
+// derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go
+
+//go:build !purego
+
+#include "textflag.h"
+
+#define LOAD32U(base, offset, tmp, dest) \
+	MOVBU	(offset+0*1)(base), dest; \
+	MOVBU	(offset+1*1)(base), tmp; \
+	SLL	$8, tmp; \
+	OR	tmp, dest; \
+	MOVBU	(offset+2*1)(base), tmp; \
+	SLL	$16, tmp; \
+	OR	tmp, dest; \
+	MOVBU	(offset+3*1)(base), tmp; \
+	SLL	$24, tmp; \
+	OR	tmp, dest
+
+#define LOAD64U(base, offset, tmp1, tmp2, dst) \
+	LOAD32U(base, offset, tmp1, dst); \
+	LOAD32U(base, offset+4, tmp1, tmp2); \
+	SLL	$32, tmp2; \
+	OR	tmp2, dst
+
+#define ROUND1EVN(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	x, a; \
+	ADDW	X23, a; \
+	XOR	c, d, X23; \
+	AND	b, X23; \
+	XOR	d, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+#define ROUND1ODD(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	X23, a; \
+	SRL	$32, x, X23; \
+	ADDW	X23, a; \
+	XOR	c, d, X23; \
+	AND	b, X23; \
+	XOR	d, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+#define ROUND2EVN(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	x, a; \
+	ADDW	X23, a; \
+	XOR	b, c, X23; \
+	AND	d, X23; \
+	XOR	c, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+#define ROUND2ODD(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	X23, a; \
+	SRL	$32, x, X23; \
+	ADDW	X23, a; \
+	XOR	b, c, X23; \
+	AND	d, X23; \
+	XOR	c, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+#define ROUND3EVN(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	x, a; \
+	ADDW	X23, a; \
+	XOR	c, d, X23; \
+	XOR	b, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+#define ROUND3ODD(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	X23, a; \
+	SRL	$32, x, X23; \
+	ADDW	X23, a; \
+	XOR	c, d, X23; \
+	XOR	b, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+#define ROUND4EVN(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	x, a; \
+	ADDW	X23, a; \
+	ORN	d, b, X23; \
+	XOR	c, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+#define ROUND4ODD(a, b, c, d, x, const, shift) \
+	MOV	$const, X23; \
+	ADDW	X23, a; \
+	SRL	$32, x, X23; \
+	ADDW	X23, a; \
+	ORN	d, b, X23; \
+	XOR	c, X23; \
+	ADDW	X23, a; \
+	RORIW	$(32-shift), a; \
+	ADDW	b, a
+
+// Register use for the block function
+//
+// X5 - X12	: contain the 16 32 bit data items in the block we're
+//		  processing.  Odd numbered values, e.g., x1, x3 are stored in
+//		  the upper 32 bits of the register.
+// X13 - X16	: a, b, c, d
+// X17 - X20	: used to store the old values of a, b, c, d, i.e., aa, bb, cc,
+//		  dd.  X17 and X18 are also used as temporary registers when
+//		  loading unaligned data.
+// X22		: pointer to dig.s
+// X23		: temporary register
+// X28		: pointer to the first byte beyond the end of p
+// X29		: pointer to current 64 byte block of data, initially set to
+//		  &p[0]
+// X30		: temporary register
+
+TEXT	Â·block(SB),NOSPLIT,$0-32
+	MOV	p+8(FP), X29
+	MOV	p_len+16(FP), X30
+	SRL	$6, X30
+	SLL	$6, X30
+	BEQZ	X30, zero
+
+	ADD	X29, X30, X28
+
+	MOV	dig+0(FP), X22
+	MOVWU	(0*4)(X22), X13	// a = s[0]
+	MOVWU	(1*4)(X22), X14	// b = s[1]
+	MOVWU	(2*4)(X22), X15	// c = s[2]
+	MOVWU	(3*4)(X22), X16	// d = s[3]
+
+loop:
+
+	// Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
+	// Different paths are taken to load the values depending on whether the
+	// buffer is 8 byte aligned or not.  We load all the values up front
+	// here at the start of the loop to avoid multiple alignment checks and
+	// to reduce code size.  It takes 10 instructions to load an unaligned
+	// 32 bit value and this value will be used 4 times in the main body
+	// of the loop below.
+
+	AND	$7, X29, X30
+	BEQZ	X30, aligned
+
+	LOAD64U(X29,0, X17, X18, X5)
+	LOAD64U(X29,8, X17, X18, X6)
+	LOAD64U(X29,16, X17, X18, X7)
+	LOAD64U(X29,24, X17, X18, X8)
+	LOAD64U(X29,32, X17, X18, X9)
+	LOAD64U(X29,40, X17, X18, X10)
+	LOAD64U(X29,48, X17, X18, X11)
+	LOAD64U(X29,56, X17, X18, X12)
+	JMP block_loaded
+
+aligned:
+	MOV	(0*8)(X29), X5
+	MOV	(1*8)(X29), X6
+	MOV	(2*8)(X29), X7
+	MOV	(3*8)(X29), X8
+	MOV	(4*8)(X29), X9
+	MOV	(5*8)(X29), X10
+	MOV	(6*8)(X29), X11
+	MOV	(7*8)(X29), X12
+
+block_loaded:
+	MOV	X13, X17
+	MOV	X14, X18
+	MOV	X15, X19
+	MOV	X16, X20
+
+	// Some of the hex constants below are too large to fit into a
+	// signed 32 bit value.  The assembler will handle these
+	// constants in a special way to ensure that they are
+	// zero extended.  Our algorithm is only interested in the
+	// bottom 32 bits and doesn't care whether constants are
+	// sign or zero extended when moved into 64 bit registers.
+	// So we use signed constants instead of hex when bit 31 is
+	// set so all constants can be loaded by lui+addi.
+
+	ROUND1EVN(X13,X14,X15,X16,X5,  -680876936, 7); // 0xd76aa478
+	ROUND1ODD(X16,X13,X14,X15,X5,  -389564586,12); // 0xe8c7b756
+	ROUND1EVN(X15,X16,X13,X14,X6,  0x242070db,17); // 0x242070db
+	ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
+	ROUND1EVN(X13,X14,X15,X16,X7,  -176418897, 7); // 0xf57c0faf
+	ROUND1ODD(X16,X13,X14,X15,X7,  0x4787c62a,12); // 0x4787c62a
+	ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
+	ROUND1ODD(X14,X15,X16,X13,X8,   -45705983,22); // 0xfd469501
+	ROUND1EVN(X13,X14,X15,X16,X9,  0x698098d8, 7); // 0x698098d8
+	ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
+	ROUND1EVN(X15,X16,X13,X14,X10,     -42063,17); // 0xffff5bb1
+	ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
+	ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
+	ROUND1ODD(X16,X13,X14,X15,X11,  -40341101,12); // 0xfd987193
+	ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
+	ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821
+
+	ROUND2ODD(X13,X14,X15,X16,X5,  -165796510, 5); // f61e2562
+	ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
+	ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
+	ROUND2EVN(X14,X15,X16,X13,X5,  -373897302,20); // e9b6c7aa
+	ROUND2ODD(X13,X14,X15,X16,X7,  -701558691, 5); // d62f105d
+	ROUND2EVN(X16,X13,X14,X15,X10,  0x2441453, 9); // 2441453
+	ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
+	ROUND2EVN(X14,X15,X16,X13,X7,  -405537848,20); // e7d3fbc8
+	ROUND2ODD(X13,X14,X15,X16,X9,  0x21e1cde6, 5); // 21e1cde6
+	ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
+	ROUND2ODD(X15,X16,X13,X14,X6,  -187363961,14); // f4d50d87
+	ROUND2EVN(X14,X15,X16,X13,X9,  0x455a14ed,20); // 455a14ed
+	ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
+	ROUND2EVN(X16,X13,X14,X15,X6,   -51403784, 9); // fcefa3f8
+	ROUND2ODD(X15,X16,X13,X14,X8,  0x676f02d9,14); // 676f02d9
+	ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a
+
+	ROUND3ODD(X13,X14,X15,X16,X7,     -378558, 4); // fffa3942
+	ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
+	ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
+	ROUND3EVN(X14,X15,X16,X13,X12,  -35309556,23); // fde5380c
+	ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
+	ROUND3EVN(X16,X13,X14,X15,X7,  0x4bdecfa9,11); // 4bdecfa9
+	ROUND3ODD(X15,X16,X13,X14,X8,  -155497632,16); // f6bb4b60
+	ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
+	ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
+	ROUND3EVN(X16,X13,X14,X15,X5,  -358537222,11); // eaa127fa
+	ROUND3ODD(X15,X16,X13,X14,X6,  -722521979,16); // d4ef3085
+	ROUND3EVN(X14,X15,X16,X13,X8,   0x4881d05,23); // 4881d05
+	ROUND3ODD(X13,X14,X15,X16,X9,  -640364487, 4); // d9d4d039
+	ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
+	ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
+	ROUND3EVN(X14,X15,X16,X13,X6,  -995338651,23); // c4ac5665
+
+	ROUND4EVN(X13,X14,X15,X16,X5,  -198630844, 6); // f4292244
+	ROUND4ODD(X16,X13,X14,X15,X8,  0x432aff97,10); // 432aff97
+	ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
+	ROUND4ODD(X14,X15,X16,X13,X7,   -57434055,21); // fc93a039
+	ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
+	ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
+	ROUND4EVN(X15,X16,X13,X14,X10   ,-1051523,15); // ffeff47d
+	ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
+	ROUND4EVN(X13,X14,X15,X16,X9,  0x6fa87e4f, 6); // 6fa87e4f
+	ROUND4ODD(X16,X13,X14,X15,X12,  -30611744,10); // fe2ce6e0
+	ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
+	ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
+	ROUND4EVN(X13,X14,X15,X16,X7,  -145523070, 6); // f7537e82
+	ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
+	ROUND4EVN(X15,X16,X13,X14,X6,  0x2ad7d2bb,15); // 2ad7d2bb
+	ROUND4ODD(X14,X15,X16,X13,X9,  -343485551,21); // eb86d391
+
+	ADDW	X17, X13
+	ADDW	X18, X14
+	ADDW	X19, X15
+	ADDW	X20, X16
+
+	ADD	$64, X29
+	BNE	X28, X29, loop
+
+	MOVW	X13, (0*4)(X22)
+	MOVW	X14, (1*4)(X22)
+	MOVW	X15, (2*4)(X22)
+	MOVW	X16, (3*4)(X22)
+
+zero:
+	RET
-- 
2.50.0