// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64
package bytealg
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le
+//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64
package bytealg
--- /dev/null
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOV a_base+0(FP), X5
+ MOV a_len+8(FP), X6
+ MOV b_base+24(FP), X7
+ MOV b_len+32(FP), X8
+ MOV $ret+48(FP), X9
+ JMP compare<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+ MOV a_base+0(FP), X5
+ MOV a_len+8(FP), X6
+ MOV b_base+16(FP), X7
+ MOV b_len+24(FP), X8
+ MOV $ret+32(FP), X9
+ JMP compare<>(SB)
+
+// On entry:
+// X5 points to start of a
+// X6 length of a
+// X7 points to start of b
+// X8 length of b
+// X9 points to the address to store the return value (-1/0/1)
+TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
+ BEQ X5, X7, cmp_len
+
+ MOV X6, X10
+ BGE X8, X10, use_a_len // X10 = min(len(a), len(b))
+ MOV X8, X10
+use_a_len:
+ BEQZ X10, cmp_len
+
+ MOV $32, X11
+ BLT X10, X11, loop4_check
+
+ // Check alignment - if alignment differs we have to do one byte at a time.
+ AND $3, X5, X12
+ AND $3, X7, X13
+ BNE X12, X13, loop4_check
+ BEQZ X12, loop32_check
+
+ // Check one byte at a time until we reach 8 byte alignment.
+ SUB X12, X10, X10
+align:
+ ADD $-1, X12
+ MOVBU 0(X5), X13
+ MOVBU 0(X7), X14
+ BNE X13, X14, cmp
+ ADD $1, X5
+ ADD $1, X7
+ BNEZ X12, align
+
+loop32_check:
+ MOV $32, X12
+ BLT X10, X12, loop16_check
+loop32:
+ MOV 0(X5), X15
+ MOV 0(X7), X16
+ MOV 8(X5), X17
+ MOV 8(X7), X18
+ BEQ X15, X16, loop32a
+ JMP cmp8a
+loop32a:
+ BEQ X17, X18, loop32b
+ JMP cmp8b
+loop32b:
+ MOV 16(X5), X15
+ MOV 16(X7), X16
+ MOV 24(X5), X17
+ MOV 24(X7), X18
+ BEQ X15, X16, loop32c
+ JMP cmp8a
+loop32c:
+ BEQ X17, X18, loop32d
+ JMP cmp8b
+loop32d:
+ ADD $32, X5
+ ADD $32, X7
+ ADD $-32, X10
+ BGE X10, X12, loop32
+ BEQZ X10, cmp_len
+
+loop16_check:
+ MOV $16, X11
+ BLT X10, X11, loop4_check
+loop16:
+ MOV 0(X5), X15
+ MOV 0(X7), X16
+ MOV 8(X5), X17
+ MOV 8(X7), X18
+ BEQ X15, X16, loop16a
+ JMP cmp8a
+loop16a:
+ BEQ X17, X18, loop16b
+ JMP cmp8b
+loop16b:
+ ADD $16, X5
+ ADD $16, X7
+ ADD $-16, X10
+ BGE X10, X11, loop16
+ BEQZ X10, cmp_len
+
+loop4_check:
+ MOV $4, X11
+ BLT X10, X11, loop1
+loop4:
+ MOVBU 0(X5), X13
+ MOVBU 0(X7), X14
+ MOVBU 1(X5), X15
+ MOVBU 1(X7), X16
+ BEQ X13, X14, loop4a
+ SLTU X14, X13, X10
+ SLTU X13, X14, X11
+ JMP cmp_ret
+loop4a:
+ BEQ X15, X16, loop4b
+ SLTU X16, X15, X10
+ SLTU X15, X16, X11
+ JMP cmp_ret
+loop4b:
+ MOVBU 2(X5), X21
+ MOVBU 2(X7), X22
+ MOVBU 3(X5), X23
+ MOVBU 3(X7), X24
+ BEQ X21, X22, loop4c
+ SLTU X22, X21, X10
+ SLTU X21, X22, X11
+ JMP cmp_ret
+loop4c:
+ BEQ X23, X24, loop4d
+ SLTU X24, X23, X10
+ SLTU X23, X24, X11
+ JMP cmp_ret
+loop4d:
+ ADD $4, X5
+ ADD $4, X7
+ ADD $-4, X10
+ BGE X10, X11, loop4
+
+loop1:
+ BEQZ X10, cmp_len
+ MOVBU 0(X5), X13
+ MOVBU 0(X7), X14
+ BNE X13, X14, cmp
+ ADD $1, X5
+ ADD $1, X7
+ ADD $-1, X10
+ JMP loop1
+
+ // Compare 8 bytes of memory in X15/X16 that are known to differ.
+cmp8a:
+ MOV $0xff, X19
+cmp8a_loop:
+ AND X15, X19, X13
+ AND X16, X19, X14
+ BNE X13, X14, cmp
+ SLLI $8, X19
+ JMP cmp8a_loop
+
+ // Compare 8 bytes of memory in X17/X18 that are known to differ.
+cmp8b:
+ MOV $0xff, X19
+cmp8b_loop:
+ AND X17, X19, X13
+ AND X18, X19, X14
+ BNE X13, X14, cmp
+ SLLI $8, X19
+ JMP cmp8b_loop
+
+cmp_len:
+ MOV X6, X13
+ MOV X8, X14
+cmp:
+ SLTU X14, X13, X10
+ SLTU X13, X14, X11
+cmp_ret:
+ SUB X10, X11, X12
+ MOV X12, (X9)
+ RET