From 5b17e2f92782bd81589b89d4cd9fbb26cae2bcd5 Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao Date: Fri, 16 May 2025 11:05:03 +0800 Subject: [PATCH] crypto/subtle: optimize function xorBytes using SIMD on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit On the Loongson-3A6000-HV and Loongson-3A5000, there has been a significant improvement in all performance metrics except for '8Bytes', which has experienced a decline, as follows. goos: linux goarch: loong64 pkg: crypto/subtle cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XORBytes/8Bytes 7.282n ± 0% 8.805n ± 0% +20.91% (p=0.000 n=10) XORBytes/128Bytes 14.43n ± 0% 10.01n ± 0% -30.63% (p=0.000 n=10) XORBytes/2048Bytes 110.60n ± 0% 46.57n ± 0% -57.89% (p=0.000 n=10) XORBytes/8192Bytes 418.7n ± 0% 161.8n ± 0% -61.36% (p=0.000 n=10) XORBytes/32768Bytes 3.220µ ± 0% 1.673µ ± 0% -48.04% (p=0.000 n=10) XORBytesAlignment/8Bytes0Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset 7.621n ± 0% 9.305n ± 0% +22.10% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset 14.430n ± 0% 9.973n ± 0% -30.88% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset 20.83n ± 0% 11.03n ± 0% -47.07% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset 20.83n ± 0% 11.03n ± 0% -47.07% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset 20.83n ± 0% 11.03n ± 0% -47.05% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset 110.60n ± 0% 46.82n ± 0% -57.67% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset 234.5n ± 0% 109.3n ± 0% -53.39% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset 234.4n ± 0% 109.3n ± 0% -53.37% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset 234.5n ± 0% 109.3n ± 0% -53.39% (p=0.000 n=10) geomean 39.42n 26.00n -34.05% goos: linux goarch: loong64 pkg: crypto/subtle cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | XORBytes/8Bytes 11.21n ± 0% 12.41n ± 1% +10.70% (p=0.000 n=10) XORBytes/128Bytes 18.22n ± 0% 13.61n ± 0% -25.30% (p=0.000 n=10) XORBytes/2048Bytes 162.20n ± 0% 48.46n ± 0% -70.13% (p=0.000 n=10) XORBytes/8192Bytes 629.8n ± 0% 163.8n ± 0% -73.99% (p=0.000 n=10) XORBytes/32768Bytes 4731.0n ± 1% 632.8n ± 0% -86.63% (p=0.000 n=10) XORBytesAlignment/8Bytes0Offset 11.61n ± 1% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/8Bytes1Offset 11.61n ± 0% 12.41n ± 0% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes2Offset 11.61n ± 0% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/8Bytes3Offset 11.61n ± 0% 12.41n ± 0% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes4Offset 11.61n ± 0% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/8Bytes5Offset 11.61n ± 0% 12.41n ± 0% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes6Offset 11.61n ± 0% 12.41n ± 1% +6.89% (p=0.000 n=10) XORBytesAlignment/8Bytes7Offset 11.61n ± 0% 12.42n ± 0% +6.98% (p=0.000 n=10) XORBytesAlignment/128Bytes0Offset 17.82n ± 0% 13.62n ± 0% -23.57% (p=0.000 n=10) XORBytesAlignment/128Bytes1Offset 26.62n ± 0% 18.43n ± 0% -30.78% (p=0.000 n=10) XORBytesAlignment/128Bytes2Offset 26.64n ± 0% 18.43n ± 0% -30.85% (p=0.000 n=10) XORBytesAlignment/128Bytes3Offset 26.65n ± 0% 18.42n ± 0% -30.90% (p=0.000 n=10) XORBytesAlignment/128Bytes4Offset 26.65n ± 0% 18.42n ± 0% -30.88% (p=0.000 n=10) XORBytesAlignment/128Bytes5Offset 26.62n ± 0% 18.42n ± 0% -30.82% (p=0.000 n=10) XORBytesAlignment/128Bytes6Offset 26.63n ± 0% 18.42n ± 0% -30.84% (p=0.000 n=10) XORBytesAlignment/128Bytes7Offset 26.64n ± 0% 18.42n ± 0% -30.86% (p=0.000 n=10) XORBytesAlignment/2048Bytes0Offset 161.80n ± 0% 48.25n ± 0% -70.18% (p=0.000 n=10) XORBytesAlignment/2048Bytes1Offset 354.6n ± 0% 189.2n ± 0% -46.64% (p=0.000 n=10) XORBytesAlignment/2048Bytes2Offset 354.6n ± 0% 189.2n ± 0% -46.64% (p=0.000 n=10) XORBytesAlignment/2048Bytes3Offset 354.7n ± 0% 189.2n ± 0% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes4Offset 354.7n ± 0% 189.2n ± 1% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes5Offset 354.7n ± 0% 189.2n ± 0% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes6Offset 354.7n ± 0% 189.2n ± 0% -46.66% (p=0.000 n=10) XORBytesAlignment/2048Bytes7Offset 354.8n ± 0% 189.2n ± 0% -46.67% (p=0.000 n=10) geomean 56.46n 36.46n -35.42% Change-Id: I66e150b132517e9ff4827abf796812ffe608c052 Reviewed-on: https://go-review.googlesource.com/c/go/+/673355 LUCI-TryBot-Result: Go LUCI Reviewed-by: abner chenc Reviewed-by: David Chase Reviewed-by: Michael Knyszek --- src/crypto/internal/fips140/subtle/xor_asm.go | 2 +- .../internal/fips140/subtle/xor_loong64.go | 39 ++ .../internal/fips140/subtle/xor_loong64.s | 389 ++++++++++++++---- src/crypto/internal/fips140deps/cpu/cpu.go | 3 + 4 files changed, 359 insertions(+), 74 deletions(-) create mode 100644 src/crypto/internal/fips140/subtle/xor_loong64.go diff --git a/src/crypto/internal/fips140/subtle/xor_asm.go b/src/crypto/internal/fips140/subtle/xor_asm.go index 216ae7ffeb..00f3497a02 100644 --- a/src/crypto/internal/fips140/subtle/xor_asm.go +++ b/src/crypto/internal/fips140/subtle/xor_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (amd64 || arm || arm64 || loong64 || mips || mipsle || mips64 || mips64le || ppc64 || ppc64le || riscv64) && !purego +//go:build (amd64 || arm || arm64 || mips || mipsle || mips64 || mips64le || ppc64 || ppc64le || riscv64) && !purego package subtle diff --git a/src/crypto/internal/fips140/subtle/xor_loong64.go b/src/crypto/internal/fips140/subtle/xor_loong64.go new file mode 100644 index 0000000000..ad66824d88 --- /dev/null +++ b/src/crypto/internal/fips140/subtle/xor_loong64.go @@ -0,0 +1,39 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +package subtle + +import ( + "crypto/internal/fips140deps/cpu" + "crypto/internal/impl" +) + +var useLSX = cpu.LOONG64HasLSX +var useLASX = cpu.LOONG64HasLASX + +func init() { + impl.Register("subtle", "LSX", &useLSX) + impl.Register("subtle", "LASX", &useLASX) +} + +//go:noescape +func xorBytesBasic(dst, a, b *byte, n int) + +//go:noescape +func xorBytesLSX(dst, a, b *byte, n int) + +//go:noescape +func xorBytesLASX(dst, a, b *byte, n int) + +func xorBytes(dst, a, b *byte, n int) { + if useLASX { + xorBytesLASX(dst, a, b, n) + } else if useLSX { + xorBytesLSX(dst, a, b, n) + } else { + xorBytesBasic(dst, a, b, n) + } +} diff --git a/src/crypto/internal/fips140/subtle/xor_loong64.s b/src/crypto/internal/fips140/subtle/xor_loong64.s index 09dc80eb93..36c18a6277 100644 --- a/src/crypto/internal/fips140/subtle/xor_loong64.s +++ b/src/crypto/internal/fips140/subtle/xor_loong64.s @@ -6,30 +6,76 @@ #include "textflag.h" -// func xorBytes(dst, a, b *byte, n int) -TEXT ·xorBytes(SB), NOSPLIT, $0 +#define SMALL_TAIL \ + SGTU $2, R7, R8; \ + BNE R8, xor_1; \ + SGTU $4, R7, R8; \ + BNE R8, xor_2; \ + SGTU $8, R7, R8; \ + BNE R8, xor_4; \ + SGTU $16, R7, R8; \ + BNE R8, xor_8; \ + +#define SMALL \ +xor_8_check:; \ + SGTU $8, R7, R8; \ + BNE R8, xor_4_check; \ +xor_8:; \ + SUBV $8, R7; \ + MOVV (R5), R10; \ + MOVV (R6), R11; \ + XOR R10, R11; \ + MOVV R11, (R4); \ + ADDV $8, R5; \ + ADDV $8, R6; \ + ADDV $8, R4; \ + BEQ R7, R0, end; \ +xor_4_check:; \ + SGTU $4, R7, R8; \ + BNE R8, xor_2_check; \ +xor_4:; \ + SUBV $4, R7; \ + MOVW (R5), R10; \ + MOVW (R6), R11; \ + XOR R10, R11; \ + MOVW R11, (R4); \ + ADDV $4, R5; \ + ADDV $4, R6; \ + ADDV $4, R4; \ + BEQ R7, R0, end; \ +xor_2_check:; \ + SGTU $2, R7, R8; \ + BNE R8, xor_1; \ +xor_2:; \ + SUBV $2, R7; \ + MOVH (R5), R10; \ + MOVH (R6), R11; \ + XOR R10, R11; \ + MOVH R11, (R4); \ + ADDV $2, R5; \ + ADDV $2, R6; \ + ADDV $2, R4; \ + BEQ R7, R0, end; \ +xor_1:; \ + MOVB (R5), R10; \ + MOVB (R6), R11; \ + XOR R10, R11; \ + MOVB R11, (R4); \ + +// func xorBytesBasic(dst, a, b *byte, n int) +TEXT ·xorBytesBasic(SB), NOSPLIT, $0 MOVV dst+0(FP), R4 MOVV a+8(FP), R5 MOVV b+16(FP), R6 MOVV n+24(FP), R7 - MOVV $64, R9 - BGEU R7, R9, loop64 // n >= 64 -tail: - SRLV $1, R9 - BGEU R7, R9, xor_32 // n >= 32 && n < 64 - SRLV $1, R9 - BGEU R7, R9, xor_16 // n >= 16 && n < 32 - SRLV $1, R9 - BGEU R7, R9, xor_8 // n >= 8 && n < 16 - SRLV $1, R9 - BGEU R7, R9, xor_4 // n >= 4 && n < 8 - SRLV $1, R9 - BGEU R7, R9, xor_2 // n >= 2 && n < 4 - SRLV $1, R9 - BGEU R7, R9, xor_1 // n = 1 - -loop64: + SMALL_TAIL + +xor_64_check: + SGTU $64, R7, R8 + BNE R8, xor_32_check +xor_64_loop: + SUBV $64, R7 MOVV (R5), R10 MOVV 8(R5), R11 MOVV 16(R5), R12 @@ -62,18 +108,18 @@ loop64: MOVV R15, 40(R4) MOVV R16, 48(R4) MOVV R17, 56(R4) + SGTU $64, R7, R8 ADDV $64, R5 ADDV $64, R6 ADDV $64, R4 - SUBV $64, R7 - // 64 in R9 - BGEU R7, R9, loop64 - BEQ R7, R0, end + BEQ R8, xor_64_loop + BEQ R7, end xor_32_check: - SRLV $1, R9 - BLT R7, R9, xor_16_check + SGTU $32, R7, R8 + BNE R8, xor_16_check xor_32: + SUBV $32, R7 MOVV (R5), R10 MOVV 8(R5), R11 MOVV 16(R5), R12 @@ -93,13 +139,13 @@ xor_32: ADDV $32, R5 ADDV $32, R6 ADDV $32, R4 - SUBV $32, R7 BEQ R7, R0, end xor_16_check: - SRLV $1, R9 - BLT R7, R9, xor_8_check + SGTU $16, R7, R8 + BNE R8, xor_8_check xor_16: + SUBV $16, R7 MOVV (R5), R10 MOVV 8(R5), R11 MOVV (R6), R12 @@ -111,56 +157,253 @@ xor_16: ADDV $16, R5 ADDV $16, R6 ADDV $16, R4 - SUBV $16, R7 BEQ R7, R0, end -xor_8_check: - SRLV $1, R9 - BLT R7, R9, xor_4_check -xor_8: - MOVV (R5), R10 - MOVV (R6), R11 - XOR R10, R11 - MOVV R11, (R4) - ADDV $8, R5 - ADDV $8, R6 - ADDV $8, R4 - SUBV $8, R7 - BEQ R7, R0, end + SMALL +end: + RET -xor_4_check: - SRLV $1, R9 - BLT R7, R9, xor_2_check -xor_4: - MOVW (R5), R10 - MOVW (R6), R11 - XOR R10, R11 - MOVW R11, (R4) - ADDV $4, R5 - ADDV $4, R6 - ADDV $4, R4 - SUBV $4, R7 - BEQ R7, R0, end +// func xorBytesLSX(dst, a, b *byte, n int) +TEXT ·xorBytesLSX(SB), NOSPLIT, $0 + MOVV dst+0(FP), R4 + MOVV a+8(FP), R5 + MOVV b+16(FP), R6 + MOVV n+24(FP), R7 -xor_2_check: - SRLV $1, R9 - BLT R7, R9, xor_1 -xor_2: - MOVH (R5), R10 - MOVH (R6), R11 - XOR R10, R11 - MOVH R11, (R4) - ADDV $2, R5 - ADDV $2, R6 - ADDV $2, R4 - SUBV $2, R7 - BEQ R7, R0, end + SMALL_TAIL + +xor_128_lsx_check: + SGTU $128, R7, R8 + BNE R8, xor_64_lsx_check +xor_128_lsx_loop: + SUBV $128, R7 + VMOVQ (R5), V0 + VMOVQ 16(R5), V1 + VMOVQ 32(R5), V2 + VMOVQ 48(R5), V3 + VMOVQ 64(R5), V4 + VMOVQ 80(R5), V5 + VMOVQ 96(R5), V6 + VMOVQ 112(R5), V7 + VMOVQ (R6), V8 + VMOVQ 16(R6), V9 + VMOVQ 32(R6), V10 + VMOVQ 48(R6), V11 + VMOVQ 64(R6), V12 + VMOVQ 80(R6), V13 + VMOVQ 96(R6), V14 + VMOVQ 112(R6), V15 + VXORV V0, V8, V8 + VXORV V1, V9, V9 + VXORV V2, V10, V10 + VXORV V3, V11, V11 + VXORV V4, V12, V12 + VXORV V5, V13, V13 + VXORV V6, V14, V14 + VXORV V7, V15, V15 + VMOVQ V8, (R4) + VMOVQ V9, 16(R4) + VMOVQ V10, 32(R4) + VMOVQ V11, 48(R4) + VMOVQ V12, 64(R4) + VMOVQ V13, 80(R4) + VMOVQ V14, 96(R4) + VMOVQ V15, 112(R4) + SGTU $128, R7, R8 + ADDV $128, R5 + ADDV $128, R6 + ADDV $128, R4 + BEQ R8, xor_128_lsx_loop + BEQ R7, end + +xor_64_lsx_check: + SGTU $64, R7, R8 + BNE R8, xor_32_lsx_check +xor_64_lsx: + SUBV $64, R7 + VMOVQ (R5), V0 + VMOVQ 16(R5), V1 + VMOVQ 32(R5), V2 + VMOVQ 48(R5), V3 + VMOVQ (R6), V4 + VMOVQ 16(R6), V5 + VMOVQ 32(R6), V6 + VMOVQ 48(R6), V7 + VXORV V0, V4, V4 + VXORV V1, V5, V5 + VXORV V2, V6, V6 + VXORV V3, V7, V7 + VMOVQ V4, (R4) + VMOVQ V5, 16(R4) + VMOVQ V6, 32(R4) + VMOVQ V7, 48(R4) + ADDV $64, R5 + ADDV $64, R6 + ADDV $64, R4 + BEQ R7, end + +xor_32_lsx_check: + SGTU $32, R7, R8 + BNE R8, xor_16_lsx_check +xor_32_lsx: + SUBV $32, R7 + VMOVQ (R5), V0 + VMOVQ 16(R5), V1 + VMOVQ (R6), V2 + VMOVQ 16(R6), V3 + VXORV V0, V2, V2 + VXORV V1, V3, V3 + VMOVQ V2, (R4) + VMOVQ V3, 16(R4) + ADDV $32, R5 + ADDV $32, R6 + ADDV $32, R4 + BEQ R7, end -xor_1: - MOVB (R5), R10 - MOVB (R6), R11 - XOR R10, R11 - MOVB R11, (R4) +xor_16_lsx_check: + SGTU $16, R7, R8 + BNE R8, xor_8_check +xor_16_lsx: + SUBV $16, R7 + VMOVQ (R5), V0 + VMOVQ (R6), V1 + VXORV V0, V1, V1 + VMOVQ V1, (R4) + ADDV $16, R5 + ADDV $16, R6 + ADDV $16, R4 + BEQ R7, end + SMALL end: RET + +// func xorBytesLASX(dst, a, b *byte, n int) +TEXT ·xorBytesLASX(SB), NOSPLIT, $0 + MOVV dst+0(FP), R4 + MOVV a+8(FP), R5 + MOVV b+16(FP), R6 + MOVV n+24(FP), R7 + + SMALL_TAIL + +xor_256_lasx_check: + SGTU $256, R7, R8 + BNE R8, xor_128_lasx_check +xor_256_lasx_loop: + SUBV $256, R7 + XVMOVQ (R5), X0 + XVMOVQ 32(R5), X1 + XVMOVQ 64(R5), X2 + XVMOVQ 96(R5), X3 + XVMOVQ 128(R5), X4 + XVMOVQ 160(R5), X5 + XVMOVQ 192(R5), X6 + XVMOVQ 224(R5), X7 + XVMOVQ (R6), X8 + XVMOVQ 32(R6), X9 + XVMOVQ 64(R6), X10 + XVMOVQ 96(R6), X11 + XVMOVQ 128(R6), X12 + XVMOVQ 160(R6), X13 + XVMOVQ 192(R6), X14 + XVMOVQ 224(R6), X15 + XVXORV X0, X8, X8 + XVXORV X1, X9, X9 + XVXORV X2, X10, X10 + XVXORV X3, X11, X11 + XVXORV X4, X12, X12 + XVXORV X5, X13, X13 + XVXORV X6, X14, X14 + XVXORV X7, X15, X15 + XVMOVQ X8, (R4) + XVMOVQ X9, 32(R4) + XVMOVQ X10, 64(R4) + XVMOVQ X11, 96(R4) + XVMOVQ X12, 128(R4) + XVMOVQ X13, 160(R4) + XVMOVQ X14, 192(R4) + XVMOVQ X15, 224(R4) + SGTU $256, R7, R8 + ADDV $256, R5 + ADDV $256, R6 + ADDV $256, R4 + BEQ R8, xor_256_lasx_loop + BEQ R7, end + +xor_128_lasx_check: + SGTU $128, R7, R8 + BNE R8, xor_64_lasx_check +xor_128_lasx: + SUBV $128, R7 + XVMOVQ (R5), X0 + XVMOVQ 32(R5), X1 + XVMOVQ 64(R5), X2 + XVMOVQ 96(R5), X3 + XVMOVQ (R6), X4 + XVMOVQ 32(R6), X5 + XVMOVQ 64(R6), X6 + XVMOVQ 96(R6), X7 + XVXORV X0, X4, X4 + XVXORV X1, X5, X5 + XVXORV X2, X6, X6 + XVXORV X3, X7, X7 + XVMOVQ X4, (R4) + XVMOVQ X5, 32(R4) + XVMOVQ X6, 64(R4) + XVMOVQ X7, 96(R4) + ADDV $128, R5 + ADDV $128, R6 + ADDV $128, R4 + BEQ R7, end + +xor_64_lasx_check: + SGTU $64, R7, R8 + BNE R8, xor_32_lasx_check +xor_64_lasx: + SUBV $64, R7 + XVMOVQ (R5), X0 + XVMOVQ 32(R5), X1 + XVMOVQ (R6), X2 + XVMOVQ 32(R6), X3 + XVXORV X0, X2, X2 + XVXORV X1, X3, X3 + XVMOVQ X2, (R4) + XVMOVQ X3, 32(R4) + ADDV $64, R5 + ADDV $64, R6 + ADDV $64, R4 + BEQ R7, end + +xor_32_lasx_check: + SGTU $32, R7, R8 + BNE R8, xor_16_lasx_check +xor_32_lasx: + SUBV $32, R7 + XVMOVQ (R5), X0 + XVMOVQ (R6), X1 + XVXORV X0, X1, X1 + XVMOVQ X1, (R4) + ADDV $32, R5 + ADDV $32, R6 + ADDV $32, R4 + BEQ R7, end + +xor_16_lasx_check: + SGTU $16, R7, R8 + BNE R8, xor_8_check +xor_16_lasx: + SUBV $16, R7 + VMOVQ (R5), V0 + VMOVQ (R6), V1 + VXORV V0, V1, V1 + VMOVQ V1, (R4) + ADDV $16, R5 + ADDV $16, R6 + ADDV $16, R4 + BEQ R7, end + + SMALL +end: + RET + diff --git a/src/crypto/internal/fips140deps/cpu/cpu.go b/src/crypto/internal/fips140deps/cpu/cpu.go index cc9ac0035a..311e4f541b 100644 --- a/src/crypto/internal/fips140deps/cpu/cpu.go +++ b/src/crypto/internal/fips140deps/cpu/cpu.go @@ -23,6 +23,9 @@ var ( ARM64HasSHA2 = cpu.ARM64.HasSHA2 ARM64HasSHA512 = cpu.ARM64.HasSHA512 + LOONG64HasLSX = cpu.Loong64.HasLSX + LOONG64HasLASX = cpu.Loong64.HasLASX + S390XHasAES = cpu.S390X.HasAES S390XHasAESCBC = cpu.S390X.HasAESCBC S390XHasAESCTR = cpu.S390X.HasAESCTR -- 2.50.0