]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/subtle: add vector implementation of xorBytes for riscv64
authorJoel Sing <joel@sing.id.au>
Mon, 3 Feb 2025 12:53:13 +0000 (23:53 +1100)
committerJoel Sing <joel@sing.id.au>
Tue, 27 Jan 2026 11:27:26 +0000 (03:27 -0800)
On a Banana Pi F3:

                                     │   subtle.1    │               subtle.2               │
                                     │    sec/op     │    sec/op     vs base                │
ConstantTimeSelect-8                    4.391n ±  1%   4.390n ±  0%        ~ (p=0.500 n=10)
ConstantTimeByteEq-8                    3.763n ±  0%   3.764n ±  0%        ~ (p=0.549 n=10)
ConstantTimeEq-8                        3.767n ±  1%   3.764n ±  0%   -0.08% (p=0.002 n=10)
ConstantTimeLessOrEq-8                  3.136n ±  0%   3.138n ±  0%   +0.06% (p=0.002 n=10)
XORBytes/8Bytes-8                       53.42n ±  0%   52.28n ±  0%   -2.13% (p=0.000 n=10)
XORBytes/128Bytes-8                     64.79n ±  0%   64.12n ±  0%   -1.03% (p=0.000 n=10)
XORBytes/2048Bytes-8                    479.3n ±  0%   322.0n ±  0%  -32.84% (p=0.000 n=10)
XORBytes/8192Bytes-8                    8.897µ ± 26%   7.734µ ± 12%        ~ (p=0.165 n=10)
XORBytes/32768Bytes-8                   39.17µ ± 17%   35.40µ ± 24%   -9.63% (p=0.029 n=10)
XORBytesAlignment/8Bytes0Offset-8       51.74n ±  0%   54.18n ±  0%   +4.72% (p=0.000 n=10)
XORBytesAlignment/8Bytes1Offset-8       51.51n ±  1%   53.52n ±  0%   +3.92% (p=0.000 n=10)
XORBytesAlignment/8Bytes2Offset-8       51.35n ±  1%   53.58n ±  0%   +4.34% (p=0.000 n=10)
XORBytesAlignment/8Bytes3Offset-8       50.86n ±  0%   53.56n ±  0%   +5.31% (p=0.000 n=10)
XORBytesAlignment/8Bytes4Offset-8       51.62n ±  0%   54.20n ±  0%   +4.98% (p=0.000 n=10)
XORBytesAlignment/8Bytes5Offset-8       51.42n ±  1%   53.48n ±  0%   +4.02% (p=0.000 n=10)
XORBytesAlignment/8Bytes6Offset-8       51.08n ±  1%   53.46n ±  0%   +4.67% (p=0.000 n=10)
XORBytesAlignment/8Bytes7Offset-8       50.83n ±  0%   53.54n ±  0%   +5.33% (p=0.000 n=10)
XORBytesAlignment/128Bytes0Offset-8     63.67n ±  0%   66.04n ±  0%   +3.72% (p=0.000 n=10)
XORBytesAlignment/128Bytes1Offset-8    114.40n ±  0%   67.42n ±  0%  -41.07% (p=0.000 n=10)
XORBytesAlignment/128Bytes2Offset-8    113.85n ±  0%   67.43n ±  0%  -40.78% (p=0.000 n=10)
XORBytesAlignment/128Bytes3Offset-8    114.60n ±  0%   67.31n ±  0%  -41.27% (p=0.000 n=10)
XORBytesAlignment/128Bytes4Offset-8    109.30n ±  0%   67.45n ±  0%  -38.29% (p=0.000 n=10)
XORBytesAlignment/128Bytes5Offset-8    110.70n ±  0%   67.32n ±  1%  -39.19% (p=0.000 n=10)
XORBytesAlignment/128Bytes6Offset-8    110.05n ±  0%   67.45n ±  1%  -38.71% (p=0.000 n=10)
XORBytesAlignment/128Bytes7Offset-8    110.60n ±  0%   67.43n ±  0%  -39.04% (p=0.000 n=10)
XORBytesAlignment/2048Bytes0Offset-8    478.4n ±  0%   335.6n ±  0%  -29.85% (p=0.000 n=10)
XORBytesAlignment/2048Bytes1Offset-8    529.7n ±  0%   349.3n ±  0%  -34.05% (p=0.000 n=10)
XORBytesAlignment/2048Bytes2Offset-8    529.3n ±  0%   349.8n ±  0%  -33.91% (p=0.000 n=10)
XORBytesAlignment/2048Bytes3Offset-8    529.8n ±  0%   349.5n ±  0%  -34.02% (p=0.000 n=10)
XORBytesAlignment/2048Bytes4Offset-8    524.7n ±  0%   349.6n ±  0%  -33.38% (p=0.000 n=10)
XORBytesAlignment/2048Bytes5Offset-8    525.9n ±  0%   349.6n ±  0%  -33.52% (p=0.000 n=10)
XORBytesAlignment/2048Bytes6Offset-8    525.1n ±  0%   349.8n ±  0%  -33.39% (p=0.000 n=10)
XORBytesAlignment/2048Bytes7Offset-8    526.0n ±  0%   349.8n ±  0%  -33.51% (p=0.000 n=10)
geomean                                 120.0n         96.92n        -19.23%

                                     │   subtle.1    │                subtle.2                │
                                     │      B/s      │      B/s        vs base                │
XORBytes/8Bytes-8                      142.8Mi ±  0%    145.9Mi ±  0%   +2.19% (p=0.000 n=10)
XORBytes/128Bytes-8                    1.840Gi ±  0%    1.859Gi ±  0%   +1.05% (p=0.000 n=10)
XORBytes/2048Bytes-8                   3.979Gi ±  0%    5.925Gi ±  0%  +48.89% (p=0.000 n=10)
XORBytes/8192Bytes-8                   879.1Mi ± 35%   1010.2Mi ± 13%        ~ (p=0.165 n=10)
XORBytes/32768Bytes-8                  797.9Mi ± 21%    882.8Mi ± 31%  +10.64% (p=0.029 n=10)
XORBytesAlignment/8Bytes0Offset-8      147.5Mi ±  0%    140.8Mi ±  0%   -4.50% (p=0.000 n=10)
XORBytesAlignment/8Bytes1Offset-8      148.1Mi ±  1%    142.5Mi ±  0%   -3.77% (p=0.000 n=10)
XORBytesAlignment/8Bytes2Offset-8      148.6Mi ±  1%    142.4Mi ±  0%   -4.15% (p=0.000 n=10)
XORBytesAlignment/8Bytes3Offset-8      150.0Mi ±  0%    142.4Mi ±  0%   -5.04% (p=0.000 n=10)
XORBytesAlignment/8Bytes4Offset-8      147.8Mi ±  0%    140.8Mi ±  0%   -4.75% (p=0.000 n=10)
XORBytesAlignment/8Bytes5Offset-8      148.4Mi ±  1%    142.6Mi ±  0%   -3.87% (p=0.000 n=10)
XORBytesAlignment/8Bytes6Offset-8      149.4Mi ±  1%    142.7Mi ±  0%   -4.45% (p=0.000 n=10)
XORBytesAlignment/8Bytes7Offset-8      150.1Mi ±  0%    142.5Mi ±  0%   -5.05% (p=0.000 n=10)
XORBytesAlignment/128Bytes0Offset-8    1.872Gi ±  0%    1.805Gi ±  0%   -3.59% (p=0.000 n=10)
XORBytesAlignment/128Bytes1Offset-8    1.042Gi ±  0%    1.768Gi ±  0%  +69.65% (p=0.000 n=10)
XORBytesAlignment/128Bytes2Offset-8    1.047Gi ±  0%    1.768Gi ±  0%  +68.80% (p=0.000 n=10)
XORBytesAlignment/128Bytes3Offset-8    1.040Gi ±  0%    1.771Gi ±  0%  +70.27% (p=0.000 n=10)
XORBytesAlignment/128Bytes4Offset-8    1.090Gi ±  0%    1.767Gi ±  0%  +62.08% (p=0.000 n=10)
XORBytesAlignment/128Bytes5Offset-8    1.077Gi ±  0%    1.771Gi ±  1%  +64.41% (p=0.000 n=10)
XORBytesAlignment/128Bytes6Offset-8    1.083Gi ±  0%    1.767Gi ±  1%  +63.17% (p=0.000 n=10)
XORBytesAlignment/128Bytes7Offset-8    1.078Gi ±  0%    1.768Gi ±  0%  +64.07% (p=0.000 n=10)
XORBytesAlignment/2048Bytes0Offset-8   3.987Gi ±  0%    5.684Gi ±  0%  +42.55% (p=0.000 n=10)
XORBytesAlignment/2048Bytes1Offset-8   3.601Gi ±  0%    5.459Gi ±  0%  +51.61% (p=0.000 n=10)
XORBytesAlignment/2048Bytes2Offset-8   3.604Gi ±  0%    5.453Gi ±  0%  +51.31% (p=0.000 n=10)
XORBytesAlignment/2048Bytes3Offset-8   3.600Gi ±  0%    5.457Gi ±  0%  +51.56% (p=0.000 n=10)
XORBytesAlignment/2048Bytes4Offset-8   3.635Gi ±  0%    5.456Gi ±  0%  +50.10% (p=0.000 n=10)
XORBytesAlignment/2048Bytes5Offset-8   3.627Gi ±  0%    5.455Gi ±  0%  +50.39% (p=0.000 n=10)
XORBytesAlignment/2048Bytes6Offset-8   3.632Gi ±  0%    5.454Gi ±  0%  +50.14% (p=0.000 n=10)
XORBytesAlignment/2048Bytes7Offset-8   3.626Gi ±  0%    5.453Gi ±  0%  +50.39% (p=0.000 n=10)
geomean                                881.0Mi          1.097Gi        +27.51%

Change-Id: Id7f9d87fe1ea39aa91ea7d3fd1ba20737f0dda3c
Reviewed-on: https://go-review.googlesource.com/c/go/+/649657
Reviewed-by: Julian Zhu <jz531210@gmail.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
src/crypto/internal/fips140/subtle/xor_asm.go
src/crypto/internal/fips140/subtle/xor_riscv64.go [new file with mode: 0644]
src/crypto/internal/fips140/subtle/xor_riscv64.s
src/crypto/internal/fips140deps/cpu/cpu.go

index bb85aefef4013e248bb05363450fc338a5fe6350..d8dcb99ba4b513f214b04fe36c26f98365721959 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (amd64 || arm64 || ppc64 || ppc64le || riscv64) && !purego
+//go:build (amd64 || arm64 || ppc64 || ppc64le) && !purego
 
 package subtle
 
diff --git a/src/crypto/internal/fips140/subtle/xor_riscv64.go b/src/crypto/internal/fips140/subtle/xor_riscv64.go
new file mode 100644 (file)
index 0000000..7bec992
--- /dev/null
@@ -0,0 +1,18 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build riscv64 && !purego
+
+package subtle
+
+import (
+       "crypto/internal/fips140deps/cpu"
+)
+
+//go:noescape
+func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool)
+
+func xorBytes(dst, a, b *byte, n int) {
+       xorBytesRISCV64(dst, a, b, n, cpu.RISCV64HasV)
+}
index b5fa5dcef45e8285327453b66cf54da95d05162b..34331f940445a0e42b1cac629616235bec43e6db 100644 (file)
@@ -4,10 +4,12 @@
 
 //go:build !purego
 
+#include "asm_riscv64.h"
+#include "go_asm.h"
 #include "textflag.h"
 
-// func xorBytes(dst, a, b *byte, n int)
-TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
+// func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool)
+TEXT ·xorBytesRISCV64(SB), NOSPLIT|NOFRAME, $0
        MOV     dst+0(FP), X10
        MOV     a+8(FP), X11
        MOV     b+16(FP), X12
@@ -16,6 +18,35 @@ TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
        MOV     $32, X15
        BLT     X13, X15, loop4_check
 
+#ifndef hasV
+       MOVB    hasV+32(FP), X5
+       BEQZ    X5, xorbytes_scalar
+#endif
+
+       // Use vector if not 8 byte aligned.
+       OR      X10, X11, X5
+       AND     $7, X5
+       BNEZ    X5, vector_loop
+
+       // Use scalar if 8 byte aligned and <= 64 bytes.
+       SUB     $64, X12, X6
+       BLEZ    X6, loop64_check
+
+       PCALIGN $16
+vector_loop:
+       VSETVLI X13, E8, M8, TU, MU, X15
+       VLE8V   (X11), V8
+       VLE8V   (X12), V16
+       VXORVV  V8, V16, V24
+       VSE8V   V24, (X10)
+       ADD     X15, X10
+       ADD     X15, X11
+       ADD     X15, X12
+       SUB     X15, X13
+       BNEZ    X13, vector_loop
+       RET
+
+xorbytes_scalar:
        // Check alignment - if alignment differs we have to do one byte at a time.
        AND     $7, X10, X5
        AND     $7, X11, X6
index 2dfcc1a4d4aae3581e11d6275f704c47f7a2a715..be75431a14092e6f11e0c04b40a36c2625242b08 100644 (file)
@@ -27,6 +27,8 @@ var (
        LOONG64HasLSX  = cpu.Loong64.HasLSX
        LOONG64HasLASX = cpu.Loong64.HasLASX
 
+       RISCV64HasV = cpu.RISCV64.HasV
+
        S390XHasAES    = cpu.S390X.HasAES
        S390XHasAESCBC = cpu.S390X.HasAESCBC
        S390XHasAESCTR = cpu.S390X.HasAESCTR