]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: vector implementation of compare for riscv64
authorJoel Sing <joel@sing.id.au>
Wed, 12 Feb 2025 12:41:35 +0000 (23:41 +1100)
committerJoel Sing <joel@sing.id.au>
Fri, 8 Aug 2025 08:35:29 +0000 (01:35 -0700)
Provide a vector implementation of compare for riscv64, which is used
when compiled with the rva23u64 profile, or when vector is detected
to be available. Inputs that are 8 byte aligned will still be handled
via a the non-vector code if the length is less than or equal to 128
bytes.

On a Banana Pi F3, with GORISCV64=rva23u64:

                                        │  compare.1   │              compare.2              │
                                        │    sec/op    │   sec/op     vs base                │
BytesCompare/1-8                           24.36n ± 0%   24.15n ± 0%   -0.84% (p=0.007 n=10)
BytesCompare/2-8                           26.75n ± 0%   26.97n ± 0%   +0.82% (p=0.000 n=10)
BytesCompare/4-8                           27.63n ± 0%   27.80n ± 0%   +0.60% (p=0.001 n=10)
BytesCompare/8-8                           35.91n ± 0%   35.19n ± 0%   -2.01% (p=0.000 n=10)
BytesCompare/16-8                          53.22n ± 0%   24.04n ± 1%  -54.82% (p=0.000 n=10)
BytesCompare/32-8                          25.12n ± 0%   26.09n ± 1%   +3.86% (p=0.000 n=10)
BytesCompare/64-8                          32.52n ± 0%   33.43n ± 1%   +2.78% (p=0.000 n=10)
BytesCompare/128-8                         46.59n ± 0%   48.22n ± 1%   +3.50% (p=0.000 n=10)
BytesCompare/256-8                         74.25n ± 0%   50.18n ± 0%  -32.42% (p=0.000 n=10)
BytesCompare/512-8                        129.85n ± 0%   83.12n ± 0%  -35.98% (p=0.000 n=10)
BytesCompare/1024-8                        244.6n ± 0%   148.0n ± 1%  -39.49% (p=0.000 n=10)
BytesCompare/2048-8                        465.9n ± 0%   282.8n ± 2%  -39.30% (p=0.000 n=10)
CompareBytesEqual-8                        51.96n ± 0%   52.90n ± 1%   +1.80% (p=0.000 n=10)
CompareBytesToNil-8                        15.77n ± 1%   15.68n ± 0%   -0.57% (p=0.000 n=10)
CompareBytesEmpty-8                        14.21n ± 1%   14.20n ± 1%        ~ (p=1.000 n=10)
CompareBytesIdentical-8                    14.20n ± 1%   15.07n ± 1%   +6.20% (p=0.000 n=10)
CompareBytesSameLength-8                   31.38n ± 0%   30.52n ± 0%   -2.74% (p=0.000 n=10)
CompareBytesDifferentLength-8              31.38n ± 0%   30.53n ± 0%   -2.71% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=1-8       2401.0µ ± 0%   437.6µ ± 0%  -81.77% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=2-8       2376.8µ ± 0%   437.4µ ± 0%  -81.60% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=3-8       2384.1µ ± 0%   437.5µ ± 0%  -81.65% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=4-8       2377.7µ ± 0%   437.4µ ± 0%  -81.60% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=5-8       2366.3µ ± 0%   437.5µ ± 0%  -81.51% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=6-8       2357.3µ ± 0%   437.3µ ± 0%  -81.45% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=7-8       2385.3µ ± 0%   437.6µ ± 0%  -81.65% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=0-8    447.2µ ± 0%   464.8µ ± 0%   +3.94% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1-8    447.7µ ± 0%   453.1µ ± 0%   +1.20% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2-8    447.9µ ± 0%   453.0µ ± 0%   +1.15% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3-8    448.0µ ± 0%   452.5µ ± 0%   +1.02% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4-8    448.0µ ± 0%   452.1µ ± 0%   +0.92% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5-8    447.8µ ± 0%   452.8µ ± 0%   +1.12% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6-8    447.9µ ± 0%   452.4µ ± 0%   +1.01% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7-8    447.9µ ± 0%   452.8µ ± 0%   +1.09% (p=0.000 n=10)
CompareBytesBig-8                          441.2µ ± 0%   461.8µ ± 0%   +4.66% (p=0.000 n=10)
CompareBytesBigIdentical-8                 13.81n ± 0%   13.80n ± 0%        ~ (p=0.519 n=10)
geomean                                    3.980µ        2.651µ       -33.40%

                                        │  compare.1   │               compare.2                │
                                        │     B/s      │      B/s       vs base                 │
CompareBytesBigUnaligned/offset=1-8       416.5Mi ± 0%   2285.1Mi ± 0%  +448.64% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=2-8       420.7Mi ± 0%   2286.4Mi ± 0%  +443.43% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=3-8       419.5Mi ± 0%   2285.9Mi ± 0%  +444.97% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=4-8       420.6Mi ± 0%   2286.1Mi ± 0%  +443.57% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=5-8       422.6Mi ± 0%   2285.7Mi ± 0%  +440.86% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=6-8       424.2Mi ± 0%   2286.8Mi ± 0%  +439.07% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=7-8       419.2Mi ± 0%   2285.2Mi ± 0%  +445.07% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=0-8   2.184Gi ± 0%    2.101Gi ± 0%    -3.79% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1-8   2.181Gi ± 0%    2.155Gi ± 0%    -1.18% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2-8   2.180Gi ± 0%    2.156Gi ± 0%    -1.13% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3-8   2.180Gi ± 0%    2.158Gi ± 0%    -1.01% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4-8   2.180Gi ± 0%    2.160Gi ± 0%    -0.91% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5-8   2.181Gi ± 0%    2.157Gi ± 0%    -1.11% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6-8   2.181Gi ± 0%    2.159Gi ± 0%    -1.00% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7-8   2.180Gi ± 0%    2.157Gi ± 0%    -1.08% (p=0.000 n=10)
CompareBytesBig-8                         2.213Gi ± 0%    2.115Gi ± 0%    -4.45% (p=0.000 n=10)
CompareBytesBigIdentical-8                69.06Ti ± 0%    69.09Ti ± 0%         ~ (p=0.315 n=10)
geomean                                   2.022Gi         4.022Gi        +98.95%

Change-Id: Id3012faf8d353eb1be0e1fb01b78ac43fa4c7e8b
Reviewed-on: https://go-review.googlesource.com/c/go/+/646737
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
src/internal/bytealg/compare_riscv64.s

index 6388fcd2095dda2a8cb3ef31eb095f635f827411..3b1523dfbf7f3bed8e3eee673cb0b2eba8b90132 100644 (file)
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include "asm_riscv64.h"
 #include "go_asm.h"
 #include "textflag.h"
 
@@ -35,6 +36,46 @@ TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
        MIN     X11, X13, X5
        BEQZ    X5, cmp_len
 
+       MOV     $16, X6
+       BLT     X5, X6, check8_unaligned
+
+#ifndef hasV
+       MOVB    internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X6
+       BEQZ    X6, compare_scalar
+#endif
+
+       // Use vector if not 8 byte aligned.
+       OR      X10, X12, X6
+       AND     $7, X6
+       BNEZ    X6, vector_loop
+
+       // Use scalar if 8 byte aligned and <= 128 bytes.
+       SUB     $128, X5, X6
+       BLEZ    X6, compare_scalar_aligned
+
+       PCALIGN $16
+vector_loop:
+       VSETVLI X5, E8, M8, TA, MA, X6
+       VLE8V   (X10), V8
+       VLE8V   (X12), V16
+       VMSNEVV V8, V16, V0
+       VFIRSTM V0, X7
+       BGEZ    X7, vector_not_eq
+       ADD     X6, X10
+       ADD     X6, X12
+       SUB     X6, X5
+       BNEZ    X5, vector_loop
+       JMP     cmp_len
+
+vector_not_eq:
+       // Load first differing bytes in X8/X9.
+       ADD     X7, X10
+       ADD     X7, X12
+       MOVBU   (X10), X8
+       MOVBU   (X12), X9
+       JMP     cmp
+
+compare_scalar:
        MOV     $32, X6
        BLT     X5, X6, check8_unaligned
 
@@ -57,9 +98,9 @@ align:
        ADD     $1, X12
        BNEZ    X7, align
 
-check32:
-       // X6 contains $32
-       BLT     X5, X6, compare16
+compare_scalar_aligned:
+       MOV     $32, X6
+       BLT     X5, X6, check16
 compare32:
        MOV     0(X10), X15
        MOV     0(X12), X16