From 75ea2d05c01903a69dbdcd15e64b934da73c84ea Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Wed, 12 Feb 2025 23:41:22 +1100 Subject: [PATCH] internal/bytealg: vector implementation of equal for riscv64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Provide a vector implementation of equal for riscv64, which is used when compiled with the rva23u64 profile, or when vector is detected to be available. Inputs that are 8 byte aligned will still be handled via a the non-vector code if the length is less than or equal to 64 bytes. On a Banana Pi F3, with GORISCV64=rva23u64: │ equal.1 │ equal.2 │ │ sec/op │ sec/op vs base │ Equal/0-8 1.254n ± 0% 1.254n ± 0% ~ (p=1.000 n=10) Equal/same/1-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.466 n=10) Equal/same/6-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.689 n=10) Equal/same/9-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.861 n=10) Equal/same/15-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.657 n=10) Equal/same/16-8 21.32n ± 0% 21.33n ± 0% ~ (p=0.075 n=10) Equal/same/20-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.249 n=10) Equal/same/32-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.303 n=10) Equal/same/4K-8 21.32n ± 0% 21.32n ± 0% ~ (p=1.000 n=10) Equal/same/4M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.582 n=10) Equal/same/64M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.930 n=10) Equal/1-8 39.16n ± 1% 38.71n ± 0% -1.15% (p=0.000 n=10) Equal/6-8 51.49n ± 1% 50.40n ± 1% -2.12% (p=0.000 n=10) Equal/9-8 54.46n ± 1% 53.89n ± 0% -1.04% (p=0.000 n=10) Equal/15-8 71.81n ± 1% 70.59n ± 0% -1.71% (p=0.000 n=10) Equal/16-8 69.14n ± 0% 68.21n ± 0% -1.34% (p=0.000 n=10) Equal/20-8 78.59n ± 0% 77.59n ± 0% -1.26% (p=0.000 n=10) Equal/32-8 41.55n ± 0% 41.16n ± 0% -0.96% (p=0.000 n=10) Equal/4K-8 925.5n ± 0% 561.4n ± 1% -39.34% (p=0.000 n=10) Equal/4M-8 3.110m ± 32% 2.463m ± 16% -20.80% (p=0.000 n=10) Equal/64M-8 47.34m ± 30% 39.89m ± 16% -15.75% (p=0.004 n=10) EqualBothUnaligned/64_0-8 32.17n ± 1% 32.11n ± 1% ~ (p=0.184 n=10) EqualBothUnaligned/64_1-8 79.48n ± 0% 48.24n ± 1% -39.31% (p=0.000 n=10) EqualBothUnaligned/64_4-8 72.71n ± 0% 48.37n ± 1% -33.48% (p=0.000 n=10) EqualBothUnaligned/64_7-8 77.12n ± 0% 48.16n ± 1% -37.56% (p=0.000 n=10) EqualBothUnaligned/4096_0-8 908.4n ± 0% 562.4n ± 2% -38.09% (p=0.000 n=10) EqualBothUnaligned/4096_1-8 956.6n ± 0% 571.4n ± 3% -40.26% (p=0.000 n=10) EqualBothUnaligned/4096_4-8 949.6n ± 0% 571.6n ± 3% -39.81% (p=0.000 n=10) EqualBothUnaligned/4096_7-8 954.2n ± 0% 571.7n ± 3% -40.09% (p=0.000 n=10) EqualBothUnaligned/4194304_0-8 2.935m ± 29% 2.664m ± 19% ~ (p=0.089 n=10) EqualBothUnaligned/4194304_1-8 3.341m ± 13% 2.896m ± 34% ~ (p=0.075 n=10) EqualBothUnaligned/4194304_4-8 3.204m ± 39% 3.352m ± 33% ~ (p=0.796 n=10) EqualBothUnaligned/4194304_7-8 3.226m ± 30% 2.737m ± 34% -15.16% (p=0.043 n=10) EqualBothUnaligned/67108864_0-8 49.04m ± 17% 39.94m ± 12% -18.57% (p=0.005 n=10) EqualBothUnaligned/67108864_1-8 51.96m ± 15% 42.48m ± 15% -18.23% (p=0.015 n=10) EqualBothUnaligned/67108864_4-8 47.67m ± 17% 37.85m ± 41% -20.61% (p=0.035 n=10) EqualBothUnaligned/67108864_7-8 53.00m ± 22% 38.76m ± 21% -26.87% (p=0.000 n=10) CompareBytesEqual-8 51.71n ± 1% 52.00n ± 0% +0.57% (p=0.002 n=10) geomean 1.469µ 1.265µ -13.93% │ equal.1 │ equal.2 │ │ B/s │ B/s vs base │ Equal/same/1-8 44.73Mi ± 0% 44.72Mi ± 0% ~ (p=0.426 n=10) Equal/same/6-8 268.3Mi ± 0% 268.4Mi ± 0% ~ (p=0.753 n=10) Equal/same/9-8 402.6Mi ± 0% 402.5Mi ± 0% ~ (p=0.209 n=10) Equal/same/15-8 670.9Mi ± 0% 670.9Mi ± 0% ~ (p=0.724 n=10) Equal/same/16-8 715.6Mi ± 0% 715.4Mi ± 0% -0.04% (p=0.022 n=10) Equal/same/20-8 894.6Mi ± 0% 894.5Mi ± 0% ~ (p=0.060 n=10) Equal/same/32-8 1.398Gi ± 0% 1.398Gi ± 0% ~ (p=0.986 n=10) Equal/same/4K-8 178.9Gi ± 0% 178.9Gi ± 0% ~ (p=0.853 n=10) Equal/same/4M-8 178.9Ti ± 0% 178.9Ti ± 0% ~ (p=0.971 n=10) Equal/same/64M-8 2862.8Ti ± 0% 2862.6Ti ± 0% ~ (p=0.971 n=10) Equal/1-8 24.35Mi ± 1% 24.63Mi ± 0% +1.16% (p=0.000 n=10) Equal/6-8 111.1Mi ± 1% 113.5Mi ± 1% +2.17% (p=0.000 n=10) Equal/9-8 157.6Mi ± 1% 159.3Mi ± 0% +1.05% (p=0.000 n=10) Equal/15-8 199.2Mi ± 1% 202.7Mi ± 0% +1.74% (p=0.000 n=10) Equal/16-8 220.7Mi ± 0% 223.7Mi ± 0% +1.36% (p=0.000 n=10) Equal/20-8 242.7Mi ± 0% 245.8Mi ± 0% +1.27% (p=0.000 n=10) Equal/32-8 734.3Mi ± 0% 741.6Mi ± 0% +0.98% (p=0.000 n=10) Equal/4K-8 4.122Gi ± 0% 6.795Gi ± 1% +64.84% (p=0.000 n=10) Equal/4M-8 1.258Gi ± 24% 1.586Gi ± 14% +26.12% (p=0.000 n=10) Equal/64M-8 1.320Gi ± 23% 1.567Gi ± 14% +18.69% (p=0.004 n=10) EqualBothUnaligned/64_0-8 1.853Gi ± 1% 1.856Gi ± 1% ~ (p=0.190 n=10) EqualBothUnaligned/64_1-8 767.9Mi ± 0% 1265.2Mi ± 1% +64.76% (p=0.000 n=10) EqualBothUnaligned/64_4-8 839.4Mi ± 0% 1261.9Mi ± 1% +50.33% (p=0.000 n=10) EqualBothUnaligned/64_7-8 791.4Mi ± 0% 1267.5Mi ± 1% +60.16% (p=0.000 n=10) EqualBothUnaligned/4096_0-8 4.199Gi ± 0% 6.784Gi ± 2% +61.54% (p=0.000 n=10) EqualBothUnaligned/4096_1-8 3.988Gi ± 0% 6.676Gi ± 3% +67.40% (p=0.000 n=10) EqualBothUnaligned/4096_4-8 4.017Gi ± 0% 6.674Gi ± 3% +66.14% (p=0.000 n=10) EqualBothUnaligned/4096_7-8 3.998Gi ± 0% 6.673Gi ± 3% +66.92% (p=0.000 n=10) EqualBothUnaligned/4194304_0-8 1.332Gi ± 22% 1.468Gi ± 16% ~ (p=0.089 n=10) EqualBothUnaligned/4194304_1-8 1.169Gi ± 12% 1.350Gi ± 25% ~ (p=0.075 n=10) EqualBothUnaligned/4194304_4-8 1.222Gi ± 28% 1.165Gi ± 48% ~ (p=0.796 n=10) EqualBothUnaligned/4194304_7-8 1.211Gi ± 23% 1.427Gi ± 26% +17.88% (p=0.043 n=10) EqualBothUnaligned/67108864_0-8 1.274Gi ± 14% 1.567Gi ± 14% +22.97% (p=0.005 n=10) EqualBothUnaligned/67108864_1-8 1.204Gi ± 14% 1.471Gi ± 13% +22.18% (p=0.015 n=10) EqualBothUnaligned/67108864_4-8 1.311Gi ± 14% 1.651Gi ± 29% +25.92% (p=0.035 n=10) EqualBothUnaligned/67108864_7-8 1.179Gi ± 18% 1.612Gi ± 17% +36.73% (p=0.000 n=10) geomean 1.870Gi 2.190Gi +17.16% Change-Id: I9c5270bcc6997d020a96d1e97c7e7cfc7ca7fd34 Reviewed-on: https://go-review.googlesource.com/c/go/+/646736 Reviewed-by: Mark Ryan Reviewed-by: Meng Zhuo LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov Reviewed-by: Mark Freeman --- src/internal/bytealg/bytealg.go | 10 ++++++---- src/internal/bytealg/equal_riscv64.s | 30 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go index 711df74baf..319ea54ba3 100644 --- a/src/internal/bytealg/bytealg.go +++ b/src/internal/bytealg/bytealg.go @@ -11,16 +11,18 @@ import ( // Offsets into internal/cpu records for use in assembly. const ( - offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) - offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) - offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) + offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) + + offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV) offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX) offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX) - offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) + offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) + offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) + offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) ) // MaxLen is the maximum length of the string to be searched for (argument b) in Index. diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s index 87b2d79302..58e033f847 100644 --- a/src/internal/bytealg/equal_riscv64.s +++ b/src/internal/bytealg/equal_riscv64.s @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include "asm_riscv64.h" #include "go_asm.h" #include "textflag.h" @@ -28,6 +29,35 @@ length_check: MOV $32, X23 BLT X12, X23, loop4_check +#ifndef hasV + MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5 + BEQZ X5, equal_scalar +#endif + + // Use vector if not 8 byte aligned. + OR X10, X11, X5 + AND $7, X5 + BNEZ X5, vector_loop + + // Use scalar if 8 byte aligned and <= 64 bytes. + SUB $64, X12, X6 + BLEZ X6, loop32_check + + PCALIGN $16 +vector_loop: + VSETVLI X12, E8, M8, TA, MA, X5 + VLE8V (X10), V8 + VLE8V (X11), V16 + VMSNEVV V8, V16, V0 + VFIRSTM V0, X6 + BGEZ X6, done + ADD X5, X10 + ADD X5, X11 + SUB X5, X12 + BNEZ X12, vector_loop + JMP done + +equal_scalar: // Check alignment - if alignment differs we have to do one byte at a time. AND $7, X10, X9 AND $7, X11, X19 -- 2.51.0