From a4772a1a5974bbf5a5bc8ba84351062ae2d4cacd Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Fri, 19 May 2023 14:00:10 +0200 Subject: [PATCH] runtime: fix alignment code in memclr_riscv64.s MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The existing code incorrectly determines whether the pointer passed to memclrNoHeapPointers is 8 byte aligned (it currently checks to see whether it's 4 byte aligned). In addition, the code that aligns the pointer, by individually filling the first few bytes of the buffer with zeros, is also incorrect. It adjusts the pointer by the wrong number of bytes, resulting in most cases, in an unaligned pointer. This commit fixes both of these issues by anding the pointer with 7 rather than 3 to determine its alignment, and by individually filling the first (8 - (pointer & 7)) bytes with 0 to align the buffer, rather than the first (pointer & 3) bytes. We also remove an unnecessary immediate MOV instruction. A new benchmark is added to test the performance of memclrNoHeapPointers on non-aligned pointers. Results of the existing and the new benchmark on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04 are presented below. Memclr/5-4 21.98n ± 7% 22.66n ± 9% ~ (p=0.079 n=10) Memclr/16-4 20.85n ± 3% 21.09n ± 5% ~ (p=0.796 n=10) Memclr/64-4 28.20n ± 4% 27.50n ± 3% ~ (p=0.093 n=10) Memclr/256-4 53.66n ± 8% 53.44n ± 8% ~ (p=0.280 n=10) Memclr/4096-4 522.6n ± 1% 523.4n ± 1% ~ (p=0.240 n=10) Memclr/65536-4 24.17µ ± 0% 24.13µ ± 0% -0.19% (p=0.029 n=10) Memclr/1M-4 446.9µ ± 0% 446.9µ ± 0% ~ (p=0.684 n=10) Memclr/4M-4 12.69m ± 2% 12.79m ± 3% +0.78% (p=0.043 n=10) Memclr/8M-4 29.75m ± 0% 29.76m ± 0% +0.03% (p=0.015 n=10) Memclr/16M-4 60.34m ± 0% 60.32m ± 0% ~ (p=0.247 n=10) Memclr/64M-4 241.2m ± 0% 241.3m ± 0% ~ (p=0.247 n=10) MemclrUnaligned/0_5-4 27.71n ± 0% 27.72n ± 1% ~ (p=0.142 n=10) MemclrUnaligned/0_16-4 26.95n ± 0% 26.04n ± 0% -3.38% (p=0.000 n=10) MemclrUnaligned/0_64-4 38.27n ± 4% 40.15n ± 6% +4.89% (p=0.005 n=10) MemclrUnaligned/0_256-4 63.95n ± 3% 64.19n ± 2% ~ (p=0.971 n=10) MemclrUnaligned/0_4096-4 532.6n ± 1% 530.9n ± 1% ~ (p=0.324 n=10) MemclrUnaligned/0_65536-4 24.30µ ± 0% 24.22µ ± 0% -0.32% (p=0.023 n=10) MemclrUnaligned/1_5-4 29.40n ± 0% 29.39n ± 0% ~ (p=0.060 n=10) MemclrUnaligned/1_16-4 632.65n ± 1% 63.80n ± 2% -89.92% (p=0.000 n=10) MemclrUnaligned/1_64-4 4091.00n ± 1% 73.23n ± 1% -98.21% (p=0.000 n=10) MemclrUnaligned/1_256-4 17803.50n ± 1% 92.03n ± 1% -99.48% (p=0.000 n=10) MemclrUnaligned/1_4096-4 294150.0n ± 1% 561.9n ± 1% -99.81% (p=0.000 n=10) MemclrUnaligned/1_65536-4 4692.80µ ± 1% 24.44µ ± 0% -99.48% (p=0.000 n=10) MemclrUnaligned/4_5-4 27.71n ± 0% 27.71n ± 0% ~ (p=0.308 n=10) MemclrUnaligned/4_16-4 1187.00n ± 1% 50.74n ± 3% -95.72% (p=0.000 n=10) MemclrUnaligned/4_64-4 4617.00n ± 1% 59.89n ± 2% -98.70% (p=0.000 n=10) MemclrUnaligned/4_256-4 18472.50n ± 1% 84.76n ± 2% -99.54% (p=0.000 n=10) MemclrUnaligned/4_4096-4 292904.0n ± 1% 553.7n ± 0% -99.81% (p=0.000 n=10) MemclrUnaligned/4_65536-4 4716.12µ ± 0% 24.38µ ± 0% -99.48% (p=0.000 n=10) MemclrUnaligned/7_5-4 29.39n ± 0% 29.39n ± 0% ~ (p=1.000 n=10) MemclrUnaligned/7_16-4 636.80n ± 1% 48.33n ± 5% -92.41% (p=0.000 n=10) MemclrUnaligned/7_64-4 4094.00n ± 1% 58.88n ± 3% -98.56% (p=0.000 n=10) MemclrUnaligned/7_256-4 17869.00n ± 2% 82.70n ± 3% -99.54% (p=0.000 n=10) MemclrUnaligned/7_4096-4 294110.5n ± 1% 554.6n ± 1% -99.81% (p=0.000 n=10) MemclrUnaligned/7_65536-4 4735.00µ ± 1% 24.28µ ± 0% -99.49% (p=0.000 n=10) MemclrUnaligned/0_1M-4 447.8µ ± 0% 450.0µ ± 1% +0.51% (p=0.000 n=10) MemclrUnaligned/0_4M-4 12.68m ± 1% 12.64m ± 2% -0.33% (p=0.015 n=10) MemclrUnaligned/0_8M-4 29.76m ± 0% 29.79m ± 2% ~ (p=0.075 n=10) MemclrUnaligned/0_16M-4 60.34m ± 1% 60.49m ± 1% ~ (p=0.353 n=10) MemclrUnaligned/0_64M-4 241.3m ± 0% 241.4m ± 0% ~ (p=0.247 n=10) MemclrUnaligned/1_1M-4 75937.3µ ± 1% 449.9µ ± 0% -99.41% (p=0.000 n=10) MemclrUnaligned/1_4M-4 313.96m ± 2% 12.69m ± 0% -95.96% (p=0.000 n=10) MemclrUnaligned/1_8M-4 630.97m ± 1% 29.76m ± 0% -95.28% (p=0.000 n=10) MemclrUnaligned/1_16M-4 1263.47m ± 1% 60.35m ± 2% -95.22% (p=0.000 n=10) MemclrUnaligned/1_64M-4 5053.5m ± 0% 241.3m ± 0% -95.23% (p=0.000 n=10) MemclrUnaligned/4_1M-4 75880.5µ ± 2% 446.5µ ± 0% -99.41% (p=0.000 n=10) MemclrUnaligned/4_4M-4 314.00m ± 1% 12.71m ± 2% -95.95% (p=0.000 n=10) MemclrUnaligned/4_8M-4 630.63m ± 1% 29.77m ± 2% -95.28% (p=0.000 n=10) MemclrUnaligned/4_16M-4 1257.80m ± 0% 60.34m ± 2% -95.20% (p=0.000 n=10) MemclrUnaligned/4_64M-4 5041.3m ± 1% 241.2m ± 0% -95.21% (p=0.000 n=10) MemclrUnaligned/7_1M-4 75866.2µ ± 1% 446.9µ ± 0% -99.41% (p=0.000 n=10) MemclrUnaligned/7_4M-4 309.86m ± 1% 12.70m ± 1% -95.90% (p=0.000 n=10) MemclrUnaligned/7_8M-4 626.67m ± 1% 29.75m ± 2% -95.25% (p=0.000 n=10) MemclrUnaligned/7_16M-4 1252.84m ± 1% 60.31m ± 0% -95.19% (p=0.000 n=10) MemclrUnaligned/7_64M-4 5015.8m ± 1% 241.4m ± 0% -95.19% (p=0.000 n=10) geomean 339.1µ 35.83µ -89.43% Change-Id: I3b958a1d8e8f5ef205052e6b985a5ce21e92ef85 Reviewed-on: https://go-review.googlesource.com/c/go/+/496455 Run-TryBot: Joel Sing Reviewed-by: Joel Sing Reviewed-by: Keith Randall Reviewed-by: Matthew Dempsky Reviewed-by: M Zhuo Reviewed-by: Keith Randall TryBot-Result: Gopher Robot --- src/runtime/memclr_riscv64.s | 5 +++-- src/runtime/memmove_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/runtime/memclr_riscv64.s b/src/runtime/memclr_riscv64.s index d12b545b1e..1c1e6ab54d 100644 --- a/src/runtime/memclr_riscv64.s +++ b/src/runtime/memclr_riscv64.s @@ -16,10 +16,11 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16 BLT X11, X9, check4 // Check alignment - AND $3, X10, X5 + AND $7, X10, X5 BEQZ X5, aligned // Zero one byte at a time until we reach 8 byte alignment. + SUB X5, X9, X5 SUB X5, X11, X11 align: ADD $-1, X5 @@ -28,7 +29,7 @@ align: BNEZ X5, align aligned: - MOV $8, X9 + // X9 already contains $8 BLT X11, X9, check4 MOV $16, X9 BLT X11, X9, zero8 diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index 73895becd8..f0c9a82bb6 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -400,6 +400,32 @@ func BenchmarkMemclr(b *testing.B) { } } +func BenchmarkMemclrUnaligned(b *testing.B) { + for _, off := range []int{0, 1, 4, 7} { + for _, n := range []int{5, 16, 64, 256, 4096, 65536} { + x := make([]byte, n+off) + b.Run(fmt.Sprint(off, n), func(b *testing.B) { + b.SetBytes(int64(n)) + for i := 0; i < b.N; i++ { + MemclrBytes(x[off:]) + } + }) + } + } + + for _, off := range []int{0, 1, 4, 7} { + for _, m := range []int{1, 4, 8, 16, 64} { + x := make([]byte, (m<<20)+off) + b.Run(fmt.Sprint(off, m, "M"), func(b *testing.B) { + b.SetBytes(int64(m << 20)) + for i := 0; i < b.N; i++ { + MemclrBytes(x[off:]) + } + }) + } + } +} + func BenchmarkGoMemclr(b *testing.B) { benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) { x := make([]byte, n) -- 2.48.1