]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: fix alignment code in memclr_riscv64.s
authorMark Ryan <markdryan@rivosinc.com>
Fri, 19 May 2023 12:00:10 +0000 (14:00 +0200)
committerJoel Sing <joel@sing.id.au>
Wed, 24 May 2023 19:23:45 +0000 (19:23 +0000)
The existing code incorrectly determines whether the pointer passed to
memclrNoHeapPointers is 8 byte aligned (it currently checks to see whether
it's 4 byte aligned).

In addition, the code that aligns the pointer, by individually filling
the first few bytes of the buffer with zeros, is also incorrect.  It adjusts
the pointer by the wrong number of bytes, resulting in most cases, in
an unaligned pointer.

This commit fixes both of these issues by anding the pointer with 7
rather than 3 to determine its alignment, and by individually filling
the first (8 - (pointer & 7)) bytes with 0 to align the buffer, rather
than the first (pointer & 3) bytes.

We also remove an unnecessary immediate MOV instruction.

A new benchmark is added to test the performance of memclrNoHeapPointers
on non-aligned pointers.  Results of the existing and the new benchmark
on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04
are presented below.

Memclr/5-4                     21.98n ± 7%   22.66n ± 9%        ~ (p=0.079 n=10)
Memclr/16-4                    20.85n ± 3%   21.09n ± 5%        ~ (p=0.796 n=10)
Memclr/64-4                    28.20n ± 4%   27.50n ± 3%        ~ (p=0.093 n=10)
Memclr/256-4                   53.66n ± 8%   53.44n ± 8%        ~ (p=0.280 n=10)
Memclr/4096-4                  522.6n ± 1%   523.4n ± 1%        ~ (p=0.240 n=10)
Memclr/65536-4                 24.17µ ± 0%   24.13µ ± 0%   -0.19% (p=0.029 n=10)
Memclr/1M-4                    446.9µ ± 0%   446.9µ ± 0%        ~ (p=0.684 n=10)
Memclr/4M-4                    12.69m ± 2%   12.79m ± 3%   +0.78% (p=0.043 n=10)
Memclr/8M-4                    29.75m ± 0%   29.76m ± 0%   +0.03% (p=0.015 n=10)
Memclr/16M-4                   60.34m ± 0%   60.32m ± 0%        ~ (p=0.247 n=10)
Memclr/64M-4                   241.2m ± 0%   241.3m ± 0%        ~ (p=0.247 n=10)
MemclrUnaligned/0_5-4          27.71n ± 0%   27.72n ± 1%        ~ (p=0.142 n=10)
MemclrUnaligned/0_16-4         26.95n ± 0%   26.04n ± 0%   -3.38% (p=0.000 n=10)
MemclrUnaligned/0_64-4         38.27n ± 4%   40.15n ± 6%   +4.89% (p=0.005 n=10)
MemclrUnaligned/0_256-4        63.95n ± 3%   64.19n ± 2%        ~ (p=0.971 n=10)
MemclrUnaligned/0_4096-4       532.6n ± 1%   530.9n ± 1%        ~ (p=0.324 n=10)
MemclrUnaligned/0_65536-4      24.30µ ± 0%   24.22µ ± 0%   -0.32% (p=0.023 n=10)
MemclrUnaligned/1_5-4          29.40n ± 0%   29.39n ± 0%        ~ (p=0.060 n=10)
MemclrUnaligned/1_16-4        632.65n ± 1%   63.80n ± 2%  -89.92% (p=0.000 n=10)
MemclrUnaligned/1_64-4       4091.00n ± 1%   73.23n ± 1%  -98.21% (p=0.000 n=10)
MemclrUnaligned/1_256-4     17803.50n ± 1%   92.03n ± 1%  -99.48% (p=0.000 n=10)
MemclrUnaligned/1_4096-4    294150.0n ± 1%   561.9n ± 1%  -99.81% (p=0.000 n=10)
MemclrUnaligned/1_65536-4    4692.80µ ± 1%   24.44µ ± 0%  -99.48% (p=0.000 n=10)
MemclrUnaligned/4_5-4          27.71n ± 0%   27.71n ± 0%        ~ (p=0.308 n=10)
MemclrUnaligned/4_16-4       1187.00n ± 1%   50.74n ± 3%  -95.72% (p=0.000 n=10)
MemclrUnaligned/4_64-4       4617.00n ± 1%   59.89n ± 2%  -98.70% (p=0.000 n=10)
MemclrUnaligned/4_256-4     18472.50n ± 1%   84.76n ± 2%  -99.54% (p=0.000 n=10)
MemclrUnaligned/4_4096-4    292904.0n ± 1%   553.7n ± 0%  -99.81% (p=0.000 n=10)
MemclrUnaligned/4_65536-4    4716.12µ ± 0%   24.38µ ± 0%  -99.48% (p=0.000 n=10)
MemclrUnaligned/7_5-4          29.39n ± 0%   29.39n ± 0%        ~ (p=1.000 n=10)
MemclrUnaligned/7_16-4        636.80n ± 1%   48.33n ± 5%  -92.41% (p=0.000 n=10)
MemclrUnaligned/7_64-4       4094.00n ± 1%   58.88n ± 3%  -98.56% (p=0.000 n=10)
MemclrUnaligned/7_256-4     17869.00n ± 2%   82.70n ± 3%  -99.54% (p=0.000 n=10)
MemclrUnaligned/7_4096-4    294110.5n ± 1%   554.6n ± 1%  -99.81% (p=0.000 n=10)
MemclrUnaligned/7_65536-4    4735.00µ ± 1%   24.28µ ± 0%  -99.49% (p=0.000 n=10)
MemclrUnaligned/0_1M-4         447.8µ ± 0%   450.0µ ± 1%   +0.51% (p=0.000 n=10)
MemclrUnaligned/0_4M-4         12.68m ± 1%   12.64m ± 2%   -0.33% (p=0.015 n=10)
MemclrUnaligned/0_8M-4         29.76m ± 0%   29.79m ± 2%        ~ (p=0.075 n=10)
MemclrUnaligned/0_16M-4        60.34m ± 1%   60.49m ± 1%        ~ (p=0.353 n=10)
MemclrUnaligned/0_64M-4        241.3m ± 0%   241.4m ± 0%        ~ (p=0.247 n=10)
MemclrUnaligned/1_1M-4       75937.3µ ± 1%   449.9µ ± 0%  -99.41% (p=0.000 n=10)
MemclrUnaligned/1_4M-4        313.96m ± 2%   12.69m ± 0%  -95.96% (p=0.000 n=10)
MemclrUnaligned/1_8M-4        630.97m ± 1%   29.76m ± 0%  -95.28% (p=0.000 n=10)
MemclrUnaligned/1_16M-4      1263.47m ± 1%   60.35m ± 2%  -95.22% (p=0.000 n=10)
MemclrUnaligned/1_64M-4       5053.5m ± 0%   241.3m ± 0%  -95.23% (p=0.000 n=10)
MemclrUnaligned/4_1M-4       75880.5µ ± 2%   446.5µ ± 0%  -99.41% (p=0.000 n=10)
MemclrUnaligned/4_4M-4        314.00m ± 1%   12.71m ± 2%  -95.95% (p=0.000 n=10)
MemclrUnaligned/4_8M-4        630.63m ± 1%   29.77m ± 2%  -95.28% (p=0.000 n=10)
MemclrUnaligned/4_16M-4      1257.80m ± 0%   60.34m ± 2%  -95.20% (p=0.000 n=10)
MemclrUnaligned/4_64M-4       5041.3m ± 1%   241.2m ± 0%  -95.21% (p=0.000 n=10)
MemclrUnaligned/7_1M-4       75866.2µ ± 1%   446.9µ ± 0%  -99.41% (p=0.000 n=10)
MemclrUnaligned/7_4M-4        309.86m ± 1%   12.70m ± 1%  -95.90% (p=0.000 n=10)
MemclrUnaligned/7_8M-4        626.67m ± 1%   29.75m ± 2%  -95.25% (p=0.000 n=10)
MemclrUnaligned/7_16M-4      1252.84m ± 1%   60.31m ± 0%  -95.19% (p=0.000 n=10)
MemclrUnaligned/7_64M-4       5015.8m ± 1%   241.4m ± 0%  -95.19% (p=0.000 n=10)
geomean                        339.1µ        35.83µ       -89.43%

Change-Id: I3b958a1d8e8f5ef205052e6b985a5ce21e92ef85
Reviewed-on: https://go-review.googlesource.com/c/go/+/496455
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>

src/runtime/memclr_riscv64.s
src/runtime/memmove_test.go

index d12b545b1e3c7be7ee1a5104745fdb3437c553eb..1c1e6ab54dfcdd312c0dd5e307ebca535a202e63 100644 (file)
@@ -16,10 +16,11 @@ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
        BLT     X11, X9, check4
 
        // Check alignment
-       AND     $3, X10, X5
+       AND     $7, X10, X5
        BEQZ    X5, aligned
 
        // Zero one byte at a time until we reach 8 byte alignment.
+       SUB     X5, X9, X5
        SUB     X5, X11, X11
 align:
        ADD     $-1, X5
@@ -28,7 +29,7 @@ align:
        BNEZ    X5, align
 
 aligned:
-       MOV     $8, X9
+       // X9 already contains $8
        BLT     X11, X9, check4
        MOV     $16, X9
        BLT     X11, X9, zero8
index 73895becd84212561843439dc03e60b54a990cff..f0c9a82bb629767d38792641349cf97e8f137c33 100644 (file)
@@ -400,6 +400,32 @@ func BenchmarkMemclr(b *testing.B) {
        }
 }
 
+func BenchmarkMemclrUnaligned(b *testing.B) {
+       for _, off := range []int{0, 1, 4, 7} {
+               for _, n := range []int{5, 16, 64, 256, 4096, 65536} {
+                       x := make([]byte, n+off)
+                       b.Run(fmt.Sprint(off, n), func(b *testing.B) {
+                               b.SetBytes(int64(n))
+                               for i := 0; i < b.N; i++ {
+                                       MemclrBytes(x[off:])
+                               }
+                       })
+               }
+       }
+
+       for _, off := range []int{0, 1, 4, 7} {
+               for _, m := range []int{1, 4, 8, 16, 64} {
+                       x := make([]byte, (m<<20)+off)
+                       b.Run(fmt.Sprint(off, m, "M"), func(b *testing.B) {
+                               b.SetBytes(int64(m << 20))
+                               for i := 0; i < b.N; i++ {
+                                       MemclrBytes(x[off:])
+                               }
+                       })
+               }
+       }
+}
+
 func BenchmarkGoMemclr(b *testing.B) {
        benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) {
                x := make([]byte, n)