]> Cypherpunks repositories - gostls13.git/commitdiff
internal/runtime/gc/scan: add AVX512 impl of filterNil.
authorJunyang Shao <shaojunyang@google.com>
Wed, 19 Nov 2025 20:13:31 +0000 (20:13 +0000)
committerJunyang Shao <shaojunyang@google.com>
Fri, 21 Nov 2025 19:45:42 +0000 (11:45 -0800)
Benchmark results:
ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=60-48          6.356Gi ± ∞ ¹    7.332Gi ± ∞ ¹        ~ (p=1.000 n=1) ²
ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=70-48          6.756Gi ± ∞ ¹    8.302Gi ± ∞ ¹        ~ (p=1.000 n=1) ²
ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=80-48          7.018Gi ± ∞ ¹    8.658Gi ± ∞ ¹        ~ (p=1.000 n=1) ²
ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=90-48          7.313Gi ± ∞ ¹    9.055Gi ± ∞ ¹        ~ (p=1.000 n=1) ²
ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=100-48         7.583Gi ± ∞ ¹    9.557Gi ± ∞ ¹        ~ (p=1.000 n=1) ²
geomean                                                              10.61Gi          14.83Gi        +39.81%

Almost a +40% on throughput.

Change-Id: I6f31a0f0202ec7f3c9d2bbffca5d6e377306fc25
Reviewed-on: https://go-review.googlesource.com/c/go/+/722040
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/internal/runtime/gc/scan/filter.go
src/internal/runtime/gc/scan/filter_amd64.go [new file with mode: 0644]
src/internal/runtime/gc/scan/filter_amd64.s [new file with mode: 0644]
src/internal/runtime/gc/scan/filter_amd64_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/filter_test.go
src/internal/runtime/gc/scan/scan_amd64.go

index 63cee9abf066a9a2c052e8c3d5cba4b535a4c667..08b5fd5b5454028a29b0921070f2ae8af54ab677 100644 (file)
@@ -9,8 +9,6 @@ import "unsafe"
 // FilterNil packs non-nil (non-zero) values in bufp together
 // at the beginning of bufp, returning the length of the
 // packed buffer. It treats bufp as an array of size n.
-//
-// TODO(mknyszek): Add a faster SIMD-based implementation.
 func FilterNil(bufp *uintptr, n int32) int32 {
        buf := unsafe.Slice(bufp, int(n))
        lo := 0
diff --git a/src/internal/runtime/gc/scan/filter_amd64.go b/src/internal/runtime/gc/scan/filter_amd64.go
new file mode 100644 (file)
index 0000000..c750741
--- /dev/null
@@ -0,0 +1,9 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+// FilterNilAVX512 is the simd version of FilterNil,
+// it is implemented in assembly.
+func FilterNilAVX512(bufp *uintptr, n int32) int32
diff --git a/src/internal/runtime/gc/scan/filter_amd64.s b/src/internal/runtime/gc/scan/filter_amd64.s
new file mode 100644 (file)
index 0000000..47330b6
--- /dev/null
@@ -0,0 +1,64 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·FilterNilAVX512(SB), NOSPLIT, $0-20
+       // Load arguments
+       MOVQ bufp+0(FP), R8     // R8 = bufp (start of the uint64 array)
+       MOVL n+8(FP), R9        // R9 = n (total length)
+       XORL R10, R10           // R10 = 0 (scanned = 0)
+       XORL R11, R11           // R11 = 0 (cnt = 0)
+
+       MOVL R9, R12    // R12 = n
+       SUBL R10, R12   // R12 = n - scanned
+       CMPL R12, $8    // Compare (n - scanned) with 8
+       JLT scalar_loop // If (n - scanned) < 8, jump to the scalar cleanup
+
+vector_loop:
+       LEAQ (R8)(R10*8), R13   // R13 = buf[scanned:] address
+       VMOVDQU64 (R13), Z1             // Z1 = v (Load 8 uint64s)
+       VPCMPUQ $4, Z1, Z15, K1 // Z15 is always 0, compare Z1 with 0, results in K1.
+
+       LEAQ (R8)(R11*8), R14   // R14 = buf[cnt:] address
+       VPCOMPRESSQ Z1, K1, Z1  // compress v
+       VMOVDQU64 Z1, (R14)             // store v to buf[cnt:]
+
+       KMOVW K1, R15
+       POPCNTL R15, R15        // R15 = popcount(K1)
+
+       ADDL R15, R11   // cnt += popcount(K1)
+       ADDL $8, R10    // scanned += 8
+
+       MOVL R9, R12    // R12 = n
+       SUBL R10, R12   // R12 = n - scanned
+       CMPL R12, $8    // Compare (n - scanned) with 8
+       JGE vector_loop // If (n - scanned) >= 8, continue loop
+
+scalar_loop:
+       CMPL R10, R9    // Compare scanned with n
+       JGE end                 // If scanned >= n, loop is done
+
+scalar_next_i:
+       LEAQ (R8)(R10*8), R13   // R13 = &buf[scanned]
+       MOVQ (R13), R14                 // R14 = buf[scanned]
+
+       CMPQ R14, $0
+       JE scalar_increment_i   // If buf[i] == 0, skip to increment i
+
+       LEAQ (R8)(R11*8), R15   // R15 = &buf[cnt]
+       MOVQ R14, (R15)                 // buf[cnt] = buf[scanned]
+
+       ADDL $1, R11    // cnt++
+
+scalar_increment_i:
+       ADDL $1, R10    // scanned++
+
+       CMPL R10, R9
+       JL scalar_next_i        // if scanned < n, continue
+
+end:
+       MOVL R11, ret+16(FP)
+       RET
diff --git a/src/internal/runtime/gc/scan/filter_amd64_test.go b/src/internal/runtime/gc/scan/filter_amd64_test.go
new file mode 100644 (file)
index 0000000..f542638
--- /dev/null
@@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+package scan_test
+
+import (
+       "internal/runtime/gc/scan"
+       "testing"
+)
+
+func TestFilterNilAVX512(t *testing.T) {
+       if !scan.CanAVX512() {
+               t.Skip("AVX512 is required for TestFilterNilAVX512")
+       }
+       runTestFilterNil(t, scan.FilterNilAVX512)
+}
index 115fbfb8bcf348db42c900b458919dd1f55ce981..7b5dd32db3bd86c117c89e7ecdecc4b76aec59cc 100644 (file)
@@ -10,56 +10,60 @@ import (
 )
 
 func TestFilterNil(t *testing.T) {
+       runTestFilterNil(t, scan.FilterNil)
+}
+
+func runTestFilterNil(t *testing.T, filterNil func(*uintptr, int32) int32) {
        t.Run("empty", func(t *testing.T) {
-               testFilterNil(t, []uintptr{}, []uintptr{})
+               testFilterNil(t, []uintptr{}, []uintptr{}, filterNil)
        })
        t.Run("one", func(t *testing.T) {
-               testFilterNil(t, []uintptr{4}, []uintptr{4})
+               testFilterNil(t, []uintptr{4}, []uintptr{4}, filterNil)
        })
        t.Run("elimOne", func(t *testing.T) {
-               testFilterNil(t, []uintptr{0}, []uintptr{})
+               testFilterNil(t, []uintptr{0}, []uintptr{}, filterNil)
        })
        t.Run("oneElimBegin", func(t *testing.T) {
-               testFilterNil(t, []uintptr{0, 4}, []uintptr{4})
+               testFilterNil(t, []uintptr{0, 4}, []uintptr{4}, filterNil)
        })
        t.Run("oneElimEnd", func(t *testing.T) {
-               testFilterNil(t, []uintptr{4, 0}, []uintptr{4})
+               testFilterNil(t, []uintptr{4, 0}, []uintptr{4}, filterNil)
        })
        t.Run("oneElimMultiBegin", func(t *testing.T) {
-               testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4})
+               testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4}, filterNil)
        })
        t.Run("oneElimMultiEnd", func(t *testing.T) {
-               testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4})
+               testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4}, filterNil)
        })
        t.Run("oneElimMulti", func(t *testing.T) {
-               testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4})
+               testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4}, filterNil)
        })
        t.Run("two", func(t *testing.T) {
-               testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12})
+               testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12}, filterNil)
        })
        t.Run("twoElimBegin", func(t *testing.T) {
-               testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12})
+               testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12}, filterNil)
        })
        t.Run("twoElimMid", func(t *testing.T) {
-               testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12})
+               testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12}, filterNil)
        })
        t.Run("twoElimEnd", func(t *testing.T) {
-               testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12})
+               testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12}, filterNil)
        })
        t.Run("twoElimMulti", func(t *testing.T) {
-               testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12})
+               testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12}, filterNil)
        })
        t.Run("Multi", func(t *testing.T) {
-               testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5})
+               testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5}, filterNil)
        })
 }
 
-func testFilterNil(t *testing.T, buf, want []uintptr) {
+func testFilterNil(t *testing.T, buf, want []uintptr, filterNil func(*uintptr, int32) int32) {
        var bufp *uintptr
        if len(buf) != 0 {
                bufp = &buf[0]
        }
-       n := scan.FilterNil(bufp, int32(len(buf)))
+       n := filterNil(bufp, int32(len(buf)))
        if n > int32(len(buf)) {
                t.Errorf("bogus new length returned: %d > %d", n, len(buf))
                return
index 2ac181f97e5b661b0faf02e89c98a7c57f27fa9b..0151804fa537fcf9cd9352f0891734ed4b290002 100644 (file)
@@ -28,7 +28,7 @@ func CanAVX512() bool {
 }
 
 func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
-       return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
+       return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
 }
 
 //go:noescape