]> Cypherpunks repositories - gostls13.git/commitdiff
internal/runtime/gc/scan: import scan kernel from gclab [green tea]
authorMichael Anthony Knyszek <mknyszek@google.com>
Wed, 5 Mar 2025 20:12:47 +0000 (20:12 +0000)
committerGopher Robot <gobot@golang.org>
Tue, 12 Aug 2025 18:23:02 +0000 (11:23 -0700)
This change imports the AVX512 GC scanning kernel from CL 593938 into a
new package, internal/runtime/gc/scan. Credit to Austin Clements for
most of this work. I did some cleanup, added support for more size
classes to the expanders, and added more testing. I also restructured
the code to make it easier and clearer to add new scan kernels for new
architectures.

For #73581.

Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc
Reviewed-on: https://go-review.googlesource.com/c/go/+/655280
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
34 files changed:
src/cmd/internal/objabi/pkgspecial.go
src/go/build/deps_test.go
src/internal/cpu/cpu.go
src/internal/cpu/cpu_x86.go
src/internal/cpu/datacache_unsupported.go [new file with mode: 0644]
src/internal/cpu/datacache_x86.go [new file with mode: 0644]
src/internal/cpu/datacache_x86_test.go [new file with mode: 0644]
src/internal/goarch/goarch.go
src/internal/runtime/gc/internal/gen/gen.go [new file with mode: 0644]
src/internal/runtime/gc/internal/gen/gp.go [new file with mode: 0644]
src/internal/runtime/gc/internal/gen/regalloc.go [new file with mode: 0644]
src/internal/runtime/gc/internal/gen/simd.go [new file with mode: 0644]
src/internal/runtime/gc/internal/gen/val.go [new file with mode: 0644]
src/internal/runtime/gc/malloc.go
src/internal/runtime/gc/mksizeclasses.go
src/internal/runtime/gc/scan/expand_amd64.go [new file with mode: 0644]
src/internal/runtime/gc/scan/expand_amd64.s [new file with mode: 0644]
src/internal/runtime/gc/scan/expand_amd64_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/expand_reference.go [new file with mode: 0644]
src/internal/runtime/gc/scan/expand_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/filter.go [new file with mode: 0644]
src/internal/runtime/gc/scan/filter_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/mem_nounix_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/mem_unix_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/mkasm.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_amd64.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_amd64.s [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_amd64_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_generic.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_generic_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_go.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_reference.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_test.go [new file with mode: 0644]
src/internal/runtime/gc/sizeclasses.go

index fe510160b31a05268fd7af5fd6b7fe2958fed5f5..94efa6883bd3e34942fd8711e3ccd7d49cc04bca 100644 (file)
@@ -52,6 +52,7 @@ var runtimePkgs = []string{
        "internal/runtime/cgroup",
        "internal/runtime/exithook",
        "internal/runtime/gc",
+       "internal/runtime/gc/scan",
        "internal/runtime/maps",
        "internal/runtime/math",
        "internal/runtime/strconv",
index 00e6e562e541789345560d2d9da0e5e34df3df23..41dde20bf9cc9b3234210823ae874fcd50e20361 100644 (file)
@@ -100,6 +100,7 @@ var depsRules = `
        < internal/runtime/maps
        < internal/runtime/strconv
        < internal/runtime/cgroup
+       < internal/runtime/gc/scan
        < runtime
        < sync/atomic
        < internal/sync
@@ -797,6 +798,20 @@ var depsRules = `
 
        FMT, testing < internal/cgrouptest;
        C, CGO < internal/runtime/cgobench;
+
+       # Generate-only packages can have anything they want
+       container/heap,
+       encoding/binary,
+       fmt,
+       hash/maphash,
+       io,
+       log,
+       math/bits,
+       os,
+       reflect,
+       strings,
+       sync
+       < internal/runtime/gc/internal/gen;
 `
 
 // listStdPkgs returns the same list of packages as "go list std".
index fca38532dc518f53bbbb21c13e36602a2ee59bd6..e92c1851a214675318722be750be57b578a1c97e 100644 (file)
@@ -34,15 +34,19 @@ var X86 struct {
        HasAVX512           bool // Virtual feature: F+CD+BW+DQ+VL
        HasAVX512F          bool
        HasAVX512CD         bool
+       HasAVX512BITALG     bool
        HasAVX512BW         bool
        HasAVX512DQ         bool
        HasAVX512VL         bool
        HasAVX512VPCLMULQDQ bool
+       HasAVX512VBMI       bool
+       HasAVX512VBMI2      bool
        HasBMI1             bool
        HasBMI2             bool
        HasERMS             bool
        HasFSRM             bool
        HasFMA              bool
+       HasGFNI             bool
        HasOSXSAVE          bool
        HasPCLMULQDQ        bool
        HasPOPCNT           bool
index 315c26b0ddb735a76ae4d6f0d8452616ef8e6832..6fa30b776310b3e4bc3c927be03826c1c7b727af 100644 (file)
@@ -18,7 +18,7 @@ func xgetbv() (eax, edx uint32)
 func getGOAMD64level() int32
 
 const (
-       // ecx bits
+       // Bits returned in ECX for CPUID EAX=0x1 ECX=0x0
        cpuid_SSE3      = 1 << 0
        cpuid_PCLMULQDQ = 1 << 1
        cpuid_SSSE3     = 1 << 9
@@ -30,7 +30,7 @@ const (
        cpuid_OSXSAVE   = 1 << 27
        cpuid_AVX       = 1 << 28
 
-       // ebx bits
+       // "Extended Feature Flag" bits returned in EBX for CPUID EAX=0x7 ECX=0x0
        cpuid_BMI1     = 1 << 3
        cpuid_AVX2     = 1 << 5
        cpuid_BMI2     = 1 << 8
@@ -43,8 +43,12 @@ const (
        cpuid_AVX512BW = 1 << 30
        cpuid_AVX512VL = 1 << 31
 
-       // ecx bits
+       // "Extended Feature Flag" bits returned in ECX for CPUID EAX=0x7 ECX=0x0
+       cpuid_AVX512_VBMI      = 1 << 1
+       cpuid_AVX512_VBMI2     = 1 << 6
+       cpuid_GFNI             = 1 << 8
        cpuid_AVX512VPCLMULQDQ = 1 << 10
+       cpuid_AVX512_BITALG    = 1 << 12
 
        // edx bits
        cpuid_FSRM = 1 << 4
@@ -163,6 +167,10 @@ func doinit() {
                X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ)
                X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
                X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
+               X86.HasAVX512VBMI = isSet(ecx7, cpuid_AVX512_VBMI)
+               X86.HasAVX512VBMI2 = isSet(ecx7, cpuid_AVX512_VBMI2)
+               X86.HasGFNI = isSet(ecx7, cpuid_GFNI)
+               X86.HasAVX512BITALG = isSet(ecx7, cpuid_AVX512_BITALG)
        }
 
        X86.HasFSRM = isSet(edx7, cpuid_FSRM)
diff --git a/src/internal/cpu/datacache_unsupported.go b/src/internal/cpu/datacache_unsupported.go
new file mode 100644 (file)
index 0000000..44544aa
--- /dev/null
@@ -0,0 +1,11 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64
+
+package cpu
+
+func DataCacheSizes() []uintptr {
+       return nil
+}
diff --git a/src/internal/cpu/datacache_x86.go b/src/internal/cpu/datacache_x86.go
new file mode 100644 (file)
index 0000000..eb7b93b
--- /dev/null
@@ -0,0 +1,121 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64
+
+package cpu
+
+// DataCacheSizes returns the size of each data cache from lowest
+// level in the hierarchy to highest.
+//
+// Unlike other parts of this package's public API, it is not safe
+// to reference early in runtime initialization because it allocates.
+// It's intended for testing only.
+func DataCacheSizes() []uintptr {
+       maxFunctionInformation, ebx0, ecx0, edx0 := cpuid(0, 0)
+       if maxFunctionInformation < 1 {
+               return nil
+       }
+
+       switch {
+       // Check for "GenuineIntel"
+       case ebx0 == 0x756E6547 && ecx0 == 0x6C65746E && edx0 == 0x49656E69:
+               return getDataCacheSizesIntel(maxFunctionInformation)
+       // Check for "AuthenticAMD"
+       case ebx0 == 0x68747541 && ecx0 == 0x444D4163 && edx0 == 0x69746E65:
+               return getDataCacheSizesAMD()
+       }
+       return nil
+}
+
+func extractBits(arg uint32, l int, r int) uint32 {
+       if l > r {
+               panic("bad bit range")
+       }
+       return (arg >> l) & ((1 << (r - l + 1)) - 1)
+}
+
+func getDataCacheSizesIntel(maxID uint32) []uintptr {
+       // Constants for cache types
+       const (
+               noCache          = 0
+               dataCache        = 1
+               instructionCache = 2
+               unifiedCache     = 3
+       )
+       if maxID < 4 {
+               return nil
+       }
+
+       // Iterate through CPUID leaf 4 (deterministic cache parameters)
+       var caches []uintptr
+       for i := uint32(0); i < 0xFFFF; i++ {
+               eax, ebx, ecx, _ := cpuid(4, i)
+
+               cacheType := eax & 0xF // EAX bits 4-0: Cache Type
+               if cacheType == 0 {
+                       break
+               }
+
+               // Report only data caches.
+               if !(cacheType == dataCache || cacheType == unifiedCache) {
+                       continue
+               }
+
+               // Guaranteed to always start counting from 1.
+               level := (eax >> 5) & 0x7
+
+               lineSize := extractBits(ebx, 0, 11) + 1         // Bits 11-0: Line size in bytes - 1
+               partitions := extractBits(ebx, 12, 21) + 1      // Bits 21-12: Physical line partitions - 1
+               ways := extractBits(ebx, 22, 31) + 1            // Bits 31-22: Ways of associativity - 1
+               sets := uint64(ecx) + 1                         // Number of sets - 1
+               size := uint64(ways*partitions*lineSize) * sets // Calculate cache size in bytes
+
+               caches = append(caches, uintptr(size))
+
+               // If we see more than one cache described per level, or they appear
+               // out of order, crash.
+               //
+               // Going by the SDM, it's not clear whether this is actually possible,
+               // so this code is purely defensive.
+               if level != uint32(len(caches)) {
+                       panic("expected levels to be in order and for there to be one data/unified cache per level")
+               }
+       }
+       return caches
+}
+
+func getDataCacheSizesAMD() []uintptr {
+       maxExtendedFunctionInformation, _, _, _ := cpuid(0x80000000, 0)
+       if maxExtendedFunctionInformation < 0x80000006 {
+               return nil
+       }
+
+       var caches []uintptr
+
+       _, _, ecx5, _ := cpuid(0x80000005, 0)
+       _, _, ecx6, edx6 := cpuid(0x80000006, 0)
+
+       // The size is return in kb, turning into bytes.
+       l1dSize := uintptr(extractBits(ecx5, 24, 31) << 10)
+       caches = append(caches, l1dSize)
+
+       // Check that L2 cache is present.
+       if l2Assoc := extractBits(ecx6, 12, 15); l2Assoc == 0 {
+               return caches
+       }
+       l2Size := uintptr(extractBits(ecx6, 16, 31) << 10)
+       caches = append(caches, l2Size)
+
+       // Check that L3 cache is present.
+       if l3Assoc := extractBits(edx6, 12, 15); l3Assoc == 0 {
+               return caches
+       }
+       // Specifies the L3 cache size is within the following range:
+       // (L3Size[31:18] * 512KB) <= L3 cache size < ((L3Size[31:18]+1) * 512KB).
+       l3Size := uintptr(extractBits(edx6, 18, 31) * (512 << 10))
+       caches = append(caches, l3Size)
+
+       return caches
+}
diff --git a/src/internal/cpu/datacache_x86_test.go b/src/internal/cpu/datacache_x86_test.go
new file mode 100644 (file)
index 0000000..425c525
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64
+
+package cpu_test
+
+import (
+       "internal/cpu"
+       "testing"
+)
+
+// Tests fetching data cache sizes. This test only checks that DataCacheSizes
+// won't explode. Otherwise it's just informational, and dumps the current
+// data cache sizes.
+func TestDataCacheSizes(t *testing.T) {
+       // N.B. Don't try to check these values because we don't know what
+       // kind of environment we're running in. We don't want this test to
+       // fail on some random x86 chip that happens to not support the right
+       // CPUID bits for some reason.
+       caches := cpu.DataCacheSizes()
+       for i, size := range caches {
+               t.Logf("L%d: %d", i+1, size)
+       }
+}
index f52fe6c42ec0fccf1429a7eb0db40477d35b8264..4da56dda9dadc120738c29a603a887d3d53b395d 100644 (file)
@@ -34,6 +34,9 @@ const (
 // It is also the size of the machine's native word size (that is, 4 on 32-bit systems, 8 on 64-bit).
 const PtrSize = 4 << (^uintptr(0) >> 63)
 
+// PtrSize is bit width of a pointer.
+const PtrBits = PtrSize * 8
+
 // ArchFamily is the architecture family (AMD64, ARM, ...)
 const ArchFamily ArchFamilyType = _ArchFamily
 
diff --git a/src/internal/runtime/gc/internal/gen/gen.go b/src/internal/runtime/gc/internal/gen/gen.go
new file mode 100644 (file)
index 0000000..0758f9b
--- /dev/null
@@ -0,0 +1,537 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gen
+
+import (
+       "container/heap"
+       "encoding/binary"
+       "fmt"
+       "hash/maphash"
+       "io"
+       "log"
+       "os"
+       "reflect"
+       "strings"
+)
+
+const logCompile = true
+
+func fatalf(f string, args ...any) {
+       panic(fmt.Sprintf(f, args...))
+}
+
+type File struct {
+       w      io.Writer
+       funcs  []*Func
+       consts []fileConst
+}
+
+func NewFile(w io.Writer) *File {
+       return &File{w: w}
+}
+
+func (f *File) AddFunc(fn *Func) {
+       f.funcs = append(f.funcs, fn)
+}
+
+type fileConst struct {
+       name string
+       data any
+}
+
+func (f *File) AddConst(name string, data any) {
+       // TODO: It would be nice if this were unified with "const" ops, but the
+       // reason I added this was for []*Func consts, which would take an overhaul
+       // to represent in "const" ops.
+       f.consts = append(f.consts, fileConst{name, data})
+}
+
+type Func struct {
+       name  string
+       nArgs int
+       idGen int
+       ops   []*op
+}
+
+func NewFunc(name string) *Func {
+       fn := &Func{name: name}
+       return fn
+}
+
+// attach adds x to fn's op list. If x has any unattached arguments, this adds
+// those first (recursively).
+func (fn *Func) attach(x *op) {
+       // Make sure the arguments are attached to the function.
+       for _, arg := range x.args {
+               argFn := arg.fn
+               if argFn == nil {
+                       fn.attach(arg)
+               } else if argFn != fn {
+                       panic("ops from different functions")
+               }
+       }
+
+       x.fn = fn
+       x.id = fn.idGen
+       fn.idGen++
+       fn.ops = append(fn.ops, x)
+}
+
+func Arg[W wrap[T], T Word](fn *Func) T {
+       loc := locReg{cls: regClassGP, reg: fn.nArgs}
+       fn.nArgs++
+       var x W
+       o := &op{op: "arg", kind: x.kind(), c: loc}
+       fn.attach(o)
+       return x.wrap(o)
+}
+
+func Return(results ...Value) {
+       args := make([]*op, len(results))
+       for i, res := range results {
+               args[i] = res.getOp()
+       }
+       var x void
+       x.initOp(&op{op: "return", kind: voidKind, args: args})
+}
+
+type op struct {
+       op   string
+       kind *kind
+       args []*op
+
+       id int
+       fn *Func
+
+       // c depends on "op".
+       //
+       // arg locReg - The register containing the argument value
+       // const any  - The constant value
+       // deref int  - Byte offset from args[0]
+       c    any
+       name string
+}
+
+func (o *op) String() string {
+       return fmt.Sprintf("v%02d", o.id)
+}
+
+func imm(val any) *op {
+       return &op{op: "imm", c: val}
+}
+
+func (o *op) equalNoName(o2 *op) bool {
+       if o.op != o2.op || o.c != o2.c || len(o.args) != len(o2.args) {
+               return false
+       }
+       for i, arg := range o.args {
+               if o2.args[i] != arg {
+                       return false
+               }
+       }
+       return true
+}
+
+func (o *op) write(w io.Writer) {
+       fmt.Fprintf(w, "v%02d = %s", o.id, o.op)
+       for _, arg := range o.args {
+               fmt.Fprintf(w, " v%02d", arg.id)
+       }
+       if o.c != nil {
+               fmt.Fprintf(w, " %v", o.c)
+       }
+       if o.name != "" {
+               fmt.Fprintf(w, " %q", o.name)
+       }
+       if o.kind != nil {
+               fmt.Fprintf(w, " [%s]", o.kind.typ)
+       }
+       fmt.Fprintf(w, "\n")
+}
+
+func (fn *Func) write(w io.Writer) {
+       fmt.Fprintf(w, "FUNC %s\n", fn.name)
+       for _, op := range fn.ops {
+               op.write(w)
+       }
+}
+
+func (f *File) Compile() {
+       // TODO: CSE constants across the whole file
+
+       fmt.Fprintf(f.w, `#include "go_asm.h"
+#include "textflag.h"
+
+`)
+
+       for _, c := range f.consts {
+               f.emitConst(c.name, c.data)
+       }
+
+       trace := func(fn *Func, step string) {
+               if !logCompile {
+                       return
+               }
+               log.Printf("## Compiling %s: %s", fn.name, step)
+               fn.write(os.Stderr)
+       }
+
+       for _, fn := range f.funcs {
+               trace(fn, "initial")
+
+               for {
+                       if fn.cse() {
+                               trace(fn, "post cse")
+                               continue
+                       }
+                       if fn.deadcode() {
+                               trace(fn, "post deadcode")
+                               continue
+                       }
+                       break
+               }
+               fn.addLoads()
+               trace(fn, "post addLoads")
+
+               // Assigning locations requires ops to be in dependency order.
+               fn.schedule()
+               trace(fn, "post schedule")
+
+               locs := fn.assignLocs()
+
+               fn.emit(f, locs)
+       }
+}
+
+// cse performs common subexpression elimination.
+func (fn *Func) cse() bool {
+       // Compute structural hashes
+       hashes := make(map[*op]uint64)
+       var h maphash.Hash
+       var bbuf [8]byte
+       for _, op := range fn.ops {
+               // We ignore the name for canonicalization.
+               h.Reset()
+               h.WriteString(op.op)
+               // TODO: Ideally we would hash o1.c, but we don't have a good way to do that.
+               for _, arg := range op.args {
+                       if _, ok := hashes[arg]; !ok {
+                               panic("ops not in dependency order")
+                       }
+                       binary.NativeEndian.PutUint64(bbuf[:], hashes[arg])
+                       h.Write(bbuf[:])
+               }
+               hashes[op] = h.Sum64()
+       }
+
+       canon := make(map[uint64][]*op)
+       lookup := func(o *op) *op {
+               hash := hashes[o]
+               for _, o2 := range canon[hash] {
+                       if o.equalNoName(o2) {
+                               return o2
+                       }
+               }
+               canon[hash] = append(canon[hash], o)
+               return o
+       }
+
+       // Canonicalize ops.
+       dirty := false
+       for _, op := range fn.ops {
+               for i, arg := range op.args {
+                       newArg := lookup(arg)
+                       if arg != newArg {
+                               dirty = true
+                               op.args[i] = newArg
+                       }
+               }
+       }
+       return dirty
+}
+
+// deadcode eliminates unused ops.
+func (fn *Func) deadcode() bool {
+       marks := make(map[*op]bool)
+       var mark func(o *op)
+       mark = func(o *op) {
+               if marks[o] {
+                       return
+               }
+               marks[o] = true
+               for _, arg := range o.args {
+                       mark(arg)
+               }
+       }
+       // Mark operations that have a side-effect.
+       for _, op := range fn.ops {
+               switch op.op {
+               case "return":
+                       mark(op)
+               }
+       }
+       // Discard unmarked operations
+       if len(marks) == len(fn.ops) {
+               return false
+       }
+       newOps := make([]*op, 0, len(marks))
+       for _, op := range fn.ops {
+               if marks[op] {
+                       newOps = append(newOps, op)
+               }
+       }
+       fn.ops = newOps
+       return true
+}
+
+// canMem is a map from operation to a bitmap of which arguments can use a
+// direct memory reference.
+var canMem = map[string]uint64{
+       "VPERMB":         1 << 0,
+       "VPERMI2B":       1 << 0,
+       "VPERMT2B":       1 << 0,
+       "VGF2P8AFFINEQB": 1 << 0,
+       "VPORQ":          1 << 0,
+       "VPSUBQ":         1 << 0,
+       "VPSHUFBITQMB":   1 << 0,
+}
+
+// addLoads inserts load ops for ops that can't take memory inputs directly.
+func (fn *Func) addLoads() {
+       // A lot of operations can directly take memory locations. If there's only a
+       // single reference to a deref operation, and the operation can do the deref
+       // itself, eliminate the deref. If there's more than one reference, then we
+       // leave the load so we can share the value in the register.
+       nRefs := fn.opRefs()
+       loads := make(map[*op]*op) // deref -> load
+       for _, o := range fn.ops {
+               canMask := canMem[o.op]
+               for i, arg := range o.args {
+                       // TODO: Many AVX-512 operations that support memory operands also
+                       // support a ".BCST" suffix that performs a broadcasting memory
+                       // load. If the const can be broadcast and all uses support
+                       // broadcast load, it would be nice to use .BCST. I'm not sure if
+                       // that belongs in this pass or a different one.
+                       if arg.op == "deref" || arg.op == "const" {
+                               // These produce memory locations.
+                               if canMask&(1<<i) == 0 || nRefs[arg] > 1 {
+                                       // This argument needs to be loaded into a register.
+                                       load, ok := loads[arg]
+                                       if !ok {
+                                               load = makeLoad(arg)
+                                               fn.attach(load)
+                                               loads[arg] = load
+                                       }
+                                       o.args[i] = load
+                               }
+                       }
+               }
+       }
+}
+
+func (fn *Func) opRefs() map[*op]int {
+       refs := make(map[*op]int)
+       for _, o1 := range fn.ops {
+               for _, arg := range o1.args {
+                       refs[arg]++
+               }
+       }
+       return refs
+}
+
+func makeLoad(deref *op) *op {
+       var inst string
+       switch deref.kind.reg {
+       default:
+               fatalf("don't know how to load %v", deref.kind.reg)
+       case regClassGP:
+               inst = "MOVQ"
+       case regClassZ:
+               inst = "VMOVDQU64"
+       }
+       // The load references deref rather than deref.args[0] because when we
+       // assign locations, the deref op gets the memory location to load from,
+       // while its argument has some other location (like a register). Also, the
+       // offset to deref is attached to the deref op.
+       return &op{op: inst, kind: deref.kind, args: []*op{deref}}
+}
+
+type opHeap []*op
+
+func (h opHeap) Len() int      { return len(h) }
+func (h opHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+func (h opHeap) Less(i, j int) bool {
+       priority := func(o *op) int {
+               if o.op == "deref" || o.op == "const" {
+                       // Input to memory load
+                       return 1
+               }
+               if len(o.args) > 0 && (o.args[0].op == "deref" || o.args[0].op == "const") {
+                       // Memory load
+                       return 2
+               }
+               return 100
+       }
+       if p1, p2 := priority(h[i]), priority(h[j]); p1 != p2 {
+               return p1 < p2
+       }
+       return h[i].id < h[j].id
+}
+
+func (h *opHeap) Push(x any) {
+       *h = append(*h, x.(*op))
+}
+
+func (h *opHeap) Pop() any {
+       old := *h
+       n := len(old)
+       x := old[n-1]
+       *h = old[0 : n-1]
+       return x
+}
+
+// schedule ensures fn's ops are in dependency order.
+func (fn *Func) schedule() {
+       // TODO: This tends to generate a huge amount of register pressure, mostly
+       // because it floats loads as early as possible and partly because it has no
+       // concept of rematerialization and CSE can make rematerializable values
+       // live for a very long time. It some sense it doesn't matter because we
+       // don't run out of registers for anything we need.
+
+       missing := make(map[*op]int)
+       uses := make(map[*op][]*op)
+       var h opHeap
+       for _, op := range fn.ops {
+               if len(op.args) == 0 {
+                       h = append(h, op)
+               } else {
+                       missing[op] = len(op.args)
+               }
+               for _, arg := range op.args {
+                       uses[arg] = append(uses[arg], op)
+               }
+       }
+       heap.Init(&h)
+
+       newOps := make([]*op, 0, len(fn.ops))
+       for len(h) > 0 {
+               if false {
+                       log.Printf("schedule: %s", h)
+               }
+               top := h[0]
+               newOps = append(newOps, top)
+               heap.Pop(&h)
+               for _, o := range uses[top] {
+                       missing[o]--
+                       if missing[o] == 0 {
+                               heap.Push(&h, o)
+                       }
+               }
+       }
+       if len(newOps) != len(fn.ops) {
+               log.Print("schedule didn't schedule all ops")
+               log.Print("before:")
+               fn.write(os.Stderr)
+               fn.ops = newOps
+               log.Print("after:")
+               fn.write(os.Stderr)
+               log.Fatal("bad schedule")
+       }
+
+       fn.ops = newOps
+}
+
+func (fn *Func) emit(f *File, locs map[*op]loc) {
+       w := f.w
+
+       // Emit constants first
+       for _, o := range fn.ops {
+               if o.op == "const" {
+                       name := locs[o].(locMem).name
+                       f.emitConst(name, o.c)
+               }
+       }
+
+       fmt.Fprintf(w, "TEXT %s(SB), NOSPLIT, $0-0\n", fn.name)
+
+       // Emit body
+       for _, o := range fn.ops {
+               switch o.op {
+               case "const", "arg", "return", "deref", "imm":
+                       // Does not produce code
+                       continue
+               }
+               switch o.op {
+               case "addConst":
+                       fatalf("addConst not lowered")
+               }
+
+               opName := o.op
+               // A ".mask" suffix is used to distinguish AVX-512 ops that use the same
+               // mnemonic for regular and masked mode.
+               opName = strings.TrimSuffix(opName, ".mask")
+
+               fmt.Fprintf(w, "\t%s", opName)
+               if o.op == "VGF2P8AFFINEQB" {
+                       // Hidden immediate, but always 0
+                       //
+                       // TODO: Replace this with an imm input.
+                       fmt.Fprintf(w, " $0,")
+               }
+               for i, arg := range o.args {
+                       if i == 0 {
+                               fmt.Fprintf(w, " ")
+                       } else {
+                               fmt.Fprintf(w, ", ")
+                       }
+                       if arg.op == "imm" {
+                               fmt.Fprintf(w, "$0x%x", arg.c)
+                       } else {
+                               fmt.Fprint(w, locs[arg].LocString())
+                       }
+               }
+               if _, ok := opRMW[o.op]; ok {
+                       // Read-modify-write instructions, so the output is already in the
+                       // arguments above.
+               } else {
+                       fmt.Fprintf(w, ", %s", locs[o].LocString())
+               }
+               fmt.Fprintf(w, "\n")
+       }
+       fmt.Fprintf(w, "\tRET\n")
+       fmt.Fprintf(w, "\n")
+}
+
+func (f *File) emitConst(name string, data any) {
+       switch data := data.(type) {
+       case []*Func:
+               fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, len(data)*8)
+               for i, fn := range data {
+                       fmt.Fprintf(f.w, "DATA  %s+%#02x(SB)/8, ", name, 8*i)
+                       if fn == nil {
+                               fmt.Fprintf(f.w, "$0\n")
+                       } else {
+                               fmt.Fprintf(f.w, "$%s(SB)\n", fn.name)
+                       }
+               }
+               fmt.Fprintf(f.w, "\n")
+               return
+       }
+
+       // Assume it's a numeric slice or array
+       rv := reflect.ValueOf(data)
+       sz := int(rv.Type().Elem().Size())
+       fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, rv.Len()*sz)
+       for wi := 0; wi < sz*rv.Len()/8; wi++ { // Iterate over words
+               var word uint64
+               for j := 0; j < 8/sz; j++ { // Iterate over elements in this word
+                       d := rv.Index(wi*8/sz + j).Uint()
+                       word |= d << (j * sz * 8)
+               }
+               fmt.Fprintf(f.w, "DATA  %s+%#02x(SB)/8, $%#016x\n", name, 8*wi, word)
+       }
+
+       fmt.Fprintf(f.w, "\n")
+}
diff --git a/src/internal/runtime/gc/internal/gen/gp.go b/src/internal/runtime/gc/internal/gen/gp.go
new file mode 100644 (file)
index 0000000..390d6e5
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gen
+
+type Uint64 struct {
+       valGP
+}
+
+var kindUint64 = &kind{typ: "Uint64", reg: regClassGP}
+
+func ConstUint64(c uint64, name string) (y Uint64) {
+       y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
+       return y
+}
+
+func (Uint64) kind() *kind {
+       return kindUint64
+}
+
+func (Uint64) wrap(x *op) Uint64 {
+       var y Uint64
+       y.initOp(x)
+       return y
+}
diff --git a/src/internal/runtime/gc/internal/gen/regalloc.go b/src/internal/runtime/gc/internal/gen/regalloc.go
new file mode 100644 (file)
index 0000000..424a295
--- /dev/null
@@ -0,0 +1,338 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gen
+
+import (
+       "fmt"
+       "log"
+       "math/bits"
+       "strings"
+)
+
+const traceRegAlloc = true
+
+type regClass uint8
+
+const (
+       regClassFixed regClass = iota
+       regClassGP
+       regClassZ
+       regClassK
+
+       numRegClasses
+
+       regClassNone = ^regClass(0)
+)
+
+type locReg struct {
+       cls regClass
+       reg int
+}
+
+func (l locReg) LocString() string {
+       switch l.cls {
+       case regClassFixed:
+               return fixedRegs[l.reg]
+       case regClassGP:
+               return gpRegs[l.reg]
+       case regClassZ:
+               return fmt.Sprintf("Z%d", l.reg)
+       case regClassK:
+               return fmt.Sprintf("K%d", l.reg)
+       }
+       panic("bad register class")
+}
+
+func (l locReg) Deref(off int) (loc, error) {
+       return locMem{l, off, ""}, nil
+}
+
+func (l locReg) Reg() (locReg, bool) {
+       return l, true
+}
+
+type locMem struct {
+       base locReg
+       off  int
+       name string
+}
+
+func (l locMem) LocString() string {
+       if l.base.cls == regClassFixed && l.base.reg == regSB && l.off == 0 {
+               return l.name + "(SB)"
+       }
+       if l.name != "" {
+               return fmt.Sprintf("%s+%d(%s)", l.name, l.off, l.base.LocString())
+       }
+       if l.off != 0 {
+               return fmt.Sprintf("%d(%s)", l.off, l.base.LocString())
+       }
+       return "(" + l.base.LocString() + ")"
+}
+
+func (l locMem) Deref(off int) (loc, error) {
+       return nil, fmt.Errorf("cannot dereference already memory address %s", l.LocString())
+}
+
+func (l locMem) Reg() (locReg, bool) {
+       if l.base.cls == regClassFixed {
+               return locReg{}, false
+       }
+       return l.base, true
+}
+
+type loc interface {
+       LocString() string          // Return the assembly syntax for this location
+       Deref(off int) (loc, error) // Treat this location as an address and return a location with the contents of memory at that address
+       Reg() (locReg, bool)        // Register used by this location
+}
+
+var opRMW = map[string]int{
+       "VPERMI2B":          2, // Overwrites third argument
+       "VPERMI2B.Z":        3, // Overwrites fourth argument
+       "VPERMI2B.mask":     3, // Overwrites fourth argument
+       "VPERMT2B":          1, // Overwrites second argument TODO: Check this. Unused for now.
+       "VPBROADCASTQ.mask": 2, // Overwrites last argument
+}
+
+// TODO: Should we have a general rule that all ".mask" instructions overwrite
+// their last argument?
+
+const (
+       regSB = iota
+       regFP
+)
+
+var fixedRegs = []string{regSB: "SB", regFP: "FP"}
+var gpRegs = []string{"AX", "BX", "CX", "DI", "SI", "R8", "R9", "R10", "R11"} // ABI argument order
+
+type regSet struct {
+       inUse [numRegClasses]uint32
+}
+
+func (s *regSet) used(o *op, l loc) {
+       if l == nil {
+               return
+       }
+       reg, ok := l.Reg()
+       if !ok {
+               return
+       }
+       if traceRegAlloc {
+               log.Printf("  alloc %s @ v%02d", reg.LocString(), o.id)
+       }
+       if s.inUse[reg.cls]&(1<<reg.reg) != 0 {
+               fatalf("register %s already used", reg.LocString())
+       }
+       s.inUse[reg.cls] |= 1 << reg.reg
+}
+
+func (s *regSet) free(l loc) {
+       if l == nil {
+               return
+       }
+       reg, ok := l.Reg()
+       if !ok {
+               return
+       }
+       if traceRegAlloc {
+               log.Printf("  free %s", reg.LocString())
+       }
+       if s.inUse[reg.cls]&(1<<reg.reg) == 0 {
+               fatalf("register %s is not in use", reg.LocString())
+       }
+       s.inUse[reg.cls] &^= 1 << reg.reg
+}
+
+func (fn *Func) assignLocs() map[*op]loc {
+       // Remove static indicator on name, if any. We'll add it back.
+       nameBase := strings.TrimSuffix(fn.name, "<>")
+
+       // Create map from op -> fn.ops index
+       opIndexes := make(map[*op]int, len(fn.ops))
+       for i, o := range fn.ops {
+               opIndexes[o] = i
+       }
+
+       // Read-modify-write operations share a location with one of their inputs.
+       // Likewise, deref ops extend the lifetime of their input (but in a shared
+       // way, unlike RMW ops).
+       //
+       // Compute a map from each op to the earliest "canonical" op whose live
+       // range we'll use.
+       canon := make(map[*op]*op)
+       overwritten := make(map[*op]bool)
+       for _, o := range fn.ops {
+               // Check that this op doesn't use any overwritten inputs.
+               for _, arg := range o.args {
+                       if overwritten[arg] {
+                               // TODO: The solution to this is to insert copy ops.
+                               fatalf("op %+v uses overwritten input %+v", o, arg)
+                       }
+               }
+
+               // Record canonical op.
+               rmw, ok := opRMW[o.op]
+               if ok {
+                       canon[o] = canon[o.args[rmw]]
+                       // Record that the input is dead now and must not be referenced.
+                       overwritten[o.args[rmw]] = true
+               } else if o.op == "deref" {
+                       canon[o] = canon[o.args[0]]
+               } else {
+                       canon[o] = o
+               }
+       }
+
+       // Compute live ranges of each canonical op.
+       //
+       // First, find the last use of each op.
+       lastUses := make(map[*op]*op) // Canonical creation op -> last use op
+       for _, op := range fn.ops {
+               for _, arg := range op.args {
+                       lastUses[canon[arg]] = op
+               }
+       }
+       // Invert the last uses map to get a map from op to the (canonical) values
+       // that die at that op.
+       lastUseMap := make(map[*op][]*op) // op of last use -> (canonical) creation ops
+       for def, lastUse := range lastUses {
+               lastUseMap[lastUse] = append(lastUseMap[lastUse], def)
+       }
+
+       // Prepare for assignments
+       regUsed := make([]regSet, len(fn.ops)) // In-use registers at each op
+       for i := range regUsed {
+               // X15/Y15/Z15 is reserved by the Go ABI
+               regUsed[i].inUse[regClassZ] |= 1 << 15
+               // K0 is contextual (if used as an opmask, it means no mask). Too
+               // complicated, so just ignore it.
+               regUsed[i].inUse[regClassK] |= 1 << 0
+       }
+       locs := make(map[*op]loc)
+       assign := func(o *op, l loc) {
+               if have, ok := locs[o]; ok {
+                       fatalf("op %+v already assigned location %v (new %v)", o, have, l)
+                       return
+               }
+               if o == canon[o] {
+                       // Mark this location used over o's live range
+                       for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ {
+                               regUsed[i].used(fn.ops[i], l)
+                       }
+               }
+               locs[o] = l
+       }
+
+       // Assign fixed locations
+       id := 0
+       for _, o := range fn.ops {
+               switch o.op {
+               case "arg":
+                       if traceRegAlloc {
+                               log.Printf("fixed op %+v", o)
+                       }
+                       assign(o, o.c.(locReg))
+               case "const":
+                       if traceRegAlloc {
+                               log.Printf("fixed op %+v", o)
+                       }
+                       name := o.name
+                       if name == "" {
+                               name = fmt.Sprintf("%s_%d<>", nameBase, id)
+                               id++
+                       } else if name[0] == '*' {
+                               name = nameBase + name[1:]
+                       }
+                       assign(o, locMem{locReg{cls: regClassFixed, reg: regSB}, 0, name})
+               case "return":
+                       if traceRegAlloc {
+                               log.Printf("fixed op %+v", o)
+                       }
+                       assign(o, nil) // no location
+                       // TODO: argZ should start at 0.
+                       argGP, argZ := 0, 1
+                       for _, arg := range o.args {
+                               switch arg.kind.reg {
+                               default:
+                                       fatalf("bad register class for return value")
+                               case regClassGP:
+                                       assign(canon[arg], locReg{regClassGP, argGP})
+                                       argGP++
+                               case regClassZ:
+                                       assign(canon[arg], locReg{regClassZ, argZ})
+                                       argZ++
+                               }
+                       }
+               case "imm":
+                       assign(o, nil) // no location
+               }
+       }
+
+       // Assign locations.
+       for _, o := range fn.ops {
+               if traceRegAlloc {
+                       log.Printf("assign %+v", o)
+               }
+
+               if _, ok := locs[o]; ok {
+                       // Already assigned a fixed location above.
+                       continue
+               }
+
+               if o.op == "deref" {
+                       loc, err := locs[o.args[0]].Deref(o.c.(int))
+                       if err != nil {
+                               fatalf("%v", err)
+                       }
+                       // We don't "assign" here because we've already processed the
+                       // canonical op, which marked loc's register as in-use for the whole
+                       // live range.
+                       locs[o] = loc
+                       continue
+               }
+
+               if canon[o] != o {
+                       // Copy the canonical op's location.
+                       locs[o] = locs[canon[o]]
+                       continue
+               }
+               // Below here we know that o is already a canonical op.
+
+               if _, ok := opRMW[o.op]; ok {
+                       fatalf("read-modify-write op not canonicalized")
+               }
+
+               // Find a free register of the right class.
+               cls := o.kind.reg
+               var used uint32
+               for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ {
+                       used |= regUsed[i].inUse[cls]
+               }
+
+               // Assign result location.
+               num := bits.TrailingZeros32(^used)
+               switch cls {
+               default:
+                       fatalf("unknown reg class %v", cls)
+               case regClassGP:
+                       if num >= len(gpRegs) {
+                               panic("out of GP regs")
+                       }
+               case regClassZ:
+                       if num >= 32 {
+                               panic("out of Z regs")
+                       }
+               case regClassK:
+                       if num >= 8 {
+                               panic("out of K regs")
+                       }
+               }
+               loc := locReg{cls, num}
+               assign(o, loc)
+       }
+
+       return locs
+}
diff --git a/src/internal/runtime/gc/internal/gen/simd.go b/src/internal/runtime/gc/internal/gen/simd.go
new file mode 100644 (file)
index 0000000..0360aa4
--- /dev/null
@@ -0,0 +1,246 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gen
+
+type Uint8x64 struct {
+       valAny
+}
+
+var kindUint8x64 = &kind{typ: "Uint8x64", reg: regClassZ}
+
+func ConstUint8x64(c [64]uint8, name string) (y Uint8x64) {
+       y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
+       return y
+}
+
+func (Uint8x64) kind() *kind {
+       return kindUint8x64
+}
+
+func (Uint8x64) wrap(x *op) Uint8x64 {
+       var y Uint8x64
+       y.initOp(x)
+       return y
+}
+
+func (x Uint8x64) ToUint64x8() (z Uint64x8) {
+       z.op = x.op
+       return z
+}
+
+func (x Uint8x64) Shuffle(shuf Uint8x64) (y Uint8x64) {
+       if shuf.op.op == "const" {
+               // TODO: There are often patterns we can take advantage of here. Sometimes
+               // we can do a broadcast. Sometimes we can at least do a quadword
+               // permutation instead of a full byte permutation.
+
+               // Range check the shuffle
+               for i, inp := range shuf.op.c.([64]uint8) {
+                       // 0xff is a special "don't care" value
+                       if !(inp == 0xff || inp < 64) {
+                               fatalf("shuffle[%d] = %d out of range [0, %d) or 0xff", i, inp, 64)
+                       }
+               }
+       }
+
+       args := []*op{x.op, shuf.op}
+       y.initOp(&op{op: "VPERMB", kind: y.kind(), args: args})
+       return y
+}
+
+func (x Uint8x64) ShuffleZeroed(shuf Uint8x64, mask Mask64) (y Uint8x64) {
+       args := []*op{x.op, shuf.op, mask.op}
+       y.initOp(&op{op: "VPERMB.Z", kind: y.kind(), args: args})
+       return y
+}
+
+func (x Uint8x64) ShuffleMasked(shuf Uint8x64, mask Mask64) (y Uint8x64) {
+       args := []*op{x.op, shuf.op, mask.op}
+       y.initOp(&op{op: "VPERMB.mask", kind: y.kind(), args: args})
+       return y
+}
+
+// TODO: The two-argument shuffle is a little weird. You almost want the
+// receiver to be the shuffle and the two arguments to be the two inputs, but
+// that's almost certainly *not* what you want for the single input shuffle.
+
+func (x Uint8x64) Shuffle2(y Uint8x64, shuf Uint8x64) (z Uint8x64) {
+       // Confusingly, the inputs are in the opposite order from what you'd expect.
+       args := []*op{y.op, x.op, shuf.op}
+       z.initOp(&op{op: "VPERMI2B", kind: z.kind(), args: args})
+       return z
+}
+
+func (x Uint8x64) Shuffle2Zeroed(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) {
+       // Confusingly, the inputs are in the opposite order from what you'd expect.
+       args := []*op{y.op, x.op, mask.op, shuf.op}
+       z.initOp(&op{op: "VPERMI2B.Z", kind: z.kind(), args: args})
+       return z
+}
+
+func (x Uint8x64) Shuffle2Masked(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) {
+       // Confusingly, the inputs are in the opposite order from what you'd expect.
+       args := []*op{y.op, x.op, mask.op, shuf.op}
+       z.initOp(&op{op: "VPERMI2B.mask", kind: z.kind(), args: args})
+       return z
+}
+
+type Uint64x8 struct {
+       valAny
+}
+
+var kindUint64x8 = &kind{typ: "Uint64x8", reg: regClassZ}
+
+func ConstUint64x8(c [8]uint64, name string) (y Uint64x8) {
+       // TODO: Sometimes these can be optimized into broadcast loads.
+       y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name})
+       return y
+}
+
+func BroadcastUint64x8Zeroed(src Uint64, mask Mask8) (z Uint64x8) {
+       z.initOp(&op{op: "VPBROADCASTQ.Z", kind: z.kind(), args: []*op{src.op, mask.op}})
+       return z
+}
+
+func (x Uint64x8) BroadcastMasked(src Uint64, mask Mask8) (z Uint64x8) {
+       z.initOp(&op{op: "VPBROADCASTQ.mask", kind: z.kind(), args: []*op{src.op, mask.op, x.op}})
+       return z
+}
+
+func (Uint64x8) kind() *kind {
+       return kindUint64x8
+}
+
+func (Uint64x8) wrap(x *op) Uint64x8 {
+       var y Uint64x8
+       y.initOp(x)
+       return y
+}
+
+func (x Uint64x8) Or(y Uint64x8) (z Uint64x8) {
+       z.initOp(&op{op: "VPORQ", kind: z.kind(), args: []*op{y.op, x.op}})
+       return z
+}
+
+func (x Uint64x8) Sub(y Uint64x8) (z Uint64x8) {
+       // Arguments are backwards
+       z.initOp(&op{op: "VPSUBQ", kind: z.kind(), args: []*op{y.op, x.op}})
+       return z
+}
+
+func (x Uint64x8) ToUint8x64() (z Uint8x64) {
+       z.op = x.op
+       return z
+}
+
+func (x Uint64x8) GF2P8Affine(y Uint8x64) (z Uint8x64) {
+       // matrix, vector
+       z.initOp(&op{op: "VGF2P8AFFINEQB", kind: z.kind(), args: []*op{x.op, y.op}})
+       return z
+}
+
+func (x Uint64x8) ShuffleBits(y Uint8x64) (z Mask64) {
+       z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op}})
+       return z
+}
+
+func (x Uint64x8) ShuffleBitsMasked(y Uint8x64, mask Mask64) (z Mask64) {
+       // This is always zeroing if the mask is provided.
+       z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op, mask.op}})
+       return z
+}
+
+type Mask8 struct {
+       valAny
+}
+
+var kindMask8 = &kind{typ: "Mask8", reg: regClassK}
+
+func ConstMask8(c uint8) (y Mask8) {
+       var tmp Uint64
+       tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}})
+       y.initOp(&op{op: "KMOVB", kind: y.kind(), args: []*op{tmp.op}})
+       return y
+}
+
+func (Mask8) kind() *kind {
+       return kindMask8
+}
+
+func (Mask8) wrap(x *op) Mask8 {
+       var y Mask8
+       y.initOp(x)
+       return y
+}
+
+func (x Mask8) ToUint8() (z Uint64) {
+       z.initOp(&op{op: "KMOVB", kind: z.kind(), args: []*op{x.op}})
+       return z
+}
+
+func (x Mask8) Or(y Mask8) (z Mask8) {
+       z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}})
+       return z
+}
+
+func (x Mask8) ShiftLeft(c uint8) (z Mask8) {
+       if c == 0 {
+               z = x
+       } else {
+               z.initOp(&op{op: "KSHIFTLB", kind: z.kind(), args: []*op{imm(c), x.op}})
+       }
+       return z
+}
+
+type Mask64 struct {
+       valAny
+}
+
+var kindMask64 = &kind{typ: "Mask64", reg: regClassK}
+
+func ConstMask64(c uint64) (y Mask64) {
+       var tmp Uint64
+       tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}})
+       y.initOp(&op{op: "KMOVQ", kind: y.kind(), args: []*op{tmp.op}})
+       return y
+}
+
+func (Mask64) kind() *kind {
+       return kindMask64
+}
+
+func (Mask64) wrap(x *op) Mask64 {
+       var y Mask64
+       y.initOp(x)
+       return y
+}
+
+func (x Mask64) ToUint64() (z Uint64) {
+       z.initOp(&op{op: "KMOVQ", kind: z.kind(), args: []*op{x.op}})
+       return z
+}
+
+func (x Mask64) Or(y Mask64) (z Mask64) {
+       z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}})
+       return z
+}
+
+func (x Mask64) ShiftLeft(c uint8) (z Mask64) {
+       if c == 0 {
+               z = x
+       } else {
+               z.initOp(&op{op: "KSHIFTLQ", kind: z.kind(), args: []*op{imm(c), x.op}})
+       }
+       return z
+}
+
+func (x Mask64) ShiftRight(c uint8) (z Mask64) {
+       if c == 0 {
+               z = x
+       } else {
+               z.initOp(&op{op: "KSHIFTRQ", kind: z.kind(), args: []*op{imm(c), x.op}})
+       }
+       return z
+}
diff --git a/src/internal/runtime/gc/internal/gen/val.go b/src/internal/runtime/gc/internal/gen/val.go
new file mode 100644 (file)
index 0000000..24a843a
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gen
+
+import "sync"
+
+type Value interface {
+       kind() *kind
+       getOp() *op
+}
+
+type Word interface {
+       Value
+       isWord()
+}
+
+// wrap is an unfortunate necessity so that we can pass Value types around as
+// values (not pointers), but still have generic functions that can construct a
+// new Value. Ideally we would just have a method on Value to initialize its op,
+// but that needs to have a non-pointer receiver to satisfy the interface and
+// then it can't mutate the Value.
+type wrap[T Value] interface {
+       Value
+       wrap(x *op) T
+}
+
+type kind struct {
+       typ string
+       reg regClass
+}
+
+type void struct {
+       valAny
+}
+
+var voidKind = &kind{typ: "void", reg: regClassNone}
+
+func (void) kind() *kind { return voidKind }
+
+type Ptr[T Value] struct {
+       valGP
+}
+
+// Ptr is a Word
+var _ Word = Ptr[void]{}
+
+var ptrKinds = sync.Map{} // *kind -> *kind
+
+func (Ptr[T]) kind() *kind {
+       var x T
+       xk := x.kind()
+       pk, ok := ptrKinds.Load(xk)
+       if !ok {
+               k := &kind{typ: "Ptr[" + x.kind().typ + "]", reg: regClassGP}
+               pk, _ = ptrKinds.LoadOrStore(xk, k)
+       }
+       return pk.(*kind)
+}
+
+func (Ptr[T]) wrap(x *op) Ptr[T] {
+       var y Ptr[T]
+       y.initOp(x)
+       return y
+}
+
+func (x Ptr[T]) AddConst(off int) (y Ptr[T]) {
+       base := x.op
+       for base.op == "addConst" {
+               off += base.args[1].c.(int)
+               base = base.args[0]
+       }
+       y.initOp(&op{op: "addConst", kind: y.kind(), args: []*op{base, imm(off)}})
+       return y
+}
+
+func Deref[W wrap[T], T Value](ptr Ptr[W]) T {
+       var off int
+       base := ptr.op
+       for base.op == "addConst" {
+               off += base.args[1].c.(int)
+               base = base.args[0]
+       }
+
+       var y W
+       return y.wrap(&op{op: "deref", kind: y.kind(), args: []*op{base}, c: off})
+}
+
+type Array[T Value] struct {
+       valAny
+}
+
+func ConstArray[T Value](vals []T, name string) (y Array[T]) {
+       // TODO: This probably doesn't actually work because emitConst won't
+       // understand vals.
+       y.initOp(&op{op: "const", kind: y.kind(), c: vals, name: name})
+       return y
+}
+
+func (Array[T]) kind() *kind {
+       // TODO: Cache this like Ptr.kind.
+       var x T
+       return &kind{typ: "Array[" + x.kind().typ + "]", reg: regClassNone}
+}
+
+type valGP struct {
+       valAny
+}
+
+func (valGP) isWord() {}
+
+type valAny struct {
+       *op
+}
+
+func (v *valAny) initOp(x *op) {
+       if v.op != nil {
+               panic("double init of val")
+       }
+       if x.kind == nil {
+               panic("val missing kind")
+       }
+       v.op = x
+
+       // Figure out this value's function.
+       for _, arg := range x.args {
+               if fn := arg.fn; fn != nil {
+                       fn.attach(x)
+                       break
+               }
+       }
+}
+
+func (v valAny) getOp() *op {
+       return v.op
+}
index bb54fff6869f9c6a4b1e7d4b687b2bea79824949..7c36a6bfbe942aaac1bbcba19223f330ce78fa79 100644 (file)
@@ -7,7 +7,8 @@ package gc
 import "internal/goarch"
 
 const (
-       ptrBits = 8 * goarch.PtrSize
+       // PageWords is the number of pointer-words per page.
+       PageWords = PageSize / goarch.PtrSize
 
        // A malloc header is functionally a single type pointer, but
        // we need to use 8 here to ensure 8-byte alignment of allocations
@@ -43,7 +44,7 @@ const (
        // would not be invariant to size-class rounding. Eschewing this property means a
        // more complex check or possibly storing additional state to determine whether a
        // span has malloc headers.
-       MinSizeForMallocHeader = goarch.PtrSize * ptrBits
+       MinSizeForMallocHeader = goarch.PtrSize * goarch.PtrBits
 
        // PageSize is the increment in which spans are managed.
        PageSize = 1 << PageShift
index ea48cda469c5203b3577c1eb879a54b5e41a64b1..e7b848af023a08ad84497da460edf06f08ec4e78 100644 (file)
@@ -52,7 +52,7 @@ func main() {
        fmt.Fprintln(&b, "// Code generated by mksizeclasses.go; DO NOT EDIT.")
        fmt.Fprintln(&b, "//go:generate go run mksizeclasses.go")
        fmt.Fprintln(&b)
-       fmt.Fprintln(&b, "package runtime")
+       fmt.Fprintln(&b, "package gc")
        classes := makeClasses()
 
        printComment(&b, classes)
@@ -287,6 +287,14 @@ func maxObjsPerSpan(classes []class) int {
        return most
 }
 
+func maxNPages(classes []class) int {
+       most := 0
+       for _, c := range classes[1:] {
+               most = max(most, c.npages)
+       }
+       return most
+}
+
 func printClasses(w io.Writer, classes []class) {
        fmt.Fprintln(w, "const (")
        fmt.Fprintf(w, "MinHeapAlign = %d\n", minHeapAlign)
@@ -297,6 +305,7 @@ func printClasses(w io.Writer, classes []class) {
        fmt.Fprintf(w, "NumSizeClasses = %d\n", len(classes))
        fmt.Fprintf(w, "PageShift = %d\n", pageShift)
        fmt.Fprintf(w, "MaxObjsPerSpan = %d\n", maxObjsPerSpan(classes))
+       fmt.Fprintf(w, "MaxSizeClassNPages = %d\n", maxNPages(classes))
        fmt.Fprintln(w, ")")
 
        fmt.Fprint(w, "var SizeClassToSize = [NumSizeClasses]uint16 {")
diff --git a/src/internal/runtime/gc/scan/expand_amd64.go b/src/internal/runtime/gc/scan/expand_amd64.go
new file mode 100644 (file)
index 0000000..9bea471
--- /dev/null
@@ -0,0 +1,22 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import "internal/runtime/gc"
+
+// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
+// where f is the word size of objects in sizeClass.
+//
+// This is a testing entrypoint to the expanders used by scanSpanPacked*.
+//
+//go:noescape
+func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
+
+// gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly
+// as they don't follow the Go ABI, but you can use this to check if a given
+// expander PC is 0.
+//
+// It is defined in assembly.
+var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr
diff --git a/src/internal/runtime/gc/scan/expand_amd64.s b/src/internal/runtime/gc/scan/expand_amd64.s
new file mode 100644 (file)
index 0000000..6b0be44
--- /dev/null
@@ -0,0 +1,2631 @@
+// Code generated by mkasm.go. DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+GLOBL Â·gcExpandersAVX512(SB), RODATA, $0x220
+DATA  Â·gcExpandersAVX512+0x00(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x08(SB)/8, $expandAVX512_1<>(SB)
+DATA  Â·gcExpandersAVX512+0x10(SB)/8, $expandAVX512_2<>(SB)
+DATA  Â·gcExpandersAVX512+0x18(SB)/8, $expandAVX512_3<>(SB)
+DATA  Â·gcExpandersAVX512+0x20(SB)/8, $expandAVX512_4<>(SB)
+DATA  Â·gcExpandersAVX512+0x28(SB)/8, $expandAVX512_6<>(SB)
+DATA  Â·gcExpandersAVX512+0x30(SB)/8, $expandAVX512_8<>(SB)
+DATA  Â·gcExpandersAVX512+0x38(SB)/8, $expandAVX512_10<>(SB)
+DATA  Â·gcExpandersAVX512+0x40(SB)/8, $expandAVX512_12<>(SB)
+DATA  Â·gcExpandersAVX512+0x48(SB)/8, $expandAVX512_14<>(SB)
+DATA  Â·gcExpandersAVX512+0x50(SB)/8, $expandAVX512_16<>(SB)
+DATA  Â·gcExpandersAVX512+0x58(SB)/8, $expandAVX512_18<>(SB)
+DATA  Â·gcExpandersAVX512+0x60(SB)/8, $expandAVX512_20<>(SB)
+DATA  Â·gcExpandersAVX512+0x68(SB)/8, $expandAVX512_22<>(SB)
+DATA  Â·gcExpandersAVX512+0x70(SB)/8, $expandAVX512_24<>(SB)
+DATA  Â·gcExpandersAVX512+0x78(SB)/8, $expandAVX512_26<>(SB)
+DATA  Â·gcExpandersAVX512+0x80(SB)/8, $expandAVX512_28<>(SB)
+DATA  Â·gcExpandersAVX512+0x88(SB)/8, $expandAVX512_30<>(SB)
+DATA  Â·gcExpandersAVX512+0x90(SB)/8, $expandAVX512_32<>(SB)
+DATA  Â·gcExpandersAVX512+0x98(SB)/8, $expandAVX512_36<>(SB)
+DATA  Â·gcExpandersAVX512+0xa0(SB)/8, $expandAVX512_40<>(SB)
+DATA  Â·gcExpandersAVX512+0xa8(SB)/8, $expandAVX512_44<>(SB)
+DATA  Â·gcExpandersAVX512+0xb0(SB)/8, $expandAVX512_48<>(SB)
+DATA  Â·gcExpandersAVX512+0xb8(SB)/8, $expandAVX512_52<>(SB)
+DATA  Â·gcExpandersAVX512+0xc0(SB)/8, $expandAVX512_56<>(SB)
+DATA  Â·gcExpandersAVX512+0xc8(SB)/8, $expandAVX512_60<>(SB)
+DATA  Â·gcExpandersAVX512+0xd0(SB)/8, $expandAVX512_64<>(SB)
+DATA  Â·gcExpandersAVX512+0xd8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0xe0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0xe8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0xf0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0xf8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x100(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x108(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x110(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x118(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x120(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x128(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x130(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x138(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x140(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x148(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x150(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x158(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x160(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x168(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x170(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x178(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x180(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x188(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x190(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x198(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1a0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1a8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1b0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1b8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1c0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1c8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1d0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1d8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1e0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1e8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1f0(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x1f8(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x200(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x208(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x210(SB)/8, $0
+DATA  Â·gcExpandersAVX512+0x218(SB)/8, $0
+
+TEXT expandAVX512_1<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 (AX), Z1
+       VMOVDQU64 64(AX), Z2
+       RET
+
+GLOBL expandAVX512_2_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_2_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512_2_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512_2_inShuf0<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_2_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_2_inShuf0<>+0x20(SB)/8, $0x1716151413121110
+DATA  expandAVX512_2_inShuf0<>+0x28(SB)/8, $0x1716151413121110
+DATA  expandAVX512_2_inShuf0<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_2_inShuf0<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918
+
+GLOBL expandAVX512_2_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_2_mat0<>+0x00(SB)/8, $0x0101020204040808
+DATA  expandAVX512_2_mat0<>+0x08(SB)/8, $0x1010202040408080
+DATA  expandAVX512_2_mat0<>+0x10(SB)/8, $0x0101020204040808
+DATA  expandAVX512_2_mat0<>+0x18(SB)/8, $0x1010202040408080
+DATA  expandAVX512_2_mat0<>+0x20(SB)/8, $0x0101020204040808
+DATA  expandAVX512_2_mat0<>+0x28(SB)/8, $0x1010202040408080
+DATA  expandAVX512_2_mat0<>+0x30(SB)/8, $0x0101020204040808
+DATA  expandAVX512_2_mat0<>+0x38(SB)/8, $0x1010202040408080
+
+GLOBL expandAVX512_2_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_2_inShuf1<>+0x00(SB)/8, $0x2726252423222120
+DATA  expandAVX512_2_inShuf1<>+0x08(SB)/8, $0x2726252423222120
+DATA  expandAVX512_2_inShuf1<>+0x10(SB)/8, $0x2f2e2d2c2b2a2928
+DATA  expandAVX512_2_inShuf1<>+0x18(SB)/8, $0x2f2e2d2c2b2a2928
+DATA  expandAVX512_2_inShuf1<>+0x20(SB)/8, $0x3736353433323130
+DATA  expandAVX512_2_inShuf1<>+0x28(SB)/8, $0x3736353433323130
+DATA  expandAVX512_2_inShuf1<>+0x30(SB)/8, $0x3f3e3d3c3b3a3938
+DATA  expandAVX512_2_inShuf1<>+0x38(SB)/8, $0x3f3e3d3c3b3a3938
+
+GLOBL expandAVX512_2_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_2_outShufLo+0x00(SB)/8, $0x0b030a0209010800
+DATA  expandAVX512_2_outShufLo+0x08(SB)/8, $0x0f070e060d050c04
+DATA  expandAVX512_2_outShufLo+0x10(SB)/8, $0x1b131a1219111810
+DATA  expandAVX512_2_outShufLo+0x18(SB)/8, $0x1f171e161d151c14
+DATA  expandAVX512_2_outShufLo+0x20(SB)/8, $0x2b232a2229212820
+DATA  expandAVX512_2_outShufLo+0x28(SB)/8, $0x2f272e262d252c24
+DATA  expandAVX512_2_outShufLo+0x30(SB)/8, $0x3b333a3239313830
+DATA  expandAVX512_2_outShufLo+0x38(SB)/8, $0x3f373e363d353c34
+
+TEXT expandAVX512_2<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_2_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_2_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512_2_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_2_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512_3_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_3_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512_3_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512_3_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512_3_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_3_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_3_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_3_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_3_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_3_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_3_mat0<>+0x00(SB)/8, $0x0101010202020404
+DATA  expandAVX512_3_mat0<>+0x08(SB)/8, $0x0408080810101020
+DATA  expandAVX512_3_mat0<>+0x10(SB)/8, $0x2020404040808080
+DATA  expandAVX512_3_mat0<>+0x18(SB)/8, $0x0101010202020404
+DATA  expandAVX512_3_mat0<>+0x20(SB)/8, $0x0408080810101020
+DATA  expandAVX512_3_mat0<>+0x28(SB)/8, $0x2020404040808080
+DATA  expandAVX512_3_mat0<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_3_mat0<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_3_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_3_inShuf1<>+0x00(SB)/8, $0x1716151413121110
+DATA  expandAVX512_3_inShuf1<>+0x08(SB)/8, $0x1716151413121110
+DATA  expandAVX512_3_inShuf1<>+0x10(SB)/8, $0x1716151413121110
+DATA  expandAVX512_3_inShuf1<>+0x18(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_3_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_3_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_3_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_3_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_3_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_3_inShuf2<>+0x00(SB)/8, $0x2726252423222120
+DATA  expandAVX512_3_inShuf2<>+0x08(SB)/8, $0x2726252423222120
+DATA  expandAVX512_3_inShuf2<>+0x10(SB)/8, $0x2726252423222120
+DATA  expandAVX512_3_inShuf2<>+0x18(SB)/8, $0xffffffffff2a2928
+DATA  expandAVX512_3_inShuf2<>+0x20(SB)/8, $0xffffffffff2a2928
+DATA  expandAVX512_3_inShuf2<>+0x28(SB)/8, $0xffffffffffff2928
+DATA  expandAVX512_3_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_3_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_3_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_3_outShufLo+0x00(SB)/8, $0x0a02110901100800
+DATA  expandAVX512_3_outShufLo+0x08(SB)/8, $0x05140c04130b0312
+DATA  expandAVX512_3_outShufLo+0x10(SB)/8, $0x170f07160e06150d
+DATA  expandAVX512_3_outShufLo+0x18(SB)/8, $0x221a292119282018
+DATA  expandAVX512_3_outShufLo+0x20(SB)/8, $0x1d2c241c2b231b2a
+DATA  expandAVX512_3_outShufLo+0x28(SB)/8, $0x2f271f2e261e2d25
+DATA  expandAVX512_3_outShufLo+0x30(SB)/8, $0x4a42514941504840
+DATA  expandAVX512_3_outShufLo+0x38(SB)/8, $0x45544c44534b4352
+
+GLOBL expandAVX512_3_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_3_outShufHi+0x00(SB)/8, $0x170f07160e06150d
+DATA  expandAVX512_3_outShufHi+0x08(SB)/8, $0x221a292119282018
+DATA  expandAVX512_3_outShufHi+0x10(SB)/8, $0x1d2c241c2b231b2a
+DATA  expandAVX512_3_outShufHi+0x18(SB)/8, $0x2f271f2e261e2d25
+DATA  expandAVX512_3_outShufHi+0x20(SB)/8, $0x4a42514941504840
+DATA  expandAVX512_3_outShufHi+0x28(SB)/8, $0x45544c44534b4352
+DATA  expandAVX512_3_outShufHi+0x30(SB)/8, $0x574f47564e46554d
+DATA  expandAVX512_3_outShufHi+0x38(SB)/8, $0x625a696159686058
+
+TEXT expandAVX512_3<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_3_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_3_mat0<>(SB), Z3
+       VMOVDQU64 expandAVX512_3_inShuf1<>(SB), Z4
+       VMOVDQU64 expandAVX512_3_inShuf2<>(SB), Z5
+       VMOVDQU64 expandAVX512_3_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_3_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z6
+       VPERMB Z6, Z0, Z0
+       VGF2P8AFFINEQB $0, Z3, Z0, Z0
+       VPERMB Z6, Z4, Z4
+       VGF2P8AFFINEQB $0, Z3, Z4, Z4
+       VPERMB Z6, Z5, Z5
+       VGF2P8AFFINEQB $0, Z3, Z5, Z3
+       VPERMI2B Z4, Z0, Z1
+       VPERMI2B Z3, Z4, Z2
+       RET
+
+GLOBL expandAVX512_4_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_4_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512_4_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512_4_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512_4_inShuf0<>+0x18(SB)/8, $0x0706050403020100
+DATA  expandAVX512_4_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_4_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_4_inShuf0<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_4_inShuf0<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908
+
+GLOBL expandAVX512_4_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_4_mat0<>+0x00(SB)/8, $0x0101010102020202
+DATA  expandAVX512_4_mat0<>+0x08(SB)/8, $0x0404040408080808
+DATA  expandAVX512_4_mat0<>+0x10(SB)/8, $0x1010101020202020
+DATA  expandAVX512_4_mat0<>+0x18(SB)/8, $0x4040404080808080
+DATA  expandAVX512_4_mat0<>+0x20(SB)/8, $0x0101010102020202
+DATA  expandAVX512_4_mat0<>+0x28(SB)/8, $0x0404040408080808
+DATA  expandAVX512_4_mat0<>+0x30(SB)/8, $0x1010101020202020
+DATA  expandAVX512_4_mat0<>+0x38(SB)/8, $0x4040404080808080
+
+GLOBL expandAVX512_4_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_4_inShuf1<>+0x00(SB)/8, $0x1716151413121110
+DATA  expandAVX512_4_inShuf1<>+0x08(SB)/8, $0x1716151413121110
+DATA  expandAVX512_4_inShuf1<>+0x10(SB)/8, $0x1716151413121110
+DATA  expandAVX512_4_inShuf1<>+0x18(SB)/8, $0x1716151413121110
+DATA  expandAVX512_4_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_4_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_4_inShuf1<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_4_inShuf1<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918
+
+GLOBL expandAVX512_4_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_4_outShufLo+0x00(SB)/8, $0x1911090118100800
+DATA  expandAVX512_4_outShufLo+0x08(SB)/8, $0x1b130b031a120a02
+DATA  expandAVX512_4_outShufLo+0x10(SB)/8, $0x1d150d051c140c04
+DATA  expandAVX512_4_outShufLo+0x18(SB)/8, $0x1f170f071e160e06
+DATA  expandAVX512_4_outShufLo+0x20(SB)/8, $0x3931292138302820
+DATA  expandAVX512_4_outShufLo+0x28(SB)/8, $0x3b332b233a322a22
+DATA  expandAVX512_4_outShufLo+0x30(SB)/8, $0x3d352d253c342c24
+DATA  expandAVX512_4_outShufLo+0x38(SB)/8, $0x3f372f273e362e26
+
+TEXT expandAVX512_4<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_4_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_4_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512_4_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_4_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512_6_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_6_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512_6_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512_6_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512_6_inShuf0<>+0x18(SB)/8, $0x0706050403020100
+DATA  expandAVX512_6_inShuf0<>+0x20(SB)/8, $0x0706050403020100
+DATA  expandAVX512_6_inShuf0<>+0x28(SB)/8, $0x0706050403020100
+DATA  expandAVX512_6_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_6_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_6_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_6_mat0<>+0x00(SB)/8, $0x0101010101010202
+DATA  expandAVX512_6_mat0<>+0x08(SB)/8, $0x0202020204040404
+DATA  expandAVX512_6_mat0<>+0x10(SB)/8, $0x0404080808080808
+DATA  expandAVX512_6_mat0<>+0x18(SB)/8, $0x1010101010102020
+DATA  expandAVX512_6_mat0<>+0x20(SB)/8, $0x2020202040404040
+DATA  expandAVX512_6_mat0<>+0x28(SB)/8, $0x4040808080808080
+DATA  expandAVX512_6_mat0<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_6_mat0<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_6_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_6_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_6_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_6_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_6_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_6_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_6_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_6_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_6_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_6_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_6_inShuf2<>+0x00(SB)/8, $0xffff151413121110
+DATA  expandAVX512_6_inShuf2<>+0x08(SB)/8, $0xffff151413121110
+DATA  expandAVX512_6_inShuf2<>+0x10(SB)/8, $0xffffff1413121110
+DATA  expandAVX512_6_inShuf2<>+0x18(SB)/8, $0xffffff1413121110
+DATA  expandAVX512_6_inShuf2<>+0x20(SB)/8, $0xffffff1413121110
+DATA  expandAVX512_6_inShuf2<>+0x28(SB)/8, $0xffffff1413121110
+DATA  expandAVX512_6_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_6_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_6_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_6_outShufLo+0x00(SB)/8, $0x0901282018100800
+DATA  expandAVX512_6_outShufLo+0x08(SB)/8, $0x1a120a0229211911
+DATA  expandAVX512_6_outShufLo+0x10(SB)/8, $0x2b231b130b032a22
+DATA  expandAVX512_6_outShufLo+0x18(SB)/8, $0x0d052c241c140c04
+DATA  expandAVX512_6_outShufLo+0x20(SB)/8, $0x1e160e062d251d15
+DATA  expandAVX512_6_outShufLo+0x28(SB)/8, $0x2f271f170f072e26
+DATA  expandAVX512_6_outShufLo+0x30(SB)/8, $0x4941686058504840
+DATA  expandAVX512_6_outShufLo+0x38(SB)/8, $0x5a524a4269615951
+
+GLOBL expandAVX512_6_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_6_outShufHi+0x00(SB)/8, $0x2b231b130b032a22
+DATA  expandAVX512_6_outShufHi+0x08(SB)/8, $0x0d052c241c140c04
+DATA  expandAVX512_6_outShufHi+0x10(SB)/8, $0x1e160e062d251d15
+DATA  expandAVX512_6_outShufHi+0x18(SB)/8, $0x2f271f170f072e26
+DATA  expandAVX512_6_outShufHi+0x20(SB)/8, $0x4941686058504840
+DATA  expandAVX512_6_outShufHi+0x28(SB)/8, $0x5a524a4269615951
+DATA  expandAVX512_6_outShufHi+0x30(SB)/8, $0x6b635b534b436a62
+DATA  expandAVX512_6_outShufHi+0x38(SB)/8, $0x4d456c645c544c44
+
+TEXT expandAVX512_6<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_6_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_6_mat0<>(SB), Z3
+       VMOVDQU64 expandAVX512_6_inShuf1<>(SB), Z4
+       VMOVDQU64 expandAVX512_6_inShuf2<>(SB), Z5
+       VMOVDQU64 expandAVX512_6_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_6_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z6
+       VPERMB Z6, Z0, Z0
+       VGF2P8AFFINEQB $0, Z3, Z0, Z0
+       VPERMB Z6, Z4, Z4
+       VGF2P8AFFINEQB $0, Z3, Z4, Z4
+       VPERMB Z6, Z5, Z5
+       VGF2P8AFFINEQB $0, Z3, Z5, Z3
+       VPERMI2B Z4, Z0, Z1
+       VPERMI2B Z3, Z4, Z2
+       RET
+
+GLOBL expandAVX512_8_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_8_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512_8_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512_8_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512_8_inShuf0<>+0x18(SB)/8, $0x0706050403020100
+DATA  expandAVX512_8_inShuf0<>+0x20(SB)/8, $0x0706050403020100
+DATA  expandAVX512_8_inShuf0<>+0x28(SB)/8, $0x0706050403020100
+DATA  expandAVX512_8_inShuf0<>+0x30(SB)/8, $0x0706050403020100
+DATA  expandAVX512_8_inShuf0<>+0x38(SB)/8, $0x0706050403020100
+
+GLOBL expandAVX512_8_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_8_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_8_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_8_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_8_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_8_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_8_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_8_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_8_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_8_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_8_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_8_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_8_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_8_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_8_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_8_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_8_inShuf1<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_8_inShuf1<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908
+
+GLOBL expandAVX512_8_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_8_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512_8_outShufLo+0x08(SB)/8, $0x3931292119110901
+DATA  expandAVX512_8_outShufLo+0x10(SB)/8, $0x3a322a221a120a02
+DATA  expandAVX512_8_outShufLo+0x18(SB)/8, $0x3b332b231b130b03
+DATA  expandAVX512_8_outShufLo+0x20(SB)/8, $0x3c342c241c140c04
+DATA  expandAVX512_8_outShufLo+0x28(SB)/8, $0x3d352d251d150d05
+DATA  expandAVX512_8_outShufLo+0x30(SB)/8, $0x3e362e261e160e06
+DATA  expandAVX512_8_outShufLo+0x38(SB)/8, $0x3f372f271f170f07
+
+TEXT expandAVX512_8<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_8_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_8_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512_8_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_8_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512_10_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_10_inShuf0<>+0x00(SB)/8, $0xff06050403020100
+DATA  expandAVX512_10_inShuf0<>+0x08(SB)/8, $0xff06050403020100
+DATA  expandAVX512_10_inShuf0<>+0x10(SB)/8, $0xff06050403020100
+DATA  expandAVX512_10_inShuf0<>+0x18(SB)/8, $0xff06050403020100
+DATA  expandAVX512_10_inShuf0<>+0x20(SB)/8, $0xffff050403020100
+DATA  expandAVX512_10_inShuf0<>+0x28(SB)/8, $0xffff050403020100
+DATA  expandAVX512_10_inShuf0<>+0x30(SB)/8, $0xffff050403020100
+DATA  expandAVX512_10_inShuf0<>+0x38(SB)/8, $0xffff050403020100
+
+GLOBL expandAVX512_10_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_10_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_10_mat0<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512_10_mat0<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512_10_mat0<>+0x18(SB)/8, $0x0404040404040808
+DATA  expandAVX512_10_mat0<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512_10_mat0<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512_10_mat0<>+0x30(SB)/8, $0x1010202020202020
+DATA  expandAVX512_10_mat0<>+0x38(SB)/8, $0x2020202040404040
+
+GLOBL expandAVX512_10_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_10_inShuf1<>+0x00(SB)/8, $0xffff050403020100
+DATA  expandAVX512_10_inShuf1<>+0x08(SB)/8, $0xffff050403020100
+DATA  expandAVX512_10_inShuf1<>+0x10(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512_10_inShuf1<>+0x18(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512_10_inShuf1<>+0x20(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512_10_inShuf1<>+0x28(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512_10_inShuf1<>+0x30(SB)/8, $0xffff0b0a09080706
+DATA  expandAVX512_10_inShuf1<>+0x38(SB)/8, $0xffff0b0a09080706
+
+GLOBL expandAVX512_10_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_10_mat1<>+0x00(SB)/8, $0x4040404040408080
+DATA  expandAVX512_10_mat1<>+0x08(SB)/8, $0x8080808080808080
+DATA  expandAVX512_10_mat1<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512_10_mat1<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512_10_mat1<>+0x20(SB)/8, $0x1010202020202020
+DATA  expandAVX512_10_mat1<>+0x28(SB)/8, $0x2020202040404040
+DATA  expandAVX512_10_mat1<>+0x30(SB)/8, $0x4040404040408080
+DATA  expandAVX512_10_mat1<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_10_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_10_inShuf2<>+0x00(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512_10_inShuf2<>+0x08(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512_10_inShuf2<>+0x10(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512_10_inShuf2<>+0x18(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512_10_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_10_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_10_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_10_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_10_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_10_mat2<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_10_mat2<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512_10_mat2<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512_10_mat2<>+0x18(SB)/8, $0x0404040404040808
+DATA  expandAVX512_10_mat2<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_10_mat2<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_10_mat2<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_10_mat2<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_10_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_10_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512_10_outShufLo+0x08(SB)/8, $0x2921191109014840
+DATA  expandAVX512_10_outShufLo+0x10(SB)/8, $0x1a120a0249413931
+DATA  expandAVX512_10_outShufLo+0x18(SB)/8, $0x0b034a423a322a22
+DATA  expandAVX512_10_outShufLo+0x20(SB)/8, $0x4b433b332b231b13
+DATA  expandAVX512_10_outShufLo+0x28(SB)/8, $0x3c342c241c140c04
+DATA  expandAVX512_10_outShufLo+0x30(SB)/8, $0x2d251d150d054c44
+DATA  expandAVX512_10_outShufLo+0x38(SB)/8, $0x1e160e064d453d35
+
+GLOBL expandAVX512_10_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_10_outShufHi+0x00(SB)/8, $0x4840383028201810
+DATA  expandAVX512_10_outShufHi+0x08(SB)/8, $0x3931292119115850
+DATA  expandAVX512_10_outShufHi+0x10(SB)/8, $0x2a221a1259514941
+DATA  expandAVX512_10_outShufHi+0x18(SB)/8, $0x1b135a524a423a32
+DATA  expandAVX512_10_outShufHi+0x20(SB)/8, $0x5b534b433b332b23
+DATA  expandAVX512_10_outShufHi+0x28(SB)/8, $0x4c443c342c241c14
+DATA  expandAVX512_10_outShufHi+0x30(SB)/8, $0x3d352d251d155c54
+DATA  expandAVX512_10_outShufHi+0x38(SB)/8, $0x2e261e165d554d45
+
+TEXT expandAVX512_10<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_10_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_10_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512_10_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512_10_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_10_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_10_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_10_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_10_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512_12_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_12_inShuf0<>+0x00(SB)/8, $0xffff050403020100
+DATA  expandAVX512_12_inShuf0<>+0x08(SB)/8, $0xffff050403020100
+DATA  expandAVX512_12_inShuf0<>+0x10(SB)/8, $0xffff050403020100
+DATA  expandAVX512_12_inShuf0<>+0x18(SB)/8, $0xffff050403020100
+DATA  expandAVX512_12_inShuf0<>+0x20(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_12_inShuf0<>+0x28(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_12_inShuf0<>+0x30(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_12_inShuf0<>+0x38(SB)/8, $0xffffff0403020100
+
+GLOBL expandAVX512_12_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_12_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_12_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_12_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_12_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_12_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_12_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_12_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_12_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_12_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_12_inShuf1<>+0x00(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_12_inShuf1<>+0x08(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_12_inShuf1<>+0x10(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_12_inShuf1<>+0x18(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_12_inShuf1<>+0x20(SB)/8, $0xffff0a0908070605
+DATA  expandAVX512_12_inShuf1<>+0x28(SB)/8, $0xffff0a0908070605
+DATA  expandAVX512_12_inShuf1<>+0x30(SB)/8, $0xffff0a0908070605
+DATA  expandAVX512_12_inShuf1<>+0x38(SB)/8, $0xffff0a0908070605
+
+GLOBL expandAVX512_12_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_12_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_12_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_12_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_12_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_12_mat1<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_12_mat1<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_12_mat1<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_12_mat1<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_12_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_12_inShuf2<>+0x00(SB)/8, $0xffffff0908070605
+DATA  expandAVX512_12_inShuf2<>+0x08(SB)/8, $0xffffff0908070605
+DATA  expandAVX512_12_inShuf2<>+0x10(SB)/8, $0xffffff0908070605
+DATA  expandAVX512_12_inShuf2<>+0x18(SB)/8, $0xffffff0908070605
+DATA  expandAVX512_12_inShuf2<>+0x20(SB)/8, $0xffffff0a09080706
+DATA  expandAVX512_12_inShuf2<>+0x28(SB)/8, $0xffffff0a09080706
+DATA  expandAVX512_12_inShuf2<>+0x30(SB)/8, $0xffffff0a09080706
+DATA  expandAVX512_12_inShuf2<>+0x38(SB)/8, $0xffffff0a09080706
+
+GLOBL expandAVX512_12_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_12_mat2<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_12_mat2<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_12_mat2<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_12_mat2<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_12_mat2<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512_12_mat2<>+0x28(SB)/8, $0x0101010102020202
+DATA  expandAVX512_12_mat2<>+0x30(SB)/8, $0x0202020202020202
+DATA  expandAVX512_12_mat2<>+0x38(SB)/8, $0x0404040404040404
+
+GLOBL expandAVX512_12_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_12_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512_12_outShufLo+0x08(SB)/8, $0x1911090158504840
+DATA  expandAVX512_12_outShufLo+0x10(SB)/8, $0x5951494139312921
+DATA  expandAVX512_12_outShufLo+0x18(SB)/8, $0x3a322a221a120a02
+DATA  expandAVX512_12_outShufLo+0x20(SB)/8, $0x1b130b035a524a42
+DATA  expandAVX512_12_outShufLo+0x28(SB)/8, $0x5b534b433b332b23
+DATA  expandAVX512_12_outShufLo+0x30(SB)/8, $0x3c342c241c140c04
+DATA  expandAVX512_12_outShufLo+0x38(SB)/8, $0x1d150d055c544c44
+
+GLOBL expandAVX512_12_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_12_outShufHi+0x00(SB)/8, $0x5850484038302820
+DATA  expandAVX512_12_outShufHi+0x08(SB)/8, $0x3931292178706860
+DATA  expandAVX512_12_outShufHi+0x10(SB)/8, $0x7971696159514941
+DATA  expandAVX512_12_outShufHi+0x18(SB)/8, $0x5a524a423a322a22
+DATA  expandAVX512_12_outShufHi+0x20(SB)/8, $0x3b332b237a726a62
+DATA  expandAVX512_12_outShufHi+0x28(SB)/8, $0x7b736b635b534b43
+DATA  expandAVX512_12_outShufHi+0x30(SB)/8, $0x5c544c443c342c24
+DATA  expandAVX512_12_outShufHi+0x38(SB)/8, $0x3d352d257c746c64
+
+TEXT expandAVX512_12<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_12_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_12_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512_12_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512_12_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_12_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_12_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_12_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_12_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512_14_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_inShuf0<>+0x00(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_14_inShuf0<>+0x08(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_14_inShuf0<>+0x10(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_14_inShuf0<>+0x18(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_14_inShuf0<>+0x20(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_14_inShuf0<>+0x28(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_14_inShuf0<>+0x30(SB)/8, $0xffffff0403020100
+DATA  expandAVX512_14_inShuf0<>+0x38(SB)/8, $0xffffff0403020100
+
+GLOBL expandAVX512_14_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_14_mat0<>+0x08(SB)/8, $0x0101010101010202
+DATA  expandAVX512_14_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_14_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512_14_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512_14_mat0<>+0x28(SB)/8, $0x0404080808080808
+DATA  expandAVX512_14_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512_14_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512_14_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_inShuf1<>+0x00(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_14_inShuf1<>+0x08(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_14_inShuf1<>+0x10(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_14_inShuf1<>+0x18(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_14_inShuf1<>+0x20(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_14_inShuf1<>+0x28(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_14_inShuf1<>+0x30(SB)/8, $0xffffff0807060504
+DATA  expandAVX512_14_inShuf1<>+0x38(SB)/8, $0xffffff0807060504
+
+GLOBL expandAVX512_14_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_mat1<>+0x00(SB)/8, $0x1010101010102020
+DATA  expandAVX512_14_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512_14_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512_14_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512_14_mat1<>+0x20(SB)/8, $0x4040808080808080
+DATA  expandAVX512_14_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512_14_mat1<>+0x30(SB)/8, $0x1010101010102020
+DATA  expandAVX512_14_mat1<>+0x38(SB)/8, $0x2020202020202020
+
+GLOBL expandAVX512_14_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_inShuf2<>+0x00(SB)/8, $0xffffff0807060504
+DATA  expandAVX512_14_inShuf2<>+0x08(SB)/8, $0xffffff0807060504
+DATA  expandAVX512_14_inShuf2<>+0x10(SB)/8, $0xffffff0807060504
+DATA  expandAVX512_14_inShuf2<>+0x18(SB)/8, $0xffffff0807060504
+DATA  expandAVX512_14_inShuf2<>+0x20(SB)/8, $0xffffff0908070605
+DATA  expandAVX512_14_inShuf2<>+0x28(SB)/8, $0xffffff0908070605
+DATA  expandAVX512_14_inShuf2<>+0x30(SB)/8, $0xffffffff08070605
+DATA  expandAVX512_14_inShuf2<>+0x38(SB)/8, $0xffffffff08070605
+
+GLOBL expandAVX512_14_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_mat2<>+0x00(SB)/8, $0x2020202040404040
+DATA  expandAVX512_14_mat2<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_14_mat2<>+0x10(SB)/8, $0x4040808080808080
+DATA  expandAVX512_14_mat2<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_14_mat2<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512_14_mat2<>+0x28(SB)/8, $0x0101010101010202
+DATA  expandAVX512_14_mat2<>+0x30(SB)/8, $0x0202020202020202
+DATA  expandAVX512_14_mat2<>+0x38(SB)/8, $0x0202020204040404
+
+GLOBL expandAVX512_14_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_inShuf3<>+0x00(SB)/8, $0xffffffff08070605
+DATA  expandAVX512_14_inShuf3<>+0x08(SB)/8, $0xffffffff08070605
+DATA  expandAVX512_14_inShuf3<>+0x10(SB)/8, $0xffffffff08070605
+DATA  expandAVX512_14_inShuf3<>+0x18(SB)/8, $0xffffffff08070605
+DATA  expandAVX512_14_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_14_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_14_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_14_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_14_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_14_mat3<>+0x00(SB)/8, $0x0404040404040404
+DATA  expandAVX512_14_mat3<>+0x08(SB)/8, $0x0404080808080808
+DATA  expandAVX512_14_mat3<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512_14_mat3<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512_14_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_14_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_14_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_14_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_14_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_14_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512_14_outShufLo+0x08(SB)/8, $0x0901686058504840
+DATA  expandAVX512_14_outShufLo+0x10(SB)/8, $0x4941393129211911
+DATA  expandAVX512_14_outShufLo+0x18(SB)/8, $0x1a120a0269615951
+DATA  expandAVX512_14_outShufLo+0x20(SB)/8, $0x5a524a423a322a22
+DATA  expandAVX512_14_outShufLo+0x28(SB)/8, $0x2b231b130b036a62
+DATA  expandAVX512_14_outShufLo+0x30(SB)/8, $0x6b635b534b433b33
+DATA  expandAVX512_14_outShufLo+0x38(SB)/8, $0x3c342c241c140c04
+
+GLOBL expandAVX512_14_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_14_outShufHi0+0x00(SB)/8, $0x6860585048403830
+DATA  expandAVX512_14_outShufHi0+0x08(SB)/8, $0x3931ffffffff7870
+DATA  expandAVX512_14_outShufHi0+0x10(SB)/8, $0x7971696159514941
+DATA  expandAVX512_14_outShufHi0+0x18(SB)/8, $0x4a423a32ffffffff
+DATA  expandAVX512_14_outShufHi0+0x20(SB)/8, $0xffff7a726a625a52
+DATA  expandAVX512_14_outShufHi0+0x28(SB)/8, $0x5b534b433b33ffff
+DATA  expandAVX512_14_outShufHi0+0x30(SB)/8, $0xffffffff7b736b63
+DATA  expandAVX512_14_outShufHi0+0x38(SB)/8, $0x6c645c544c443c34
+
+GLOBL expandAVX512_14_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_14_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_14_outShufHi1+0x08(SB)/8, $0xffff18100800ffff
+DATA  expandAVX512_14_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_14_outShufHi1+0x18(SB)/8, $0xffffffff19110901
+DATA  expandAVX512_14_outShufHi1+0x20(SB)/8, $0x0a02ffffffffffff
+DATA  expandAVX512_14_outShufHi1+0x28(SB)/8, $0xffffffffffff1a12
+DATA  expandAVX512_14_outShufHi1+0x30(SB)/8, $0x1b130b03ffffffff
+DATA  expandAVX512_14_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
+
+TEXT expandAVX512_14<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_14_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_14_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_14_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_14_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_14_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_14_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_14_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_14_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_14_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_14_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_14_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xff0ffc3ff0ffc3ff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0xf003c00f003c00, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_16_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_16_inShuf0<>+0x00(SB)/8, $0x0303020201010000
+DATA  expandAVX512_16_inShuf0<>+0x08(SB)/8, $0x0303020201010000
+DATA  expandAVX512_16_inShuf0<>+0x10(SB)/8, $0x0303020201010000
+DATA  expandAVX512_16_inShuf0<>+0x18(SB)/8, $0x0303020201010000
+DATA  expandAVX512_16_inShuf0<>+0x20(SB)/8, $0x0303020201010000
+DATA  expandAVX512_16_inShuf0<>+0x28(SB)/8, $0x0303020201010000
+DATA  expandAVX512_16_inShuf0<>+0x30(SB)/8, $0x0303020201010000
+DATA  expandAVX512_16_inShuf0<>+0x38(SB)/8, $0x0303020201010000
+
+GLOBL expandAVX512_16_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_16_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_16_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_16_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_16_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_16_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_16_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_16_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_16_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_16_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_16_inShuf1<>+0x00(SB)/8, $0x0707060605050404
+DATA  expandAVX512_16_inShuf1<>+0x08(SB)/8, $0x0707060605050404
+DATA  expandAVX512_16_inShuf1<>+0x10(SB)/8, $0x0707060605050404
+DATA  expandAVX512_16_inShuf1<>+0x18(SB)/8, $0x0707060605050404
+DATA  expandAVX512_16_inShuf1<>+0x20(SB)/8, $0x0707060605050404
+DATA  expandAVX512_16_inShuf1<>+0x28(SB)/8, $0x0707060605050404
+DATA  expandAVX512_16_inShuf1<>+0x30(SB)/8, $0x0707060605050404
+DATA  expandAVX512_16_inShuf1<>+0x38(SB)/8, $0x0707060605050404
+
+GLOBL expandAVX512_16_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_16_outShufLo+0x00(SB)/8, $0x1918111009080100
+DATA  expandAVX512_16_outShufLo+0x08(SB)/8, $0x3938313029282120
+DATA  expandAVX512_16_outShufLo+0x10(SB)/8, $0x1b1a13120b0a0302
+DATA  expandAVX512_16_outShufLo+0x18(SB)/8, $0x3b3a33322b2a2322
+DATA  expandAVX512_16_outShufLo+0x20(SB)/8, $0x1d1c15140d0c0504
+DATA  expandAVX512_16_outShufLo+0x28(SB)/8, $0x3d3c35342d2c2524
+DATA  expandAVX512_16_outShufLo+0x30(SB)/8, $0x1f1e17160f0e0706
+DATA  expandAVX512_16_outShufLo+0x38(SB)/8, $0x3f3e37362f2e2726
+
+TEXT expandAVX512_16<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_16_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_16_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512_16_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_16_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512_18_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_inShuf0<>+0x00(SB)/8, $0x0303020201010000
+DATA  expandAVX512_18_inShuf0<>+0x08(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_18_inShuf0<>+0x10(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_18_inShuf0<>+0x18(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_18_inShuf0<>+0x20(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_18_inShuf0<>+0x28(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_18_inShuf0<>+0x30(SB)/8, $0x0303020201010000
+DATA  expandAVX512_18_inShuf0<>+0x38(SB)/8, $0xff03020201010000
+
+GLOBL expandAVX512_18_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_18_mat0<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512_18_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_18_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512_18_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512_18_mat0<>+0x28(SB)/8, $0x0404040404040808
+DATA  expandAVX512_18_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512_18_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512_18_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_inShuf1<>+0x00(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_18_inShuf1<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_18_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_18_inShuf1<>+0x18(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_18_inShuf1<>+0x20(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_18_inShuf1<>+0x28(SB)/8, $0xffff020201010000
+DATA  expandAVX512_18_inShuf1<>+0x30(SB)/8, $0xff06060505040403
+DATA  expandAVX512_18_inShuf1<>+0x38(SB)/8, $0xffffffff06050403
+
+GLOBL expandAVX512_18_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_mat1<>+0x00(SB)/8, $0x1010202020202020
+DATA  expandAVX512_18_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512_18_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512_18_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512_18_mat1<>+0x20(SB)/8, $0x4040404040408080
+DATA  expandAVX512_18_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512_18_mat1<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_18_mat1<>+0x38(SB)/8, $0x1010202020202020
+
+GLOBL expandAVX512_18_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_inShuf2<>+0x00(SB)/8, $0xffffffff06050403
+DATA  expandAVX512_18_inShuf2<>+0x08(SB)/8, $0xffffffff06050403
+DATA  expandAVX512_18_inShuf2<>+0x10(SB)/8, $0xffffffff06050403
+DATA  expandAVX512_18_inShuf2<>+0x18(SB)/8, $0xffffffff06050403
+DATA  expandAVX512_18_inShuf2<>+0x20(SB)/8, $0x0606050504040303
+DATA  expandAVX512_18_inShuf2<>+0x28(SB)/8, $0x0707060605050404
+DATA  expandAVX512_18_inShuf2<>+0x30(SB)/8, $0xffffffffff060504
+DATA  expandAVX512_18_inShuf2<>+0x38(SB)/8, $0xffffffffff060504
+
+GLOBL expandAVX512_18_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_mat2<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_18_mat2<>+0x08(SB)/8, $0x2020202040404040
+DATA  expandAVX512_18_mat2<>+0x10(SB)/8, $0x4040404040404040
+DATA  expandAVX512_18_mat2<>+0x18(SB)/8, $0x4040404040408080
+DATA  expandAVX512_18_mat2<>+0x20(SB)/8, $0x8080808080808080
+DATA  expandAVX512_18_mat2<>+0x28(SB)/8, $0x0101010101010101
+DATA  expandAVX512_18_mat2<>+0x30(SB)/8, $0x0101020202020202
+DATA  expandAVX512_18_mat2<>+0x38(SB)/8, $0x0202020202020202
+
+GLOBL expandAVX512_18_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_inShuf3<>+0x00(SB)/8, $0xffffffffff060504
+DATA  expandAVX512_18_inShuf3<>+0x08(SB)/8, $0xffffffffff060504
+DATA  expandAVX512_18_inShuf3<>+0x10(SB)/8, $0xffffffffff060504
+DATA  expandAVX512_18_inShuf3<>+0x18(SB)/8, $0xffff060605050404
+DATA  expandAVX512_18_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_18_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_18_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_18_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_18_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_18_mat3<>+0x00(SB)/8, $0x0202020204040404
+DATA  expandAVX512_18_mat3<>+0x08(SB)/8, $0x0404040404040404
+DATA  expandAVX512_18_mat3<>+0x10(SB)/8, $0x0404040404040808
+DATA  expandAVX512_18_mat3<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_18_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_18_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_18_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_18_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_18_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_18_outShufLo+0x00(SB)/8, $0x3028201810080100
+DATA  expandAVX512_18_outShufLo+0x08(SB)/8, $0x6058504840393831
+DATA  expandAVX512_18_outShufLo+0x10(SB)/8, $0x2119110903026968
+DATA  expandAVX512_18_outShufLo+0x18(SB)/8, $0x5149413b3a333229
+DATA  expandAVX512_18_outShufLo+0x20(SB)/8, $0x120a05046b6a6159
+DATA  expandAVX512_18_outShufLo+0x28(SB)/8, $0x423d3c35342a221a
+DATA  expandAVX512_18_outShufLo+0x30(SB)/8, $0x07066d6c625a524a
+DATA  expandAVX512_18_outShufLo+0x38(SB)/8, $0x3e37362b231b130b
+
+GLOBL expandAVX512_18_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_18_outShufHi0+0x00(SB)/8, $0x6160585048403830
+DATA  expandAVX512_18_outShufHi0+0x08(SB)/8, $0xffffffff78706968
+DATA  expandAVX512_18_outShufHi0+0x10(SB)/8, $0x59514941393231ff
+DATA  expandAVX512_18_outShufHi0+0x18(SB)/8, $0xffff79716b6a6362
+DATA  expandAVX512_18_outShufHi0+0x20(SB)/8, $0x4a423a3433ffffff
+DATA  expandAVX512_18_outShufHi0+0x28(SB)/8, $0x7a726d6c65645a52
+DATA  expandAVX512_18_outShufHi0+0x30(SB)/8, $0x3b3635ffffffffff
+DATA  expandAVX512_18_outShufHi0+0x38(SB)/8, $0x6f6e67665b534b43
+
+GLOBL expandAVX512_18_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_18_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_18_outShufHi1+0x08(SB)/8, $0x18100800ffffffff
+DATA  expandAVX512_18_outShufHi1+0x10(SB)/8, $0xffffffffffffff19
+DATA  expandAVX512_18_outShufHi1+0x18(SB)/8, $0x0901ffffffffffff
+DATA  expandAVX512_18_outShufHi1+0x20(SB)/8, $0xffffffffff1b1a11
+DATA  expandAVX512_18_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_18_outShufHi1+0x30(SB)/8, $0xffffff1d1c120a02
+DATA  expandAVX512_18_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
+
+TEXT expandAVX512_18<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_18_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_18_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_18_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_18_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_18_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_18_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_18_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_18_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_18_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_18_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_18_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xffe0fff83ffe0fff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x1f0007c001f000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_20_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_20_inShuf0<>+0x00(SB)/8, $0x0303020201010000
+DATA  expandAVX512_20_inShuf0<>+0x08(SB)/8, $0xffffffff03020100
+DATA  expandAVX512_20_inShuf0<>+0x10(SB)/8, $0xff03020201010000
+DATA  expandAVX512_20_inShuf0<>+0x18(SB)/8, $0xffff020201010000
+DATA  expandAVX512_20_inShuf0<>+0x20(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_20_inShuf0<>+0x28(SB)/8, $0xffff020201010000
+DATA  expandAVX512_20_inShuf0<>+0x30(SB)/8, $0xffff020201010000
+DATA  expandAVX512_20_inShuf0<>+0x38(SB)/8, $0xffffffffff020100
+
+GLOBL expandAVX512_20_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_20_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_20_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_20_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_20_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_20_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_20_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_20_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_20_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_20_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_20_inShuf1<>+0x00(SB)/8, $0xffff020201010000
+DATA  expandAVX512_20_inShuf1<>+0x08(SB)/8, $0xffff020201010000
+DATA  expandAVX512_20_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_20_inShuf1<>+0x18(SB)/8, $0xffff020201010000
+DATA  expandAVX512_20_inShuf1<>+0x20(SB)/8, $0xff06060505040403
+DATA  expandAVX512_20_inShuf1<>+0x28(SB)/8, $0x0606050504040303
+DATA  expandAVX512_20_inShuf1<>+0x30(SB)/8, $0xffffffff06050403
+DATA  expandAVX512_20_inShuf1<>+0x38(SB)/8, $0xffff050504040303
+
+GLOBL expandAVX512_20_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_20_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_20_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_20_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_20_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_20_mat1<>+0x20(SB)/8, $0x0202020202020202
+DATA  expandAVX512_20_mat1<>+0x28(SB)/8, $0x0404040404040404
+DATA  expandAVX512_20_mat1<>+0x30(SB)/8, $0x0404040408080808
+DATA  expandAVX512_20_mat1<>+0x38(SB)/8, $0x0808080808080808
+
+GLOBL expandAVX512_20_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_20_inShuf2<>+0x00(SB)/8, $0xffff050504040303
+DATA  expandAVX512_20_inShuf2<>+0x08(SB)/8, $0xffffffffff050403
+DATA  expandAVX512_20_inShuf2<>+0x10(SB)/8, $0xffff050504040303
+DATA  expandAVX512_20_inShuf2<>+0x18(SB)/8, $0xffff050504040303
+DATA  expandAVX512_20_inShuf2<>+0x20(SB)/8, $0xffffffffff050403
+DATA  expandAVX512_20_inShuf2<>+0x28(SB)/8, $0xffff050504040303
+DATA  expandAVX512_20_inShuf2<>+0x30(SB)/8, $0xffff060605050404
+DATA  expandAVX512_20_inShuf2<>+0x38(SB)/8, $0xffffffffff060504
+
+GLOBL expandAVX512_20_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_20_mat2<>+0x00(SB)/8, $0x1010101010101010
+DATA  expandAVX512_20_mat2<>+0x08(SB)/8, $0x1010101020202020
+DATA  expandAVX512_20_mat2<>+0x10(SB)/8, $0x2020202020202020
+DATA  expandAVX512_20_mat2<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512_20_mat2<>+0x20(SB)/8, $0x4040404080808080
+DATA  expandAVX512_20_mat2<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512_20_mat2<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512_20_mat2<>+0x38(SB)/8, $0x0101010102020202
+
+GLOBL expandAVX512_20_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_20_outShufLo+0x00(SB)/8, $0x2019181110080100
+DATA  expandAVX512_20_outShufLo+0x08(SB)/8, $0x4841403831302928
+DATA  expandAVX512_20_outShufLo+0x10(SB)/8, $0x1209030259585049
+DATA  expandAVX512_20_outShufLo+0x18(SB)/8, $0x33322b2a211b1a13
+DATA  expandAVX512_20_outShufLo+0x20(SB)/8, $0x5b5a514b4a434239
+DATA  expandAVX512_20_outShufLo+0x28(SB)/8, $0x221d1c15140a0504
+DATA  expandAVX512_20_outShufLo+0x30(SB)/8, $0x4c45443a35342d2c
+DATA  expandAVX512_20_outShufLo+0x38(SB)/8, $0x160b07065d5c524d
+
+GLOBL expandAVX512_20_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_20_outShufHi+0x00(SB)/8, $0x4140393830292820
+DATA  expandAVX512_20_outShufHi+0x08(SB)/8, $0x6968605958515048
+DATA  expandAVX512_20_outShufHi+0x10(SB)/8, $0x312b2a2221787170
+DATA  expandAVX512_20_outShufHi+0x18(SB)/8, $0x5a53524943423b3a
+DATA  expandAVX512_20_outShufHi+0x20(SB)/8, $0x237973726b6a615b
+DATA  expandAVX512_20_outShufHi+0x28(SB)/8, $0x45443d3c322d2c24
+DATA  expandAVX512_20_outShufHi+0x30(SB)/8, $0x6d6c625d5c55544a
+DATA  expandAVX512_20_outShufHi+0x38(SB)/8, $0x332f2e26257a7574
+
+TEXT expandAVX512_20<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_20_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_20_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512_20_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512_20_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_20_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_20_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_20_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_20_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512_22_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_inShuf0<>+0x00(SB)/8, $0xffff020201010000
+DATA  expandAVX512_22_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_22_inShuf0<>+0x10(SB)/8, $0xffff020201010000
+DATA  expandAVX512_22_inShuf0<>+0x18(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_22_inShuf0<>+0x20(SB)/8, $0xffff020201010000
+DATA  expandAVX512_22_inShuf0<>+0x28(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_22_inShuf0<>+0x30(SB)/8, $0xffff020201010000
+DATA  expandAVX512_22_inShuf0<>+0x38(SB)/8, $0xffff020201010000
+
+GLOBL expandAVX512_22_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_22_mat0<>+0x08(SB)/8, $0x0101010101010202
+DATA  expandAVX512_22_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_22_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512_22_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512_22_mat0<>+0x28(SB)/8, $0x0404080808080808
+DATA  expandAVX512_22_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512_22_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512_22_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_inShuf1<>+0x00(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_22_inShuf1<>+0x08(SB)/8, $0xffff020201010000
+DATA  expandAVX512_22_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_22_inShuf1<>+0x18(SB)/8, $0xffff020201010000
+DATA  expandAVX512_22_inShuf1<>+0x20(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_22_inShuf1<>+0x28(SB)/8, $0xffffffff01010000
+DATA  expandAVX512_22_inShuf1<>+0x30(SB)/8, $0xffff040403030202
+DATA  expandAVX512_22_inShuf1<>+0x38(SB)/8, $0xffff050504040303
+
+GLOBL expandAVX512_22_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_mat1<>+0x00(SB)/8, $0x1010101010102020
+DATA  expandAVX512_22_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512_22_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512_22_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512_22_mat1<>+0x20(SB)/8, $0x4040808080808080
+DATA  expandAVX512_22_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512_22_mat1<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512_22_mat1<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512_22_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_inShuf2<>+0x00(SB)/8, $0xffffffffff050403
+DATA  expandAVX512_22_inShuf2<>+0x08(SB)/8, $0xffff050504040303
+DATA  expandAVX512_22_inShuf2<>+0x10(SB)/8, $0xffffffffff050403
+DATA  expandAVX512_22_inShuf2<>+0x18(SB)/8, $0xffff050504040303
+DATA  expandAVX512_22_inShuf2<>+0x20(SB)/8, $0xffffffffff050403
+DATA  expandAVX512_22_inShuf2<>+0x28(SB)/8, $0xffff050504040303
+DATA  expandAVX512_22_inShuf2<>+0x30(SB)/8, $0xffff050504040303
+DATA  expandAVX512_22_inShuf2<>+0x38(SB)/8, $0xffffffffff050403
+
+GLOBL expandAVX512_22_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_mat2<>+0x00(SB)/8, $0x0101010101010202
+DATA  expandAVX512_22_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_22_mat2<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512_22_mat2<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_22_mat2<>+0x20(SB)/8, $0x0404080808080808
+DATA  expandAVX512_22_mat2<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_22_mat2<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_22_mat2<>+0x38(SB)/8, $0x1010101010102020
+
+GLOBL expandAVX512_22_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_inShuf3<>+0x00(SB)/8, $0xffff050504040303
+DATA  expandAVX512_22_inShuf3<>+0x08(SB)/8, $0xffffffffff050403
+DATA  expandAVX512_22_inShuf3<>+0x10(SB)/8, $0xffffff0504040303
+DATA  expandAVX512_22_inShuf3<>+0x18(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512_22_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_22_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_22_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_22_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_22_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_22_mat3<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_22_mat3<>+0x08(SB)/8, $0x2020202040404040
+DATA  expandAVX512_22_mat3<>+0x10(SB)/8, $0x4040404040404040
+DATA  expandAVX512_22_mat3<>+0x18(SB)/8, $0x4040808080808080
+DATA  expandAVX512_22_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_22_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_22_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_22_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_22_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_22_outShufLo+0x00(SB)/8, $0x2120181110080100
+DATA  expandAVX512_22_outShufLo+0x08(SB)/8, $0x4948403938313028
+DATA  expandAVX512_22_outShufLo+0x10(SB)/8, $0x0302696860595850
+DATA  expandAVX512_22_outShufLo+0x18(SB)/8, $0x3229232219131209
+DATA  expandAVX512_22_outShufLo+0x20(SB)/8, $0x5a514b4a413b3a33
+DATA  expandAVX512_22_outShufLo+0x28(SB)/8, $0x140a05046b6a615b
+DATA  expandAVX512_22_outShufLo+0x30(SB)/8, $0x3c35342a25241a15
+DATA  expandAVX512_22_outShufLo+0x38(SB)/8, $0x625d5c524d4c423d
+
+GLOBL expandAVX512_22_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_22_outShufHi0+0x00(SB)/8, $0x5049484039383130
+DATA  expandAVX512_22_outShufHi0+0x08(SB)/8, $0x7871706968605958
+DATA  expandAVX512_22_outShufHi0+0x10(SB)/8, $0x3332ffffffffffff
+DATA  expandAVX512_22_outShufHi0+0x18(SB)/8, $0x5b5a514b4a413b3a
+DATA  expandAVX512_22_outShufHi0+0x20(SB)/8, $0xffff7973726b6a61
+DATA  expandAVX512_22_outShufHi0+0x28(SB)/8, $0x3d3c3534ffffffff
+DATA  expandAVX512_22_outShufHi0+0x30(SB)/8, $0x6c625d5c524d4c42
+DATA  expandAVX512_22_outShufHi0+0x38(SB)/8, $0xffffffff7a75746d
+
+GLOBL expandAVX512_22_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_22_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_22_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_22_outShufHi1+0x10(SB)/8, $0xffff181110080100
+DATA  expandAVX512_22_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_22_outShufHi1+0x20(SB)/8, $0x0302ffffffffffff
+DATA  expandAVX512_22_outShufHi1+0x28(SB)/8, $0xffffffff19131209
+DATA  expandAVX512_22_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_22_outShufHi1+0x38(SB)/8, $0x140a0504ffffffff
+
+TEXT expandAVX512_22<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_22_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_22_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_22_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_22_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_22_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_22_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_22_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_22_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_22_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_22_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_22_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xffff03fffc0ffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0xf0000fc0003f0000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_24_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_24_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512_24_inShuf0<>+0x08(SB)/8, $0x0202010101000000
+DATA  expandAVX512_24_inShuf0<>+0x10(SB)/8, $0x0202010101000000
+DATA  expandAVX512_24_inShuf0<>+0x18(SB)/8, $0x0202010101000000
+DATA  expandAVX512_24_inShuf0<>+0x20(SB)/8, $0x0202010101000000
+DATA  expandAVX512_24_inShuf0<>+0x28(SB)/8, $0xff02010101000000
+DATA  expandAVX512_24_inShuf0<>+0x30(SB)/8, $0xffff010101000000
+DATA  expandAVX512_24_inShuf0<>+0x38(SB)/8, $0xffff010101000000
+
+GLOBL expandAVX512_24_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_24_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_24_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_24_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_24_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_24_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_24_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_24_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_24_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_24_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_24_inShuf1<>+0x00(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_24_inShuf1<>+0x08(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_24_inShuf1<>+0x10(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_24_inShuf1<>+0x18(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_24_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_24_inShuf1<>+0x28(SB)/8, $0x0404040303030202
+DATA  expandAVX512_24_inShuf1<>+0x30(SB)/8, $0x0404030303020202
+DATA  expandAVX512_24_inShuf1<>+0x38(SB)/8, $0x0404030303020202
+
+GLOBL expandAVX512_24_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_24_inShuf2<>+0x00(SB)/8, $0x0505040404030303
+DATA  expandAVX512_24_inShuf2<>+0x08(SB)/8, $0x0505040404030303
+DATA  expandAVX512_24_inShuf2<>+0x10(SB)/8, $0x0505040404030303
+DATA  expandAVX512_24_inShuf2<>+0x18(SB)/8, $0xffff040404030303
+DATA  expandAVX512_24_inShuf2<>+0x20(SB)/8, $0xffff040404030303
+DATA  expandAVX512_24_inShuf2<>+0x28(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512_24_inShuf2<>+0x30(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512_24_inShuf2<>+0x38(SB)/8, $0xffffffffffffff05
+
+GLOBL expandAVX512_24_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_24_mat2<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_24_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_24_mat2<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_24_mat2<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_24_mat2<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_24_mat2<>+0x28(SB)/8, $0x4040404040404040
+DATA  expandAVX512_24_mat2<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512_24_mat2<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512_24_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_24_inShuf3<>+0x00(SB)/8, $0xffffffffffffff05
+DATA  expandAVX512_24_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_24_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_24_mat3<>+0x00(SB)/8, $0x0202020202020202
+DATA  expandAVX512_24_mat3<>+0x08(SB)/8, $0x0000000000000000
+DATA  expandAVX512_24_mat3<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512_24_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512_24_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_24_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_24_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_24_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_24_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_24_outShufLo+0x00(SB)/8, $0x11100a0908020100
+DATA  expandAVX512_24_outShufLo+0x08(SB)/8, $0x282221201a191812
+DATA  expandAVX512_24_outShufLo+0x10(SB)/8, $0x3a39383231302a29
+DATA  expandAVX512_24_outShufLo+0x18(SB)/8, $0x14130d0c0b050403
+DATA  expandAVX512_24_outShufLo+0x20(SB)/8, $0x2b2524231d1c1b15
+DATA  expandAVX512_24_outShufLo+0x28(SB)/8, $0x3d3c3b3534332d2c
+DATA  expandAVX512_24_outShufLo+0x30(SB)/8, $0x1716480f0e400706
+DATA  expandAVX512_24_outShufLo+0x38(SB)/8, $0x2e602726581f1e50
+
+GLOBL expandAVX512_24_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_24_outShufHi0+0x00(SB)/8, $0x3a39383231302928
+DATA  expandAVX512_24_outShufHi0+0x08(SB)/8, $0x51504a4948424140
+DATA  expandAVX512_24_outShufHi0+0x10(SB)/8, $0x2a6261605a595852
+DATA  expandAVX512_24_outShufHi0+0x18(SB)/8, $0x3d3c3b3534332c2b
+DATA  expandAVX512_24_outShufHi0+0x20(SB)/8, $0x54534d4c4b454443
+DATA  expandAVX512_24_outShufHi0+0x28(SB)/8, $0x2d6564635d5c5b55
+DATA  expandAVX512_24_outShufHi0+0x30(SB)/8, $0x703f3e6837362f2e
+DATA  expandAVX512_24_outShufHi0+0x38(SB)/8, $0x5756ff4f4e784746
+
+GLOBL expandAVX512_24_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_24_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_24_outShufHi1+0x38(SB)/8, $0xffff00ffffffffff
+
+TEXT expandAVX512_24<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_24_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_24_mat0<>(SB), Z2
+       VMOVDQU64 expandAVX512_24_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512_24_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512_24_inShuf3<>(SB), Z5
+       VMOVDQU64 expandAVX512_24_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_24_outShufHi0(SB), Z6
+       VMOVDQU64 expandAVX512_24_outShufHi1(SB), Z7
+       VMOVDQU64 (AX), Z8
+       VPERMB Z8, Z0, Z0
+       VGF2P8AFFINEQB $0, Z2, Z0, Z0
+       VPERMB Z8, Z3, Z3
+       VGF2P8AFFINEQB $0, Z2, Z3, Z2
+       VPERMB Z8, Z4, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_24_mat2<>(SB), Z3, Z3
+       VPERMB Z8, Z5, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_24_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xdfffffffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z6
+       MOVQ $0x2000000000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z7, K1, Z0
+       VPORQ Z0, Z6, Z2
+       RET
+
+GLOBL expandAVX512_26_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512_26_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_26_inShuf0<>+0x10(SB)/8, $0xffff020201010000
+DATA  expandAVX512_26_inShuf0<>+0x18(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_26_inShuf0<>+0x20(SB)/8, $0xffff020201010000
+DATA  expandAVX512_26_inShuf0<>+0x28(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_26_inShuf0<>+0x30(SB)/8, $0x0202010101000000
+DATA  expandAVX512_26_inShuf0<>+0x38(SB)/8, $0xffff010101000000
+
+GLOBL expandAVX512_26_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_26_mat0<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512_26_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_26_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512_26_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512_26_mat0<>+0x28(SB)/8, $0x0404040404040808
+DATA  expandAVX512_26_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512_26_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512_26_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_26_inShuf1<>+0x08(SB)/8, $0xffffffff01010000
+DATA  expandAVX512_26_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_26_inShuf1<>+0x18(SB)/8, $0xffffffff01010000
+DATA  expandAVX512_26_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_26_inShuf1<>+0x28(SB)/8, $0xffff010101000000
+DATA  expandAVX512_26_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_26_inShuf1<>+0x38(SB)/8, $0xff04040403030302
+
+GLOBL expandAVX512_26_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_mat1<>+0x00(SB)/8, $0x1010202020202020
+DATA  expandAVX512_26_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512_26_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512_26_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512_26_mat1<>+0x20(SB)/8, $0x4040404040408080
+DATA  expandAVX512_26_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512_26_mat1<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512_26_mat1<>+0x38(SB)/8, $0x0808080808080808
+
+GLOBL expandAVX512_26_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_inShuf2<>+0x00(SB)/8, $0x0404030303020202
+DATA  expandAVX512_26_inShuf2<>+0x08(SB)/8, $0xffffffffff040302
+DATA  expandAVX512_26_inShuf2<>+0x10(SB)/8, $0xffff040403030202
+DATA  expandAVX512_26_inShuf2<>+0x18(SB)/8, $0xffffffffff040302
+DATA  expandAVX512_26_inShuf2<>+0x20(SB)/8, $0xffff040403030202
+DATA  expandAVX512_26_inShuf2<>+0x28(SB)/8, $0xffffffffff040302
+DATA  expandAVX512_26_inShuf2<>+0x30(SB)/8, $0xff04030303020202
+DATA  expandAVX512_26_inShuf2<>+0x38(SB)/8, $0xffff040404030303
+
+GLOBL expandAVX512_26_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_mat2<>+0x00(SB)/8, $0x1010101010101010
+DATA  expandAVX512_26_mat2<>+0x08(SB)/8, $0x1010202020202020
+DATA  expandAVX512_26_mat2<>+0x10(SB)/8, $0x2020202020202020
+DATA  expandAVX512_26_mat2<>+0x18(SB)/8, $0x2020202040404040
+DATA  expandAVX512_26_mat2<>+0x20(SB)/8, $0x4040404040404040
+DATA  expandAVX512_26_mat2<>+0x28(SB)/8, $0x4040404040408080
+DATA  expandAVX512_26_mat2<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512_26_mat2<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512_26_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512_26_inShuf3<>+0x08(SB)/8, $0xffffffff04040303
+DATA  expandAVX512_26_inShuf3<>+0x10(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512_26_inShuf3<>+0x18(SB)/8, $0xffffffff04040303
+DATA  expandAVX512_26_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512_26_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512_26_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_26_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_26_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_26_mat3<>+0x00(SB)/8, $0x0101020202020202
+DATA  expandAVX512_26_mat3<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_26_mat3<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512_26_mat3<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_26_mat3<>+0x20(SB)/8, $0x0404040404040808
+DATA  expandAVX512_26_mat3<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512_26_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_26_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_26_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_26_outShufLo+0x00(SB)/8, $0x2018111008020100
+DATA  expandAVX512_26_outShufLo+0x08(SB)/8, $0x3a39383231302821
+DATA  expandAVX512_26_outShufLo+0x10(SB)/8, $0x6860595850494840
+DATA  expandAVX512_26_outShufLo+0x18(SB)/8, $0x1312090504036a69
+DATA  expandAVX512_26_outShufLo+0x20(SB)/8, $0x3b35343329232219
+DATA  expandAVX512_26_outShufLo+0x28(SB)/8, $0x5b5a514b4a413d3c
+DATA  expandAVX512_26_outShufLo+0x30(SB)/8, $0x0a7007066d6c6b61
+DATA  expandAVX512_26_outShufLo+0x38(SB)/8, $0x37362a25241a1514
+
+GLOBL expandAVX512_26_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_26_outShufHi0+0x00(SB)/8, $0x5851504842414038
+DATA  expandAVX512_26_outShufHi0+0x08(SB)/8, $0x7978727170686160
+DATA  expandAVX512_26_outShufHi0+0x10(SB)/8, $0xffffffffffffff7a
+DATA  expandAVX512_26_outShufHi0+0x18(SB)/8, $0x52494544433b3a39
+DATA  expandAVX512_26_outShufHi0+0x20(SB)/8, $0x7574736963625953
+DATA  expandAVX512_26_outShufHi0+0x28(SB)/8, $0xffffffffff7d7c7b
+DATA  expandAVX512_26_outShufHi0+0x30(SB)/8, $0xff47463e3d3cffff
+DATA  expandAVX512_26_outShufHi0+0x38(SB)/8, $0x766a65645a55544a
+
+GLOBL expandAVX512_26_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_26_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_26_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_26_outShufHi1+0x10(SB)/8, $0x20191810090800ff
+DATA  expandAVX512_26_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_26_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_26_outShufHi1+0x28(SB)/8, $0x1a110b0a01ffffff
+DATA  expandAVX512_26_outShufHi1+0x30(SB)/8, $0x28ffffffffff211b
+DATA  expandAVX512_26_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
+
+TEXT expandAVX512_26<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_26_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_26_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_26_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_26_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_26_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_26_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_26_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_26_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_26_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_26_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_26_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xff7c07ffff01ffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x83f80000fe0000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_28_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512_28_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_28_inShuf0<>+0x10(SB)/8, $0x0202010101000000
+DATA  expandAVX512_28_inShuf0<>+0x18(SB)/8, $0xff02010101000000
+DATA  expandAVX512_28_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_28_inShuf0<>+0x28(SB)/8, $0xffff010101000000
+DATA  expandAVX512_28_inShuf0<>+0x30(SB)/8, $0xffff010101000000
+DATA  expandAVX512_28_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100
+
+GLOBL expandAVX512_28_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_28_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_28_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_28_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_28_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_28_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_28_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_28_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_28_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_inShuf1<>+0x00(SB)/8, $0xffff010101000000
+DATA  expandAVX512_28_inShuf1<>+0x08(SB)/8, $0xffff010101000000
+DATA  expandAVX512_28_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_28_inShuf1<>+0x18(SB)/8, $0xffff010101000000
+DATA  expandAVX512_28_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_28_inShuf1<>+0x28(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_28_inShuf1<>+0x30(SB)/8, $0x0404040303030202
+DATA  expandAVX512_28_inShuf1<>+0x38(SB)/8, $0xffffffffff040302
+
+GLOBL expandAVX512_28_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_28_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_28_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_28_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_28_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512_28_mat1<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512_28_mat1<>+0x30(SB)/8, $0x0404040404040404
+DATA  expandAVX512_28_mat1<>+0x38(SB)/8, $0x0404040408080808
+
+GLOBL expandAVX512_28_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_inShuf2<>+0x00(SB)/8, $0x0404030303020202
+DATA  expandAVX512_28_inShuf2<>+0x08(SB)/8, $0x0404030303020202
+DATA  expandAVX512_28_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512_28_inShuf2<>+0x18(SB)/8, $0xffff030303020202
+DATA  expandAVX512_28_inShuf2<>+0x20(SB)/8, $0xffff030303020202
+DATA  expandAVX512_28_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512_28_inShuf2<>+0x30(SB)/8, $0xffff030303020202
+DATA  expandAVX512_28_inShuf2<>+0x38(SB)/8, $0xffff040404030303
+
+GLOBL expandAVX512_28_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_mat2<>+0x00(SB)/8, $0x0808080808080808
+DATA  expandAVX512_28_mat2<>+0x08(SB)/8, $0x1010101010101010
+DATA  expandAVX512_28_mat2<>+0x10(SB)/8, $0x1010101020202020
+DATA  expandAVX512_28_mat2<>+0x18(SB)/8, $0x2020202020202020
+DATA  expandAVX512_28_mat2<>+0x20(SB)/8, $0x4040404040404040
+DATA  expandAVX512_28_mat2<>+0x28(SB)/8, $0x4040404080808080
+DATA  expandAVX512_28_mat2<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512_28_mat2<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512_28_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512_28_inShuf3<>+0x08(SB)/8, $0xffff040404030303
+DATA  expandAVX512_28_inShuf3<>+0x10(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512_28_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_28_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_28_mat3<>+0x00(SB)/8, $0x0101010102020202
+DATA  expandAVX512_28_mat3<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_28_mat3<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512_28_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512_28_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_28_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_28_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_28_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_28_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_28_outShufLo+0x00(SB)/8, $0x1812111008020100
+DATA  expandAVX512_28_outShufLo+0x08(SB)/8, $0x31302a2928201a19
+DATA  expandAVX512_28_outShufLo+0x10(SB)/8, $0x4a49484241403832
+DATA  expandAVX512_28_outShufLo+0x18(SB)/8, $0x090504035a595850
+DATA  expandAVX512_28_outShufLo+0x20(SB)/8, $0x2b211d1c1b151413
+DATA  expandAVX512_28_outShufLo+0x28(SB)/8, $0x4443393534332d2c
+DATA  expandAVX512_28_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b45
+DATA  expandAVX512_28_outShufLo+0x38(SB)/8, $0x1e6817160a600706
+
+GLOBL expandAVX512_28_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_28_outShufHi0+0x00(SB)/8, $0x4948424140383130
+DATA  expandAVX512_28_outShufHi0+0x08(SB)/8, $0x6261605a5958504a
+DATA  expandAVX512_28_outShufHi0+0x10(SB)/8, $0xff7a797872717068
+DATA  expandAVX512_28_outShufHi0+0x18(SB)/8, $0x4339343332ffffff
+DATA  expandAVX512_28_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b4544
+DATA  expandAVX512_28_outShufHi0+0x28(SB)/8, $0x757473696564635d
+DATA  expandAVX512_28_outShufHi0+0x30(SB)/8, $0x35ffffffff7d7c7b
+DATA  expandAVX512_28_outShufHi0+0x38(SB)/8, $0x4f4eff47463a3736
+
+GLOBL expandAVX512_28_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_28_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_outShufHi1+0x10(SB)/8, $0x00ffffffffffffff
+DATA  expandAVX512_28_outShufHi1+0x18(SB)/8, $0xffffffffff0a0908
+DATA  expandAVX512_28_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_28_outShufHi1+0x30(SB)/8, $0xff0d0c0b01ffffff
+DATA  expandAVX512_28_outShufHi1+0x38(SB)/8, $0xffff10ffffffffff
+
+TEXT expandAVX512_28<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_28_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_28_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_28_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_28_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_28_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_28_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_28_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_28_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_28_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_28_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_28_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xdf87fffff87fffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x2078000007800000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_30_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512_30_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512_30_inShuf0<>+0x10(SB)/8, $0xffff010101000000
+DATA  expandAVX512_30_inShuf0<>+0x18(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_30_inShuf0<>+0x20(SB)/8, $0xffff010101000000
+DATA  expandAVX512_30_inShuf0<>+0x28(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_30_inShuf0<>+0x30(SB)/8, $0xffff010101000000
+DATA  expandAVX512_30_inShuf0<>+0x38(SB)/8, $0xffff010101000000
+
+GLOBL expandAVX512_30_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_30_mat0<>+0x08(SB)/8, $0x0101010101010202
+DATA  expandAVX512_30_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_30_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512_30_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512_30_mat0<>+0x28(SB)/8, $0x0404080808080808
+DATA  expandAVX512_30_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512_30_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512_30_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_30_inShuf1<>+0x08(SB)/8, $0xffff010101000000
+DATA  expandAVX512_30_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_30_inShuf1<>+0x18(SB)/8, $0xffff010101000000
+DATA  expandAVX512_30_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_30_inShuf1<>+0x28(SB)/8, $0xffff010101000000
+DATA  expandAVX512_30_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_30_inShuf1<>+0x38(SB)/8, $0x0404030303020202
+
+GLOBL expandAVX512_30_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_mat1<>+0x00(SB)/8, $0x1010101010102020
+DATA  expandAVX512_30_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512_30_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512_30_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512_30_mat1<>+0x20(SB)/8, $0x4040808080808080
+DATA  expandAVX512_30_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512_30_mat1<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512_30_mat1<>+0x38(SB)/8, $0x0202020202020202
+
+GLOBL expandAVX512_30_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_inShuf2<>+0x00(SB)/8, $0xffffffffff040302
+DATA  expandAVX512_30_inShuf2<>+0x08(SB)/8, $0xffff030303020202
+DATA  expandAVX512_30_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512_30_inShuf2<>+0x18(SB)/8, $0xffff030303020202
+DATA  expandAVX512_30_inShuf2<>+0x20(SB)/8, $0xffff030303020202
+DATA  expandAVX512_30_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512_30_inShuf2<>+0x30(SB)/8, $0xffff030303020202
+DATA  expandAVX512_30_inShuf2<>+0x38(SB)/8, $0xffffffffffff0302
+
+GLOBL expandAVX512_30_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_mat2<>+0x00(SB)/8, $0x0202020204040404
+DATA  expandAVX512_30_mat2<>+0x08(SB)/8, $0x0404040404040404
+DATA  expandAVX512_30_mat2<>+0x10(SB)/8, $0x0404080808080808
+DATA  expandAVX512_30_mat2<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_30_mat2<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_30_mat2<>+0x28(SB)/8, $0x1010101010102020
+DATA  expandAVX512_30_mat2<>+0x30(SB)/8, $0x2020202020202020
+DATA  expandAVX512_30_mat2<>+0x38(SB)/8, $0x2020202040404040
+
+GLOBL expandAVX512_30_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_inShuf3<>+0x00(SB)/8, $0xffff030303020202
+DATA  expandAVX512_30_inShuf3<>+0x08(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512_30_inShuf3<>+0x10(SB)/8, $0xffff030303020202
+DATA  expandAVX512_30_inShuf3<>+0x18(SB)/8, $0xffff040404030303
+DATA  expandAVX512_30_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512_30_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512_30_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_30_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_30_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_30_mat3<>+0x00(SB)/8, $0x4040404040404040
+DATA  expandAVX512_30_mat3<>+0x08(SB)/8, $0x4040808080808080
+DATA  expandAVX512_30_mat3<>+0x10(SB)/8, $0x8080808080808080
+DATA  expandAVX512_30_mat3<>+0x18(SB)/8, $0x0101010101010101
+DATA  expandAVX512_30_mat3<>+0x20(SB)/8, $0x0101010101010202
+DATA  expandAVX512_30_mat3<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512_30_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_30_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_30_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_30_outShufLo+0x00(SB)/8, $0x1812111008020100
+DATA  expandAVX512_30_outShufLo+0x08(SB)/8, $0x3832313028222120
+DATA  expandAVX512_30_outShufLo+0x10(SB)/8, $0x58504a4948403a39
+DATA  expandAVX512_30_outShufLo+0x18(SB)/8, $0x04036a6968605a59
+DATA  expandAVX512_30_outShufLo+0x20(SB)/8, $0x2423191514130905
+DATA  expandAVX512_30_outShufLo+0x28(SB)/8, $0x3d3c3b3534332925
+DATA  expandAVX512_30_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b41
+DATA  expandAVX512_30_outShufLo+0x38(SB)/8, $0x0a7007066d6c6b61
+
+GLOBL expandAVX512_30_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_30_outShufHi0+0x00(SB)/8, $0x504a4948403a3938
+DATA  expandAVX512_30_outShufHi0+0x08(SB)/8, $0x70686261605a5958
+DATA  expandAVX512_30_outShufHi0+0x10(SB)/8, $0xffffffffff787271
+DATA  expandAVX512_30_outShufHi0+0x18(SB)/8, $0x3c3bffffffffffff
+DATA  expandAVX512_30_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b413d
+DATA  expandAVX512_30_outShufHi0+0x28(SB)/8, $0x757473696564635d
+DATA  expandAVX512_30_outShufHi0+0x30(SB)/8, $0xffffffffffffff79
+DATA  expandAVX512_30_outShufHi0+0x38(SB)/8, $0x42ff3f3effffffff
+
+GLOBL expandAVX512_30_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_30_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_30_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_30_outShufHi1+0x10(SB)/8, $0x1008020100ffffff
+DATA  expandAVX512_30_outShufHi1+0x18(SB)/8, $0xffff201a19181211
+DATA  expandAVX512_30_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_30_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_30_outShufHi1+0x30(SB)/8, $0x15141309050403ff
+DATA  expandAVX512_30_outShufHi1+0x38(SB)/8, $0xff28ffff211d1c1b
+
+TEXT expandAVX512_30<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_30_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_30_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_30_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_30_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_30_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_30_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_30_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_30_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_30_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_30_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_30_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xb001ffffc007ffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x4ffe00003ff80000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_32_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_32_inShuf0<>+0x00(SB)/8, $0x0101010100000000
+DATA  expandAVX512_32_inShuf0<>+0x08(SB)/8, $0x0101010100000000
+DATA  expandAVX512_32_inShuf0<>+0x10(SB)/8, $0x0101010100000000
+DATA  expandAVX512_32_inShuf0<>+0x18(SB)/8, $0x0101010100000000
+DATA  expandAVX512_32_inShuf0<>+0x20(SB)/8, $0x0101010100000000
+DATA  expandAVX512_32_inShuf0<>+0x28(SB)/8, $0x0101010100000000
+DATA  expandAVX512_32_inShuf0<>+0x30(SB)/8, $0x0101010100000000
+DATA  expandAVX512_32_inShuf0<>+0x38(SB)/8, $0x0101010100000000
+
+GLOBL expandAVX512_32_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_32_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_32_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_32_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_32_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_32_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_32_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_32_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_32_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_32_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_32_inShuf1<>+0x00(SB)/8, $0x0303030302020202
+DATA  expandAVX512_32_inShuf1<>+0x08(SB)/8, $0x0303030302020202
+DATA  expandAVX512_32_inShuf1<>+0x10(SB)/8, $0x0303030302020202
+DATA  expandAVX512_32_inShuf1<>+0x18(SB)/8, $0x0303030302020202
+DATA  expandAVX512_32_inShuf1<>+0x20(SB)/8, $0x0303030302020202
+DATA  expandAVX512_32_inShuf1<>+0x28(SB)/8, $0x0303030302020202
+DATA  expandAVX512_32_inShuf1<>+0x30(SB)/8, $0x0303030302020202
+DATA  expandAVX512_32_inShuf1<>+0x38(SB)/8, $0x0303030302020202
+
+GLOBL expandAVX512_32_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_32_outShufLo+0x00(SB)/8, $0x0b0a090803020100
+DATA  expandAVX512_32_outShufLo+0x08(SB)/8, $0x1b1a191813121110
+DATA  expandAVX512_32_outShufLo+0x10(SB)/8, $0x2b2a292823222120
+DATA  expandAVX512_32_outShufLo+0x18(SB)/8, $0x3b3a393833323130
+DATA  expandAVX512_32_outShufLo+0x20(SB)/8, $0x0f0e0d0c07060504
+DATA  expandAVX512_32_outShufLo+0x28(SB)/8, $0x1f1e1d1c17161514
+DATA  expandAVX512_32_outShufLo+0x30(SB)/8, $0x2f2e2d2c27262524
+DATA  expandAVX512_32_outShufLo+0x38(SB)/8, $0x3f3e3d3c37363534
+
+TEXT expandAVX512_32<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_32_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_32_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512_32_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_32_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512_36_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_36_inShuf0<>+0x00(SB)/8, $0x0101010100000000
+DATA  expandAVX512_36_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_36_inShuf0<>+0x10(SB)/8, $0x0101010100000000
+DATA  expandAVX512_36_inShuf0<>+0x18(SB)/8, $0x0101010100000000
+DATA  expandAVX512_36_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_36_inShuf0<>+0x28(SB)/8, $0x0101010100000000
+DATA  expandAVX512_36_inShuf0<>+0x30(SB)/8, $0x0101010100000000
+DATA  expandAVX512_36_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100
+
+GLOBL expandAVX512_36_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_36_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_36_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_36_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_36_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_36_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_36_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_36_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_36_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_36_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_36_inShuf1<>+0x00(SB)/8, $0x0101010100000000
+DATA  expandAVX512_36_inShuf1<>+0x08(SB)/8, $0xffffff0100000000
+DATA  expandAVX512_36_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512_36_inShuf1<>+0x18(SB)/8, $0xffffffff00000000
+DATA  expandAVX512_36_inShuf1<>+0x20(SB)/8, $0xff02020202010101
+DATA  expandAVX512_36_inShuf1<>+0x28(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512_36_inShuf1<>+0x30(SB)/8, $0x0202020201010101
+DATA  expandAVX512_36_inShuf1<>+0x38(SB)/8, $0x0303030302020202
+
+GLOBL expandAVX512_36_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_36_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_36_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_36_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_36_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_36_mat1<>+0x20(SB)/8, $0x4040404040404040
+DATA  expandAVX512_36_mat1<>+0x28(SB)/8, $0x4040404080808080
+DATA  expandAVX512_36_mat1<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512_36_mat1<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512_36_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_36_inShuf2<>+0x00(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512_36_inShuf2<>+0x08(SB)/8, $0x0303030302020202
+DATA  expandAVX512_36_inShuf2<>+0x10(SB)/8, $0x0303030302020202
+DATA  expandAVX512_36_inShuf2<>+0x18(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512_36_inShuf2<>+0x20(SB)/8, $0x0303030302020202
+DATA  expandAVX512_36_inShuf2<>+0x28(SB)/8, $0xffff030302020202
+DATA  expandAVX512_36_inShuf2<>+0x30(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_36_inShuf2<>+0x38(SB)/8, $0xffffffff02020202
+
+GLOBL expandAVX512_36_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_36_mat2<>+0x00(SB)/8, $0x0101010102020202
+DATA  expandAVX512_36_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_36_mat2<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_36_mat2<>+0x18(SB)/8, $0x0404040408080808
+DATA  expandAVX512_36_mat2<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512_36_mat2<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512_36_mat2<>+0x30(SB)/8, $0x1010101020202020
+DATA  expandAVX512_36_mat2<>+0x38(SB)/8, $0x2020202020202020
+
+GLOBL expandAVX512_36_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_36_outShufLo+0x00(SB)/8, $0x1211100803020100
+DATA  expandAVX512_36_outShufLo+0x08(SB)/8, $0x2928201b1a191813
+DATA  expandAVX512_36_outShufLo+0x10(SB)/8, $0x4038333231302b2a
+DATA  expandAVX512_36_outShufLo+0x18(SB)/8, $0x504b4a4948434241
+DATA  expandAVX512_36_outShufLo+0x20(SB)/8, $0x070605045b5a5958
+DATA  expandAVX512_36_outShufLo+0x28(SB)/8, $0x1e1d1c1716151409
+DATA  expandAVX512_36_outShufLo+0x30(SB)/8, $0x35342f2e2d2c211f
+DATA  expandAVX512_36_outShufLo+0x38(SB)/8, $0x4c47464544393736
+
+GLOBL expandAVX512_36_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_36_outShufHi+0x00(SB)/8, $0x3332313028222120
+DATA  expandAVX512_36_outShufHi+0x08(SB)/8, $0x4a4948403b3a3938
+DATA  expandAVX512_36_outShufHi+0x10(SB)/8, $0x616058535251504b
+DATA  expandAVX512_36_outShufHi+0x18(SB)/8, $0x78706b6a69686362
+DATA  expandAVX512_36_outShufHi+0x20(SB)/8, $0x29262524237b7a79
+DATA  expandAVX512_36_outShufHi+0x28(SB)/8, $0x3f3e3d3c37363534
+DATA  expandAVX512_36_outShufHi+0x30(SB)/8, $0x5655544f4e4d4c41
+DATA  expandAVX512_36_outShufHi+0x38(SB)/8, $0x6d6c676665645957
+
+TEXT expandAVX512_36<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_36_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_36_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512_36_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512_36_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_36_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_36_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_36_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_36_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512_40_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_inShuf0<>+0x00(SB)/8, $0x0101010000000000
+DATA  expandAVX512_40_inShuf0<>+0x08(SB)/8, $0x0101010000000000
+DATA  expandAVX512_40_inShuf0<>+0x10(SB)/8, $0x0101010000000000
+DATA  expandAVX512_40_inShuf0<>+0x18(SB)/8, $0x0101010000000000
+DATA  expandAVX512_40_inShuf0<>+0x20(SB)/8, $0x0101010000000000
+DATA  expandAVX512_40_inShuf0<>+0x28(SB)/8, $0xffffff0000000000
+DATA  expandAVX512_40_inShuf0<>+0x30(SB)/8, $0xffffff0000000000
+DATA  expandAVX512_40_inShuf0<>+0x38(SB)/8, $0xffffff0000000000
+
+GLOBL expandAVX512_40_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_40_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_40_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_40_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_40_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_40_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_40_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_40_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_40_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_inShuf1<>+0x00(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_40_inShuf1<>+0x08(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_40_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_40_inShuf1<>+0x18(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_40_inShuf1<>+0x20(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512_40_inShuf1<>+0x28(SB)/8, $0xffff020202020201
+DATA  expandAVX512_40_inShuf1<>+0x30(SB)/8, $0x0202020101010101
+DATA  expandAVX512_40_inShuf1<>+0x38(SB)/8, $0x0202020101010101
+
+GLOBL expandAVX512_40_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_mat1<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_40_mat1<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_40_mat1<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_40_mat1<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_40_mat1<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_40_mat1<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512_40_mat1<>+0x30(SB)/8, $0x2020202020202020
+DATA  expandAVX512_40_mat1<>+0x38(SB)/8, $0x4040404040404040
+
+GLOBL expandAVX512_40_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_inShuf2<>+0x00(SB)/8, $0x0202020101010101
+DATA  expandAVX512_40_inShuf2<>+0x08(SB)/8, $0x0303030202020202
+DATA  expandAVX512_40_inShuf2<>+0x10(SB)/8, $0x0303030202020202
+DATA  expandAVX512_40_inShuf2<>+0x18(SB)/8, $0xffffff0202020202
+DATA  expandAVX512_40_inShuf2<>+0x20(SB)/8, $0xffffff0202020202
+DATA  expandAVX512_40_inShuf2<>+0x28(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512_40_inShuf2<>+0x30(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512_40_inShuf2<>+0x38(SB)/8, $0xffffffffffff0202
+
+GLOBL expandAVX512_40_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_mat2<>+0x00(SB)/8, $0x8080808080808080
+DATA  expandAVX512_40_mat2<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512_40_mat2<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_40_mat2<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_40_mat2<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512_40_mat2<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_40_mat2<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_40_mat2<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_40_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_inShuf3<>+0x00(SB)/8, $0xffffffffffff0303
+DATA  expandAVX512_40_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_40_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_40_mat3<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_40_mat3<>+0x08(SB)/8, $0x0000000000000000
+DATA  expandAVX512_40_mat3<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512_40_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512_40_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_40_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_40_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_40_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_40_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_40_outShufLo+0x00(SB)/8, $0x0a09080403020100
+DATA  expandAVX512_40_outShufLo+0x08(SB)/8, $0x1814131211100c0b
+DATA  expandAVX512_40_outShufLo+0x10(SB)/8, $0x232221201c1b1a19
+DATA  expandAVX512_40_outShufLo+0x18(SB)/8, $0x31302c2b2a292824
+DATA  expandAVX512_40_outShufLo+0x20(SB)/8, $0x3c3b3a3938343332
+DATA  expandAVX512_40_outShufLo+0x28(SB)/8, $0x0f0e0d4140070605
+DATA  expandAVX512_40_outShufLo+0x30(SB)/8, $0x1d51501716154948
+DATA  expandAVX512_40_outShufLo+0x38(SB)/8, $0x6027262559581f1e
+
+GLOBL expandAVX512_40_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_40_outShufHi0+0x00(SB)/8, $0x3938343332313028
+DATA  expandAVX512_40_outShufHi0+0x08(SB)/8, $0x44434241403c3b3a
+DATA  expandAVX512_40_outShufHi0+0x10(SB)/8, $0x5251504c4b4a4948
+DATA  expandAVX512_40_outShufHi0+0x18(SB)/8, $0x605c5b5a59585453
+DATA  expandAVX512_40_outShufHi0+0x20(SB)/8, $0x2c2b2a2964636261
+DATA  expandAVX512_40_outShufHi0+0x28(SB)/8, $0x3e3d69683736352d
+DATA  expandAVX512_40_outShufHi0+0x30(SB)/8, $0x797847464571703f
+DATA  expandAVX512_40_outShufHi0+0x38(SB)/8, $0x575655ffff4f4e4d
+
+GLOBL expandAVX512_40_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_40_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_40_outShufHi1+0x38(SB)/8, $0xffffff0100ffffff
+
+TEXT expandAVX512_40<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_40_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_40_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_40_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_40_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_40_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_40_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_40_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_40_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_40_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_40_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_40_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xe7ffffffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x1800000000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_44_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_inShuf0<>+0x00(SB)/8, $0x0101010000000000
+DATA  expandAVX512_44_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_44_inShuf0<>+0x10(SB)/8, $0x0101010000000000
+DATA  expandAVX512_44_inShuf0<>+0x18(SB)/8, $0x0101010000000000
+DATA  expandAVX512_44_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_44_inShuf0<>+0x28(SB)/8, $0x0101010000000000
+DATA  expandAVX512_44_inShuf0<>+0x30(SB)/8, $0xffffff0000000000
+DATA  expandAVX512_44_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
+
+GLOBL expandAVX512_44_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_44_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_44_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_44_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_44_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_44_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_44_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_44_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_44_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_inShuf1<>+0x00(SB)/8, $0xffffff0000000000
+DATA  expandAVX512_44_inShuf1<>+0x08(SB)/8, $0xffffff0000000000
+DATA  expandAVX512_44_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512_44_inShuf1<>+0x18(SB)/8, $0xffffff0000000000
+DATA  expandAVX512_44_inShuf1<>+0x20(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_44_inShuf1<>+0x28(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_44_inShuf1<>+0x30(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_44_inShuf1<>+0x38(SB)/8, $0xff02020202020101
+
+GLOBL expandAVX512_44_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_44_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_44_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_44_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_44_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512_44_mat1<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512_44_mat1<>+0x30(SB)/8, $0x0404040404040404
+DATA  expandAVX512_44_mat1<>+0x38(SB)/8, $0x0808080808080808
+
+GLOBL expandAVX512_44_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_inShuf2<>+0x00(SB)/8, $0x0202020101010101
+DATA  expandAVX512_44_inShuf2<>+0x08(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512_44_inShuf2<>+0x10(SB)/8, $0x0202020101010101
+DATA  expandAVX512_44_inShuf2<>+0x18(SB)/8, $0x0202020101010101
+DATA  expandAVX512_44_inShuf2<>+0x20(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512_44_inShuf2<>+0x28(SB)/8, $0xffff020101010101
+DATA  expandAVX512_44_inShuf2<>+0x30(SB)/8, $0xffffff0202020202
+DATA  expandAVX512_44_inShuf2<>+0x38(SB)/8, $0xffffffffffffff02
+
+GLOBL expandAVX512_44_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_mat2<>+0x00(SB)/8, $0x1010101010101010
+DATA  expandAVX512_44_mat2<>+0x08(SB)/8, $0x1010101020202020
+DATA  expandAVX512_44_mat2<>+0x10(SB)/8, $0x2020202020202020
+DATA  expandAVX512_44_mat2<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512_44_mat2<>+0x20(SB)/8, $0x4040404080808080
+DATA  expandAVX512_44_mat2<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512_44_mat2<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512_44_mat2<>+0x38(SB)/8, $0x0101010102020202
+
+GLOBL expandAVX512_44_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_inShuf3<>+0x00(SB)/8, $0xffffff0202020202
+DATA  expandAVX512_44_inShuf3<>+0x08(SB)/8, $0xffffff0202020202
+DATA  expandAVX512_44_inShuf3<>+0x10(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_44_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512_44_inShuf3<>+0x20(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512_44_inShuf3<>+0x28(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512_44_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_44_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_44_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_44_mat3<>+0x00(SB)/8, $0x0202020202020202
+DATA  expandAVX512_44_mat3<>+0x08(SB)/8, $0x0404040404040404
+DATA  expandAVX512_44_mat3<>+0x10(SB)/8, $0x0404040408080808
+DATA  expandAVX512_44_mat3<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512_44_mat3<>+0x20(SB)/8, $0x2020202020202020
+DATA  expandAVX512_44_mat3<>+0x28(SB)/8, $0x4040404040404040
+DATA  expandAVX512_44_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_44_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_44_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_44_outShufLo+0x00(SB)/8, $0x1110080403020100
+DATA  expandAVX512_44_outShufLo+0x08(SB)/8, $0x1c1b1a1918141312
+DATA  expandAVX512_44_outShufLo+0x10(SB)/8, $0x31302c2b2a292820
+DATA  expandAVX512_44_outShufLo+0x18(SB)/8, $0x4342414038343332
+DATA  expandAVX512_44_outShufLo+0x20(SB)/8, $0x58504c4b4a494844
+DATA  expandAVX512_44_outShufLo+0x28(SB)/8, $0x600706055c5b5a59
+DATA  expandAVX512_44_outShufLo+0x30(SB)/8, $0x1d69681716150961
+DATA  expandAVX512_44_outShufLo+0x38(SB)/8, $0x2f2e2d2171701f1e
+
+GLOBL expandAVX512_44_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_44_outShufHi0+0x00(SB)/8, $0x4844434241403938
+DATA  expandAVX512_44_outShufHi0+0x08(SB)/8, $0x5a59585453525150
+DATA  expandAVX512_44_outShufHi0+0x10(SB)/8, $0x6c6b6a6968605c5b
+DATA  expandAVX512_44_outShufHi0+0x18(SB)/8, $0xffff787473727170
+DATA  expandAVX512_44_outShufHi0+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_44_outShufHi0+0x28(SB)/8, $0x46453e3d3c3b3aff
+DATA  expandAVX512_44_outShufHi0+0x30(SB)/8, $0xff57565549ffff47
+DATA  expandAVX512_44_outShufHi0+0x38(SB)/8, $0x6d61ffff5f5e5dff
+
+GLOBL expandAVX512_44_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_44_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_44_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_44_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_44_outShufHi1+0x18(SB)/8, $0x0100ffffffffffff
+DATA  expandAVX512_44_outShufHi1+0x20(SB)/8, $0x0c0b0a0908040302
+DATA  expandAVX512_44_outShufHi1+0x28(SB)/8, $0xffffffffffffff10
+DATA  expandAVX512_44_outShufHi1+0x30(SB)/8, $0x20ffffffff1918ff
+DATA  expandAVX512_44_outShufHi1+0x38(SB)/8, $0xffff2928ffffff21
+
+TEXT expandAVX512_44<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_44_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_44_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_44_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_44_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_44_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_44_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_44_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_44_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_44_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_44_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_44_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xce79fe003fffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x318601ffc0000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_48_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_48_inShuf0<>+0x00(SB)/8, $0x0101000000000000
+DATA  expandAVX512_48_inShuf0<>+0x08(SB)/8, $0x0101000000000000
+DATA  expandAVX512_48_inShuf0<>+0x10(SB)/8, $0x0101000000000000
+DATA  expandAVX512_48_inShuf0<>+0x18(SB)/8, $0xffff000000000000
+DATA  expandAVX512_48_inShuf0<>+0x20(SB)/8, $0xffff000000000000
+DATA  expandAVX512_48_inShuf0<>+0x28(SB)/8, $0xffff000000000000
+DATA  expandAVX512_48_inShuf0<>+0x30(SB)/8, $0xffff000000000000
+DATA  expandAVX512_48_inShuf0<>+0x38(SB)/8, $0xffff000000000000
+
+GLOBL expandAVX512_48_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_48_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_48_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_48_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_48_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_48_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_48_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_48_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_48_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_48_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_48_inShuf1<>+0x00(SB)/8, $0xffffffff01010101
+DATA  expandAVX512_48_inShuf1<>+0x08(SB)/8, $0xffffffff01010101
+DATA  expandAVX512_48_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512_48_inShuf1<>+0x18(SB)/8, $0x0202020202020101
+DATA  expandAVX512_48_inShuf1<>+0x20(SB)/8, $0x0202010101010101
+DATA  expandAVX512_48_inShuf1<>+0x28(SB)/8, $0x0202010101010101
+DATA  expandAVX512_48_inShuf1<>+0x30(SB)/8, $0x0202010101010101
+DATA  expandAVX512_48_inShuf1<>+0x38(SB)/8, $0xffff010101010101
+
+GLOBL expandAVX512_48_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_48_mat1<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_48_mat1<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_48_mat1<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_48_mat1<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_48_mat1<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512_48_mat1<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512_48_mat1<>+0x30(SB)/8, $0x2020202020202020
+DATA  expandAVX512_48_mat1<>+0x38(SB)/8, $0x4040404040404040
+
+GLOBL expandAVX512_48_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_48_inShuf2<>+0x00(SB)/8, $0xffff010101010101
+DATA  expandAVX512_48_inShuf2<>+0x08(SB)/8, $0xffff020202020202
+DATA  expandAVX512_48_inShuf2<>+0x10(SB)/8, $0xffff020202020202
+DATA  expandAVX512_48_inShuf2<>+0x18(SB)/8, $0xffffffff02020202
+DATA  expandAVX512_48_inShuf2<>+0x20(SB)/8, $0xffffffff02020202
+DATA  expandAVX512_48_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_48_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_48_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_48_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_48_mat2<>+0x00(SB)/8, $0x8080808080808080
+DATA  expandAVX512_48_mat2<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512_48_mat2<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_48_mat2<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_48_mat2<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_48_mat2<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_48_mat2<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_48_mat2<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_48_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_48_outShufLo+0x00(SB)/8, $0x0908050403020100
+DATA  expandAVX512_48_outShufLo+0x08(SB)/8, $0x131211100d0c0b0a
+DATA  expandAVX512_48_outShufLo+0x10(SB)/8, $0x1d1c1b1a19181514
+DATA  expandAVX512_48_outShufLo+0x18(SB)/8, $0x2928252423222120
+DATA  expandAVX512_48_outShufLo+0x20(SB)/8, $0x333231302d2c2b2a
+DATA  expandAVX512_48_outShufLo+0x28(SB)/8, $0x3d3c3b3a39383534
+DATA  expandAVX512_48_outShufLo+0x30(SB)/8, $0x0f0e434241400706
+DATA  expandAVX512_48_outShufLo+0x38(SB)/8, $0x515017164b4a4948
+
+GLOBL expandAVX512_48_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_48_outShufHi+0x00(SB)/8, $0x2524232221201918
+DATA  expandAVX512_48_outShufHi+0x08(SB)/8, $0x31302d2c2b2a2928
+DATA  expandAVX512_48_outShufHi+0x10(SB)/8, $0x3b3a393835343332
+DATA  expandAVX512_48_outShufHi+0x18(SB)/8, $0x4544434241403d3c
+DATA  expandAVX512_48_outShufHi+0x20(SB)/8, $0x51504d4c4b4a4948
+DATA  expandAVX512_48_outShufHi+0x28(SB)/8, $0x1d1c1b1a55545352
+DATA  expandAVX512_48_outShufHi+0x30(SB)/8, $0x5b5a595827261f1e
+DATA  expandAVX512_48_outShufHi+0x38(SB)/8, $0x3736636261602f2e
+
+TEXT expandAVX512_48<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_48_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_48_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512_48_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512_48_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_48_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_48_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_48_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_48_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512_52_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_inShuf0<>+0x00(SB)/8, $0x0101000000000000
+DATA  expandAVX512_52_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512_52_inShuf0<>+0x10(SB)/8, $0x0101000000000000
+DATA  expandAVX512_52_inShuf0<>+0x18(SB)/8, $0xffff000000000000
+DATA  expandAVX512_52_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512_52_inShuf0<>+0x28(SB)/8, $0xffff000000000000
+DATA  expandAVX512_52_inShuf0<>+0x30(SB)/8, $0xffff000000000000
+DATA  expandAVX512_52_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
+
+GLOBL expandAVX512_52_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_52_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_52_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_52_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_52_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_52_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_52_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_52_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_52_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_inShuf1<>+0x00(SB)/8, $0xffff000000000000
+DATA  expandAVX512_52_inShuf1<>+0x08(SB)/8, $0xffff000000000000
+DATA  expandAVX512_52_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512_52_inShuf1<>+0x18(SB)/8, $0xffff000000000000
+DATA  expandAVX512_52_inShuf1<>+0x20(SB)/8, $0xffffffff01010101
+DATA  expandAVX512_52_inShuf1<>+0x28(SB)/8, $0xffffffffff010101
+DATA  expandAVX512_52_inShuf1<>+0x30(SB)/8, $0xff02020202020201
+DATA  expandAVX512_52_inShuf1<>+0x38(SB)/8, $0x0202010101010101
+
+GLOBL expandAVX512_52_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_52_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_52_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_52_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_52_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512_52_mat1<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512_52_mat1<>+0x30(SB)/8, $0x0202020202020202
+DATA  expandAVX512_52_mat1<>+0x38(SB)/8, $0x0404040404040404
+
+GLOBL expandAVX512_52_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_inShuf2<>+0x00(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512_52_inShuf2<>+0x08(SB)/8, $0x0202010101010101
+DATA  expandAVX512_52_inShuf2<>+0x10(SB)/8, $0xffff010101010101
+DATA  expandAVX512_52_inShuf2<>+0x18(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512_52_inShuf2<>+0x20(SB)/8, $0xffff010101010101
+DATA  expandAVX512_52_inShuf2<>+0x28(SB)/8, $0xffff010101010101
+DATA  expandAVX512_52_inShuf2<>+0x30(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512_52_inShuf2<>+0x38(SB)/8, $0xffff010101010101
+
+GLOBL expandAVX512_52_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_mat2<>+0x00(SB)/8, $0x0404040408080808
+DATA  expandAVX512_52_mat2<>+0x08(SB)/8, $0x0808080808080808
+DATA  expandAVX512_52_mat2<>+0x10(SB)/8, $0x1010101010101010
+DATA  expandAVX512_52_mat2<>+0x18(SB)/8, $0x1010101020202020
+DATA  expandAVX512_52_mat2<>+0x20(SB)/8, $0x2020202020202020
+DATA  expandAVX512_52_mat2<>+0x28(SB)/8, $0x4040404040404040
+DATA  expandAVX512_52_mat2<>+0x30(SB)/8, $0x4040404080808080
+DATA  expandAVX512_52_mat2<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_52_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_inShuf3<>+0x00(SB)/8, $0xffff020202020202
+DATA  expandAVX512_52_inShuf3<>+0x08(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_52_inShuf3<>+0x10(SB)/8, $0xffffffff02020202
+DATA  expandAVX512_52_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512_52_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_52_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_52_mat3<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_52_mat3<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_52_mat3<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_52_mat3<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_52_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_52_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_52_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_52_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_52_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_52_outShufLo+0x00(SB)/8, $0x1008050403020100
+DATA  expandAVX512_52_outShufLo+0x08(SB)/8, $0x1a19181514131211
+DATA  expandAVX512_52_outShufLo+0x10(SB)/8, $0x2b2a2928201d1c1b
+DATA  expandAVX512_52_outShufLo+0x18(SB)/8, $0x3534333231302d2c
+DATA  expandAVX512_52_outShufLo+0x20(SB)/8, $0x4845444342414038
+DATA  expandAVX512_52_outShufLo+0x28(SB)/8, $0x5958504d4c4b4a49
+DATA  expandAVX512_52_outShufLo+0x30(SB)/8, $0x616007065d5c5b5a
+DATA  expandAVX512_52_outShufLo+0x38(SB)/8, $0x6a69681716096362
+
+GLOBL expandAVX512_52_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_52_outShufHi0+0x00(SB)/8, $0x403d3c3b3a393830
+DATA  expandAVX512_52_outShufHi0+0x08(SB)/8, $0x51504d4c4b4a4948
+DATA  expandAVX512_52_outShufHi0+0x10(SB)/8, $0x6261605855545352
+DATA  expandAVX512_52_outShufHi0+0x18(SB)/8, $0x6c6b6a6968656463
+DATA  expandAVX512_52_outShufHi0+0x20(SB)/8, $0x7d7c7b7a7978706d
+DATA  expandAVX512_52_outShufHi0+0x28(SB)/8, $0x31ffffffffffffff
+DATA  expandAVX512_52_outShufHi0+0x30(SB)/8, $0xff3f3e3635343332
+DATA  expandAVX512_52_outShufHi0+0x38(SB)/8, $0xffff4f4e41ffffff
+
+GLOBL expandAVX512_52_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_52_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_52_outShufHi1+0x28(SB)/8, $0xff08050403020100
+DATA  expandAVX512_52_outShufHi1+0x30(SB)/8, $0x10ffffffffffffff
+DATA  expandAVX512_52_outShufHi1+0x38(SB)/8, $0x1918ffffff131211
+
+TEXT expandAVX512_52<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_52_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_52_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_52_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_52_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_52_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_52_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_52_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_52_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_52_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_52_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_52_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0x387f80ffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0xc7807f0000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_56_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_56_inShuf0<>+0x00(SB)/8, $0x0100000000000000
+DATA  expandAVX512_56_inShuf0<>+0x08(SB)/8, $0x0100000000000000
+DATA  expandAVX512_56_inShuf0<>+0x10(SB)/8, $0xff00000000000000
+DATA  expandAVX512_56_inShuf0<>+0x18(SB)/8, $0xff00000000000000
+DATA  expandAVX512_56_inShuf0<>+0x20(SB)/8, $0xff00000000000000
+DATA  expandAVX512_56_inShuf0<>+0x28(SB)/8, $0xff00000000000000
+DATA  expandAVX512_56_inShuf0<>+0x30(SB)/8, $0xff00000000000000
+DATA  expandAVX512_56_inShuf0<>+0x38(SB)/8, $0xff00000000000000
+
+GLOBL expandAVX512_56_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_56_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_56_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_56_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_56_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_56_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_56_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_56_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_56_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_56_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_56_inShuf1<>+0x00(SB)/8, $0xffff010101010101
+DATA  expandAVX512_56_inShuf1<>+0x08(SB)/8, $0x0202010101010101
+DATA  expandAVX512_56_inShuf1<>+0x10(SB)/8, $0x0201010101010101
+DATA  expandAVX512_56_inShuf1<>+0x18(SB)/8, $0xff01010101010101
+DATA  expandAVX512_56_inShuf1<>+0x20(SB)/8, $0xff01010101010101
+DATA  expandAVX512_56_inShuf1<>+0x28(SB)/8, $0xff01010101010101
+DATA  expandAVX512_56_inShuf1<>+0x30(SB)/8, $0xff01010101010101
+DATA  expandAVX512_56_inShuf1<>+0x38(SB)/8, $0xff01010101010101
+
+GLOBL expandAVX512_56_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_56_inShuf2<>+0x00(SB)/8, $0xff02020202020202
+DATA  expandAVX512_56_inShuf2<>+0x08(SB)/8, $0xffffff0202020202
+DATA  expandAVX512_56_inShuf2<>+0x10(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512_56_inShuf2<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_56_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_56_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_56_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_56_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_56_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_56_mat2<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_56_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_56_mat2<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_56_mat2<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512_56_mat2<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_56_mat2<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_56_mat2<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_56_mat2<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_56_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_56_outShufLo+0x00(SB)/8, $0x0806050403020100
+DATA  expandAVX512_56_outShufLo+0x08(SB)/8, $0x11100e0d0c0b0a09
+DATA  expandAVX512_56_outShufLo+0x10(SB)/8, $0x1a19181615141312
+DATA  expandAVX512_56_outShufLo+0x18(SB)/8, $0x232221201e1d1c1b
+DATA  expandAVX512_56_outShufLo+0x20(SB)/8, $0x2c2b2a2928262524
+DATA  expandAVX512_56_outShufLo+0x28(SB)/8, $0x3534333231302e2d
+DATA  expandAVX512_56_outShufLo+0x30(SB)/8, $0x3e3d3c3b3a393836
+DATA  expandAVX512_56_outShufLo+0x38(SB)/8, $0x0f45444342414007
+
+GLOBL expandAVX512_56_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512_56_outShufHi+0x00(SB)/8, $0x11100d0c0b0a0908
+DATA  expandAVX512_56_outShufHi+0x08(SB)/8, $0x1a19181615141312
+DATA  expandAVX512_56_outShufHi+0x10(SB)/8, $0x232221201e1d1c1b
+DATA  expandAVX512_56_outShufHi+0x18(SB)/8, $0x2c2b2a2928262524
+DATA  expandAVX512_56_outShufHi+0x20(SB)/8, $0x3534333231302e2d
+DATA  expandAVX512_56_outShufHi+0x28(SB)/8, $0x3e3d3c3b3a393836
+DATA  expandAVX512_56_outShufHi+0x30(SB)/8, $0x0e46454443424140
+DATA  expandAVX512_56_outShufHi+0x38(SB)/8, $0x50174c4b4a49480f
+
+TEXT expandAVX512_56<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_56_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_56_mat0<>(SB), Z3
+       VMOVDQU64 expandAVX512_56_inShuf1<>(SB), Z4
+       VMOVDQU64 expandAVX512_56_inShuf2<>(SB), Z5
+       VMOVDQU64 expandAVX512_56_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_56_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z6
+       VPERMB Z6, Z0, Z0
+       VGF2P8AFFINEQB $0, Z3, Z0, Z0
+       VPERMB Z6, Z4, Z4
+       VGF2P8AFFINEQB $0, Z3, Z4, Z3
+       VPERMB Z6, Z5, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_56_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512_60_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_inShuf0<>+0x00(SB)/8, $0x0100000000000000
+DATA  expandAVX512_60_inShuf0<>+0x08(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512_60_inShuf0<>+0x10(SB)/8, $0xff00000000000000
+DATA  expandAVX512_60_inShuf0<>+0x18(SB)/8, $0xff00000000000000
+DATA  expandAVX512_60_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512_60_inShuf0<>+0x28(SB)/8, $0xff00000000000000
+DATA  expandAVX512_60_inShuf0<>+0x30(SB)/8, $0xff00000000000000
+DATA  expandAVX512_60_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
+
+GLOBL expandAVX512_60_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_60_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512_60_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512_60_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512_60_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512_60_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512_60_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512_60_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512_60_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_inShuf1<>+0x00(SB)/8, $0xff00000000000000
+DATA  expandAVX512_60_inShuf1<>+0x08(SB)/8, $0xff00000000000000
+DATA  expandAVX512_60_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512_60_inShuf1<>+0x18(SB)/8, $0xff00000000000000
+DATA  expandAVX512_60_inShuf1<>+0x20(SB)/8, $0xffffffffff010101
+DATA  expandAVX512_60_inShuf1<>+0x28(SB)/8, $0x0202020202010101
+DATA  expandAVX512_60_inShuf1<>+0x30(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512_60_inShuf1<>+0x38(SB)/8, $0xff01010101010101
+
+GLOBL expandAVX512_60_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512_60_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512_60_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512_60_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512_60_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512_60_mat1<>+0x28(SB)/8, $0x0101010101010101
+DATA  expandAVX512_60_mat1<>+0x30(SB)/8, $0x0101010102020202
+DATA  expandAVX512_60_mat1<>+0x38(SB)/8, $0x0202020202020202
+
+GLOBL expandAVX512_60_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_inShuf2<>+0x00(SB)/8, $0xff01010101010101
+DATA  expandAVX512_60_inShuf2<>+0x08(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512_60_inShuf2<>+0x10(SB)/8, $0xff01010101010101
+DATA  expandAVX512_60_inShuf2<>+0x18(SB)/8, $0xff01010101010101
+DATA  expandAVX512_60_inShuf2<>+0x20(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512_60_inShuf2<>+0x28(SB)/8, $0xff01010101010101
+DATA  expandAVX512_60_inShuf2<>+0x30(SB)/8, $0xff01010101010101
+DATA  expandAVX512_60_inShuf2<>+0x38(SB)/8, $0xffffffffffffff01
+
+GLOBL expandAVX512_60_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_mat2<>+0x00(SB)/8, $0x0404040404040404
+DATA  expandAVX512_60_mat2<>+0x08(SB)/8, $0x0404040408080808
+DATA  expandAVX512_60_mat2<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512_60_mat2<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512_60_mat2<>+0x20(SB)/8, $0x1010101020202020
+DATA  expandAVX512_60_mat2<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_60_mat2<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_60_mat2<>+0x38(SB)/8, $0x4040404080808080
+
+GLOBL expandAVX512_60_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_inShuf3<>+0x00(SB)/8, $0xff01010101010101
+DATA  expandAVX512_60_inShuf3<>+0x08(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512_60_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512_60_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512_60_mat3<>+0x00(SB)/8, $0x8080808080808080
+DATA  expandAVX512_60_mat3<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512_60_mat3<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512_60_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512_60_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_60_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_60_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_60_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_60_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_60_outShufLo+0x00(SB)/8, $0x0806050403020100
+DATA  expandAVX512_60_outShufLo+0x08(SB)/8, $0x1816151413121110
+DATA  expandAVX512_60_outShufLo+0x10(SB)/8, $0x28201e1d1c1b1a19
+DATA  expandAVX512_60_outShufLo+0x18(SB)/8, $0x31302e2d2c2b2a29
+DATA  expandAVX512_60_outShufLo+0x20(SB)/8, $0x4140383635343332
+DATA  expandAVX512_60_outShufLo+0x28(SB)/8, $0x4a49484645444342
+DATA  expandAVX512_60_outShufLo+0x30(SB)/8, $0x5a5958504e4d4c4b
+DATA  expandAVX512_60_outShufLo+0x38(SB)/8, $0x626160075e5d5c5b
+
+GLOBL expandAVX512_60_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512_60_outShufHi0+0x00(SB)/8, $0x3b3a3938302a2928
+DATA  expandAVX512_60_outShufHi0+0x08(SB)/8, $0x44434241403e3d3c
+DATA  expandAVX512_60_outShufHi0+0x10(SB)/8, $0x5453525150484645
+DATA  expandAVX512_60_outShufHi0+0x18(SB)/8, $0x5d5c5b5a59585655
+DATA  expandAVX512_60_outShufHi0+0x20(SB)/8, $0x6d6c6b6a6968605e
+DATA  expandAVX512_60_outShufHi0+0x28(SB)/8, $0x767574737271706e
+DATA  expandAVX512_60_outShufHi0+0x30(SB)/8, $0xffffffffffffff78
+DATA  expandAVX512_60_outShufHi0+0x38(SB)/8, $0x31ffff2f2e2d2c2b
+
+GLOBL expandAVX512_60_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512_60_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512_60_outShufHi1+0x30(SB)/8, $0x06050403020100ff
+DATA  expandAVX512_60_outShufHi1+0x38(SB)/8, $0xff0908ffffffffff
+
+TEXT expandAVX512_60<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_60_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_60_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_60_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512_60_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512_60_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512_60_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512_60_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512_60_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512_60_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512_60_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512_60_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0x9f01ffffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x60fe000000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512_64_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512_64_inShuf0<>+0x00(SB)/8, $0x0000000000000000
+DATA  expandAVX512_64_inShuf0<>+0x08(SB)/8, $0x0000000000000000
+DATA  expandAVX512_64_inShuf0<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512_64_inShuf0<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512_64_inShuf0<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512_64_inShuf0<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512_64_inShuf0<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512_64_inShuf0<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512_64_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512_64_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512_64_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512_64_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512_64_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512_64_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512_64_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512_64_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512_64_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512_64_inShuf1<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_inShuf1<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_inShuf1<>+0x10(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_inShuf1<>+0x18(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_inShuf1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_inShuf1<>+0x28(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_inShuf1<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512_64_inShuf1<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512_64_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512_64_outShufLo+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512_64_outShufLo+0x08(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512_64_outShufLo+0x10(SB)/8, $0x1716151413121110
+DATA  expandAVX512_64_outShufLo+0x18(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512_64_outShufLo+0x20(SB)/8, $0x2726252423222120
+DATA  expandAVX512_64_outShufLo+0x28(SB)/8, $0x2f2e2d2c2b2a2928
+DATA  expandAVX512_64_outShufLo+0x30(SB)/8, $0x3736353433323130
+DATA  expandAVX512_64_outShufLo+0x38(SB)/8, $0x3f3e3d3c3b3a3938
+
+TEXT expandAVX512_64<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512_64_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512_64_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512_64_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512_64_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
diff --git a/src/internal/runtime/gc/scan/expand_amd64_test.go b/src/internal/runtime/gc/scan/expand_amd64_test.go
new file mode 100644 (file)
index 0000000..a8f5b88
--- /dev/null
@@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+package scan_test
+
+import (
+       "internal/runtime/gc/scan"
+       "testing"
+)
+
+func TestExpandAVX512(t *testing.T) {
+       if !scan.CanAVX512() {
+               t.Skip("no AVX512")
+       }
+       testExpand(t, scan.ExpandAVX512)
+}
diff --git a/src/internal/runtime/gc/scan/expand_reference.go b/src/internal/runtime/gc/scan/expand_reference.go
new file mode 100644 (file)
index 0000000..4544652
--- /dev/null
@@ -0,0 +1,39 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+       "internal/goarch"
+       "internal/runtime/gc"
+)
+
+// ExpandReference is a reference implementation of an expander function
+// that translates object mark bits into a bitmap of one bit per word of
+// marked object, assuming the object is of the provided size class.
+func ExpandReference(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
+       // Look up the size and derive the number of objects in a span.
+       // We're only concerned with small objects in single-page spans,
+       // and gc.PtrMask enforces this by being statically sized to
+       // accomodate only such spans.
+       size := uintptr(gc.SizeClassToSize[sizeClass])
+       nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
+
+       // f is the expansion factor. For example, if our objects are of size 48,
+       // then each mark bit will translate into 6 (48/8 = 6) set bits in the
+       // pointer bitmap.
+       f := size / goarch.PtrSize
+       for i := range nObj {
+               // Check if the object is marked.
+               if packed[i/goarch.PtrBits]&(uintptr(1)<<(i%goarch.PtrBits)) == 0 {
+                       continue
+               }
+               // Propagate that mark into the destination into one bit per the
+               // expansion factor f, offset to the object's offset within the span.
+               for j := range f {
+                       b := i*f + j // i*f is the start bit for the object, j indexes into each corresponding word after.
+                       unpacked[b/goarch.PtrBits] |= uintptr(1) << (b % goarch.PtrBits)
+               }
+       }
+}
diff --git a/src/internal/runtime/gc/scan/expand_test.go b/src/internal/runtime/gc/scan/expand_test.go
new file mode 100644 (file)
index 0000000..692817d
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan_test
+
+import (
+       "internal/goarch"
+       "internal/runtime/gc"
+       "internal/runtime/gc/scan"
+       "testing"
+)
+
+type expandFunc func(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
+
+func testExpand(t *testing.T, expF expandFunc) {
+       expR := scan.ExpandReference
+
+       testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
+               var want, got gc.PtrMask
+               expR(sizeClass, objs, &want)
+               expF(sizeClass, objs, &got)
+
+               for i := range want {
+                       if got[i] != want[i] {
+                               t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
+                               if goarch.PtrSize == 4 {
+                                       t.Logf("got:  %032b", got[i])
+                                       t.Logf("want: %032b", want[i])
+                               } else {
+                                       t.Logf("got:  %064b", got[i])
+                                       t.Logf("want: %064b", want[i])
+                               }
+                       }
+               }
+       })
+}
diff --git a/src/internal/runtime/gc/scan/filter.go b/src/internal/runtime/gc/scan/filter.go
new file mode 100644 (file)
index 0000000..63cee9a
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import "unsafe"
+
+// FilterNil packs non-nil (non-zero) values in bufp together
+// at the beginning of bufp, returning the length of the
+// packed buffer. It treats bufp as an array of size n.
+//
+// TODO(mknyszek): Add a faster SIMD-based implementation.
+func FilterNil(bufp *uintptr, n int32) int32 {
+       buf := unsafe.Slice(bufp, int(n))
+       lo := 0
+       hi := len(buf) - 1
+       for lo < hi {
+               for lo < hi && buf[hi] == 0 {
+                       hi--
+               }
+               for lo < hi && buf[lo] != 0 {
+                       lo++
+               }
+               if lo >= hi {
+                       break
+               }
+               buf[lo] = buf[hi]
+               hi--
+       }
+       if hi >= 0 && buf[hi] == 0 {
+               hi--
+       }
+       return int32(hi) + 1
+}
diff --git a/src/internal/runtime/gc/scan/filter_test.go b/src/internal/runtime/gc/scan/filter_test.go
new file mode 100644 (file)
index 0000000..115fbfb
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan_test
+
+import (
+       "internal/runtime/gc/scan"
+       "testing"
+)
+
+func TestFilterNil(t *testing.T) {
+       t.Run("empty", func(t *testing.T) {
+               testFilterNil(t, []uintptr{}, []uintptr{})
+       })
+       t.Run("one", func(t *testing.T) {
+               testFilterNil(t, []uintptr{4}, []uintptr{4})
+       })
+       t.Run("elimOne", func(t *testing.T) {
+               testFilterNil(t, []uintptr{0}, []uintptr{})
+       })
+       t.Run("oneElimBegin", func(t *testing.T) {
+               testFilterNil(t, []uintptr{0, 4}, []uintptr{4})
+       })
+       t.Run("oneElimEnd", func(t *testing.T) {
+               testFilterNil(t, []uintptr{4, 0}, []uintptr{4})
+       })
+       t.Run("oneElimMultiBegin", func(t *testing.T) {
+               testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4})
+       })
+       t.Run("oneElimMultiEnd", func(t *testing.T) {
+               testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4})
+       })
+       t.Run("oneElimMulti", func(t *testing.T) {
+               testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4})
+       })
+       t.Run("two", func(t *testing.T) {
+               testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12})
+       })
+       t.Run("twoElimBegin", func(t *testing.T) {
+               testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12})
+       })
+       t.Run("twoElimMid", func(t *testing.T) {
+               testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12})
+       })
+       t.Run("twoElimEnd", func(t *testing.T) {
+               testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12})
+       })
+       t.Run("twoElimMulti", func(t *testing.T) {
+               testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12})
+       })
+       t.Run("Multi", func(t *testing.T) {
+               testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5})
+       })
+}
+
+func testFilterNil(t *testing.T, buf, want []uintptr) {
+       var bufp *uintptr
+       if len(buf) != 0 {
+               bufp = &buf[0]
+       }
+       n := scan.FilterNil(bufp, int32(len(buf)))
+       if n > int32(len(buf)) {
+               t.Errorf("bogus new length returned: %d > %d", n, len(buf))
+               return
+       }
+       buf = buf[:n]
+       if len(buf) != len(want) {
+               t.Errorf("lengths differ: got %d, want %d", len(buf), len(want))
+       }
+
+       wantMap := make(map[uintptr]int)
+       gotMap := make(map[uintptr]int)
+       for _, p := range want {
+               wantMap[p]++
+       }
+       for _, p := range buf {
+               gotMap[p]++
+       }
+       for p, nWant := range wantMap {
+               if nGot, ok := gotMap[p]; !ok {
+                       t.Errorf("want %d, but missing from output", p)
+               } else if nGot != nWant {
+                       t.Errorf("want %d copies of %d, but got %d", nWant, p, nGot)
+               }
+       }
+       for p := range gotMap {
+               if _, ok := wantMap[p]; !ok {
+                       t.Errorf("got %d, but didn't want it", p)
+               }
+       }
+       t.Logf("got:  %v", buf)
+       t.Logf("want: %v", want)
+}
diff --git a/src/internal/runtime/gc/scan/mem_nounix_test.go b/src/internal/runtime/gc/scan/mem_nounix_test.go
new file mode 100644 (file)
index 0000000..f4d21d8
--- /dev/null
@@ -0,0 +1,16 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !unix
+
+package scan_test
+
+import (
+       "testing"
+)
+
+func makeMem(t testing.TB, nPages int) ([]uintptr, func()) {
+       t.Skip("mmap unsupported")
+       return nil, nil
+}
diff --git a/src/internal/runtime/gc/scan/mem_unix_test.go b/src/internal/runtime/gc/scan/mem_unix_test.go
new file mode 100644 (file)
index 0000000..03f0bd5
--- /dev/null
@@ -0,0 +1,25 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build unix
+
+package scan_test
+
+import (
+       "internal/runtime/gc"
+       "syscall"
+       "testing"
+       "unsafe"
+)
+
+func makeMem(t testing.TB, nPages int) ([]uintptr, func()) {
+       mem, err := syscall.Mmap(-1, 0, int(gc.PageSize*nPages), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_PRIVATE|syscall.MAP_ANON)
+       if err != nil {
+               t.Fatalf("mmap failed: %s", err)
+       }
+       free := func() {
+               syscall.Munmap(mem)
+       }
+       return unsafe.Slice((*uintptr)(unsafe.Pointer(unsafe.SliceData(mem))), len(mem)/8), free
+}
diff --git a/src/internal/runtime/gc/scan/mkasm.go b/src/internal/runtime/gc/scan/mkasm.go
new file mode 100644 (file)
index 0000000..e36defb
--- /dev/null
@@ -0,0 +1,412 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ignore
+
+package main
+
+import (
+       "bytes"
+       "fmt"
+       "io"
+       "log"
+       "os"
+       "slices"
+       "strconv"
+
+       "internal/runtime/gc"
+       "internal/runtime/gc/internal/gen"
+)
+
+const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"
+
+func main() {
+       generate("expand_amd64.s", genExpanders)
+}
+
+func generate(fileName string, genFunc func(*gen.File)) {
+       var buf bytes.Buffer
+       tee := io.MultiWriter(&buf, os.Stdout)
+
+       file := gen.NewFile(tee)
+
+       genFunc(file)
+
+       fmt.Fprintf(tee, header)
+       file.Compile()
+
+       f, err := os.Create(fileName)
+       if err != nil {
+               log.Fatal(err)
+       }
+       defer f.Close()
+       _, err = f.Write(buf.Bytes())
+       if err != nil {
+               log.Fatal(err)
+       }
+}
+
+func genExpanders(file *gen.File) {
+       gcExpandersAVX512 := make([]*gen.Func, len(gc.SizeClassToSize))
+       for sc, ob := range gc.SizeClassToSize {
+               if gc.SizeClassToNPages[sc] != 1 {
+                       // These functions all produce a bitmap that covers exactly one
+                       // page.
+                       continue
+               }
+               if ob > gc.MinSizeForMallocHeader {
+                       // This size class is too big to have a packed pointer/scalar bitmap.
+                       break
+               }
+
+               xf := int(ob) / 8
+               log.Printf("size class %d bytes, expansion %dx", ob, xf)
+
+               fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
+               ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)
+
+               if xf == 1 {
+                       expandIdentity(ptrObjBits)
+               } else {
+                       ok := gfExpander(xf, ptrObjBits)
+                       if !ok {
+                               log.Printf("failed to generate expander for size class %d", sc)
+                       }
+               }
+               file.AddFunc(fn)
+               gcExpandersAVX512[sc] = fn
+       }
+
+       // Generate table mapping size class to expander PC
+       file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
+}
+
+// mat8x8 is an 8x8 bit matrix.
+type mat8x8 struct {
+       mat [8]uint8
+}
+
+func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
+       var out [8]uint64
+       for i, mat := range mats {
+               for j, row := range mat.mat {
+                       // For some reason, Intel flips the rows.
+                       out[i] |= uint64(row) << ((7 - j) * 8)
+               }
+       }
+       return out
+}
+
+// expandIdentity implements 1x expansion (that is, no expansion).
+func expandIdentity(ptrObjBits gen.Ptr[gen.Uint8x64]) {
+       objBitsLo := gen.Deref(ptrObjBits)
+       objBitsHi := gen.Deref(ptrObjBits.AddConst(64))
+       gen.Return(objBitsLo, objBitsHi)
+}
+
+// gfExpander produces a function that expands each bit in an input bitmap into
+// f consecutive bits in an output bitmap.
+//
+// The input is
+//
+//     AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
+//
+// The output is
+//
+//     Z1 [64]uint8  = The bottom 512 bits of the expanded bitmap
+//     Z2 [64]uint8  = The top 512 bits of the expanded bitmap
+//
+// TODO(austin): This should Z0/Z1.
+func gfExpander(f int, ptrObjBits gen.Ptr[gen.Uint8x64]) bool {
+       // TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
+
+       // TODO(austin): For f >= 8, I suspect there are better ways to do this.
+       //
+       // For example, we could use a mask expansion to get a full byte for each
+       // input bit, and separately create the bytes that blend adjacent bits, then
+       // shuffle those bytes together. Certainly for f >= 16 this makes sense
+       // because each of those bytes will be used, possibly more than once.
+
+       objBits := gen.Deref(ptrObjBits)
+
+       type term struct {
+               iByte, oByte int
+               mat          mat8x8
+       }
+       var terms []term
+
+       // Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
+       // the output byte from the appropriate input byte. Gather all of these into
+       // "terms".
+       for oByte := 0; oByte < 1024/8; oByte++ {
+               var byteMat mat8x8
+               iByte := -1
+               for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
+                       iBit := oBit / f
+                       if iByte == -1 {
+                               iByte = iBit / 8
+                       } else if iByte != iBit/8 {
+                               log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
+                               return false
+                       }
+                       // One way to view this is that the i'th row of the matrix will be
+                       // ANDed with the input byte, and the parity of the result will set
+                       // the i'th bit in the output. We use a simple 1 bit mask, so the
+                       // parity is irrelevant beyond selecting out that one bit.
+                       byteMat.mat[oBit%8] = 1 << (iBit % 8)
+               }
+               terms = append(terms, term{iByte, oByte, byteMat})
+       }
+
+       if false {
+               // Print input byte -> output byte as a matrix
+               maxIByte, maxOByte := 0, 0
+               for _, term := range terms {
+                       maxIByte = max(maxIByte, term.iByte)
+                       maxOByte = max(maxOByte, term.oByte)
+               }
+               iToO := make([][]rune, maxIByte+1)
+               for i := range iToO {
+                       iToO[i] = make([]rune, maxOByte+1)
+               }
+               matMap := make(map[mat8x8]int)
+               for _, term := range terms {
+                       i, ok := matMap[term.mat]
+                       if !ok {
+                               i = len(matMap)
+                               matMap[term.mat] = i
+                       }
+                       iToO[term.iByte][term.oByte] = 'A' + rune(i)
+               }
+               for o := range maxOByte + 1 {
+                       fmt.Printf("%d", o)
+                       for i := range maxIByte + 1 {
+                               fmt.Printf(",")
+                               if mat := iToO[i][o]; mat != 0 {
+                                       fmt.Printf("%c", mat)
+                               }
+                       }
+                       fmt.Println()
+               }
+       }
+
+       // In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
+       // and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
+       //
+       //  abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
+       //    mat0     mat1     mat2     mat3     mat4     mat5     mat6     mat7
+
+       // Group the terms by matrix, but limit each group to 8 terms.
+       const termsPerGroup = 8       // Number of terms we can multiply by the same matrix.
+       const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
+
+       matMap := make(map[mat8x8]int)
+       allMats := make(map[mat8x8]bool)
+       var termGroups [][]term
+       for _, term := range terms {
+               allMats[term.mat] = true
+
+               i, ok := matMap[term.mat]
+               if ok && f > groupsPerSuperGroup {
+                       // The output is ultimately produced in two [64]uint8 registers.
+                       // Getting every byte in the right place of each of these requires a
+                       // final permutation that often requires more than one source.
+                       //
+                       // Up to 8x expansion, we can get a really nice grouping so we can use
+                       // the same 8 matrix vector several times, without producing
+                       // permutations that require more than two sources.
+                       //
+                       // Above 8x, however, we can't get nice matrixes anyway, so we
+                       // instead prefer reducing the complexity of the permutations we
+                       // need to produce the final outputs. To do this, avoid grouping
+                       // together terms that are split across the two registers.
+                       outRegister := termGroups[i][0].oByte / 64
+                       if term.oByte/64 != outRegister {
+                               ok = false
+                       }
+               }
+               if !ok {
+                       // Start a new term group.
+                       i = len(termGroups)
+                       matMap[term.mat] = i
+                       termGroups = append(termGroups, nil)
+               }
+
+               termGroups[i] = append(termGroups[i], term)
+
+               if len(termGroups[i]) == termsPerGroup {
+                       // This term group is full.
+                       delete(matMap, term.mat)
+               }
+       }
+
+       for i, termGroup := range termGroups {
+               log.Printf("term group %d:", i)
+               for _, term := range termGroup {
+                       log.Printf("  %+v", term)
+               }
+       }
+
+       // We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
+       // as many term groups as we can into each super-group to minimize the
+       // number of matrix multiplies.
+       //
+       // Ideally, we use the same matrix in each super-group, which might mean
+       // doing fewer than 8 multiplies at a time. That's fine because it never
+       // increases the total number of matrix multiplies.
+       //
+       // TODO: Packing the matrixes less densely may let us use more broadcast
+       // loads instead of general permutations, though. That replaces a load of
+       // the permutation with a load of the matrix, but is probably still slightly
+       // better.
+       var sgSize, nSuperGroups int
+       oneMatVec := f <= groupsPerSuperGroup
+       if oneMatVec {
+               // We can use the same matrix in each multiply by doing sgSize
+               // multiplies at a time.
+               sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
+               nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
+       } else {
+               // We can't use the same matrix for each multiply. Just do as many at a
+               // time as we can.
+               //
+               // TODO: This is going to produce several distinct matrixes, when we
+               // probably only need two. Be smarter about how we create super-groups
+               // in this case. Maybe we build up an array of super-groups and then the
+               // loop below just turns them into ops?
+               sgSize = 8
+               nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
+       }
+
+       // Construct each super-group.
+       var matGroup [8]mat8x8
+       var matMuls []gen.Uint8x64
+       var perm [128]int
+       for sgi := range nSuperGroups {
+               var iperm [64]uint8
+               for i := range iperm {
+                       iperm[i] = 0xff // "Don't care"
+               }
+               // Pick off sgSize term groups.
+               superGroup := termGroups[:min(len(termGroups), sgSize)]
+               termGroups = termGroups[len(superGroup):]
+               // Build the matrix and permutations for this super-group.
+               var thisMatGroup [8]mat8x8
+               for i, termGroup := range superGroup {
+                       // All terms in this group have the same matrix. Pick one.
+                       thisMatGroup[i] = termGroup[0].mat
+                       for j, term := range termGroup {
+                               // Build the input permutation.
+                               iperm[i*termsPerGroup+j] = uint8(term.iByte)
+                               // Build the output permutation.
+                               perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
+                       }
+               }
+               log.Printf("input permutation %d: %v", sgi, iperm)
+
+               // Check that we're not making more distinct matrixes than expected.
+               if oneMatVec {
+                       if sgi == 0 {
+                               matGroup = thisMatGroup
+                       } else if matGroup != thisMatGroup {
+                               log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
+                               return false
+                       }
+               }
+
+               // Emit matrix op.
+               matConst := gen.ConstUint64x8(matGroupToVec(&thisMatGroup), fmt.Sprintf("*_mat%d<>", sgi))
+               inOp := objBits.Shuffle(gen.ConstUint8x64(iperm, fmt.Sprintf("*_inShuf%d<>", sgi)))
+               matMul := matConst.GF2P8Affine(inOp)
+               matMuls = append(matMuls, matMul)
+       }
+
+       log.Printf("output permutation: %v", perm)
+
+       outLo, ok := genShuffle("*_outShufLo", (*[64]int)(perm[:64]), matMuls...)
+       if !ok {
+               log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+               return false
+       }
+       outHi, ok := genShuffle("*_outShufHi", (*[64]int)(perm[64:]), matMuls...)
+       if !ok {
+               log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+               return false
+       }
+       gen.Return(outLo, outHi)
+
+       return true
+}
+
+func genShuffle(name string, perm *[64]int, args ...gen.Uint8x64) (gen.Uint8x64, bool) {
+       // Construct flattened permutation.
+       var vperm [64]byte
+
+       // Get the inputs used by this permutation.
+       var inputs []int
+       for i, src := range perm {
+               inputIdx := slices.Index(inputs, src/64)
+               if inputIdx == -1 {
+                       inputIdx = len(inputs)
+                       inputs = append(inputs, src/64)
+               }
+               vperm[i] = byte(src%64 | (inputIdx << 6))
+       }
+
+       // Emit instructions for easy cases.
+       switch len(inputs) {
+       case 1:
+               constOp := gen.ConstUint8x64(vperm, name)
+               return args[inputs[0]].Shuffle(constOp), true
+       case 2:
+               constOp := gen.ConstUint8x64(vperm, name)
+               return args[inputs[0]].Shuffle2(args[inputs[1]], constOp), true
+       }
+
+       // Harder case, we need to shuffle in from up to 2 more tables.
+       //
+       // Perform two shuffles. One shuffle will get its data from the first
+       // two inputs, the other shuffle will get its data from the other one
+       // or two inputs. All values they don't care each don't care about will
+       // be zeroed.
+       var vperms [2][64]byte
+       var masks [2]uint64
+       for j, idx := range vperm {
+               for i := range vperms {
+                       vperms[i][j] = 0xff // "Don't care"
+               }
+               if idx == 0xff {
+                       continue
+               }
+               vperms[idx/128][j] = idx % 128
+               masks[idx/128] |= uint64(1) << j
+       }
+
+       // Validate that the masks are fully disjoint.
+       if masks[0]^masks[1] != ^uint64(0) {
+               panic("bad shuffle!")
+       }
+
+       // Generate constants.
+       constOps := make([]gen.Uint8x64, len(vperms))
+       for i, v := range vperms {
+               constOps[i] = gen.ConstUint8x64(v, name+strconv.Itoa(i))
+       }
+
+       // Generate shuffles.
+       switch len(inputs) {
+       case 3:
+               r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
+               r1 := args[inputs[2]].ShuffleZeroed(constOps[1], gen.ConstMask64(masks[1]))
+               return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
+       case 4:
+               r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0]))
+               r1 := args[inputs[2]].Shuffle2Zeroed(args[inputs[3]], constOps[1], gen.ConstMask64(masks[1]))
+               return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true
+       }
+
+       // Too many inputs. To support more, we'd need to separate tables much earlier.
+       // Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
+       return args[0], false
+}
diff --git a/src/internal/runtime/gc/scan/scan_amd64.go b/src/internal/runtime/gc/scan/scan_amd64.go
new file mode 100644 (file)
index 0000000..2ac181f
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+       "internal/cpu"
+       "internal/runtime/gc"
+       "unsafe"
+)
+
+func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       if CanAVX512() {
+               return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
+       }
+       panic("not implemented")
+}
+
+func HasFastScanSpanPacked() bool {
+       return avx512ScanPackedReqsMet
+}
+
+// -- AVX512 --
+
+func CanAVX512() bool {
+       return avx512ScanPackedReqsMet
+}
+
+func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
+}
+
+//go:noescape
+func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
+
+var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
+       cpu.X86.HasAVX512BW &&
+       cpu.X86.HasGFNI &&
+       cpu.X86.HasAVX512BITALG &&
+       cpu.X86.HasAVX512VBMI
diff --git a/src/internal/runtime/gc/scan/scan_amd64.s b/src/internal/runtime/gc/scan/scan_amd64.s
new file mode 100644 (file)
index 0000000..055995f
--- /dev/null
@@ -0,0 +1,103 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Test-only.
+TEXT Â·ExpandAVX512(SB), NOSPLIT, $0-24
+       MOVQ sizeClass+0(FP), CX
+       MOVQ packed+8(FP), AX
+
+       // Call the expander for this size class
+       LEAQ Â·gcExpandersAVX512(SB), BX
+       CALL (BX)(CX*8)
+
+       MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
+       VMOVDQU64 Z1, 0(DI)
+       VMOVDQU64 Z2, 64(DI)
+       VZEROUPPER
+       RET
+
+TEXT Â·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
+       // Z1+Z2 = Expand the grey object mask into a grey word mask
+       MOVQ objMarks+16(FP), AX
+       MOVQ sizeClass+24(FP), CX
+       LEAQ Â·gcExpandersAVX512(SB), BX
+       CALL (BX)(CX*8)
+
+       // Z3+Z4 = Load the pointer mask
+       MOVQ ptrMask+32(FP), AX
+       VMOVDQU64 0(AX), Z3
+       VMOVDQU64 64(AX), Z4
+
+       // Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
+       VPANDQ Z1, Z3, Z1
+       VPANDQ Z2, Z4, Z2
+
+       // Now each bit of Z1+Z2 represents one word of the span.
+       // Thus, each byte covers 64 bytes of memory, which is also how
+       // much we can fix in a Z register.
+       //
+       // We do a load/compress for each 64 byte frame.
+       //
+       // Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
+       VPOPCNTB Z1, Z3 // Requires BITALG
+       VPOPCNTB Z2, Z4
+
+       // Store the scan mask and word counts at 0(SP) and 128(SP).
+       //
+       // TODO: Is it better to read directly from the registers?
+       VMOVDQU64 Z1, 0(SP)
+       VMOVDQU64 Z2, 64(SP)
+       VMOVDQU64 Z3, 128(SP)
+       VMOVDQU64 Z4, 192(SP)
+
+       // SI = Current address in span
+       MOVQ mem+0(FP), SI
+       // DI = Scan buffer base
+       MOVQ bufp+8(FP), DI
+       // DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
+       MOVQ $0, DX
+
+       // AX = address in scan mask, 128(AX) = address in popcount
+       LEAQ 0(SP), AX
+
+       // Loop over the 64 byte frames in this span.
+       // BX = 1 past the end of the scan mask
+       LEAQ 128(SP), BX
+
+       // Align loop to a cache line so that performance is less sensitive
+       // to how this function ends up laid out in memory. This is a hot
+       // function in the GC, and this is a tight loop. We don't want
+       // performance to waver wildly due to unrelated changes.
+       PCALIGN $64
+loop:
+       // CX = Fetch the mask of words to load from this frame.
+       MOVBQZX 0(AX), CX
+       // Skip empty frames.
+       TESTQ CX, CX
+       JZ skip
+
+       // Load the 64 byte frame.
+       KMOVB CX, K1
+       VMOVDQA64 0(SI), Z1
+
+       // Collect just the pointers from the greyed objects into the scan buffer,
+       // i.e., copy the word indices in the mask from Z1 into contiguous memory.
+       VPCOMPRESSQ Z1, K1, (DI)(DX*8)
+       // Advance the scan buffer position by the number of pointers.
+       MOVBQZX 128(AX), CX
+       ADDQ CX, DX
+
+skip:
+       ADDQ $64, SI
+       ADDQ $1, AX
+       CMPQ AX, BX
+       JB loop
+
+end:
+       MOVL DX, count+40(FP)
+       VZEROUPPER
+       RET
diff --git a/src/internal/runtime/gc/scan/scan_amd64_test.go b/src/internal/runtime/gc/scan/scan_amd64_test.go
new file mode 100644 (file)
index 0000000..a914b4f
--- /dev/null
@@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+package scan_test
+
+import (
+       "internal/runtime/gc/scan"
+       "testing"
+)
+
+func TestScanSpanPackedAVX512(t *testing.T) {
+       if !scan.CanAVX512() {
+               t.Skip("no AVX512")
+       }
+       testScanSpanPacked(t, scan.ScanSpanPackedAVX512)
+}
diff --git a/src/internal/runtime/gc/scan/scan_generic.go b/src/internal/runtime/gc/scan/scan_generic.go
new file mode 100644 (file)
index 0000000..a4d5182
--- /dev/null
@@ -0,0 +1,23 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64
+
+package scan
+
+import (
+       "internal/runtime/gc"
+       "unsafe"
+)
+
+func HasFastScanSpanPacked() bool {
+       // N.B. ScanSpanPackedGeneric isn't actually fast enough to serve as a general-purpose implementation.
+       // The runtime's alternative of jumping between each object is still substantially better, even at
+       // relatively high object densities.
+       return false
+}
+
+func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
+}
diff --git a/src/internal/runtime/gc/scan/scan_generic_test.go b/src/internal/runtime/gc/scan/scan_generic_test.go
new file mode 100644 (file)
index 0000000..250135e
--- /dev/null
@@ -0,0 +1,14 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan_test
+
+import (
+       "internal/runtime/gc/scan"
+       "testing"
+)
+
+func TestScanSpanPackedGo(t *testing.T) {
+       testScanSpanPacked(t, scan.ScanSpanPackedGo)
+}
diff --git a/src/internal/runtime/gc/scan/scan_go.go b/src/internal/runtime/gc/scan/scan_go.go
new file mode 100644 (file)
index 0000000..9a2985a
--- /dev/null
@@ -0,0 +1,104 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+       "internal/goarch"
+       "internal/runtime/gc"
+       "internal/runtime/sys"
+       "unsafe"
+)
+
+// ScanSpanPackedGo is an optimized pure Go implementation of ScanSpanPacked.
+func ScanSpanPackedGo(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       buf := newUnsafeBuf(bufp)
+       objBytes := uintptr(gc.SizeClassToSize[sizeClass])
+       // TODO(austin): Trim objMarks to the number of objects in this size class?
+       for markI, markWord := range objMarks {
+               for range sys.OnesCount64(uint64(markWord)) {
+                       bitI := sys.TrailingZeros64(uint64(markWord))
+                       markWord &^= 1 << bitI
+
+                       objIndex := markI*goarch.PtrBits + bitI
+
+                       // objStartInSpan is the index of the word from mem where the
+                       // object stats. objEndInSpan points to the next object, i.e.
+                       // it's an exclusive upper bound.
+                       objStartInSpan := objBytes * uintptr(objIndex) / goarch.PtrSize
+                       objEndInSpan := objStartInSpan + objBytes/goarch.PtrSize
+
+                       // TODO: Another way to do this would be to extract the pointer mask
+                       // for this object (it's at most 64 bits) and do a bit iteration
+                       // over that.
+
+                       for wordI := objStartInSpan; wordI < objEndInSpan; wordI++ {
+                               val := *(*uintptr)(unsafe.Add(mem, wordI*goarch.PtrSize))
+                               // Check if we should enqueue this word.
+                               //
+                               // We load the word before the check because, even though this
+                               // can lead to loading much more than necessary, it's faster.
+                               // Most likely this is because it warms up the hardware
+                               // prefetcher much better, and gives us more time before we need
+                               // the value.
+                               //
+                               // We discard values that can't possibly be useful pointers
+                               // here, too, because this filters out a lot of words and does
+                               // so with as little processing as possible.
+                               //
+                               // TODO: This is close to, but not entirely branchless.
+                               isPtr := bool2int(ptrMask[wordI/goarch.PtrBits]&(1<<(wordI%goarch.PtrBits)) != 0)
+                               isNonNil := bool2int(val >= 4096)
+                               pred := isPtr&isNonNil != 0
+                               buf.addIf(val, pred)
+                       }
+               }
+       }
+       // We don't know the true size of bufp, but we can at least catch obvious errors
+       // in this function by making sure we didn't write more than gc.PageWords pointers
+       // into the buffer.
+       buf.check(gc.PageWords)
+       return int32(buf.n)
+}
+
+// unsafeBuf allows for appending to a buffer without bounds-checks or branches.
+type unsafeBuf[T any] struct {
+       base *T
+       n    int
+}
+
+func newUnsafeBuf[T any](base *T) unsafeBuf[T] {
+       return unsafeBuf[T]{base, 0}
+}
+
+// addIf appends a value to the buffer if the predicate is true.
+//
+// addIf speculatively writes to the next index of the buffer, so the caller
+// must be certain that such a write will still be in-bounds with respect
+// to the buffer's true capacity.
+func (b *unsafeBuf[T]) addIf(val T, pred bool) {
+       *(*T)(unsafe.Add(unsafe.Pointer(b.base), b.n*int(unsafe.Sizeof(val)))) = val
+       b.n += bool2int(pred)
+}
+
+// check performs a bounds check on speculative writes into the buffer.
+// Calling this shortly after a series of addIf calls is important to
+// catch any misuse as fast as possible. Separating the bounds check from
+// the append is more efficient, but one check to cover several appends is
+// still efficient and much more memory safe.
+func (b unsafeBuf[T]) check(cap int) {
+       // We fail even if b.n == cap because addIf speculatively writes one past b.n.
+       if b.n >= cap {
+               panic("unsafeBuf overflow")
+       }
+}
+
+func bool2int(x bool) int {
+       // This particular pattern gets optimized by the compiler.
+       var b int
+       if x {
+               b = 1
+       }
+       return b
+}
diff --git a/src/internal/runtime/gc/scan/scan_reference.go b/src/internal/runtime/gc/scan/scan_reference.go
new file mode 100644 (file)
index 0000000..05eca98
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+       "internal/goarch"
+       "internal/runtime/gc"
+       "unsafe"
+)
+
+// ScanSpanPackedReference is the reference implementation of ScanScanPacked. It prioritizes clarity over performance.
+//
+// Concretely, ScanScanPacked functions read pointers from mem, assumed to be gc.PageSize-aligned and gc.PageSize in size,
+// and writes them to bufp, which is large enough to guarantee that even if pointer-word of mem is a pointer, it will fit.
+// Therefore bufp, is always at least gc.PageSize in size.
+//
+// ScanSpanPacked is supposed to identify pointers by first filtering words by objMarks, where each bit of the mask
+// represents gc.SizeClassToSize[sizeClass] bytes of memory, and then filtering again by the bits in ptrMask.
+func ScanSpanPackedReference(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       buf := unsafe.Slice(bufp, gc.PageWords)
+       expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
+       for word := range gc.PageWords {
+               objI := uintptr(word) / expandBy
+               if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
+                       continue
+               }
+               if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
+                       continue
+               }
+               ptr := *(*uintptr)(unsafe.Add(mem, word*goarch.PtrSize))
+               if ptr == 0 {
+                       continue
+               }
+               buf[count] = ptr
+               count++
+       }
+       return count
+}
diff --git a/src/internal/runtime/gc/scan/scan_test.go b/src/internal/runtime/gc/scan/scan_test.go
new file mode 100644 (file)
index 0000000..9b57715
--- /dev/null
@@ -0,0 +1,254 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan_test
+
+import (
+       "fmt"
+       "internal/cpu"
+       "internal/goarch"
+       "internal/runtime/gc"
+       "internal/runtime/gc/scan"
+       "math/bits"
+       "math/rand/v2"
+       "slices"
+       "sync"
+       "testing"
+       "unsafe"
+)
+
+type scanFunc func(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
+
+func testScanSpanPacked(t *testing.T, scanF scanFunc) {
+       scanR := scan.ScanSpanPackedReference
+
+       // Construct a fake memory
+       mem, free := makeMem(t, 1)
+       defer free()
+       for i := range mem {
+               // Use values > heap.PageSize because a scan function can discard
+               // pointers smaller than this.
+               mem[i] = uintptr(int(gc.PageSize) + i + 1)
+       }
+
+       // Construct a random pointer mask
+       rnd := rand.New(rand.NewPCG(42, 42))
+       var ptrs gc.PtrMask
+       for i := range ptrs {
+               ptrs[i] = uintptr(rnd.Uint64())
+       }
+
+       bufF := make([]uintptr, gc.PageWords)
+       bufR := make([]uintptr, gc.PageWords)
+       testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
+               nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs)
+               nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs)
+
+               if nR != nF {
+                       t.Errorf("want %d count, got %d", nR, nF)
+               } else if !slices.Equal(bufF[:nF], bufR[:nR]) {
+                       t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF])
+               }
+       })
+}
+
+func testObjs(t *testing.T, f func(t *testing.T, sizeClass int, objMask *gc.ObjMask)) {
+       for sizeClass := range gc.NumSizeClasses {
+               if sizeClass == 0 {
+                       continue
+               }
+               size := uintptr(gc.SizeClassToSize[sizeClass])
+               if size > gc.MinSizeForMallocHeader {
+                       break // Pointer/scalar metadata is not packed for larger sizes.
+               }
+               t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) {
+                       // Scan a few objects near i to test boundary conditions.
+                       const objMask = 0x101
+                       nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
+                       for i := range nObj - uintptr(bits.Len(objMask)-1) {
+                               t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) {
+                                       var objs gc.ObjMask
+                                       objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits)
+                                       f(t, sizeClass, &objs)
+                               })
+                       }
+               })
+       }
+}
+
+var dataCacheSizes = sync.OnceValue(func() []uintptr {
+       cs := cpu.DataCacheSizes()
+       for i, c := range cs {
+               fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize)
+       }
+       return cs
+})
+
+func BenchmarkScanSpanPacked(b *testing.B) {
+       benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses)
+}
+
+func benchmarkCacheSizes(b *testing.B, fn func(b *testing.B, heapPages int)) {
+       cacheSizes := dataCacheSizes()
+       b.Run("cache=tiny/pages=1", func(b *testing.B) {
+               fn(b, 1)
+       })
+       for i, cacheBytes := range cacheSizes {
+               pages := int(cacheBytes*3/4) / gc.PageSize
+               b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) {
+                       fn(b, pages)
+               })
+       }
+       ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize
+       b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) {
+               fn(b, ramPages)
+       })
+}
+
+func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) {
+       for sc := range gc.NumSizeClasses {
+               if sc == 0 {
+                       continue
+               }
+               if sc >= gc.MinSizeForMallocHeader {
+                       break
+               }
+               b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) {
+                       benchmarkScanSpanPacked(b, nPages, sc)
+               })
+       }
+}
+
+func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
+       rnd := rand.New(rand.NewPCG(42, 42))
+
+       // Construct a fake memory
+       mem, free := makeMem(b, nPages)
+       defer free()
+       for i := range mem {
+               // Use values > heap.PageSize because a scan function can discard
+               // pointers smaller than this.
+               mem[i] = uintptr(int(gc.PageSize) + i + 1)
+       }
+
+       // Construct a random pointer mask
+       ptrs := make([]gc.PtrMask, nPages)
+       for i := range ptrs {
+               for j := range ptrs[i] {
+                       ptrs[i][j] = uintptr(rnd.Uint64())
+               }
+       }
+
+       // Visit the pages in a random order
+       pageOrder := rnd.Perm(nPages)
+
+       // Create the scan buffer.
+       buf := make([]uintptr, gc.PageWords)
+
+       // Sweep from 0 marks to all marks. We'll use the same marks for each page
+       // because I don't think that predictability matters.
+       objBytes := uintptr(gc.SizeClassToSize[sizeClass])
+       nObj := gc.PageSize / objBytes
+       markOrder := rnd.Perm(int(nObj))
+       const steps = 11
+       for i := 0; i < steps; i++ {
+               frac := float64(i) / float64(steps-1)
+               // Set frac marks.
+               nMarks := int(float64(len(markOrder))*frac + 0.5)
+               var objMarks gc.ObjMask
+               for _, mark := range markOrder[:nMarks] {
+                       objMarks[mark/goarch.PtrBits] |= 1 << (mark % goarch.PtrBits)
+               }
+               greyClusters := 0
+               for page := range ptrs {
+                       greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page])
+               }
+
+               // Report MB/s of how much memory they're actually hitting. This assumes
+               // 64 byte cache lines (TODO: Should it assume 128 byte cache lines?)
+               // and expands each access to the whole cache line. This is useful for
+               // comparing against memory bandwidth.
+               //
+               // TODO: Add a benchmark that just measures single core memory bandwidth
+               // for comparison. (See runtime memcpy benchmarks.)
+               //
+               // TODO: Should there be a separate measure where we don't expand to
+               // cache lines?
+               avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs))
+
+               b.Run(fmt.Sprintf("pct=%d", int(100*frac)), func(b *testing.B) {
+                       b.Run("impl=Reference", func(b *testing.B) {
+                               b.SetBytes(avgBytes)
+                               for i := range b.N {
+                                       page := pageOrder[i%len(pageOrder)]
+                                       scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
+                               }
+                       })
+                       b.Run("impl=Go", func(b *testing.B) {
+                               b.SetBytes(avgBytes)
+                               for i := range b.N {
+                                       page := pageOrder[i%len(pageOrder)]
+                                       scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
+                               }
+                       })
+                       if scan.HasFastScanSpanPacked() {
+                               b.Run("impl=Platform", func(b *testing.B) {
+                                       b.SetBytes(avgBytes)
+                                       for i := range b.N {
+                                               page := pageOrder[i%len(pageOrder)]
+                                               scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
+                                       }
+                               })
+                       }
+               })
+       }
+}
+
+func countGreyClusters(sizeClass int, objMarks *gc.ObjMask, ptrMask *gc.PtrMask) int {
+       clusters := 0
+       lastCluster := -1
+
+       expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
+       for word := range gc.PageWords {
+               objI := uintptr(word) / expandBy
+               if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
+                       continue
+               }
+               if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
+                       continue
+               }
+               c := word * 8 / goarch.PtrBits
+               if c != lastCluster {
+                       lastCluster = c
+                       clusters++
+               }
+       }
+       return clusters
+}
+
+func BenchmarkScanMaxBandwidth(b *testing.B) {
+       // Measure the theoretical "maximum" bandwidth of scanning by reproducing
+       // the memory access pattern of a full page scan, but using memcpy as the
+       // kernel instead of scanning.
+       benchmarkCacheSizes(b, func(b *testing.B, heapPages int) {
+               mem, free := makeMem(b, heapPages)
+               defer free()
+               for i := range mem {
+                       mem[i] = uintptr(int(gc.PageSize) + i + 1)
+               }
+               buf := make([]uintptr, gc.PageWords)
+
+               // Visit the pages in a random order
+               rnd := rand.New(rand.NewPCG(42, 42))
+               pageOrder := rnd.Perm(heapPages)
+
+               b.SetBytes(int64(gc.PageSize))
+
+               b.ResetTimer()
+               for i := range b.N {
+                       page := pageOrder[i%len(pageOrder)]
+                       copy(buf, mem[gc.PageWords*page:])
+               }
+       })
+}
index d2cca1cef13b284167d63a9be802b8f0d3fb85d8..e5d562f943adae68ed0067b6fe72edd9ac028cd7 100644 (file)
@@ -82,14 +82,15 @@ package gc
 //      8192    13         32768
 
 const (
-       MinHeapAlign   = 8
-       MaxSmallSize   = 32768
-       SmallSizeDiv   = 8
-       SmallSizeMax   = 1024
-       LargeSizeDiv   = 128
-       NumSizeClasses = 68
-       PageShift      = 13
-       MaxObjsPerSpan = 1024
+       MinHeapAlign       = 8
+       MaxSmallSize       = 32768
+       SmallSizeDiv       = 8
+       SmallSizeMax       = 1024
+       LargeSizeDiv       = 128
+       NumSizeClasses     = 68
+       PageShift          = 13
+       MaxObjsPerSpan     = 1024
+       MaxSizeClassNPages = 10
 )
 
 var SizeClassToSize = [NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}