From 889ab74169db2c8758f51c1a199a90266b16669b Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Wed, 5 Mar 2025 20:12:47 +0000 Subject: [PATCH] internal/runtime/gc/scan: import scan kernel from gclab [green tea] This change imports the AVX512 GC scanning kernel from CL 593938 into a new package, internal/runtime/gc/scan. Credit to Austin Clements for most of this work. I did some cleanup, added support for more size classes to the expanders, and added more testing. I also restructured the code to make it easier and clearer to add new scan kernels for new architectures. For #73581. Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc Reviewed-on: https://go-review.googlesource.com/c/go/+/655280 LUCI-TryBot-Result: Go LUCI Auto-Submit: Michael Knyszek Reviewed-by: Michael Pratt --- src/cmd/internal/objabi/pkgspecial.go | 1 + src/go/build/deps_test.go | 15 + src/internal/cpu/cpu.go | 4 + src/internal/cpu/cpu_x86.go | 14 +- src/internal/cpu/datacache_unsupported.go | 11 + src/internal/cpu/datacache_x86.go | 121 + src/internal/cpu/datacache_x86_test.go | 26 + src/internal/goarch/goarch.go | 3 + src/internal/runtime/gc/internal/gen/gen.go | 537 ++++ src/internal/runtime/gc/internal/gen/gp.go | 26 + .../runtime/gc/internal/gen/regalloc.go | 338 +++ src/internal/runtime/gc/internal/gen/simd.go | 246 ++ src/internal/runtime/gc/internal/gen/val.go | 137 + src/internal/runtime/gc/malloc.go | 5 +- src/internal/runtime/gc/mksizeclasses.go | 11 +- src/internal/runtime/gc/scan/expand_amd64.go | 22 + src/internal/runtime/gc/scan/expand_amd64.s | 2631 +++++++++++++++++ .../runtime/gc/scan/expand_amd64_test.go | 19 + .../runtime/gc/scan/expand_reference.go | 39 + src/internal/runtime/gc/scan/expand_test.go | 37 + src/internal/runtime/gc/scan/filter.go | 35 + src/internal/runtime/gc/scan/filter_test.go | 94 + .../runtime/gc/scan/mem_nounix_test.go | 16 + src/internal/runtime/gc/scan/mem_unix_test.go | 25 + src/internal/runtime/gc/scan/mkasm.go | 412 +++ src/internal/runtime/gc/scan/scan_amd64.go | 41 + src/internal/runtime/gc/scan/scan_amd64.s | 103 + .../runtime/gc/scan/scan_amd64_test.go | 19 + src/internal/runtime/gc/scan/scan_generic.go | 23 + .../runtime/gc/scan/scan_generic_test.go | 14 + src/internal/runtime/gc/scan/scan_go.go | 104 + .../runtime/gc/scan/scan_reference.go | 40 + src/internal/runtime/gc/scan/scan_test.go | 254 ++ src/internal/runtime/gc/sizeclasses.go | 17 +- 34 files changed, 5426 insertions(+), 14 deletions(-) create mode 100644 src/internal/cpu/datacache_unsupported.go create mode 100644 src/internal/cpu/datacache_x86.go create mode 100644 src/internal/cpu/datacache_x86_test.go create mode 100644 src/internal/runtime/gc/internal/gen/gen.go create mode 100644 src/internal/runtime/gc/internal/gen/gp.go create mode 100644 src/internal/runtime/gc/internal/gen/regalloc.go create mode 100644 src/internal/runtime/gc/internal/gen/simd.go create mode 100644 src/internal/runtime/gc/internal/gen/val.go create mode 100644 src/internal/runtime/gc/scan/expand_amd64.go create mode 100644 src/internal/runtime/gc/scan/expand_amd64.s create mode 100644 src/internal/runtime/gc/scan/expand_amd64_test.go create mode 100644 src/internal/runtime/gc/scan/expand_reference.go create mode 100644 src/internal/runtime/gc/scan/expand_test.go create mode 100644 src/internal/runtime/gc/scan/filter.go create mode 100644 src/internal/runtime/gc/scan/filter_test.go create mode 100644 src/internal/runtime/gc/scan/mem_nounix_test.go create mode 100644 src/internal/runtime/gc/scan/mem_unix_test.go create mode 100644 src/internal/runtime/gc/scan/mkasm.go create mode 100644 src/internal/runtime/gc/scan/scan_amd64.go create mode 100644 src/internal/runtime/gc/scan/scan_amd64.s create mode 100644 src/internal/runtime/gc/scan/scan_amd64_test.go create mode 100644 src/internal/runtime/gc/scan/scan_generic.go create mode 100644 src/internal/runtime/gc/scan/scan_generic_test.go create mode 100644 src/internal/runtime/gc/scan/scan_go.go create mode 100644 src/internal/runtime/gc/scan/scan_reference.go create mode 100644 src/internal/runtime/gc/scan/scan_test.go diff --git a/src/cmd/internal/objabi/pkgspecial.go b/src/cmd/internal/objabi/pkgspecial.go index fe510160b3..94efa6883b 100644 --- a/src/cmd/internal/objabi/pkgspecial.go +++ b/src/cmd/internal/objabi/pkgspecial.go @@ -52,6 +52,7 @@ var runtimePkgs = []string{ "internal/runtime/cgroup", "internal/runtime/exithook", "internal/runtime/gc", + "internal/runtime/gc/scan", "internal/runtime/maps", "internal/runtime/math", "internal/runtime/strconv", diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go index 00e6e562e5..41dde20bf9 100644 --- a/src/go/build/deps_test.go +++ b/src/go/build/deps_test.go @@ -100,6 +100,7 @@ var depsRules = ` < internal/runtime/maps < internal/runtime/strconv < internal/runtime/cgroup + < internal/runtime/gc/scan < runtime < sync/atomic < internal/sync @@ -797,6 +798,20 @@ var depsRules = ` FMT, testing < internal/cgrouptest; C, CGO < internal/runtime/cgobench; + + # Generate-only packages can have anything they want + container/heap, + encoding/binary, + fmt, + hash/maphash, + io, + log, + math/bits, + os, + reflect, + strings, + sync + < internal/runtime/gc/internal/gen; ` // listStdPkgs returns the same list of packages as "go list std". diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index fca38532dc..e92c1851a2 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -34,15 +34,19 @@ var X86 struct { HasAVX512 bool // Virtual feature: F+CD+BW+DQ+VL HasAVX512F bool HasAVX512CD bool + HasAVX512BITALG bool HasAVX512BW bool HasAVX512DQ bool HasAVX512VL bool HasAVX512VPCLMULQDQ bool + HasAVX512VBMI bool + HasAVX512VBMI2 bool HasBMI1 bool HasBMI2 bool HasERMS bool HasFSRM bool HasFMA bool + HasGFNI bool HasOSXSAVE bool HasPCLMULQDQ bool HasPOPCNT bool diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go index 315c26b0dd..6fa30b7763 100644 --- a/src/internal/cpu/cpu_x86.go +++ b/src/internal/cpu/cpu_x86.go @@ -18,7 +18,7 @@ func xgetbv() (eax, edx uint32) func getGOAMD64level() int32 const ( - // ecx bits + // Bits returned in ECX for CPUID EAX=0x1 ECX=0x0 cpuid_SSE3 = 1 << 0 cpuid_PCLMULQDQ = 1 << 1 cpuid_SSSE3 = 1 << 9 @@ -30,7 +30,7 @@ const ( cpuid_OSXSAVE = 1 << 27 cpuid_AVX = 1 << 28 - // ebx bits + // "Extended Feature Flag" bits returned in EBX for CPUID EAX=0x7 ECX=0x0 cpuid_BMI1 = 1 << 3 cpuid_AVX2 = 1 << 5 cpuid_BMI2 = 1 << 8 @@ -43,8 +43,12 @@ const ( cpuid_AVX512BW = 1 << 30 cpuid_AVX512VL = 1 << 31 - // ecx bits + // "Extended Feature Flag" bits returned in ECX for CPUID EAX=0x7 ECX=0x0 + cpuid_AVX512_VBMI = 1 << 1 + cpuid_AVX512_VBMI2 = 1 << 6 + cpuid_GFNI = 1 << 8 cpuid_AVX512VPCLMULQDQ = 1 << 10 + cpuid_AVX512_BITALG = 1 << 12 // edx bits cpuid_FSRM = 1 << 4 @@ -163,6 +167,10 @@ func doinit() { X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ) X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ) + X86.HasAVX512VBMI = isSet(ecx7, cpuid_AVX512_VBMI) + X86.HasAVX512VBMI2 = isSet(ecx7, cpuid_AVX512_VBMI2) + X86.HasGFNI = isSet(ecx7, cpuid_GFNI) + X86.HasAVX512BITALG = isSet(ecx7, cpuid_AVX512_BITALG) } X86.HasFSRM = isSet(edx7, cpuid_FSRM) diff --git a/src/internal/cpu/datacache_unsupported.go b/src/internal/cpu/datacache_unsupported.go new file mode 100644 index 0000000000..44544aa8c9 --- /dev/null +++ b/src/internal/cpu/datacache_unsupported.go @@ -0,0 +1,11 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !386 && !amd64 + +package cpu + +func DataCacheSizes() []uintptr { + return nil +} diff --git a/src/internal/cpu/datacache_x86.go b/src/internal/cpu/datacache_x86.go new file mode 100644 index 0000000000..eb7b93b0a2 --- /dev/null +++ b/src/internal/cpu/datacache_x86.go @@ -0,0 +1,121 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build 386 || amd64 + +package cpu + +// DataCacheSizes returns the size of each data cache from lowest +// level in the hierarchy to highest. +// +// Unlike other parts of this package's public API, it is not safe +// to reference early in runtime initialization because it allocates. +// It's intended for testing only. +func DataCacheSizes() []uintptr { + maxFunctionInformation, ebx0, ecx0, edx0 := cpuid(0, 0) + if maxFunctionInformation < 1 { + return nil + } + + switch { + // Check for "GenuineIntel" + case ebx0 == 0x756E6547 && ecx0 == 0x6C65746E && edx0 == 0x49656E69: + return getDataCacheSizesIntel(maxFunctionInformation) + // Check for "AuthenticAMD" + case ebx0 == 0x68747541 && ecx0 == 0x444D4163 && edx0 == 0x69746E65: + return getDataCacheSizesAMD() + } + return nil +} + +func extractBits(arg uint32, l int, r int) uint32 { + if l > r { + panic("bad bit range") + } + return (arg >> l) & ((1 << (r - l + 1)) - 1) +} + +func getDataCacheSizesIntel(maxID uint32) []uintptr { + // Constants for cache types + const ( + noCache = 0 + dataCache = 1 + instructionCache = 2 + unifiedCache = 3 + ) + if maxID < 4 { + return nil + } + + // Iterate through CPUID leaf 4 (deterministic cache parameters) + var caches []uintptr + for i := uint32(0); i < 0xFFFF; i++ { + eax, ebx, ecx, _ := cpuid(4, i) + + cacheType := eax & 0xF // EAX bits 4-0: Cache Type + if cacheType == 0 { + break + } + + // Report only data caches. + if !(cacheType == dataCache || cacheType == unifiedCache) { + continue + } + + // Guaranteed to always start counting from 1. + level := (eax >> 5) & 0x7 + + lineSize := extractBits(ebx, 0, 11) + 1 // Bits 11-0: Line size in bytes - 1 + partitions := extractBits(ebx, 12, 21) + 1 // Bits 21-12: Physical line partitions - 1 + ways := extractBits(ebx, 22, 31) + 1 // Bits 31-22: Ways of associativity - 1 + sets := uint64(ecx) + 1 // Number of sets - 1 + size := uint64(ways*partitions*lineSize) * sets // Calculate cache size in bytes + + caches = append(caches, uintptr(size)) + + // If we see more than one cache described per level, or they appear + // out of order, crash. + // + // Going by the SDM, it's not clear whether this is actually possible, + // so this code is purely defensive. + if level != uint32(len(caches)) { + panic("expected levels to be in order and for there to be one data/unified cache per level") + } + } + return caches +} + +func getDataCacheSizesAMD() []uintptr { + maxExtendedFunctionInformation, _, _, _ := cpuid(0x80000000, 0) + if maxExtendedFunctionInformation < 0x80000006 { + return nil + } + + var caches []uintptr + + _, _, ecx5, _ := cpuid(0x80000005, 0) + _, _, ecx6, edx6 := cpuid(0x80000006, 0) + + // The size is return in kb, turning into bytes. + l1dSize := uintptr(extractBits(ecx5, 24, 31) << 10) + caches = append(caches, l1dSize) + + // Check that L2 cache is present. + if l2Assoc := extractBits(ecx6, 12, 15); l2Assoc == 0 { + return caches + } + l2Size := uintptr(extractBits(ecx6, 16, 31) << 10) + caches = append(caches, l2Size) + + // Check that L3 cache is present. + if l3Assoc := extractBits(edx6, 12, 15); l3Assoc == 0 { + return caches + } + // Specifies the L3 cache size is within the following range: + // (L3Size[31:18] * 512KB) <= L3 cache size < ((L3Size[31:18]+1) * 512KB). + l3Size := uintptr(extractBits(edx6, 18, 31) * (512 << 10)) + caches = append(caches, l3Size) + + return caches +} diff --git a/src/internal/cpu/datacache_x86_test.go b/src/internal/cpu/datacache_x86_test.go new file mode 100644 index 0000000000..425c525be0 --- /dev/null +++ b/src/internal/cpu/datacache_x86_test.go @@ -0,0 +1,26 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build 386 || amd64 + +package cpu_test + +import ( + "internal/cpu" + "testing" +) + +// Tests fetching data cache sizes. This test only checks that DataCacheSizes +// won't explode. Otherwise it's just informational, and dumps the current +// data cache sizes. +func TestDataCacheSizes(t *testing.T) { + // N.B. Don't try to check these values because we don't know what + // kind of environment we're running in. We don't want this test to + // fail on some random x86 chip that happens to not support the right + // CPUID bits for some reason. + caches := cpu.DataCacheSizes() + for i, size := range caches { + t.Logf("L%d: %d", i+1, size) + } +} diff --git a/src/internal/goarch/goarch.go b/src/internal/goarch/goarch.go index f52fe6c42e..4da56dda9d 100644 --- a/src/internal/goarch/goarch.go +++ b/src/internal/goarch/goarch.go @@ -34,6 +34,9 @@ const ( // It is also the size of the machine's native word size (that is, 4 on 32-bit systems, 8 on 64-bit). const PtrSize = 4 << (^uintptr(0) >> 63) +// PtrSize is bit width of a pointer. +const PtrBits = PtrSize * 8 + // ArchFamily is the architecture family (AMD64, ARM, ...) const ArchFamily ArchFamilyType = _ArchFamily diff --git a/src/internal/runtime/gc/internal/gen/gen.go b/src/internal/runtime/gc/internal/gen/gen.go new file mode 100644 index 0000000000..0758f9b242 --- /dev/null +++ b/src/internal/runtime/gc/internal/gen/gen.go @@ -0,0 +1,537 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gen + +import ( + "container/heap" + "encoding/binary" + "fmt" + "hash/maphash" + "io" + "log" + "os" + "reflect" + "strings" +) + +const logCompile = true + +func fatalf(f string, args ...any) { + panic(fmt.Sprintf(f, args...)) +} + +type File struct { + w io.Writer + funcs []*Func + consts []fileConst +} + +func NewFile(w io.Writer) *File { + return &File{w: w} +} + +func (f *File) AddFunc(fn *Func) { + f.funcs = append(f.funcs, fn) +} + +type fileConst struct { + name string + data any +} + +func (f *File) AddConst(name string, data any) { + // TODO: It would be nice if this were unified with "const" ops, but the + // reason I added this was for []*Func consts, which would take an overhaul + // to represent in "const" ops. + f.consts = append(f.consts, fileConst{name, data}) +} + +type Func struct { + name string + nArgs int + idGen int + ops []*op +} + +func NewFunc(name string) *Func { + fn := &Func{name: name} + return fn +} + +// attach adds x to fn's op list. If x has any unattached arguments, this adds +// those first (recursively). +func (fn *Func) attach(x *op) { + // Make sure the arguments are attached to the function. + for _, arg := range x.args { + argFn := arg.fn + if argFn == nil { + fn.attach(arg) + } else if argFn != fn { + panic("ops from different functions") + } + } + + x.fn = fn + x.id = fn.idGen + fn.idGen++ + fn.ops = append(fn.ops, x) +} + +func Arg[W wrap[T], T Word](fn *Func) T { + loc := locReg{cls: regClassGP, reg: fn.nArgs} + fn.nArgs++ + var x W + o := &op{op: "arg", kind: x.kind(), c: loc} + fn.attach(o) + return x.wrap(o) +} + +func Return(results ...Value) { + args := make([]*op, len(results)) + for i, res := range results { + args[i] = res.getOp() + } + var x void + x.initOp(&op{op: "return", kind: voidKind, args: args}) +} + +type op struct { + op string + kind *kind + args []*op + + id int + fn *Func + + // c depends on "op". + // + // arg locReg - The register containing the argument value + // const any - The constant value + // deref int - Byte offset from args[0] + c any + name string +} + +func (o *op) String() string { + return fmt.Sprintf("v%02d", o.id) +} + +func imm(val any) *op { + return &op{op: "imm", c: val} +} + +func (o *op) equalNoName(o2 *op) bool { + if o.op != o2.op || o.c != o2.c || len(o.args) != len(o2.args) { + return false + } + for i, arg := range o.args { + if o2.args[i] != arg { + return false + } + } + return true +} + +func (o *op) write(w io.Writer) { + fmt.Fprintf(w, "v%02d = %s", o.id, o.op) + for _, arg := range o.args { + fmt.Fprintf(w, " v%02d", arg.id) + } + if o.c != nil { + fmt.Fprintf(w, " %v", o.c) + } + if o.name != "" { + fmt.Fprintf(w, " %q", o.name) + } + if o.kind != nil { + fmt.Fprintf(w, " [%s]", o.kind.typ) + } + fmt.Fprintf(w, "\n") +} + +func (fn *Func) write(w io.Writer) { + fmt.Fprintf(w, "FUNC %s\n", fn.name) + for _, op := range fn.ops { + op.write(w) + } +} + +func (f *File) Compile() { + // TODO: CSE constants across the whole file + + fmt.Fprintf(f.w, `#include "go_asm.h" +#include "textflag.h" + +`) + + for _, c := range f.consts { + f.emitConst(c.name, c.data) + } + + trace := func(fn *Func, step string) { + if !logCompile { + return + } + log.Printf("## Compiling %s: %s", fn.name, step) + fn.write(os.Stderr) + } + + for _, fn := range f.funcs { + trace(fn, "initial") + + for { + if fn.cse() { + trace(fn, "post cse") + continue + } + if fn.deadcode() { + trace(fn, "post deadcode") + continue + } + break + } + fn.addLoads() + trace(fn, "post addLoads") + + // Assigning locations requires ops to be in dependency order. + fn.schedule() + trace(fn, "post schedule") + + locs := fn.assignLocs() + + fn.emit(f, locs) + } +} + +// cse performs common subexpression elimination. +func (fn *Func) cse() bool { + // Compute structural hashes + hashes := make(map[*op]uint64) + var h maphash.Hash + var bbuf [8]byte + for _, op := range fn.ops { + // We ignore the name for canonicalization. + h.Reset() + h.WriteString(op.op) + // TODO: Ideally we would hash o1.c, but we don't have a good way to do that. + for _, arg := range op.args { + if _, ok := hashes[arg]; !ok { + panic("ops not in dependency order") + } + binary.NativeEndian.PutUint64(bbuf[:], hashes[arg]) + h.Write(bbuf[:]) + } + hashes[op] = h.Sum64() + } + + canon := make(map[uint64][]*op) + lookup := func(o *op) *op { + hash := hashes[o] + for _, o2 := range canon[hash] { + if o.equalNoName(o2) { + return o2 + } + } + canon[hash] = append(canon[hash], o) + return o + } + + // Canonicalize ops. + dirty := false + for _, op := range fn.ops { + for i, arg := range op.args { + newArg := lookup(arg) + if arg != newArg { + dirty = true + op.args[i] = newArg + } + } + } + return dirty +} + +// deadcode eliminates unused ops. +func (fn *Func) deadcode() bool { + marks := make(map[*op]bool) + var mark func(o *op) + mark = func(o *op) { + if marks[o] { + return + } + marks[o] = true + for _, arg := range o.args { + mark(arg) + } + } + // Mark operations that have a side-effect. + for _, op := range fn.ops { + switch op.op { + case "return": + mark(op) + } + } + // Discard unmarked operations + if len(marks) == len(fn.ops) { + return false + } + newOps := make([]*op, 0, len(marks)) + for _, op := range fn.ops { + if marks[op] { + newOps = append(newOps, op) + } + } + fn.ops = newOps + return true +} + +// canMem is a map from operation to a bitmap of which arguments can use a +// direct memory reference. +var canMem = map[string]uint64{ + "VPERMB": 1 << 0, + "VPERMI2B": 1 << 0, + "VPERMT2B": 1 << 0, + "VGF2P8AFFINEQB": 1 << 0, + "VPORQ": 1 << 0, + "VPSUBQ": 1 << 0, + "VPSHUFBITQMB": 1 << 0, +} + +// addLoads inserts load ops for ops that can't take memory inputs directly. +func (fn *Func) addLoads() { + // A lot of operations can directly take memory locations. If there's only a + // single reference to a deref operation, and the operation can do the deref + // itself, eliminate the deref. If there's more than one reference, then we + // leave the load so we can share the value in the register. + nRefs := fn.opRefs() + loads := make(map[*op]*op) // deref -> load + for _, o := range fn.ops { + canMask := canMem[o.op] + for i, arg := range o.args { + // TODO: Many AVX-512 operations that support memory operands also + // support a ".BCST" suffix that performs a broadcasting memory + // load. If the const can be broadcast and all uses support + // broadcast load, it would be nice to use .BCST. I'm not sure if + // that belongs in this pass or a different one. + if arg.op == "deref" || arg.op == "const" { + // These produce memory locations. + if canMask&(1< 1 { + // This argument needs to be loaded into a register. + load, ok := loads[arg] + if !ok { + load = makeLoad(arg) + fn.attach(load) + loads[arg] = load + } + o.args[i] = load + } + } + } + } +} + +func (fn *Func) opRefs() map[*op]int { + refs := make(map[*op]int) + for _, o1 := range fn.ops { + for _, arg := range o1.args { + refs[arg]++ + } + } + return refs +} + +func makeLoad(deref *op) *op { + var inst string + switch deref.kind.reg { + default: + fatalf("don't know how to load %v", deref.kind.reg) + case regClassGP: + inst = "MOVQ" + case regClassZ: + inst = "VMOVDQU64" + } + // The load references deref rather than deref.args[0] because when we + // assign locations, the deref op gets the memory location to load from, + // while its argument has some other location (like a register). Also, the + // offset to deref is attached to the deref op. + return &op{op: inst, kind: deref.kind, args: []*op{deref}} +} + +type opHeap []*op + +func (h opHeap) Len() int { return len(h) } +func (h opHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } +func (h opHeap) Less(i, j int) bool { + priority := func(o *op) int { + if o.op == "deref" || o.op == "const" { + // Input to memory load + return 1 + } + if len(o.args) > 0 && (o.args[0].op == "deref" || o.args[0].op == "const") { + // Memory load + return 2 + } + return 100 + } + if p1, p2 := priority(h[i]), priority(h[j]); p1 != p2 { + return p1 < p2 + } + return h[i].id < h[j].id +} + +func (h *opHeap) Push(x any) { + *h = append(*h, x.(*op)) +} + +func (h *opHeap) Pop() any { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// schedule ensures fn's ops are in dependency order. +func (fn *Func) schedule() { + // TODO: This tends to generate a huge amount of register pressure, mostly + // because it floats loads as early as possible and partly because it has no + // concept of rematerialization and CSE can make rematerializable values + // live for a very long time. It some sense it doesn't matter because we + // don't run out of registers for anything we need. + + missing := make(map[*op]int) + uses := make(map[*op][]*op) + var h opHeap + for _, op := range fn.ops { + if len(op.args) == 0 { + h = append(h, op) + } else { + missing[op] = len(op.args) + } + for _, arg := range op.args { + uses[arg] = append(uses[arg], op) + } + } + heap.Init(&h) + + newOps := make([]*op, 0, len(fn.ops)) + for len(h) > 0 { + if false { + log.Printf("schedule: %s", h) + } + top := h[0] + newOps = append(newOps, top) + heap.Pop(&h) + for _, o := range uses[top] { + missing[o]-- + if missing[o] == 0 { + heap.Push(&h, o) + } + } + } + if len(newOps) != len(fn.ops) { + log.Print("schedule didn't schedule all ops") + log.Print("before:") + fn.write(os.Stderr) + fn.ops = newOps + log.Print("after:") + fn.write(os.Stderr) + log.Fatal("bad schedule") + } + + fn.ops = newOps +} + +func (fn *Func) emit(f *File, locs map[*op]loc) { + w := f.w + + // Emit constants first + for _, o := range fn.ops { + if o.op == "const" { + name := locs[o].(locMem).name + f.emitConst(name, o.c) + } + } + + fmt.Fprintf(w, "TEXT %s(SB), NOSPLIT, $0-0\n", fn.name) + + // Emit body + for _, o := range fn.ops { + switch o.op { + case "const", "arg", "return", "deref", "imm": + // Does not produce code + continue + } + switch o.op { + case "addConst": + fatalf("addConst not lowered") + } + + opName := o.op + // A ".mask" suffix is used to distinguish AVX-512 ops that use the same + // mnemonic for regular and masked mode. + opName = strings.TrimSuffix(opName, ".mask") + + fmt.Fprintf(w, "\t%s", opName) + if o.op == "VGF2P8AFFINEQB" { + // Hidden immediate, but always 0 + // + // TODO: Replace this with an imm input. + fmt.Fprintf(w, " $0,") + } + for i, arg := range o.args { + if i == 0 { + fmt.Fprintf(w, " ") + } else { + fmt.Fprintf(w, ", ") + } + if arg.op == "imm" { + fmt.Fprintf(w, "$0x%x", arg.c) + } else { + fmt.Fprint(w, locs[arg].LocString()) + } + } + if _, ok := opRMW[o.op]; ok { + // Read-modify-write instructions, so the output is already in the + // arguments above. + } else { + fmt.Fprintf(w, ", %s", locs[o].LocString()) + } + fmt.Fprintf(w, "\n") + } + fmt.Fprintf(w, "\tRET\n") + fmt.Fprintf(w, "\n") +} + +func (f *File) emitConst(name string, data any) { + switch data := data.(type) { + case []*Func: + fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, len(data)*8) + for i, fn := range data { + fmt.Fprintf(f.w, "DATA %s+%#02x(SB)/8, ", name, 8*i) + if fn == nil { + fmt.Fprintf(f.w, "$0\n") + } else { + fmt.Fprintf(f.w, "$%s(SB)\n", fn.name) + } + } + fmt.Fprintf(f.w, "\n") + return + } + + // Assume it's a numeric slice or array + rv := reflect.ValueOf(data) + sz := int(rv.Type().Elem().Size()) + fmt.Fprintf(f.w, "GLOBL %s(SB), RODATA, $%#x\n", name, rv.Len()*sz) + for wi := 0; wi < sz*rv.Len()/8; wi++ { // Iterate over words + var word uint64 + for j := 0; j < 8/sz; j++ { // Iterate over elements in this word + d := rv.Index(wi*8/sz + j).Uint() + word |= d << (j * sz * 8) + } + fmt.Fprintf(f.w, "DATA %s+%#02x(SB)/8, $%#016x\n", name, 8*wi, word) + } + + fmt.Fprintf(f.w, "\n") +} diff --git a/src/internal/runtime/gc/internal/gen/gp.go b/src/internal/runtime/gc/internal/gen/gp.go new file mode 100644 index 0000000000..390d6e50ed --- /dev/null +++ b/src/internal/runtime/gc/internal/gen/gp.go @@ -0,0 +1,26 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gen + +type Uint64 struct { + valGP +} + +var kindUint64 = &kind{typ: "Uint64", reg: regClassGP} + +func ConstUint64(c uint64, name string) (y Uint64) { + y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name}) + return y +} + +func (Uint64) kind() *kind { + return kindUint64 +} + +func (Uint64) wrap(x *op) Uint64 { + var y Uint64 + y.initOp(x) + return y +} diff --git a/src/internal/runtime/gc/internal/gen/regalloc.go b/src/internal/runtime/gc/internal/gen/regalloc.go new file mode 100644 index 0000000000..424a295afb --- /dev/null +++ b/src/internal/runtime/gc/internal/gen/regalloc.go @@ -0,0 +1,338 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gen + +import ( + "fmt" + "log" + "math/bits" + "strings" +) + +const traceRegAlloc = true + +type regClass uint8 + +const ( + regClassFixed regClass = iota + regClassGP + regClassZ + regClassK + + numRegClasses + + regClassNone = ^regClass(0) +) + +type locReg struct { + cls regClass + reg int +} + +func (l locReg) LocString() string { + switch l.cls { + case regClassFixed: + return fixedRegs[l.reg] + case regClassGP: + return gpRegs[l.reg] + case regClassZ: + return fmt.Sprintf("Z%d", l.reg) + case regClassK: + return fmt.Sprintf("K%d", l.reg) + } + panic("bad register class") +} + +func (l locReg) Deref(off int) (loc, error) { + return locMem{l, off, ""}, nil +} + +func (l locReg) Reg() (locReg, bool) { + return l, true +} + +type locMem struct { + base locReg + off int + name string +} + +func (l locMem) LocString() string { + if l.base.cls == regClassFixed && l.base.reg == regSB && l.off == 0 { + return l.name + "(SB)" + } + if l.name != "" { + return fmt.Sprintf("%s+%d(%s)", l.name, l.off, l.base.LocString()) + } + if l.off != 0 { + return fmt.Sprintf("%d(%s)", l.off, l.base.LocString()) + } + return "(" + l.base.LocString() + ")" +} + +func (l locMem) Deref(off int) (loc, error) { + return nil, fmt.Errorf("cannot dereference already memory address %s", l.LocString()) +} + +func (l locMem) Reg() (locReg, bool) { + if l.base.cls == regClassFixed { + return locReg{}, false + } + return l.base, true +} + +type loc interface { + LocString() string // Return the assembly syntax for this location + Deref(off int) (loc, error) // Treat this location as an address and return a location with the contents of memory at that address + Reg() (locReg, bool) // Register used by this location +} + +var opRMW = map[string]int{ + "VPERMI2B": 2, // Overwrites third argument + "VPERMI2B.Z": 3, // Overwrites fourth argument + "VPERMI2B.mask": 3, // Overwrites fourth argument + "VPERMT2B": 1, // Overwrites second argument TODO: Check this. Unused for now. + "VPBROADCASTQ.mask": 2, // Overwrites last argument +} + +// TODO: Should we have a general rule that all ".mask" instructions overwrite +// their last argument? + +const ( + regSB = iota + regFP +) + +var fixedRegs = []string{regSB: "SB", regFP: "FP"} +var gpRegs = []string{"AX", "BX", "CX", "DI", "SI", "R8", "R9", "R10", "R11"} // ABI argument order + +type regSet struct { + inUse [numRegClasses]uint32 +} + +func (s *regSet) used(o *op, l loc) { + if l == nil { + return + } + reg, ok := l.Reg() + if !ok { + return + } + if traceRegAlloc { + log.Printf(" alloc %s @ v%02d", reg.LocString(), o.id) + } + if s.inUse[reg.cls]&(1<") + + // Create map from op -> fn.ops index + opIndexes := make(map[*op]int, len(fn.ops)) + for i, o := range fn.ops { + opIndexes[o] = i + } + + // Read-modify-write operations share a location with one of their inputs. + // Likewise, deref ops extend the lifetime of their input (but in a shared + // way, unlike RMW ops). + // + // Compute a map from each op to the earliest "canonical" op whose live + // range we'll use. + canon := make(map[*op]*op) + overwritten := make(map[*op]bool) + for _, o := range fn.ops { + // Check that this op doesn't use any overwritten inputs. + for _, arg := range o.args { + if overwritten[arg] { + // TODO: The solution to this is to insert copy ops. + fatalf("op %+v uses overwritten input %+v", o, arg) + } + } + + // Record canonical op. + rmw, ok := opRMW[o.op] + if ok { + canon[o] = canon[o.args[rmw]] + // Record that the input is dead now and must not be referenced. + overwritten[o.args[rmw]] = true + } else if o.op == "deref" { + canon[o] = canon[o.args[0]] + } else { + canon[o] = o + } + } + + // Compute live ranges of each canonical op. + // + // First, find the last use of each op. + lastUses := make(map[*op]*op) // Canonical creation op -> last use op + for _, op := range fn.ops { + for _, arg := range op.args { + lastUses[canon[arg]] = op + } + } + // Invert the last uses map to get a map from op to the (canonical) values + // that die at that op. + lastUseMap := make(map[*op][]*op) // op of last use -> (canonical) creation ops + for def, lastUse := range lastUses { + lastUseMap[lastUse] = append(lastUseMap[lastUse], def) + } + + // Prepare for assignments + regUsed := make([]regSet, len(fn.ops)) // In-use registers at each op + for i := range regUsed { + // X15/Y15/Z15 is reserved by the Go ABI + regUsed[i].inUse[regClassZ] |= 1 << 15 + // K0 is contextual (if used as an opmask, it means no mask). Too + // complicated, so just ignore it. + regUsed[i].inUse[regClassK] |= 1 << 0 + } + locs := make(map[*op]loc) + assign := func(o *op, l loc) { + if have, ok := locs[o]; ok { + fatalf("op %+v already assigned location %v (new %v)", o, have, l) + return + } + if o == canon[o] { + // Mark this location used over o's live range + for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ { + regUsed[i].used(fn.ops[i], l) + } + } + locs[o] = l + } + + // Assign fixed locations + id := 0 + for _, o := range fn.ops { + switch o.op { + case "arg": + if traceRegAlloc { + log.Printf("fixed op %+v", o) + } + assign(o, o.c.(locReg)) + case "const": + if traceRegAlloc { + log.Printf("fixed op %+v", o) + } + name := o.name + if name == "" { + name = fmt.Sprintf("%s_%d<>", nameBase, id) + id++ + } else if name[0] == '*' { + name = nameBase + name[1:] + } + assign(o, locMem{locReg{cls: regClassFixed, reg: regSB}, 0, name}) + case "return": + if traceRegAlloc { + log.Printf("fixed op %+v", o) + } + assign(o, nil) // no location + // TODO: argZ should start at 0. + argGP, argZ := 0, 1 + for _, arg := range o.args { + switch arg.kind.reg { + default: + fatalf("bad register class for return value") + case regClassGP: + assign(canon[arg], locReg{regClassGP, argGP}) + argGP++ + case regClassZ: + assign(canon[arg], locReg{regClassZ, argZ}) + argZ++ + } + } + case "imm": + assign(o, nil) // no location + } + } + + // Assign locations. + for _, o := range fn.ops { + if traceRegAlloc { + log.Printf("assign %+v", o) + } + + if _, ok := locs[o]; ok { + // Already assigned a fixed location above. + continue + } + + if o.op == "deref" { + loc, err := locs[o.args[0]].Deref(o.c.(int)) + if err != nil { + fatalf("%v", err) + } + // We don't "assign" here because we've already processed the + // canonical op, which marked loc's register as in-use for the whole + // live range. + locs[o] = loc + continue + } + + if canon[o] != o { + // Copy the canonical op's location. + locs[o] = locs[canon[o]] + continue + } + // Below here we know that o is already a canonical op. + + if _, ok := opRMW[o.op]; ok { + fatalf("read-modify-write op not canonicalized") + } + + // Find a free register of the right class. + cls := o.kind.reg + var used uint32 + for i := opIndexes[o]; i < opIndexes[lastUses[o]]; i++ { + used |= regUsed[i].inUse[cls] + } + + // Assign result location. + num := bits.TrailingZeros32(^used) + switch cls { + default: + fatalf("unknown reg class %v", cls) + case regClassGP: + if num >= len(gpRegs) { + panic("out of GP regs") + } + case regClassZ: + if num >= 32 { + panic("out of Z regs") + } + case regClassK: + if num >= 8 { + panic("out of K regs") + } + } + loc := locReg{cls, num} + assign(o, loc) + } + + return locs +} diff --git a/src/internal/runtime/gc/internal/gen/simd.go b/src/internal/runtime/gc/internal/gen/simd.go new file mode 100644 index 0000000000..0360aa4b06 --- /dev/null +++ b/src/internal/runtime/gc/internal/gen/simd.go @@ -0,0 +1,246 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gen + +type Uint8x64 struct { + valAny +} + +var kindUint8x64 = &kind{typ: "Uint8x64", reg: regClassZ} + +func ConstUint8x64(c [64]uint8, name string) (y Uint8x64) { + y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name}) + return y +} + +func (Uint8x64) kind() *kind { + return kindUint8x64 +} + +func (Uint8x64) wrap(x *op) Uint8x64 { + var y Uint8x64 + y.initOp(x) + return y +} + +func (x Uint8x64) ToUint64x8() (z Uint64x8) { + z.op = x.op + return z +} + +func (x Uint8x64) Shuffle(shuf Uint8x64) (y Uint8x64) { + if shuf.op.op == "const" { + // TODO: There are often patterns we can take advantage of here. Sometimes + // we can do a broadcast. Sometimes we can at least do a quadword + // permutation instead of a full byte permutation. + + // Range check the shuffle + for i, inp := range shuf.op.c.([64]uint8) { + // 0xff is a special "don't care" value + if !(inp == 0xff || inp < 64) { + fatalf("shuffle[%d] = %d out of range [0, %d) or 0xff", i, inp, 64) + } + } + } + + args := []*op{x.op, shuf.op} + y.initOp(&op{op: "VPERMB", kind: y.kind(), args: args}) + return y +} + +func (x Uint8x64) ShuffleZeroed(shuf Uint8x64, mask Mask64) (y Uint8x64) { + args := []*op{x.op, shuf.op, mask.op} + y.initOp(&op{op: "VPERMB.Z", kind: y.kind(), args: args}) + return y +} + +func (x Uint8x64) ShuffleMasked(shuf Uint8x64, mask Mask64) (y Uint8x64) { + args := []*op{x.op, shuf.op, mask.op} + y.initOp(&op{op: "VPERMB.mask", kind: y.kind(), args: args}) + return y +} + +// TODO: The two-argument shuffle is a little weird. You almost want the +// receiver to be the shuffle and the two arguments to be the two inputs, but +// that's almost certainly *not* what you want for the single input shuffle. + +func (x Uint8x64) Shuffle2(y Uint8x64, shuf Uint8x64) (z Uint8x64) { + // Confusingly, the inputs are in the opposite order from what you'd expect. + args := []*op{y.op, x.op, shuf.op} + z.initOp(&op{op: "VPERMI2B", kind: z.kind(), args: args}) + return z +} + +func (x Uint8x64) Shuffle2Zeroed(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) { + // Confusingly, the inputs are in the opposite order from what you'd expect. + args := []*op{y.op, x.op, mask.op, shuf.op} + z.initOp(&op{op: "VPERMI2B.Z", kind: z.kind(), args: args}) + return z +} + +func (x Uint8x64) Shuffle2Masked(y Uint8x64, shuf Uint8x64, mask Mask64) (z Uint8x64) { + // Confusingly, the inputs are in the opposite order from what you'd expect. + args := []*op{y.op, x.op, mask.op, shuf.op} + z.initOp(&op{op: "VPERMI2B.mask", kind: z.kind(), args: args}) + return z +} + +type Uint64x8 struct { + valAny +} + +var kindUint64x8 = &kind{typ: "Uint64x8", reg: regClassZ} + +func ConstUint64x8(c [8]uint64, name string) (y Uint64x8) { + // TODO: Sometimes these can be optimized into broadcast loads. + y.initOp(&op{op: "const", kind: y.kind(), c: c, name: name}) + return y +} + +func BroadcastUint64x8Zeroed(src Uint64, mask Mask8) (z Uint64x8) { + z.initOp(&op{op: "VPBROADCASTQ.Z", kind: z.kind(), args: []*op{src.op, mask.op}}) + return z +} + +func (x Uint64x8) BroadcastMasked(src Uint64, mask Mask8) (z Uint64x8) { + z.initOp(&op{op: "VPBROADCASTQ.mask", kind: z.kind(), args: []*op{src.op, mask.op, x.op}}) + return z +} + +func (Uint64x8) kind() *kind { + return kindUint64x8 +} + +func (Uint64x8) wrap(x *op) Uint64x8 { + var y Uint64x8 + y.initOp(x) + return y +} + +func (x Uint64x8) Or(y Uint64x8) (z Uint64x8) { + z.initOp(&op{op: "VPORQ", kind: z.kind(), args: []*op{y.op, x.op}}) + return z +} + +func (x Uint64x8) Sub(y Uint64x8) (z Uint64x8) { + // Arguments are backwards + z.initOp(&op{op: "VPSUBQ", kind: z.kind(), args: []*op{y.op, x.op}}) + return z +} + +func (x Uint64x8) ToUint8x64() (z Uint8x64) { + z.op = x.op + return z +} + +func (x Uint64x8) GF2P8Affine(y Uint8x64) (z Uint8x64) { + // matrix, vector + z.initOp(&op{op: "VGF2P8AFFINEQB", kind: z.kind(), args: []*op{x.op, y.op}}) + return z +} + +func (x Uint64x8) ShuffleBits(y Uint8x64) (z Mask64) { + z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op}}) + return z +} + +func (x Uint64x8) ShuffleBitsMasked(y Uint8x64, mask Mask64) (z Mask64) { + // This is always zeroing if the mask is provided. + z.initOp(&op{op: "VPSHUFBITQMB", kind: z.kind(), args: []*op{y.op, x.op, mask.op}}) + return z +} + +type Mask8 struct { + valAny +} + +var kindMask8 = &kind{typ: "Mask8", reg: regClassK} + +func ConstMask8(c uint8) (y Mask8) { + var tmp Uint64 + tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}}) + y.initOp(&op{op: "KMOVB", kind: y.kind(), args: []*op{tmp.op}}) + return y +} + +func (Mask8) kind() *kind { + return kindMask8 +} + +func (Mask8) wrap(x *op) Mask8 { + var y Mask8 + y.initOp(x) + return y +} + +func (x Mask8) ToUint8() (z Uint64) { + z.initOp(&op{op: "KMOVB", kind: z.kind(), args: []*op{x.op}}) + return z +} + +func (x Mask8) Or(y Mask8) (z Mask8) { + z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}}) + return z +} + +func (x Mask8) ShiftLeft(c uint8) (z Mask8) { + if c == 0 { + z = x + } else { + z.initOp(&op{op: "KSHIFTLB", kind: z.kind(), args: []*op{imm(c), x.op}}) + } + return z +} + +type Mask64 struct { + valAny +} + +var kindMask64 = &kind{typ: "Mask64", reg: regClassK} + +func ConstMask64(c uint64) (y Mask64) { + var tmp Uint64 + tmp.initOp(&op{op: "MOVQ", kind: tmp.kind(), args: []*op{imm(c)}}) + y.initOp(&op{op: "KMOVQ", kind: y.kind(), args: []*op{tmp.op}}) + return y +} + +func (Mask64) kind() *kind { + return kindMask64 +} + +func (Mask64) wrap(x *op) Mask64 { + var y Mask64 + y.initOp(x) + return y +} + +func (x Mask64) ToUint64() (z Uint64) { + z.initOp(&op{op: "KMOVQ", kind: z.kind(), args: []*op{x.op}}) + return z +} + +func (x Mask64) Or(y Mask64) (z Mask64) { + z.initOp(&op{op: "KORQ", kind: z.kind(), args: []*op{y.op, x.op}}) + return z +} + +func (x Mask64) ShiftLeft(c uint8) (z Mask64) { + if c == 0 { + z = x + } else { + z.initOp(&op{op: "KSHIFTLQ", kind: z.kind(), args: []*op{imm(c), x.op}}) + } + return z +} + +func (x Mask64) ShiftRight(c uint8) (z Mask64) { + if c == 0 { + z = x + } else { + z.initOp(&op{op: "KSHIFTRQ", kind: z.kind(), args: []*op{imm(c), x.op}}) + } + return z +} diff --git a/src/internal/runtime/gc/internal/gen/val.go b/src/internal/runtime/gc/internal/gen/val.go new file mode 100644 index 0000000000..24a843a62c --- /dev/null +++ b/src/internal/runtime/gc/internal/gen/val.go @@ -0,0 +1,137 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gen + +import "sync" + +type Value interface { + kind() *kind + getOp() *op +} + +type Word interface { + Value + isWord() +} + +// wrap is an unfortunate necessity so that we can pass Value types around as +// values (not pointers), but still have generic functions that can construct a +// new Value. Ideally we would just have a method on Value to initialize its op, +// but that needs to have a non-pointer receiver to satisfy the interface and +// then it can't mutate the Value. +type wrap[T Value] interface { + Value + wrap(x *op) T +} + +type kind struct { + typ string + reg regClass +} + +type void struct { + valAny +} + +var voidKind = &kind{typ: "void", reg: regClassNone} + +func (void) kind() *kind { return voidKind } + +type Ptr[T Value] struct { + valGP +} + +// Ptr is a Word +var _ Word = Ptr[void]{} + +var ptrKinds = sync.Map{} // *kind -> *kind + +func (Ptr[T]) kind() *kind { + var x T + xk := x.kind() + pk, ok := ptrKinds.Load(xk) + if !ok { + k := &kind{typ: "Ptr[" + x.kind().typ + "]", reg: regClassGP} + pk, _ = ptrKinds.LoadOrStore(xk, k) + } + return pk.(*kind) +} + +func (Ptr[T]) wrap(x *op) Ptr[T] { + var y Ptr[T] + y.initOp(x) + return y +} + +func (x Ptr[T]) AddConst(off int) (y Ptr[T]) { + base := x.op + for base.op == "addConst" { + off += base.args[1].c.(int) + base = base.args[0] + } + y.initOp(&op{op: "addConst", kind: y.kind(), args: []*op{base, imm(off)}}) + return y +} + +func Deref[W wrap[T], T Value](ptr Ptr[W]) T { + var off int + base := ptr.op + for base.op == "addConst" { + off += base.args[1].c.(int) + base = base.args[0] + } + + var y W + return y.wrap(&op{op: "deref", kind: y.kind(), args: []*op{base}, c: off}) +} + +type Array[T Value] struct { + valAny +} + +func ConstArray[T Value](vals []T, name string) (y Array[T]) { + // TODO: This probably doesn't actually work because emitConst won't + // understand vals. + y.initOp(&op{op: "const", kind: y.kind(), c: vals, name: name}) + return y +} + +func (Array[T]) kind() *kind { + // TODO: Cache this like Ptr.kind. + var x T + return &kind{typ: "Array[" + x.kind().typ + "]", reg: regClassNone} +} + +type valGP struct { + valAny +} + +func (valGP) isWord() {} + +type valAny struct { + *op +} + +func (v *valAny) initOp(x *op) { + if v.op != nil { + panic("double init of val") + } + if x.kind == nil { + panic("val missing kind") + } + v.op = x + + // Figure out this value's function. + for _, arg := range x.args { + if fn := arg.fn; fn != nil { + fn.attach(x) + break + } + } +} + +func (v valAny) getOp() *op { + return v.op +} diff --git a/src/internal/runtime/gc/malloc.go b/src/internal/runtime/gc/malloc.go index bb54fff686..7c36a6bfbe 100644 --- a/src/internal/runtime/gc/malloc.go +++ b/src/internal/runtime/gc/malloc.go @@ -7,7 +7,8 @@ package gc import "internal/goarch" const ( - ptrBits = 8 * goarch.PtrSize + // PageWords is the number of pointer-words per page. + PageWords = PageSize / goarch.PtrSize // A malloc header is functionally a single type pointer, but // we need to use 8 here to ensure 8-byte alignment of allocations @@ -43,7 +44,7 @@ const ( // would not be invariant to size-class rounding. Eschewing this property means a // more complex check or possibly storing additional state to determine whether a // span has malloc headers. - MinSizeForMallocHeader = goarch.PtrSize * ptrBits + MinSizeForMallocHeader = goarch.PtrSize * goarch.PtrBits // PageSize is the increment in which spans are managed. PageSize = 1 << PageShift diff --git a/src/internal/runtime/gc/mksizeclasses.go b/src/internal/runtime/gc/mksizeclasses.go index ea48cda469..e7b848af02 100644 --- a/src/internal/runtime/gc/mksizeclasses.go +++ b/src/internal/runtime/gc/mksizeclasses.go @@ -52,7 +52,7 @@ func main() { fmt.Fprintln(&b, "// Code generated by mksizeclasses.go; DO NOT EDIT.") fmt.Fprintln(&b, "//go:generate go run mksizeclasses.go") fmt.Fprintln(&b) - fmt.Fprintln(&b, "package runtime") + fmt.Fprintln(&b, "package gc") classes := makeClasses() printComment(&b, classes) @@ -287,6 +287,14 @@ func maxObjsPerSpan(classes []class) int { return most } +func maxNPages(classes []class) int { + most := 0 + for _, c := range classes[1:] { + most = max(most, c.npages) + } + return most +} + func printClasses(w io.Writer, classes []class) { fmt.Fprintln(w, "const (") fmt.Fprintf(w, "MinHeapAlign = %d\n", minHeapAlign) @@ -297,6 +305,7 @@ func printClasses(w io.Writer, classes []class) { fmt.Fprintf(w, "NumSizeClasses = %d\n", len(classes)) fmt.Fprintf(w, "PageShift = %d\n", pageShift) fmt.Fprintf(w, "MaxObjsPerSpan = %d\n", maxObjsPerSpan(classes)) + fmt.Fprintf(w, "MaxSizeClassNPages = %d\n", maxNPages(classes)) fmt.Fprintln(w, ")") fmt.Fprint(w, "var SizeClassToSize = [NumSizeClasses]uint16 {") diff --git a/src/internal/runtime/gc/scan/expand_amd64.go b/src/internal/runtime/gc/scan/expand_amd64.go new file mode 100644 index 0000000000..9bea471abe --- /dev/null +++ b/src/internal/runtime/gc/scan/expand_amd64.go @@ -0,0 +1,22 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan + +import "internal/runtime/gc" + +// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked, +// where f is the word size of objects in sizeClass. +// +// This is a testing entrypoint to the expanders used by scanSpanPacked*. +// +//go:noescape +func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) + +// gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly +// as they don't follow the Go ABI, but you can use this to check if a given +// expander PC is 0. +// +// It is defined in assembly. +var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr diff --git a/src/internal/runtime/gc/scan/expand_amd64.s b/src/internal/runtime/gc/scan/expand_amd64.s new file mode 100644 index 0000000000..6b0be44cc1 --- /dev/null +++ b/src/internal/runtime/gc/scan/expand_amd64.s @@ -0,0 +1,2631 @@ +// Code generated by mkasm.go. DO NOT EDIT. + +#include "go_asm.h" +#include "textflag.h" + +GLOBL ·gcExpandersAVX512(SB), RODATA, $0x220 +DATA ·gcExpandersAVX512+0x00(SB)/8, $0 +DATA ·gcExpandersAVX512+0x08(SB)/8, $expandAVX512_1<>(SB) +DATA ·gcExpandersAVX512+0x10(SB)/8, $expandAVX512_2<>(SB) +DATA ·gcExpandersAVX512+0x18(SB)/8, $expandAVX512_3<>(SB) +DATA ·gcExpandersAVX512+0x20(SB)/8, $expandAVX512_4<>(SB) +DATA ·gcExpandersAVX512+0x28(SB)/8, $expandAVX512_6<>(SB) +DATA ·gcExpandersAVX512+0x30(SB)/8, $expandAVX512_8<>(SB) +DATA ·gcExpandersAVX512+0x38(SB)/8, $expandAVX512_10<>(SB) +DATA ·gcExpandersAVX512+0x40(SB)/8, $expandAVX512_12<>(SB) +DATA ·gcExpandersAVX512+0x48(SB)/8, $expandAVX512_14<>(SB) +DATA ·gcExpandersAVX512+0x50(SB)/8, $expandAVX512_16<>(SB) +DATA ·gcExpandersAVX512+0x58(SB)/8, $expandAVX512_18<>(SB) +DATA ·gcExpandersAVX512+0x60(SB)/8, $expandAVX512_20<>(SB) +DATA ·gcExpandersAVX512+0x68(SB)/8, $expandAVX512_22<>(SB) +DATA ·gcExpandersAVX512+0x70(SB)/8, $expandAVX512_24<>(SB) +DATA ·gcExpandersAVX512+0x78(SB)/8, $expandAVX512_26<>(SB) +DATA ·gcExpandersAVX512+0x80(SB)/8, $expandAVX512_28<>(SB) +DATA ·gcExpandersAVX512+0x88(SB)/8, $expandAVX512_30<>(SB) +DATA ·gcExpandersAVX512+0x90(SB)/8, $expandAVX512_32<>(SB) +DATA ·gcExpandersAVX512+0x98(SB)/8, $expandAVX512_36<>(SB) +DATA ·gcExpandersAVX512+0xa0(SB)/8, $expandAVX512_40<>(SB) +DATA ·gcExpandersAVX512+0xa8(SB)/8, $expandAVX512_44<>(SB) +DATA ·gcExpandersAVX512+0xb0(SB)/8, $expandAVX512_48<>(SB) +DATA ·gcExpandersAVX512+0xb8(SB)/8, $expandAVX512_52<>(SB) +DATA ·gcExpandersAVX512+0xc0(SB)/8, $expandAVX512_56<>(SB) +DATA ·gcExpandersAVX512+0xc8(SB)/8, $expandAVX512_60<>(SB) +DATA ·gcExpandersAVX512+0xd0(SB)/8, $expandAVX512_64<>(SB) +DATA ·gcExpandersAVX512+0xd8(SB)/8, $0 +DATA ·gcExpandersAVX512+0xe0(SB)/8, $0 +DATA ·gcExpandersAVX512+0xe8(SB)/8, $0 +DATA ·gcExpandersAVX512+0xf0(SB)/8, $0 +DATA ·gcExpandersAVX512+0xf8(SB)/8, $0 +DATA ·gcExpandersAVX512+0x100(SB)/8, $0 +DATA ·gcExpandersAVX512+0x108(SB)/8, $0 +DATA ·gcExpandersAVX512+0x110(SB)/8, $0 +DATA ·gcExpandersAVX512+0x118(SB)/8, $0 +DATA ·gcExpandersAVX512+0x120(SB)/8, $0 +DATA ·gcExpandersAVX512+0x128(SB)/8, $0 +DATA ·gcExpandersAVX512+0x130(SB)/8, $0 +DATA ·gcExpandersAVX512+0x138(SB)/8, $0 +DATA ·gcExpandersAVX512+0x140(SB)/8, $0 +DATA ·gcExpandersAVX512+0x148(SB)/8, $0 +DATA ·gcExpandersAVX512+0x150(SB)/8, $0 +DATA ·gcExpandersAVX512+0x158(SB)/8, $0 +DATA ·gcExpandersAVX512+0x160(SB)/8, $0 +DATA ·gcExpandersAVX512+0x168(SB)/8, $0 +DATA ·gcExpandersAVX512+0x170(SB)/8, $0 +DATA ·gcExpandersAVX512+0x178(SB)/8, $0 +DATA ·gcExpandersAVX512+0x180(SB)/8, $0 +DATA ·gcExpandersAVX512+0x188(SB)/8, $0 +DATA ·gcExpandersAVX512+0x190(SB)/8, $0 +DATA ·gcExpandersAVX512+0x198(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1a0(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1a8(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1b0(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1b8(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1c0(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1c8(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1d0(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1d8(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1e0(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1e8(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1f0(SB)/8, $0 +DATA ·gcExpandersAVX512+0x1f8(SB)/8, $0 +DATA ·gcExpandersAVX512+0x200(SB)/8, $0 +DATA ·gcExpandersAVX512+0x208(SB)/8, $0 +DATA ·gcExpandersAVX512+0x210(SB)/8, $0 +DATA ·gcExpandersAVX512+0x218(SB)/8, $0 + +TEXT expandAVX512_1<>(SB), NOSPLIT, $0-0 + VMOVDQU64 (AX), Z1 + VMOVDQU64 64(AX), Z2 + RET + +GLOBL expandAVX512_2_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_2_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512_2_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512_2_inShuf0<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_2_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_2_inShuf0<>+0x20(SB)/8, $0x1716151413121110 +DATA expandAVX512_2_inShuf0<>+0x28(SB)/8, $0x1716151413121110 +DATA expandAVX512_2_inShuf0<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_2_inShuf0<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918 + +GLOBL expandAVX512_2_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_2_mat0<>+0x00(SB)/8, $0x0101020204040808 +DATA expandAVX512_2_mat0<>+0x08(SB)/8, $0x1010202040408080 +DATA expandAVX512_2_mat0<>+0x10(SB)/8, $0x0101020204040808 +DATA expandAVX512_2_mat0<>+0x18(SB)/8, $0x1010202040408080 +DATA expandAVX512_2_mat0<>+0x20(SB)/8, $0x0101020204040808 +DATA expandAVX512_2_mat0<>+0x28(SB)/8, $0x1010202040408080 +DATA expandAVX512_2_mat0<>+0x30(SB)/8, $0x0101020204040808 +DATA expandAVX512_2_mat0<>+0x38(SB)/8, $0x1010202040408080 + +GLOBL expandAVX512_2_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_2_inShuf1<>+0x00(SB)/8, $0x2726252423222120 +DATA expandAVX512_2_inShuf1<>+0x08(SB)/8, $0x2726252423222120 +DATA expandAVX512_2_inShuf1<>+0x10(SB)/8, $0x2f2e2d2c2b2a2928 +DATA expandAVX512_2_inShuf1<>+0x18(SB)/8, $0x2f2e2d2c2b2a2928 +DATA expandAVX512_2_inShuf1<>+0x20(SB)/8, $0x3736353433323130 +DATA expandAVX512_2_inShuf1<>+0x28(SB)/8, $0x3736353433323130 +DATA expandAVX512_2_inShuf1<>+0x30(SB)/8, $0x3f3e3d3c3b3a3938 +DATA expandAVX512_2_inShuf1<>+0x38(SB)/8, $0x3f3e3d3c3b3a3938 + +GLOBL expandAVX512_2_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_2_outShufLo+0x00(SB)/8, $0x0b030a0209010800 +DATA expandAVX512_2_outShufLo+0x08(SB)/8, $0x0f070e060d050c04 +DATA expandAVX512_2_outShufLo+0x10(SB)/8, $0x1b131a1219111810 +DATA expandAVX512_2_outShufLo+0x18(SB)/8, $0x1f171e161d151c14 +DATA expandAVX512_2_outShufLo+0x20(SB)/8, $0x2b232a2229212820 +DATA expandAVX512_2_outShufLo+0x28(SB)/8, $0x2f272e262d252c24 +DATA expandAVX512_2_outShufLo+0x30(SB)/8, $0x3b333a3239313830 +DATA expandAVX512_2_outShufLo+0x38(SB)/8, $0x3f373e363d353c34 + +TEXT expandAVX512_2<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_2_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_2_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512_2_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_2_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512_3_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_3_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512_3_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512_3_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512_3_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_3_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_3_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_3_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_3_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_3_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_3_mat0<>+0x00(SB)/8, $0x0101010202020404 +DATA expandAVX512_3_mat0<>+0x08(SB)/8, $0x0408080810101020 +DATA expandAVX512_3_mat0<>+0x10(SB)/8, $0x2020404040808080 +DATA expandAVX512_3_mat0<>+0x18(SB)/8, $0x0101010202020404 +DATA expandAVX512_3_mat0<>+0x20(SB)/8, $0x0408080810101020 +DATA expandAVX512_3_mat0<>+0x28(SB)/8, $0x2020404040808080 +DATA expandAVX512_3_mat0<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_3_mat0<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_3_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_3_inShuf1<>+0x00(SB)/8, $0x1716151413121110 +DATA expandAVX512_3_inShuf1<>+0x08(SB)/8, $0x1716151413121110 +DATA expandAVX512_3_inShuf1<>+0x10(SB)/8, $0x1716151413121110 +DATA expandAVX512_3_inShuf1<>+0x18(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_3_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_3_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_3_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_3_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_3_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_3_inShuf2<>+0x00(SB)/8, $0x2726252423222120 +DATA expandAVX512_3_inShuf2<>+0x08(SB)/8, $0x2726252423222120 +DATA expandAVX512_3_inShuf2<>+0x10(SB)/8, $0x2726252423222120 +DATA expandAVX512_3_inShuf2<>+0x18(SB)/8, $0xffffffffff2a2928 +DATA expandAVX512_3_inShuf2<>+0x20(SB)/8, $0xffffffffff2a2928 +DATA expandAVX512_3_inShuf2<>+0x28(SB)/8, $0xffffffffffff2928 +DATA expandAVX512_3_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_3_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_3_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_3_outShufLo+0x00(SB)/8, $0x0a02110901100800 +DATA expandAVX512_3_outShufLo+0x08(SB)/8, $0x05140c04130b0312 +DATA expandAVX512_3_outShufLo+0x10(SB)/8, $0x170f07160e06150d +DATA expandAVX512_3_outShufLo+0x18(SB)/8, $0x221a292119282018 +DATA expandAVX512_3_outShufLo+0x20(SB)/8, $0x1d2c241c2b231b2a +DATA expandAVX512_3_outShufLo+0x28(SB)/8, $0x2f271f2e261e2d25 +DATA expandAVX512_3_outShufLo+0x30(SB)/8, $0x4a42514941504840 +DATA expandAVX512_3_outShufLo+0x38(SB)/8, $0x45544c44534b4352 + +GLOBL expandAVX512_3_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_3_outShufHi+0x00(SB)/8, $0x170f07160e06150d +DATA expandAVX512_3_outShufHi+0x08(SB)/8, $0x221a292119282018 +DATA expandAVX512_3_outShufHi+0x10(SB)/8, $0x1d2c241c2b231b2a +DATA expandAVX512_3_outShufHi+0x18(SB)/8, $0x2f271f2e261e2d25 +DATA expandAVX512_3_outShufHi+0x20(SB)/8, $0x4a42514941504840 +DATA expandAVX512_3_outShufHi+0x28(SB)/8, $0x45544c44534b4352 +DATA expandAVX512_3_outShufHi+0x30(SB)/8, $0x574f47564e46554d +DATA expandAVX512_3_outShufHi+0x38(SB)/8, $0x625a696159686058 + +TEXT expandAVX512_3<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_3_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_3_mat0<>(SB), Z3 + VMOVDQU64 expandAVX512_3_inShuf1<>(SB), Z4 + VMOVDQU64 expandAVX512_3_inShuf2<>(SB), Z5 + VMOVDQU64 expandAVX512_3_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_3_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z6 + VPERMB Z6, Z0, Z0 + VGF2P8AFFINEQB $0, Z3, Z0, Z0 + VPERMB Z6, Z4, Z4 + VGF2P8AFFINEQB $0, Z3, Z4, Z4 + VPERMB Z6, Z5, Z5 + VGF2P8AFFINEQB $0, Z3, Z5, Z3 + VPERMI2B Z4, Z0, Z1 + VPERMI2B Z3, Z4, Z2 + RET + +GLOBL expandAVX512_4_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_4_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512_4_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512_4_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512_4_inShuf0<>+0x18(SB)/8, $0x0706050403020100 +DATA expandAVX512_4_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_4_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_4_inShuf0<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_4_inShuf0<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 + +GLOBL expandAVX512_4_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_4_mat0<>+0x00(SB)/8, $0x0101010102020202 +DATA expandAVX512_4_mat0<>+0x08(SB)/8, $0x0404040408080808 +DATA expandAVX512_4_mat0<>+0x10(SB)/8, $0x1010101020202020 +DATA expandAVX512_4_mat0<>+0x18(SB)/8, $0x4040404080808080 +DATA expandAVX512_4_mat0<>+0x20(SB)/8, $0x0101010102020202 +DATA expandAVX512_4_mat0<>+0x28(SB)/8, $0x0404040408080808 +DATA expandAVX512_4_mat0<>+0x30(SB)/8, $0x1010101020202020 +DATA expandAVX512_4_mat0<>+0x38(SB)/8, $0x4040404080808080 + +GLOBL expandAVX512_4_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_4_inShuf1<>+0x00(SB)/8, $0x1716151413121110 +DATA expandAVX512_4_inShuf1<>+0x08(SB)/8, $0x1716151413121110 +DATA expandAVX512_4_inShuf1<>+0x10(SB)/8, $0x1716151413121110 +DATA expandAVX512_4_inShuf1<>+0x18(SB)/8, $0x1716151413121110 +DATA expandAVX512_4_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_4_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_4_inShuf1<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_4_inShuf1<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918 + +GLOBL expandAVX512_4_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_4_outShufLo+0x00(SB)/8, $0x1911090118100800 +DATA expandAVX512_4_outShufLo+0x08(SB)/8, $0x1b130b031a120a02 +DATA expandAVX512_4_outShufLo+0x10(SB)/8, $0x1d150d051c140c04 +DATA expandAVX512_4_outShufLo+0x18(SB)/8, $0x1f170f071e160e06 +DATA expandAVX512_4_outShufLo+0x20(SB)/8, $0x3931292138302820 +DATA expandAVX512_4_outShufLo+0x28(SB)/8, $0x3b332b233a322a22 +DATA expandAVX512_4_outShufLo+0x30(SB)/8, $0x3d352d253c342c24 +DATA expandAVX512_4_outShufLo+0x38(SB)/8, $0x3f372f273e362e26 + +TEXT expandAVX512_4<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_4_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_4_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512_4_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_4_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512_6_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_6_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512_6_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512_6_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512_6_inShuf0<>+0x18(SB)/8, $0x0706050403020100 +DATA expandAVX512_6_inShuf0<>+0x20(SB)/8, $0x0706050403020100 +DATA expandAVX512_6_inShuf0<>+0x28(SB)/8, $0x0706050403020100 +DATA expandAVX512_6_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_6_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_6_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_6_mat0<>+0x00(SB)/8, $0x0101010101010202 +DATA expandAVX512_6_mat0<>+0x08(SB)/8, $0x0202020204040404 +DATA expandAVX512_6_mat0<>+0x10(SB)/8, $0x0404080808080808 +DATA expandAVX512_6_mat0<>+0x18(SB)/8, $0x1010101010102020 +DATA expandAVX512_6_mat0<>+0x20(SB)/8, $0x2020202040404040 +DATA expandAVX512_6_mat0<>+0x28(SB)/8, $0x4040808080808080 +DATA expandAVX512_6_mat0<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_6_mat0<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_6_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_6_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_6_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_6_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_6_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_6_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_6_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_6_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_6_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_6_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_6_inShuf2<>+0x00(SB)/8, $0xffff151413121110 +DATA expandAVX512_6_inShuf2<>+0x08(SB)/8, $0xffff151413121110 +DATA expandAVX512_6_inShuf2<>+0x10(SB)/8, $0xffffff1413121110 +DATA expandAVX512_6_inShuf2<>+0x18(SB)/8, $0xffffff1413121110 +DATA expandAVX512_6_inShuf2<>+0x20(SB)/8, $0xffffff1413121110 +DATA expandAVX512_6_inShuf2<>+0x28(SB)/8, $0xffffff1413121110 +DATA expandAVX512_6_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_6_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_6_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_6_outShufLo+0x00(SB)/8, $0x0901282018100800 +DATA expandAVX512_6_outShufLo+0x08(SB)/8, $0x1a120a0229211911 +DATA expandAVX512_6_outShufLo+0x10(SB)/8, $0x2b231b130b032a22 +DATA expandAVX512_6_outShufLo+0x18(SB)/8, $0x0d052c241c140c04 +DATA expandAVX512_6_outShufLo+0x20(SB)/8, $0x1e160e062d251d15 +DATA expandAVX512_6_outShufLo+0x28(SB)/8, $0x2f271f170f072e26 +DATA expandAVX512_6_outShufLo+0x30(SB)/8, $0x4941686058504840 +DATA expandAVX512_6_outShufLo+0x38(SB)/8, $0x5a524a4269615951 + +GLOBL expandAVX512_6_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_6_outShufHi+0x00(SB)/8, $0x2b231b130b032a22 +DATA expandAVX512_6_outShufHi+0x08(SB)/8, $0x0d052c241c140c04 +DATA expandAVX512_6_outShufHi+0x10(SB)/8, $0x1e160e062d251d15 +DATA expandAVX512_6_outShufHi+0x18(SB)/8, $0x2f271f170f072e26 +DATA expandAVX512_6_outShufHi+0x20(SB)/8, $0x4941686058504840 +DATA expandAVX512_6_outShufHi+0x28(SB)/8, $0x5a524a4269615951 +DATA expandAVX512_6_outShufHi+0x30(SB)/8, $0x6b635b534b436a62 +DATA expandAVX512_6_outShufHi+0x38(SB)/8, $0x4d456c645c544c44 + +TEXT expandAVX512_6<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_6_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_6_mat0<>(SB), Z3 + VMOVDQU64 expandAVX512_6_inShuf1<>(SB), Z4 + VMOVDQU64 expandAVX512_6_inShuf2<>(SB), Z5 + VMOVDQU64 expandAVX512_6_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_6_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z6 + VPERMB Z6, Z0, Z0 + VGF2P8AFFINEQB $0, Z3, Z0, Z0 + VPERMB Z6, Z4, Z4 + VGF2P8AFFINEQB $0, Z3, Z4, Z4 + VPERMB Z6, Z5, Z5 + VGF2P8AFFINEQB $0, Z3, Z5, Z3 + VPERMI2B Z4, Z0, Z1 + VPERMI2B Z3, Z4, Z2 + RET + +GLOBL expandAVX512_8_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_8_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512_8_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512_8_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512_8_inShuf0<>+0x18(SB)/8, $0x0706050403020100 +DATA expandAVX512_8_inShuf0<>+0x20(SB)/8, $0x0706050403020100 +DATA expandAVX512_8_inShuf0<>+0x28(SB)/8, $0x0706050403020100 +DATA expandAVX512_8_inShuf0<>+0x30(SB)/8, $0x0706050403020100 +DATA expandAVX512_8_inShuf0<>+0x38(SB)/8, $0x0706050403020100 + +GLOBL expandAVX512_8_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_8_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_8_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_8_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_8_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_8_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_8_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_8_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_8_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_8_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_8_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_8_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_8_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_8_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_8_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_8_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_8_inShuf1<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_8_inShuf1<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 + +GLOBL expandAVX512_8_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_8_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512_8_outShufLo+0x08(SB)/8, $0x3931292119110901 +DATA expandAVX512_8_outShufLo+0x10(SB)/8, $0x3a322a221a120a02 +DATA expandAVX512_8_outShufLo+0x18(SB)/8, $0x3b332b231b130b03 +DATA expandAVX512_8_outShufLo+0x20(SB)/8, $0x3c342c241c140c04 +DATA expandAVX512_8_outShufLo+0x28(SB)/8, $0x3d352d251d150d05 +DATA expandAVX512_8_outShufLo+0x30(SB)/8, $0x3e362e261e160e06 +DATA expandAVX512_8_outShufLo+0x38(SB)/8, $0x3f372f271f170f07 + +TEXT expandAVX512_8<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_8_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_8_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512_8_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_8_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512_10_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_10_inShuf0<>+0x00(SB)/8, $0xff06050403020100 +DATA expandAVX512_10_inShuf0<>+0x08(SB)/8, $0xff06050403020100 +DATA expandAVX512_10_inShuf0<>+0x10(SB)/8, $0xff06050403020100 +DATA expandAVX512_10_inShuf0<>+0x18(SB)/8, $0xff06050403020100 +DATA expandAVX512_10_inShuf0<>+0x20(SB)/8, $0xffff050403020100 +DATA expandAVX512_10_inShuf0<>+0x28(SB)/8, $0xffff050403020100 +DATA expandAVX512_10_inShuf0<>+0x30(SB)/8, $0xffff050403020100 +DATA expandAVX512_10_inShuf0<>+0x38(SB)/8, $0xffff050403020100 + +GLOBL expandAVX512_10_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_10_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_10_mat0<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512_10_mat0<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512_10_mat0<>+0x18(SB)/8, $0x0404040404040808 +DATA expandAVX512_10_mat0<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512_10_mat0<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512_10_mat0<>+0x30(SB)/8, $0x1010202020202020 +DATA expandAVX512_10_mat0<>+0x38(SB)/8, $0x2020202040404040 + +GLOBL expandAVX512_10_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_10_inShuf1<>+0x00(SB)/8, $0xffff050403020100 +DATA expandAVX512_10_inShuf1<>+0x08(SB)/8, $0xffff050403020100 +DATA expandAVX512_10_inShuf1<>+0x10(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512_10_inShuf1<>+0x18(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512_10_inShuf1<>+0x20(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512_10_inShuf1<>+0x28(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512_10_inShuf1<>+0x30(SB)/8, $0xffff0b0a09080706 +DATA expandAVX512_10_inShuf1<>+0x38(SB)/8, $0xffff0b0a09080706 + +GLOBL expandAVX512_10_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_10_mat1<>+0x00(SB)/8, $0x4040404040408080 +DATA expandAVX512_10_mat1<>+0x08(SB)/8, $0x8080808080808080 +DATA expandAVX512_10_mat1<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512_10_mat1<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512_10_mat1<>+0x20(SB)/8, $0x1010202020202020 +DATA expandAVX512_10_mat1<>+0x28(SB)/8, $0x2020202040404040 +DATA expandAVX512_10_mat1<>+0x30(SB)/8, $0x4040404040408080 +DATA expandAVX512_10_mat1<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_10_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_10_inShuf2<>+0x00(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512_10_inShuf2<>+0x08(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512_10_inShuf2<>+0x10(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512_10_inShuf2<>+0x18(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512_10_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_10_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_10_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_10_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_10_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_10_mat2<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_10_mat2<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512_10_mat2<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512_10_mat2<>+0x18(SB)/8, $0x0404040404040808 +DATA expandAVX512_10_mat2<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_10_mat2<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_10_mat2<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_10_mat2<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_10_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_10_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512_10_outShufLo+0x08(SB)/8, $0x2921191109014840 +DATA expandAVX512_10_outShufLo+0x10(SB)/8, $0x1a120a0249413931 +DATA expandAVX512_10_outShufLo+0x18(SB)/8, $0x0b034a423a322a22 +DATA expandAVX512_10_outShufLo+0x20(SB)/8, $0x4b433b332b231b13 +DATA expandAVX512_10_outShufLo+0x28(SB)/8, $0x3c342c241c140c04 +DATA expandAVX512_10_outShufLo+0x30(SB)/8, $0x2d251d150d054c44 +DATA expandAVX512_10_outShufLo+0x38(SB)/8, $0x1e160e064d453d35 + +GLOBL expandAVX512_10_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_10_outShufHi+0x00(SB)/8, $0x4840383028201810 +DATA expandAVX512_10_outShufHi+0x08(SB)/8, $0x3931292119115850 +DATA expandAVX512_10_outShufHi+0x10(SB)/8, $0x2a221a1259514941 +DATA expandAVX512_10_outShufHi+0x18(SB)/8, $0x1b135a524a423a32 +DATA expandAVX512_10_outShufHi+0x20(SB)/8, $0x5b534b433b332b23 +DATA expandAVX512_10_outShufHi+0x28(SB)/8, $0x4c443c342c241c14 +DATA expandAVX512_10_outShufHi+0x30(SB)/8, $0x3d352d251d155c54 +DATA expandAVX512_10_outShufHi+0x38(SB)/8, $0x2e261e165d554d45 + +TEXT expandAVX512_10<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_10_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_10_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512_10_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512_10_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_10_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_10_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_10_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_10_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512_12_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_12_inShuf0<>+0x00(SB)/8, $0xffff050403020100 +DATA expandAVX512_12_inShuf0<>+0x08(SB)/8, $0xffff050403020100 +DATA expandAVX512_12_inShuf0<>+0x10(SB)/8, $0xffff050403020100 +DATA expandAVX512_12_inShuf0<>+0x18(SB)/8, $0xffff050403020100 +DATA expandAVX512_12_inShuf0<>+0x20(SB)/8, $0xffffff0403020100 +DATA expandAVX512_12_inShuf0<>+0x28(SB)/8, $0xffffff0403020100 +DATA expandAVX512_12_inShuf0<>+0x30(SB)/8, $0xffffff0403020100 +DATA expandAVX512_12_inShuf0<>+0x38(SB)/8, $0xffffff0403020100 + +GLOBL expandAVX512_12_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_12_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_12_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_12_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_12_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_12_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_12_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_12_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_12_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_12_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_12_inShuf1<>+0x00(SB)/8, $0xffffff0403020100 +DATA expandAVX512_12_inShuf1<>+0x08(SB)/8, $0xffffff0403020100 +DATA expandAVX512_12_inShuf1<>+0x10(SB)/8, $0xffffff0403020100 +DATA expandAVX512_12_inShuf1<>+0x18(SB)/8, $0xffffff0403020100 +DATA expandAVX512_12_inShuf1<>+0x20(SB)/8, $0xffff0a0908070605 +DATA expandAVX512_12_inShuf1<>+0x28(SB)/8, $0xffff0a0908070605 +DATA expandAVX512_12_inShuf1<>+0x30(SB)/8, $0xffff0a0908070605 +DATA expandAVX512_12_inShuf1<>+0x38(SB)/8, $0xffff0a0908070605 + +GLOBL expandAVX512_12_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_12_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_12_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_12_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_12_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_12_mat1<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_12_mat1<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_12_mat1<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_12_mat1<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_12_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_12_inShuf2<>+0x00(SB)/8, $0xffffff0908070605 +DATA expandAVX512_12_inShuf2<>+0x08(SB)/8, $0xffffff0908070605 +DATA expandAVX512_12_inShuf2<>+0x10(SB)/8, $0xffffff0908070605 +DATA expandAVX512_12_inShuf2<>+0x18(SB)/8, $0xffffff0908070605 +DATA expandAVX512_12_inShuf2<>+0x20(SB)/8, $0xffffff0a09080706 +DATA expandAVX512_12_inShuf2<>+0x28(SB)/8, $0xffffff0a09080706 +DATA expandAVX512_12_inShuf2<>+0x30(SB)/8, $0xffffff0a09080706 +DATA expandAVX512_12_inShuf2<>+0x38(SB)/8, $0xffffff0a09080706 + +GLOBL expandAVX512_12_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_12_mat2<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_12_mat2<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_12_mat2<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_12_mat2<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_12_mat2<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512_12_mat2<>+0x28(SB)/8, $0x0101010102020202 +DATA expandAVX512_12_mat2<>+0x30(SB)/8, $0x0202020202020202 +DATA expandAVX512_12_mat2<>+0x38(SB)/8, $0x0404040404040404 + +GLOBL expandAVX512_12_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_12_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512_12_outShufLo+0x08(SB)/8, $0x1911090158504840 +DATA expandAVX512_12_outShufLo+0x10(SB)/8, $0x5951494139312921 +DATA expandAVX512_12_outShufLo+0x18(SB)/8, $0x3a322a221a120a02 +DATA expandAVX512_12_outShufLo+0x20(SB)/8, $0x1b130b035a524a42 +DATA expandAVX512_12_outShufLo+0x28(SB)/8, $0x5b534b433b332b23 +DATA expandAVX512_12_outShufLo+0x30(SB)/8, $0x3c342c241c140c04 +DATA expandAVX512_12_outShufLo+0x38(SB)/8, $0x1d150d055c544c44 + +GLOBL expandAVX512_12_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_12_outShufHi+0x00(SB)/8, $0x5850484038302820 +DATA expandAVX512_12_outShufHi+0x08(SB)/8, $0x3931292178706860 +DATA expandAVX512_12_outShufHi+0x10(SB)/8, $0x7971696159514941 +DATA expandAVX512_12_outShufHi+0x18(SB)/8, $0x5a524a423a322a22 +DATA expandAVX512_12_outShufHi+0x20(SB)/8, $0x3b332b237a726a62 +DATA expandAVX512_12_outShufHi+0x28(SB)/8, $0x7b736b635b534b43 +DATA expandAVX512_12_outShufHi+0x30(SB)/8, $0x5c544c443c342c24 +DATA expandAVX512_12_outShufHi+0x38(SB)/8, $0x3d352d257c746c64 + +TEXT expandAVX512_12<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_12_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_12_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512_12_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512_12_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_12_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_12_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_12_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_12_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512_14_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_14_inShuf0<>+0x00(SB)/8, $0xffffff0403020100 +DATA expandAVX512_14_inShuf0<>+0x08(SB)/8, $0xffffff0403020100 +DATA expandAVX512_14_inShuf0<>+0x10(SB)/8, $0xffffff0403020100 +DATA expandAVX512_14_inShuf0<>+0x18(SB)/8, $0xffffff0403020100 +DATA expandAVX512_14_inShuf0<>+0x20(SB)/8, $0xffffff0403020100 +DATA expandAVX512_14_inShuf0<>+0x28(SB)/8, $0xffffff0403020100 +DATA expandAVX512_14_inShuf0<>+0x30(SB)/8, $0xffffff0403020100 +DATA expandAVX512_14_inShuf0<>+0x38(SB)/8, $0xffffff0403020100 + +GLOBL expandAVX512_14_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_14_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_14_mat0<>+0x08(SB)/8, $0x0101010101010202 +DATA expandAVX512_14_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_14_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512_14_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512_14_mat0<>+0x28(SB)/8, $0x0404080808080808 +DATA expandAVX512_14_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512_14_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512_14_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_14_inShuf1<>+0x00(SB)/8, $0xffffffff03020100 +DATA expandAVX512_14_inShuf1<>+0x08(SB)/8, $0xffffffff03020100 +DATA expandAVX512_14_inShuf1<>+0x10(SB)/8, $0xffffffff03020100 +DATA expandAVX512_14_inShuf1<>+0x18(SB)/8, $0xffffffff03020100 +DATA expandAVX512_14_inShuf1<>+0x20(SB)/8, $0xffffffff03020100 +DATA expandAVX512_14_inShuf1<>+0x28(SB)/8, $0xffffffff03020100 +DATA expandAVX512_14_inShuf1<>+0x30(SB)/8, $0xffffff0807060504 +DATA expandAVX512_14_inShuf1<>+0x38(SB)/8, $0xffffff0807060504 + +GLOBL expandAVX512_14_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_14_mat1<>+0x00(SB)/8, $0x1010101010102020 +DATA expandAVX512_14_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512_14_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512_14_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512_14_mat1<>+0x20(SB)/8, $0x4040808080808080 +DATA expandAVX512_14_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512_14_mat1<>+0x30(SB)/8, $0x1010101010102020 +DATA expandAVX512_14_mat1<>+0x38(SB)/8, $0x2020202020202020 + +GLOBL expandAVX512_14_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_14_inShuf2<>+0x00(SB)/8, $0xffffff0807060504 +DATA expandAVX512_14_inShuf2<>+0x08(SB)/8, $0xffffff0807060504 +DATA expandAVX512_14_inShuf2<>+0x10(SB)/8, $0xffffff0807060504 +DATA expandAVX512_14_inShuf2<>+0x18(SB)/8, $0xffffff0807060504 +DATA expandAVX512_14_inShuf2<>+0x20(SB)/8, $0xffffff0908070605 +DATA expandAVX512_14_inShuf2<>+0x28(SB)/8, $0xffffff0908070605 +DATA expandAVX512_14_inShuf2<>+0x30(SB)/8, $0xffffffff08070605 +DATA expandAVX512_14_inShuf2<>+0x38(SB)/8, $0xffffffff08070605 + +GLOBL expandAVX512_14_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_14_mat2<>+0x00(SB)/8, $0x2020202040404040 +DATA expandAVX512_14_mat2<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_14_mat2<>+0x10(SB)/8, $0x4040808080808080 +DATA expandAVX512_14_mat2<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_14_mat2<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512_14_mat2<>+0x28(SB)/8, $0x0101010101010202 +DATA expandAVX512_14_mat2<>+0x30(SB)/8, $0x0202020202020202 +DATA expandAVX512_14_mat2<>+0x38(SB)/8, $0x0202020204040404 + +GLOBL expandAVX512_14_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_14_inShuf3<>+0x00(SB)/8, $0xffffffff08070605 +DATA expandAVX512_14_inShuf3<>+0x08(SB)/8, $0xffffffff08070605 +DATA expandAVX512_14_inShuf3<>+0x10(SB)/8, $0xffffffff08070605 +DATA expandAVX512_14_inShuf3<>+0x18(SB)/8, $0xffffffff08070605 +DATA expandAVX512_14_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_14_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_14_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_14_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_14_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_14_mat3<>+0x00(SB)/8, $0x0404040404040404 +DATA expandAVX512_14_mat3<>+0x08(SB)/8, $0x0404080808080808 +DATA expandAVX512_14_mat3<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512_14_mat3<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512_14_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_14_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_14_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_14_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_14_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_14_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512_14_outShufLo+0x08(SB)/8, $0x0901686058504840 +DATA expandAVX512_14_outShufLo+0x10(SB)/8, $0x4941393129211911 +DATA expandAVX512_14_outShufLo+0x18(SB)/8, $0x1a120a0269615951 +DATA expandAVX512_14_outShufLo+0x20(SB)/8, $0x5a524a423a322a22 +DATA expandAVX512_14_outShufLo+0x28(SB)/8, $0x2b231b130b036a62 +DATA expandAVX512_14_outShufLo+0x30(SB)/8, $0x6b635b534b433b33 +DATA expandAVX512_14_outShufLo+0x38(SB)/8, $0x3c342c241c140c04 + +GLOBL expandAVX512_14_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_14_outShufHi0+0x00(SB)/8, $0x6860585048403830 +DATA expandAVX512_14_outShufHi0+0x08(SB)/8, $0x3931ffffffff7870 +DATA expandAVX512_14_outShufHi0+0x10(SB)/8, $0x7971696159514941 +DATA expandAVX512_14_outShufHi0+0x18(SB)/8, $0x4a423a32ffffffff +DATA expandAVX512_14_outShufHi0+0x20(SB)/8, $0xffff7a726a625a52 +DATA expandAVX512_14_outShufHi0+0x28(SB)/8, $0x5b534b433b33ffff +DATA expandAVX512_14_outShufHi0+0x30(SB)/8, $0xffffffff7b736b63 +DATA expandAVX512_14_outShufHi0+0x38(SB)/8, $0x6c645c544c443c34 + +GLOBL expandAVX512_14_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_14_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_14_outShufHi1+0x08(SB)/8, $0xffff18100800ffff +DATA expandAVX512_14_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_14_outShufHi1+0x18(SB)/8, $0xffffffff19110901 +DATA expandAVX512_14_outShufHi1+0x20(SB)/8, $0x0a02ffffffffffff +DATA expandAVX512_14_outShufHi1+0x28(SB)/8, $0xffffffffffff1a12 +DATA expandAVX512_14_outShufHi1+0x30(SB)/8, $0x1b130b03ffffffff +DATA expandAVX512_14_outShufHi1+0x38(SB)/8, $0xffffffffffffffff + +TEXT expandAVX512_14<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_14_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_14_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_14_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_14_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_14_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_14_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_14_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_14_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_14_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_14_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_14_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xff0ffc3ff0ffc3ff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0xf003c00f003c00, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_16_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_16_inShuf0<>+0x00(SB)/8, $0x0303020201010000 +DATA expandAVX512_16_inShuf0<>+0x08(SB)/8, $0x0303020201010000 +DATA expandAVX512_16_inShuf0<>+0x10(SB)/8, $0x0303020201010000 +DATA expandAVX512_16_inShuf0<>+0x18(SB)/8, $0x0303020201010000 +DATA expandAVX512_16_inShuf0<>+0x20(SB)/8, $0x0303020201010000 +DATA expandAVX512_16_inShuf0<>+0x28(SB)/8, $0x0303020201010000 +DATA expandAVX512_16_inShuf0<>+0x30(SB)/8, $0x0303020201010000 +DATA expandAVX512_16_inShuf0<>+0x38(SB)/8, $0x0303020201010000 + +GLOBL expandAVX512_16_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_16_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_16_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_16_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_16_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_16_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_16_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_16_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_16_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_16_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_16_inShuf1<>+0x00(SB)/8, $0x0707060605050404 +DATA expandAVX512_16_inShuf1<>+0x08(SB)/8, $0x0707060605050404 +DATA expandAVX512_16_inShuf1<>+0x10(SB)/8, $0x0707060605050404 +DATA expandAVX512_16_inShuf1<>+0x18(SB)/8, $0x0707060605050404 +DATA expandAVX512_16_inShuf1<>+0x20(SB)/8, $0x0707060605050404 +DATA expandAVX512_16_inShuf1<>+0x28(SB)/8, $0x0707060605050404 +DATA expandAVX512_16_inShuf1<>+0x30(SB)/8, $0x0707060605050404 +DATA expandAVX512_16_inShuf1<>+0x38(SB)/8, $0x0707060605050404 + +GLOBL expandAVX512_16_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_16_outShufLo+0x00(SB)/8, $0x1918111009080100 +DATA expandAVX512_16_outShufLo+0x08(SB)/8, $0x3938313029282120 +DATA expandAVX512_16_outShufLo+0x10(SB)/8, $0x1b1a13120b0a0302 +DATA expandAVX512_16_outShufLo+0x18(SB)/8, $0x3b3a33322b2a2322 +DATA expandAVX512_16_outShufLo+0x20(SB)/8, $0x1d1c15140d0c0504 +DATA expandAVX512_16_outShufLo+0x28(SB)/8, $0x3d3c35342d2c2524 +DATA expandAVX512_16_outShufLo+0x30(SB)/8, $0x1f1e17160f0e0706 +DATA expandAVX512_16_outShufLo+0x38(SB)/8, $0x3f3e37362f2e2726 + +TEXT expandAVX512_16<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_16_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_16_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512_16_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_16_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512_18_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_18_inShuf0<>+0x00(SB)/8, $0x0303020201010000 +DATA expandAVX512_18_inShuf0<>+0x08(SB)/8, $0xffffffff03020100 +DATA expandAVX512_18_inShuf0<>+0x10(SB)/8, $0xffffffff03020100 +DATA expandAVX512_18_inShuf0<>+0x18(SB)/8, $0xffffffff03020100 +DATA expandAVX512_18_inShuf0<>+0x20(SB)/8, $0xffffffff03020100 +DATA expandAVX512_18_inShuf0<>+0x28(SB)/8, $0xffffffff03020100 +DATA expandAVX512_18_inShuf0<>+0x30(SB)/8, $0x0303020201010000 +DATA expandAVX512_18_inShuf0<>+0x38(SB)/8, $0xff03020201010000 + +GLOBL expandAVX512_18_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_18_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_18_mat0<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512_18_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_18_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512_18_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512_18_mat0<>+0x28(SB)/8, $0x0404040404040808 +DATA expandAVX512_18_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512_18_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512_18_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_18_inShuf1<>+0x00(SB)/8, $0xffffffffff020100 +DATA expandAVX512_18_inShuf1<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512_18_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 +DATA expandAVX512_18_inShuf1<>+0x18(SB)/8, $0xffffffffff020100 +DATA expandAVX512_18_inShuf1<>+0x20(SB)/8, $0xffffffffff020100 +DATA expandAVX512_18_inShuf1<>+0x28(SB)/8, $0xffff020201010000 +DATA expandAVX512_18_inShuf1<>+0x30(SB)/8, $0xff06060505040403 +DATA expandAVX512_18_inShuf1<>+0x38(SB)/8, $0xffffffff06050403 + +GLOBL expandAVX512_18_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_18_mat1<>+0x00(SB)/8, $0x1010202020202020 +DATA expandAVX512_18_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512_18_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512_18_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512_18_mat1<>+0x20(SB)/8, $0x4040404040408080 +DATA expandAVX512_18_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512_18_mat1<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_18_mat1<>+0x38(SB)/8, $0x1010202020202020 + +GLOBL expandAVX512_18_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_18_inShuf2<>+0x00(SB)/8, $0xffffffff06050403 +DATA expandAVX512_18_inShuf2<>+0x08(SB)/8, $0xffffffff06050403 +DATA expandAVX512_18_inShuf2<>+0x10(SB)/8, $0xffffffff06050403 +DATA expandAVX512_18_inShuf2<>+0x18(SB)/8, $0xffffffff06050403 +DATA expandAVX512_18_inShuf2<>+0x20(SB)/8, $0x0606050504040303 +DATA expandAVX512_18_inShuf2<>+0x28(SB)/8, $0x0707060605050404 +DATA expandAVX512_18_inShuf2<>+0x30(SB)/8, $0xffffffffff060504 +DATA expandAVX512_18_inShuf2<>+0x38(SB)/8, $0xffffffffff060504 + +GLOBL expandAVX512_18_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_18_mat2<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_18_mat2<>+0x08(SB)/8, $0x2020202040404040 +DATA expandAVX512_18_mat2<>+0x10(SB)/8, $0x4040404040404040 +DATA expandAVX512_18_mat2<>+0x18(SB)/8, $0x4040404040408080 +DATA expandAVX512_18_mat2<>+0x20(SB)/8, $0x8080808080808080 +DATA expandAVX512_18_mat2<>+0x28(SB)/8, $0x0101010101010101 +DATA expandAVX512_18_mat2<>+0x30(SB)/8, $0x0101020202020202 +DATA expandAVX512_18_mat2<>+0x38(SB)/8, $0x0202020202020202 + +GLOBL expandAVX512_18_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_18_inShuf3<>+0x00(SB)/8, $0xffffffffff060504 +DATA expandAVX512_18_inShuf3<>+0x08(SB)/8, $0xffffffffff060504 +DATA expandAVX512_18_inShuf3<>+0x10(SB)/8, $0xffffffffff060504 +DATA expandAVX512_18_inShuf3<>+0x18(SB)/8, $0xffff060605050404 +DATA expandAVX512_18_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_18_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_18_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_18_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_18_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_18_mat3<>+0x00(SB)/8, $0x0202020204040404 +DATA expandAVX512_18_mat3<>+0x08(SB)/8, $0x0404040404040404 +DATA expandAVX512_18_mat3<>+0x10(SB)/8, $0x0404040404040808 +DATA expandAVX512_18_mat3<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_18_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_18_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_18_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_18_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_18_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_18_outShufLo+0x00(SB)/8, $0x3028201810080100 +DATA expandAVX512_18_outShufLo+0x08(SB)/8, $0x6058504840393831 +DATA expandAVX512_18_outShufLo+0x10(SB)/8, $0x2119110903026968 +DATA expandAVX512_18_outShufLo+0x18(SB)/8, $0x5149413b3a333229 +DATA expandAVX512_18_outShufLo+0x20(SB)/8, $0x120a05046b6a6159 +DATA expandAVX512_18_outShufLo+0x28(SB)/8, $0x423d3c35342a221a +DATA expandAVX512_18_outShufLo+0x30(SB)/8, $0x07066d6c625a524a +DATA expandAVX512_18_outShufLo+0x38(SB)/8, $0x3e37362b231b130b + +GLOBL expandAVX512_18_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_18_outShufHi0+0x00(SB)/8, $0x6160585048403830 +DATA expandAVX512_18_outShufHi0+0x08(SB)/8, $0xffffffff78706968 +DATA expandAVX512_18_outShufHi0+0x10(SB)/8, $0x59514941393231ff +DATA expandAVX512_18_outShufHi0+0x18(SB)/8, $0xffff79716b6a6362 +DATA expandAVX512_18_outShufHi0+0x20(SB)/8, $0x4a423a3433ffffff +DATA expandAVX512_18_outShufHi0+0x28(SB)/8, $0x7a726d6c65645a52 +DATA expandAVX512_18_outShufHi0+0x30(SB)/8, $0x3b3635ffffffffff +DATA expandAVX512_18_outShufHi0+0x38(SB)/8, $0x6f6e67665b534b43 + +GLOBL expandAVX512_18_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_18_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_18_outShufHi1+0x08(SB)/8, $0x18100800ffffffff +DATA expandAVX512_18_outShufHi1+0x10(SB)/8, $0xffffffffffffff19 +DATA expandAVX512_18_outShufHi1+0x18(SB)/8, $0x0901ffffffffffff +DATA expandAVX512_18_outShufHi1+0x20(SB)/8, $0xffffffffff1b1a11 +DATA expandAVX512_18_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_18_outShufHi1+0x30(SB)/8, $0xffffff1d1c120a02 +DATA expandAVX512_18_outShufHi1+0x38(SB)/8, $0xffffffffffffffff + +TEXT expandAVX512_18<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_18_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_18_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_18_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_18_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_18_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_18_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_18_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_18_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_18_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_18_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_18_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xffe0fff83ffe0fff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x1f0007c001f000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_20_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_20_inShuf0<>+0x00(SB)/8, $0x0303020201010000 +DATA expandAVX512_20_inShuf0<>+0x08(SB)/8, $0xffffffff03020100 +DATA expandAVX512_20_inShuf0<>+0x10(SB)/8, $0xff03020201010000 +DATA expandAVX512_20_inShuf0<>+0x18(SB)/8, $0xffff020201010000 +DATA expandAVX512_20_inShuf0<>+0x20(SB)/8, $0xffffffffff020100 +DATA expandAVX512_20_inShuf0<>+0x28(SB)/8, $0xffff020201010000 +DATA expandAVX512_20_inShuf0<>+0x30(SB)/8, $0xffff020201010000 +DATA expandAVX512_20_inShuf0<>+0x38(SB)/8, $0xffffffffff020100 + +GLOBL expandAVX512_20_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_20_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_20_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_20_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_20_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_20_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_20_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_20_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_20_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_20_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_20_inShuf1<>+0x00(SB)/8, $0xffff020201010000 +DATA expandAVX512_20_inShuf1<>+0x08(SB)/8, $0xffff020201010000 +DATA expandAVX512_20_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 +DATA expandAVX512_20_inShuf1<>+0x18(SB)/8, $0xffff020201010000 +DATA expandAVX512_20_inShuf1<>+0x20(SB)/8, $0xff06060505040403 +DATA expandAVX512_20_inShuf1<>+0x28(SB)/8, $0x0606050504040303 +DATA expandAVX512_20_inShuf1<>+0x30(SB)/8, $0xffffffff06050403 +DATA expandAVX512_20_inShuf1<>+0x38(SB)/8, $0xffff050504040303 + +GLOBL expandAVX512_20_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_20_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_20_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_20_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_20_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_20_mat1<>+0x20(SB)/8, $0x0202020202020202 +DATA expandAVX512_20_mat1<>+0x28(SB)/8, $0x0404040404040404 +DATA expandAVX512_20_mat1<>+0x30(SB)/8, $0x0404040408080808 +DATA expandAVX512_20_mat1<>+0x38(SB)/8, $0x0808080808080808 + +GLOBL expandAVX512_20_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_20_inShuf2<>+0x00(SB)/8, $0xffff050504040303 +DATA expandAVX512_20_inShuf2<>+0x08(SB)/8, $0xffffffffff050403 +DATA expandAVX512_20_inShuf2<>+0x10(SB)/8, $0xffff050504040303 +DATA expandAVX512_20_inShuf2<>+0x18(SB)/8, $0xffff050504040303 +DATA expandAVX512_20_inShuf2<>+0x20(SB)/8, $0xffffffffff050403 +DATA expandAVX512_20_inShuf2<>+0x28(SB)/8, $0xffff050504040303 +DATA expandAVX512_20_inShuf2<>+0x30(SB)/8, $0xffff060605050404 +DATA expandAVX512_20_inShuf2<>+0x38(SB)/8, $0xffffffffff060504 + +GLOBL expandAVX512_20_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_20_mat2<>+0x00(SB)/8, $0x1010101010101010 +DATA expandAVX512_20_mat2<>+0x08(SB)/8, $0x1010101020202020 +DATA expandAVX512_20_mat2<>+0x10(SB)/8, $0x2020202020202020 +DATA expandAVX512_20_mat2<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512_20_mat2<>+0x20(SB)/8, $0x4040404080808080 +DATA expandAVX512_20_mat2<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512_20_mat2<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512_20_mat2<>+0x38(SB)/8, $0x0101010102020202 + +GLOBL expandAVX512_20_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_20_outShufLo+0x00(SB)/8, $0x2019181110080100 +DATA expandAVX512_20_outShufLo+0x08(SB)/8, $0x4841403831302928 +DATA expandAVX512_20_outShufLo+0x10(SB)/8, $0x1209030259585049 +DATA expandAVX512_20_outShufLo+0x18(SB)/8, $0x33322b2a211b1a13 +DATA expandAVX512_20_outShufLo+0x20(SB)/8, $0x5b5a514b4a434239 +DATA expandAVX512_20_outShufLo+0x28(SB)/8, $0x221d1c15140a0504 +DATA expandAVX512_20_outShufLo+0x30(SB)/8, $0x4c45443a35342d2c +DATA expandAVX512_20_outShufLo+0x38(SB)/8, $0x160b07065d5c524d + +GLOBL expandAVX512_20_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_20_outShufHi+0x00(SB)/8, $0x4140393830292820 +DATA expandAVX512_20_outShufHi+0x08(SB)/8, $0x6968605958515048 +DATA expandAVX512_20_outShufHi+0x10(SB)/8, $0x312b2a2221787170 +DATA expandAVX512_20_outShufHi+0x18(SB)/8, $0x5a53524943423b3a +DATA expandAVX512_20_outShufHi+0x20(SB)/8, $0x237973726b6a615b +DATA expandAVX512_20_outShufHi+0x28(SB)/8, $0x45443d3c322d2c24 +DATA expandAVX512_20_outShufHi+0x30(SB)/8, $0x6d6c625d5c55544a +DATA expandAVX512_20_outShufHi+0x38(SB)/8, $0x332f2e26257a7574 + +TEXT expandAVX512_20<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_20_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_20_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512_20_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512_20_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_20_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_20_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_20_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_20_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512_22_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_22_inShuf0<>+0x00(SB)/8, $0xffff020201010000 +DATA expandAVX512_22_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512_22_inShuf0<>+0x10(SB)/8, $0xffff020201010000 +DATA expandAVX512_22_inShuf0<>+0x18(SB)/8, $0xffffffffff020100 +DATA expandAVX512_22_inShuf0<>+0x20(SB)/8, $0xffff020201010000 +DATA expandAVX512_22_inShuf0<>+0x28(SB)/8, $0xffffffffff020100 +DATA expandAVX512_22_inShuf0<>+0x30(SB)/8, $0xffff020201010000 +DATA expandAVX512_22_inShuf0<>+0x38(SB)/8, $0xffff020201010000 + +GLOBL expandAVX512_22_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_22_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_22_mat0<>+0x08(SB)/8, $0x0101010101010202 +DATA expandAVX512_22_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_22_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512_22_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512_22_mat0<>+0x28(SB)/8, $0x0404080808080808 +DATA expandAVX512_22_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512_22_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512_22_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_22_inShuf1<>+0x00(SB)/8, $0xffffffffff020100 +DATA expandAVX512_22_inShuf1<>+0x08(SB)/8, $0xffff020201010000 +DATA expandAVX512_22_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 +DATA expandAVX512_22_inShuf1<>+0x18(SB)/8, $0xffff020201010000 +DATA expandAVX512_22_inShuf1<>+0x20(SB)/8, $0xffffffffff020100 +DATA expandAVX512_22_inShuf1<>+0x28(SB)/8, $0xffffffff01010000 +DATA expandAVX512_22_inShuf1<>+0x30(SB)/8, $0xffff040403030202 +DATA expandAVX512_22_inShuf1<>+0x38(SB)/8, $0xffff050504040303 + +GLOBL expandAVX512_22_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_22_mat1<>+0x00(SB)/8, $0x1010101010102020 +DATA expandAVX512_22_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512_22_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512_22_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512_22_mat1<>+0x20(SB)/8, $0x4040808080808080 +DATA expandAVX512_22_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512_22_mat1<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512_22_mat1<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512_22_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_22_inShuf2<>+0x00(SB)/8, $0xffffffffff050403 +DATA expandAVX512_22_inShuf2<>+0x08(SB)/8, $0xffff050504040303 +DATA expandAVX512_22_inShuf2<>+0x10(SB)/8, $0xffffffffff050403 +DATA expandAVX512_22_inShuf2<>+0x18(SB)/8, $0xffff050504040303 +DATA expandAVX512_22_inShuf2<>+0x20(SB)/8, $0xffffffffff050403 +DATA expandAVX512_22_inShuf2<>+0x28(SB)/8, $0xffff050504040303 +DATA expandAVX512_22_inShuf2<>+0x30(SB)/8, $0xffff050504040303 +DATA expandAVX512_22_inShuf2<>+0x38(SB)/8, $0xffffffffff050403 + +GLOBL expandAVX512_22_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_22_mat2<>+0x00(SB)/8, $0x0101010101010202 +DATA expandAVX512_22_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_22_mat2<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512_22_mat2<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_22_mat2<>+0x20(SB)/8, $0x0404080808080808 +DATA expandAVX512_22_mat2<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_22_mat2<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_22_mat2<>+0x38(SB)/8, $0x1010101010102020 + +GLOBL expandAVX512_22_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_22_inShuf3<>+0x00(SB)/8, $0xffff050504040303 +DATA expandAVX512_22_inShuf3<>+0x08(SB)/8, $0xffffffffff050403 +DATA expandAVX512_22_inShuf3<>+0x10(SB)/8, $0xffffff0504040303 +DATA expandAVX512_22_inShuf3<>+0x18(SB)/8, $0xffffffffffff0403 +DATA expandAVX512_22_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_22_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_22_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_22_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_22_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_22_mat3<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_22_mat3<>+0x08(SB)/8, $0x2020202040404040 +DATA expandAVX512_22_mat3<>+0x10(SB)/8, $0x4040404040404040 +DATA expandAVX512_22_mat3<>+0x18(SB)/8, $0x4040808080808080 +DATA expandAVX512_22_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_22_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_22_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_22_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_22_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_22_outShufLo+0x00(SB)/8, $0x2120181110080100 +DATA expandAVX512_22_outShufLo+0x08(SB)/8, $0x4948403938313028 +DATA expandAVX512_22_outShufLo+0x10(SB)/8, $0x0302696860595850 +DATA expandAVX512_22_outShufLo+0x18(SB)/8, $0x3229232219131209 +DATA expandAVX512_22_outShufLo+0x20(SB)/8, $0x5a514b4a413b3a33 +DATA expandAVX512_22_outShufLo+0x28(SB)/8, $0x140a05046b6a615b +DATA expandAVX512_22_outShufLo+0x30(SB)/8, $0x3c35342a25241a15 +DATA expandAVX512_22_outShufLo+0x38(SB)/8, $0x625d5c524d4c423d + +GLOBL expandAVX512_22_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_22_outShufHi0+0x00(SB)/8, $0x5049484039383130 +DATA expandAVX512_22_outShufHi0+0x08(SB)/8, $0x7871706968605958 +DATA expandAVX512_22_outShufHi0+0x10(SB)/8, $0x3332ffffffffffff +DATA expandAVX512_22_outShufHi0+0x18(SB)/8, $0x5b5a514b4a413b3a +DATA expandAVX512_22_outShufHi0+0x20(SB)/8, $0xffff7973726b6a61 +DATA expandAVX512_22_outShufHi0+0x28(SB)/8, $0x3d3c3534ffffffff +DATA expandAVX512_22_outShufHi0+0x30(SB)/8, $0x6c625d5c524d4c42 +DATA expandAVX512_22_outShufHi0+0x38(SB)/8, $0xffffffff7a75746d + +GLOBL expandAVX512_22_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_22_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_22_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_22_outShufHi1+0x10(SB)/8, $0xffff181110080100 +DATA expandAVX512_22_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_22_outShufHi1+0x20(SB)/8, $0x0302ffffffffffff +DATA expandAVX512_22_outShufHi1+0x28(SB)/8, $0xffffffff19131209 +DATA expandAVX512_22_outShufHi1+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_22_outShufHi1+0x38(SB)/8, $0x140a0504ffffffff + +TEXT expandAVX512_22<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_22_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_22_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_22_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_22_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_22_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_22_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_22_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_22_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_22_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_22_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_22_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xffff03fffc0ffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0xf0000fc0003f0000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_24_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_24_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512_24_inShuf0<>+0x08(SB)/8, $0x0202010101000000 +DATA expandAVX512_24_inShuf0<>+0x10(SB)/8, $0x0202010101000000 +DATA expandAVX512_24_inShuf0<>+0x18(SB)/8, $0x0202010101000000 +DATA expandAVX512_24_inShuf0<>+0x20(SB)/8, $0x0202010101000000 +DATA expandAVX512_24_inShuf0<>+0x28(SB)/8, $0xff02010101000000 +DATA expandAVX512_24_inShuf0<>+0x30(SB)/8, $0xffff010101000000 +DATA expandAVX512_24_inShuf0<>+0x38(SB)/8, $0xffff010101000000 + +GLOBL expandAVX512_24_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_24_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_24_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_24_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_24_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_24_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_24_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_24_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_24_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_24_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_24_inShuf1<>+0x00(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_24_inShuf1<>+0x08(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_24_inShuf1<>+0x10(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_24_inShuf1<>+0x18(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_24_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_24_inShuf1<>+0x28(SB)/8, $0x0404040303030202 +DATA expandAVX512_24_inShuf1<>+0x30(SB)/8, $0x0404030303020202 +DATA expandAVX512_24_inShuf1<>+0x38(SB)/8, $0x0404030303020202 + +GLOBL expandAVX512_24_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_24_inShuf2<>+0x00(SB)/8, $0x0505040404030303 +DATA expandAVX512_24_inShuf2<>+0x08(SB)/8, $0x0505040404030303 +DATA expandAVX512_24_inShuf2<>+0x10(SB)/8, $0x0505040404030303 +DATA expandAVX512_24_inShuf2<>+0x18(SB)/8, $0xffff040404030303 +DATA expandAVX512_24_inShuf2<>+0x20(SB)/8, $0xffff040404030303 +DATA expandAVX512_24_inShuf2<>+0x28(SB)/8, $0xffffffffffffff04 +DATA expandAVX512_24_inShuf2<>+0x30(SB)/8, $0xffffffffffffff04 +DATA expandAVX512_24_inShuf2<>+0x38(SB)/8, $0xffffffffffffff05 + +GLOBL expandAVX512_24_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_24_mat2<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_24_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_24_mat2<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_24_mat2<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_24_mat2<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_24_mat2<>+0x28(SB)/8, $0x4040404040404040 +DATA expandAVX512_24_mat2<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512_24_mat2<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512_24_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_24_inShuf3<>+0x00(SB)/8, $0xffffffffffffff05 +DATA expandAVX512_24_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_24_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_24_mat3<>+0x00(SB)/8, $0x0202020202020202 +DATA expandAVX512_24_mat3<>+0x08(SB)/8, $0x0000000000000000 +DATA expandAVX512_24_mat3<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512_24_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512_24_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_24_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_24_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_24_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_24_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_24_outShufLo+0x00(SB)/8, $0x11100a0908020100 +DATA expandAVX512_24_outShufLo+0x08(SB)/8, $0x282221201a191812 +DATA expandAVX512_24_outShufLo+0x10(SB)/8, $0x3a39383231302a29 +DATA expandAVX512_24_outShufLo+0x18(SB)/8, $0x14130d0c0b050403 +DATA expandAVX512_24_outShufLo+0x20(SB)/8, $0x2b2524231d1c1b15 +DATA expandAVX512_24_outShufLo+0x28(SB)/8, $0x3d3c3b3534332d2c +DATA expandAVX512_24_outShufLo+0x30(SB)/8, $0x1716480f0e400706 +DATA expandAVX512_24_outShufLo+0x38(SB)/8, $0x2e602726581f1e50 + +GLOBL expandAVX512_24_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_24_outShufHi0+0x00(SB)/8, $0x3a39383231302928 +DATA expandAVX512_24_outShufHi0+0x08(SB)/8, $0x51504a4948424140 +DATA expandAVX512_24_outShufHi0+0x10(SB)/8, $0x2a6261605a595852 +DATA expandAVX512_24_outShufHi0+0x18(SB)/8, $0x3d3c3b3534332c2b +DATA expandAVX512_24_outShufHi0+0x20(SB)/8, $0x54534d4c4b454443 +DATA expandAVX512_24_outShufHi0+0x28(SB)/8, $0x2d6564635d5c5b55 +DATA expandAVX512_24_outShufHi0+0x30(SB)/8, $0x703f3e6837362f2e +DATA expandAVX512_24_outShufHi0+0x38(SB)/8, $0x5756ff4f4e784746 + +GLOBL expandAVX512_24_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_24_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_outShufHi1+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_24_outShufHi1+0x38(SB)/8, $0xffff00ffffffffff + +TEXT expandAVX512_24<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_24_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_24_mat0<>(SB), Z2 + VMOVDQU64 expandAVX512_24_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512_24_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512_24_inShuf3<>(SB), Z5 + VMOVDQU64 expandAVX512_24_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_24_outShufHi0(SB), Z6 + VMOVDQU64 expandAVX512_24_outShufHi1(SB), Z7 + VMOVDQU64 (AX), Z8 + VPERMB Z8, Z0, Z0 + VGF2P8AFFINEQB $0, Z2, Z0, Z0 + VPERMB Z8, Z3, Z3 + VGF2P8AFFINEQB $0, Z2, Z3, Z2 + VPERMB Z8, Z4, Z3 + VGF2P8AFFINEQB $0, expandAVX512_24_mat2<>(SB), Z3, Z3 + VPERMB Z8, Z5, Z4 + VGF2P8AFFINEQB $0, expandAVX512_24_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xdfffffffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z6 + MOVQ $0x2000000000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z7, K1, Z0 + VPORQ Z0, Z6, Z2 + RET + +GLOBL expandAVX512_26_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_26_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512_26_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512_26_inShuf0<>+0x10(SB)/8, $0xffff020201010000 +DATA expandAVX512_26_inShuf0<>+0x18(SB)/8, $0xffffffffff020100 +DATA expandAVX512_26_inShuf0<>+0x20(SB)/8, $0xffff020201010000 +DATA expandAVX512_26_inShuf0<>+0x28(SB)/8, $0xffffffffff020100 +DATA expandAVX512_26_inShuf0<>+0x30(SB)/8, $0x0202010101000000 +DATA expandAVX512_26_inShuf0<>+0x38(SB)/8, $0xffff010101000000 + +GLOBL expandAVX512_26_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_26_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_26_mat0<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512_26_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_26_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512_26_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512_26_mat0<>+0x28(SB)/8, $0x0404040404040808 +DATA expandAVX512_26_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512_26_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512_26_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_26_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_26_inShuf1<>+0x08(SB)/8, $0xffffffff01010000 +DATA expandAVX512_26_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_26_inShuf1<>+0x18(SB)/8, $0xffffffff01010000 +DATA expandAVX512_26_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_26_inShuf1<>+0x28(SB)/8, $0xffff010101000000 +DATA expandAVX512_26_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_26_inShuf1<>+0x38(SB)/8, $0xff04040403030302 + +GLOBL expandAVX512_26_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_26_mat1<>+0x00(SB)/8, $0x1010202020202020 +DATA expandAVX512_26_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512_26_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512_26_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512_26_mat1<>+0x20(SB)/8, $0x4040404040408080 +DATA expandAVX512_26_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512_26_mat1<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512_26_mat1<>+0x38(SB)/8, $0x0808080808080808 + +GLOBL expandAVX512_26_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_26_inShuf2<>+0x00(SB)/8, $0x0404030303020202 +DATA expandAVX512_26_inShuf2<>+0x08(SB)/8, $0xffffffffff040302 +DATA expandAVX512_26_inShuf2<>+0x10(SB)/8, $0xffff040403030202 +DATA expandAVX512_26_inShuf2<>+0x18(SB)/8, $0xffffffffff040302 +DATA expandAVX512_26_inShuf2<>+0x20(SB)/8, $0xffff040403030202 +DATA expandAVX512_26_inShuf2<>+0x28(SB)/8, $0xffffffffff040302 +DATA expandAVX512_26_inShuf2<>+0x30(SB)/8, $0xff04030303020202 +DATA expandAVX512_26_inShuf2<>+0x38(SB)/8, $0xffff040404030303 + +GLOBL expandAVX512_26_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_26_mat2<>+0x00(SB)/8, $0x1010101010101010 +DATA expandAVX512_26_mat2<>+0x08(SB)/8, $0x1010202020202020 +DATA expandAVX512_26_mat2<>+0x10(SB)/8, $0x2020202020202020 +DATA expandAVX512_26_mat2<>+0x18(SB)/8, $0x2020202040404040 +DATA expandAVX512_26_mat2<>+0x20(SB)/8, $0x4040404040404040 +DATA expandAVX512_26_mat2<>+0x28(SB)/8, $0x4040404040408080 +DATA expandAVX512_26_mat2<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512_26_mat2<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512_26_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_26_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403 +DATA expandAVX512_26_inShuf3<>+0x08(SB)/8, $0xffffffff04040303 +DATA expandAVX512_26_inShuf3<>+0x10(SB)/8, $0xffffffffffff0403 +DATA expandAVX512_26_inShuf3<>+0x18(SB)/8, $0xffffffff04040303 +DATA expandAVX512_26_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403 +DATA expandAVX512_26_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04 +DATA expandAVX512_26_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_26_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_26_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_26_mat3<>+0x00(SB)/8, $0x0101020202020202 +DATA expandAVX512_26_mat3<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_26_mat3<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512_26_mat3<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_26_mat3<>+0x20(SB)/8, $0x0404040404040808 +DATA expandAVX512_26_mat3<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512_26_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_26_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_26_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_26_outShufLo+0x00(SB)/8, $0x2018111008020100 +DATA expandAVX512_26_outShufLo+0x08(SB)/8, $0x3a39383231302821 +DATA expandAVX512_26_outShufLo+0x10(SB)/8, $0x6860595850494840 +DATA expandAVX512_26_outShufLo+0x18(SB)/8, $0x1312090504036a69 +DATA expandAVX512_26_outShufLo+0x20(SB)/8, $0x3b35343329232219 +DATA expandAVX512_26_outShufLo+0x28(SB)/8, $0x5b5a514b4a413d3c +DATA expandAVX512_26_outShufLo+0x30(SB)/8, $0x0a7007066d6c6b61 +DATA expandAVX512_26_outShufLo+0x38(SB)/8, $0x37362a25241a1514 + +GLOBL expandAVX512_26_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_26_outShufHi0+0x00(SB)/8, $0x5851504842414038 +DATA expandAVX512_26_outShufHi0+0x08(SB)/8, $0x7978727170686160 +DATA expandAVX512_26_outShufHi0+0x10(SB)/8, $0xffffffffffffff7a +DATA expandAVX512_26_outShufHi0+0x18(SB)/8, $0x52494544433b3a39 +DATA expandAVX512_26_outShufHi0+0x20(SB)/8, $0x7574736963625953 +DATA expandAVX512_26_outShufHi0+0x28(SB)/8, $0xffffffffff7d7c7b +DATA expandAVX512_26_outShufHi0+0x30(SB)/8, $0xff47463e3d3cffff +DATA expandAVX512_26_outShufHi0+0x38(SB)/8, $0x766a65645a55544a + +GLOBL expandAVX512_26_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_26_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_26_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_26_outShufHi1+0x10(SB)/8, $0x20191810090800ff +DATA expandAVX512_26_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_26_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_26_outShufHi1+0x28(SB)/8, $0x1a110b0a01ffffff +DATA expandAVX512_26_outShufHi1+0x30(SB)/8, $0x28ffffffffff211b +DATA expandAVX512_26_outShufHi1+0x38(SB)/8, $0xffffffffffffffff + +TEXT expandAVX512_26<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_26_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_26_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_26_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_26_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_26_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_26_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_26_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_26_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_26_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_26_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_26_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xff7c07ffff01ffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x83f80000fe0000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_28_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_28_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512_28_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512_28_inShuf0<>+0x10(SB)/8, $0x0202010101000000 +DATA expandAVX512_28_inShuf0<>+0x18(SB)/8, $0xff02010101000000 +DATA expandAVX512_28_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_28_inShuf0<>+0x28(SB)/8, $0xffff010101000000 +DATA expandAVX512_28_inShuf0<>+0x30(SB)/8, $0xffff010101000000 +DATA expandAVX512_28_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100 + +GLOBL expandAVX512_28_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_28_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_28_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_28_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_28_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_28_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_28_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_28_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_28_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_28_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_28_inShuf1<>+0x00(SB)/8, $0xffff010101000000 +DATA expandAVX512_28_inShuf1<>+0x08(SB)/8, $0xffff010101000000 +DATA expandAVX512_28_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_28_inShuf1<>+0x18(SB)/8, $0xffff010101000000 +DATA expandAVX512_28_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_28_inShuf1<>+0x28(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_28_inShuf1<>+0x30(SB)/8, $0x0404040303030202 +DATA expandAVX512_28_inShuf1<>+0x38(SB)/8, $0xffffffffff040302 + +GLOBL expandAVX512_28_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_28_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_28_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_28_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_28_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_28_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512_28_mat1<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512_28_mat1<>+0x30(SB)/8, $0x0404040404040404 +DATA expandAVX512_28_mat1<>+0x38(SB)/8, $0x0404040408080808 + +GLOBL expandAVX512_28_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_28_inShuf2<>+0x00(SB)/8, $0x0404030303020202 +DATA expandAVX512_28_inShuf2<>+0x08(SB)/8, $0x0404030303020202 +DATA expandAVX512_28_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302 +DATA expandAVX512_28_inShuf2<>+0x18(SB)/8, $0xffff030303020202 +DATA expandAVX512_28_inShuf2<>+0x20(SB)/8, $0xffff030303020202 +DATA expandAVX512_28_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302 +DATA expandAVX512_28_inShuf2<>+0x30(SB)/8, $0xffff030303020202 +DATA expandAVX512_28_inShuf2<>+0x38(SB)/8, $0xffff040404030303 + +GLOBL expandAVX512_28_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_28_mat2<>+0x00(SB)/8, $0x0808080808080808 +DATA expandAVX512_28_mat2<>+0x08(SB)/8, $0x1010101010101010 +DATA expandAVX512_28_mat2<>+0x10(SB)/8, $0x1010101020202020 +DATA expandAVX512_28_mat2<>+0x18(SB)/8, $0x2020202020202020 +DATA expandAVX512_28_mat2<>+0x20(SB)/8, $0x4040404040404040 +DATA expandAVX512_28_mat2<>+0x28(SB)/8, $0x4040404080808080 +DATA expandAVX512_28_mat2<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512_28_mat2<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512_28_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_28_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403 +DATA expandAVX512_28_inShuf3<>+0x08(SB)/8, $0xffff040404030303 +DATA expandAVX512_28_inShuf3<>+0x10(SB)/8, $0xffffffffffffff04 +DATA expandAVX512_28_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_28_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_28_mat3<>+0x00(SB)/8, $0x0101010102020202 +DATA expandAVX512_28_mat3<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_28_mat3<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512_28_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512_28_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_28_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_28_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_28_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_28_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_28_outShufLo+0x00(SB)/8, $0x1812111008020100 +DATA expandAVX512_28_outShufLo+0x08(SB)/8, $0x31302a2928201a19 +DATA expandAVX512_28_outShufLo+0x10(SB)/8, $0x4a49484241403832 +DATA expandAVX512_28_outShufLo+0x18(SB)/8, $0x090504035a595850 +DATA expandAVX512_28_outShufLo+0x20(SB)/8, $0x2b211d1c1b151413 +DATA expandAVX512_28_outShufLo+0x28(SB)/8, $0x4443393534332d2c +DATA expandAVX512_28_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b45 +DATA expandAVX512_28_outShufLo+0x38(SB)/8, $0x1e6817160a600706 + +GLOBL expandAVX512_28_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_28_outShufHi0+0x00(SB)/8, $0x4948424140383130 +DATA expandAVX512_28_outShufHi0+0x08(SB)/8, $0x6261605a5958504a +DATA expandAVX512_28_outShufHi0+0x10(SB)/8, $0xff7a797872717068 +DATA expandAVX512_28_outShufHi0+0x18(SB)/8, $0x4339343332ffffff +DATA expandAVX512_28_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b4544 +DATA expandAVX512_28_outShufHi0+0x28(SB)/8, $0x757473696564635d +DATA expandAVX512_28_outShufHi0+0x30(SB)/8, $0x35ffffffff7d7c7b +DATA expandAVX512_28_outShufHi0+0x38(SB)/8, $0x4f4eff47463a3736 + +GLOBL expandAVX512_28_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_28_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_outShufHi1+0x10(SB)/8, $0x00ffffffffffffff +DATA expandAVX512_28_outShufHi1+0x18(SB)/8, $0xffffffffff0a0908 +DATA expandAVX512_28_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_28_outShufHi1+0x30(SB)/8, $0xff0d0c0b01ffffff +DATA expandAVX512_28_outShufHi1+0x38(SB)/8, $0xffff10ffffffffff + +TEXT expandAVX512_28<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_28_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_28_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_28_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_28_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_28_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_28_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_28_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_28_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_28_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_28_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_28_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xdf87fffff87fffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x2078000007800000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_30_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_30_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512_30_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512_30_inShuf0<>+0x10(SB)/8, $0xffff010101000000 +DATA expandAVX512_30_inShuf0<>+0x18(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_30_inShuf0<>+0x20(SB)/8, $0xffff010101000000 +DATA expandAVX512_30_inShuf0<>+0x28(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_30_inShuf0<>+0x30(SB)/8, $0xffff010101000000 +DATA expandAVX512_30_inShuf0<>+0x38(SB)/8, $0xffff010101000000 + +GLOBL expandAVX512_30_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_30_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_30_mat0<>+0x08(SB)/8, $0x0101010101010202 +DATA expandAVX512_30_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_30_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512_30_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512_30_mat0<>+0x28(SB)/8, $0x0404080808080808 +DATA expandAVX512_30_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512_30_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512_30_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_30_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_30_inShuf1<>+0x08(SB)/8, $0xffff010101000000 +DATA expandAVX512_30_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_30_inShuf1<>+0x18(SB)/8, $0xffff010101000000 +DATA expandAVX512_30_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_30_inShuf1<>+0x28(SB)/8, $0xffff010101000000 +DATA expandAVX512_30_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_30_inShuf1<>+0x38(SB)/8, $0x0404030303020202 + +GLOBL expandAVX512_30_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_30_mat1<>+0x00(SB)/8, $0x1010101010102020 +DATA expandAVX512_30_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512_30_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512_30_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512_30_mat1<>+0x20(SB)/8, $0x4040808080808080 +DATA expandAVX512_30_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512_30_mat1<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512_30_mat1<>+0x38(SB)/8, $0x0202020202020202 + +GLOBL expandAVX512_30_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_30_inShuf2<>+0x00(SB)/8, $0xffffffffff040302 +DATA expandAVX512_30_inShuf2<>+0x08(SB)/8, $0xffff030303020202 +DATA expandAVX512_30_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302 +DATA expandAVX512_30_inShuf2<>+0x18(SB)/8, $0xffff030303020202 +DATA expandAVX512_30_inShuf2<>+0x20(SB)/8, $0xffff030303020202 +DATA expandAVX512_30_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302 +DATA expandAVX512_30_inShuf2<>+0x30(SB)/8, $0xffff030303020202 +DATA expandAVX512_30_inShuf2<>+0x38(SB)/8, $0xffffffffffff0302 + +GLOBL expandAVX512_30_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_30_mat2<>+0x00(SB)/8, $0x0202020204040404 +DATA expandAVX512_30_mat2<>+0x08(SB)/8, $0x0404040404040404 +DATA expandAVX512_30_mat2<>+0x10(SB)/8, $0x0404080808080808 +DATA expandAVX512_30_mat2<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_30_mat2<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_30_mat2<>+0x28(SB)/8, $0x1010101010102020 +DATA expandAVX512_30_mat2<>+0x30(SB)/8, $0x2020202020202020 +DATA expandAVX512_30_mat2<>+0x38(SB)/8, $0x2020202040404040 + +GLOBL expandAVX512_30_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_30_inShuf3<>+0x00(SB)/8, $0xffff030303020202 +DATA expandAVX512_30_inShuf3<>+0x08(SB)/8, $0xffffffffffff0302 +DATA expandAVX512_30_inShuf3<>+0x10(SB)/8, $0xffff030303020202 +DATA expandAVX512_30_inShuf3<>+0x18(SB)/8, $0xffff040404030303 +DATA expandAVX512_30_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403 +DATA expandAVX512_30_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04 +DATA expandAVX512_30_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_30_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_30_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_30_mat3<>+0x00(SB)/8, $0x4040404040404040 +DATA expandAVX512_30_mat3<>+0x08(SB)/8, $0x4040808080808080 +DATA expandAVX512_30_mat3<>+0x10(SB)/8, $0x8080808080808080 +DATA expandAVX512_30_mat3<>+0x18(SB)/8, $0x0101010101010101 +DATA expandAVX512_30_mat3<>+0x20(SB)/8, $0x0101010101010202 +DATA expandAVX512_30_mat3<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512_30_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_30_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_30_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_30_outShufLo+0x00(SB)/8, $0x1812111008020100 +DATA expandAVX512_30_outShufLo+0x08(SB)/8, $0x3832313028222120 +DATA expandAVX512_30_outShufLo+0x10(SB)/8, $0x58504a4948403a39 +DATA expandAVX512_30_outShufLo+0x18(SB)/8, $0x04036a6968605a59 +DATA expandAVX512_30_outShufLo+0x20(SB)/8, $0x2423191514130905 +DATA expandAVX512_30_outShufLo+0x28(SB)/8, $0x3d3c3b3534332925 +DATA expandAVX512_30_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b41 +DATA expandAVX512_30_outShufLo+0x38(SB)/8, $0x0a7007066d6c6b61 + +GLOBL expandAVX512_30_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_30_outShufHi0+0x00(SB)/8, $0x504a4948403a3938 +DATA expandAVX512_30_outShufHi0+0x08(SB)/8, $0x70686261605a5958 +DATA expandAVX512_30_outShufHi0+0x10(SB)/8, $0xffffffffff787271 +DATA expandAVX512_30_outShufHi0+0x18(SB)/8, $0x3c3bffffffffffff +DATA expandAVX512_30_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b413d +DATA expandAVX512_30_outShufHi0+0x28(SB)/8, $0x757473696564635d +DATA expandAVX512_30_outShufHi0+0x30(SB)/8, $0xffffffffffffff79 +DATA expandAVX512_30_outShufHi0+0x38(SB)/8, $0x42ff3f3effffffff + +GLOBL expandAVX512_30_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_30_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_30_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_30_outShufHi1+0x10(SB)/8, $0x1008020100ffffff +DATA expandAVX512_30_outShufHi1+0x18(SB)/8, $0xffff201a19181211 +DATA expandAVX512_30_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_30_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_30_outShufHi1+0x30(SB)/8, $0x15141309050403ff +DATA expandAVX512_30_outShufHi1+0x38(SB)/8, $0xff28ffff211d1c1b + +TEXT expandAVX512_30<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_30_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_30_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_30_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_30_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_30_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_30_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_30_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_30_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_30_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_30_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_30_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xb001ffffc007ffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x4ffe00003ff80000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_32_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_32_inShuf0<>+0x00(SB)/8, $0x0101010100000000 +DATA expandAVX512_32_inShuf0<>+0x08(SB)/8, $0x0101010100000000 +DATA expandAVX512_32_inShuf0<>+0x10(SB)/8, $0x0101010100000000 +DATA expandAVX512_32_inShuf0<>+0x18(SB)/8, $0x0101010100000000 +DATA expandAVX512_32_inShuf0<>+0x20(SB)/8, $0x0101010100000000 +DATA expandAVX512_32_inShuf0<>+0x28(SB)/8, $0x0101010100000000 +DATA expandAVX512_32_inShuf0<>+0x30(SB)/8, $0x0101010100000000 +DATA expandAVX512_32_inShuf0<>+0x38(SB)/8, $0x0101010100000000 + +GLOBL expandAVX512_32_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_32_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_32_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_32_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_32_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_32_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_32_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_32_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_32_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_32_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_32_inShuf1<>+0x00(SB)/8, $0x0303030302020202 +DATA expandAVX512_32_inShuf1<>+0x08(SB)/8, $0x0303030302020202 +DATA expandAVX512_32_inShuf1<>+0x10(SB)/8, $0x0303030302020202 +DATA expandAVX512_32_inShuf1<>+0x18(SB)/8, $0x0303030302020202 +DATA expandAVX512_32_inShuf1<>+0x20(SB)/8, $0x0303030302020202 +DATA expandAVX512_32_inShuf1<>+0x28(SB)/8, $0x0303030302020202 +DATA expandAVX512_32_inShuf1<>+0x30(SB)/8, $0x0303030302020202 +DATA expandAVX512_32_inShuf1<>+0x38(SB)/8, $0x0303030302020202 + +GLOBL expandAVX512_32_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_32_outShufLo+0x00(SB)/8, $0x0b0a090803020100 +DATA expandAVX512_32_outShufLo+0x08(SB)/8, $0x1b1a191813121110 +DATA expandAVX512_32_outShufLo+0x10(SB)/8, $0x2b2a292823222120 +DATA expandAVX512_32_outShufLo+0x18(SB)/8, $0x3b3a393833323130 +DATA expandAVX512_32_outShufLo+0x20(SB)/8, $0x0f0e0d0c07060504 +DATA expandAVX512_32_outShufLo+0x28(SB)/8, $0x1f1e1d1c17161514 +DATA expandAVX512_32_outShufLo+0x30(SB)/8, $0x2f2e2d2c27262524 +DATA expandAVX512_32_outShufLo+0x38(SB)/8, $0x3f3e3d3c37363534 + +TEXT expandAVX512_32<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_32_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_32_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512_32_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_32_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512_36_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_36_inShuf0<>+0x00(SB)/8, $0x0101010100000000 +DATA expandAVX512_36_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_36_inShuf0<>+0x10(SB)/8, $0x0101010100000000 +DATA expandAVX512_36_inShuf0<>+0x18(SB)/8, $0x0101010100000000 +DATA expandAVX512_36_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_36_inShuf0<>+0x28(SB)/8, $0x0101010100000000 +DATA expandAVX512_36_inShuf0<>+0x30(SB)/8, $0x0101010100000000 +DATA expandAVX512_36_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100 + +GLOBL expandAVX512_36_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_36_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_36_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_36_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_36_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_36_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_36_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_36_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_36_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_36_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_36_inShuf1<>+0x00(SB)/8, $0x0101010100000000 +DATA expandAVX512_36_inShuf1<>+0x08(SB)/8, $0xffffff0100000000 +DATA expandAVX512_36_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512_36_inShuf1<>+0x18(SB)/8, $0xffffffff00000000 +DATA expandAVX512_36_inShuf1<>+0x20(SB)/8, $0xff02020202010101 +DATA expandAVX512_36_inShuf1<>+0x28(SB)/8, $0xffffffffffff0201 +DATA expandAVX512_36_inShuf1<>+0x30(SB)/8, $0x0202020201010101 +DATA expandAVX512_36_inShuf1<>+0x38(SB)/8, $0x0303030302020202 + +GLOBL expandAVX512_36_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_36_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_36_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_36_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_36_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_36_mat1<>+0x20(SB)/8, $0x4040404040404040 +DATA expandAVX512_36_mat1<>+0x28(SB)/8, $0x4040404080808080 +DATA expandAVX512_36_mat1<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512_36_mat1<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512_36_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_36_inShuf2<>+0x00(SB)/8, $0xffffffffffff0302 +DATA expandAVX512_36_inShuf2<>+0x08(SB)/8, $0x0303030302020202 +DATA expandAVX512_36_inShuf2<>+0x10(SB)/8, $0x0303030302020202 +DATA expandAVX512_36_inShuf2<>+0x18(SB)/8, $0xffffffffffff0302 +DATA expandAVX512_36_inShuf2<>+0x20(SB)/8, $0x0303030302020202 +DATA expandAVX512_36_inShuf2<>+0x28(SB)/8, $0xffff030302020202 +DATA expandAVX512_36_inShuf2<>+0x30(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_36_inShuf2<>+0x38(SB)/8, $0xffffffff02020202 + +GLOBL expandAVX512_36_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_36_mat2<>+0x00(SB)/8, $0x0101010102020202 +DATA expandAVX512_36_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_36_mat2<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_36_mat2<>+0x18(SB)/8, $0x0404040408080808 +DATA expandAVX512_36_mat2<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512_36_mat2<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512_36_mat2<>+0x30(SB)/8, $0x1010101020202020 +DATA expandAVX512_36_mat2<>+0x38(SB)/8, $0x2020202020202020 + +GLOBL expandAVX512_36_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_36_outShufLo+0x00(SB)/8, $0x1211100803020100 +DATA expandAVX512_36_outShufLo+0x08(SB)/8, $0x2928201b1a191813 +DATA expandAVX512_36_outShufLo+0x10(SB)/8, $0x4038333231302b2a +DATA expandAVX512_36_outShufLo+0x18(SB)/8, $0x504b4a4948434241 +DATA expandAVX512_36_outShufLo+0x20(SB)/8, $0x070605045b5a5958 +DATA expandAVX512_36_outShufLo+0x28(SB)/8, $0x1e1d1c1716151409 +DATA expandAVX512_36_outShufLo+0x30(SB)/8, $0x35342f2e2d2c211f +DATA expandAVX512_36_outShufLo+0x38(SB)/8, $0x4c47464544393736 + +GLOBL expandAVX512_36_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_36_outShufHi+0x00(SB)/8, $0x3332313028222120 +DATA expandAVX512_36_outShufHi+0x08(SB)/8, $0x4a4948403b3a3938 +DATA expandAVX512_36_outShufHi+0x10(SB)/8, $0x616058535251504b +DATA expandAVX512_36_outShufHi+0x18(SB)/8, $0x78706b6a69686362 +DATA expandAVX512_36_outShufHi+0x20(SB)/8, $0x29262524237b7a79 +DATA expandAVX512_36_outShufHi+0x28(SB)/8, $0x3f3e3d3c37363534 +DATA expandAVX512_36_outShufHi+0x30(SB)/8, $0x5655544f4e4d4c41 +DATA expandAVX512_36_outShufHi+0x38(SB)/8, $0x6d6c676665645957 + +TEXT expandAVX512_36<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_36_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_36_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512_36_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512_36_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_36_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_36_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_36_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_36_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512_40_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_40_inShuf0<>+0x00(SB)/8, $0x0101010000000000 +DATA expandAVX512_40_inShuf0<>+0x08(SB)/8, $0x0101010000000000 +DATA expandAVX512_40_inShuf0<>+0x10(SB)/8, $0x0101010000000000 +DATA expandAVX512_40_inShuf0<>+0x18(SB)/8, $0x0101010000000000 +DATA expandAVX512_40_inShuf0<>+0x20(SB)/8, $0x0101010000000000 +DATA expandAVX512_40_inShuf0<>+0x28(SB)/8, $0xffffff0000000000 +DATA expandAVX512_40_inShuf0<>+0x30(SB)/8, $0xffffff0000000000 +DATA expandAVX512_40_inShuf0<>+0x38(SB)/8, $0xffffff0000000000 + +GLOBL expandAVX512_40_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_40_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_40_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_40_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_40_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_40_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_40_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_40_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_40_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_40_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_40_inShuf1<>+0x00(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_40_inShuf1<>+0x08(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_40_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_40_inShuf1<>+0x18(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_40_inShuf1<>+0x20(SB)/8, $0xffffffffffffff01 +DATA expandAVX512_40_inShuf1<>+0x28(SB)/8, $0xffff020202020201 +DATA expandAVX512_40_inShuf1<>+0x30(SB)/8, $0x0202020101010101 +DATA expandAVX512_40_inShuf1<>+0x38(SB)/8, $0x0202020101010101 + +GLOBL expandAVX512_40_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_40_mat1<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_40_mat1<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_40_mat1<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_40_mat1<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_40_mat1<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_40_mat1<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512_40_mat1<>+0x30(SB)/8, $0x2020202020202020 +DATA expandAVX512_40_mat1<>+0x38(SB)/8, $0x4040404040404040 + +GLOBL expandAVX512_40_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_40_inShuf2<>+0x00(SB)/8, $0x0202020101010101 +DATA expandAVX512_40_inShuf2<>+0x08(SB)/8, $0x0303030202020202 +DATA expandAVX512_40_inShuf2<>+0x10(SB)/8, $0x0303030202020202 +DATA expandAVX512_40_inShuf2<>+0x18(SB)/8, $0xffffff0202020202 +DATA expandAVX512_40_inShuf2<>+0x20(SB)/8, $0xffffff0202020202 +DATA expandAVX512_40_inShuf2<>+0x28(SB)/8, $0xffffffffffff0202 +DATA expandAVX512_40_inShuf2<>+0x30(SB)/8, $0xffffffffffff0202 +DATA expandAVX512_40_inShuf2<>+0x38(SB)/8, $0xffffffffffff0202 + +GLOBL expandAVX512_40_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_40_mat2<>+0x00(SB)/8, $0x8080808080808080 +DATA expandAVX512_40_mat2<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512_40_mat2<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_40_mat2<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_40_mat2<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512_40_mat2<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_40_mat2<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_40_mat2<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_40_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_40_inShuf3<>+0x00(SB)/8, $0xffffffffffff0303 +DATA expandAVX512_40_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_40_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_40_mat3<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_40_mat3<>+0x08(SB)/8, $0x0000000000000000 +DATA expandAVX512_40_mat3<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512_40_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512_40_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_40_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_40_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_40_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_40_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_40_outShufLo+0x00(SB)/8, $0x0a09080403020100 +DATA expandAVX512_40_outShufLo+0x08(SB)/8, $0x1814131211100c0b +DATA expandAVX512_40_outShufLo+0x10(SB)/8, $0x232221201c1b1a19 +DATA expandAVX512_40_outShufLo+0x18(SB)/8, $0x31302c2b2a292824 +DATA expandAVX512_40_outShufLo+0x20(SB)/8, $0x3c3b3a3938343332 +DATA expandAVX512_40_outShufLo+0x28(SB)/8, $0x0f0e0d4140070605 +DATA expandAVX512_40_outShufLo+0x30(SB)/8, $0x1d51501716154948 +DATA expandAVX512_40_outShufLo+0x38(SB)/8, $0x6027262559581f1e + +GLOBL expandAVX512_40_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_40_outShufHi0+0x00(SB)/8, $0x3938343332313028 +DATA expandAVX512_40_outShufHi0+0x08(SB)/8, $0x44434241403c3b3a +DATA expandAVX512_40_outShufHi0+0x10(SB)/8, $0x5251504c4b4a4948 +DATA expandAVX512_40_outShufHi0+0x18(SB)/8, $0x605c5b5a59585453 +DATA expandAVX512_40_outShufHi0+0x20(SB)/8, $0x2c2b2a2964636261 +DATA expandAVX512_40_outShufHi0+0x28(SB)/8, $0x3e3d69683736352d +DATA expandAVX512_40_outShufHi0+0x30(SB)/8, $0x797847464571703f +DATA expandAVX512_40_outShufHi0+0x38(SB)/8, $0x575655ffff4f4e4d + +GLOBL expandAVX512_40_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_40_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_outShufHi1+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_40_outShufHi1+0x38(SB)/8, $0xffffff0100ffffff + +TEXT expandAVX512_40<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_40_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_40_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_40_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_40_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_40_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_40_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_40_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_40_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_40_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_40_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_40_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xe7ffffffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x1800000000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_44_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_44_inShuf0<>+0x00(SB)/8, $0x0101010000000000 +DATA expandAVX512_44_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_44_inShuf0<>+0x10(SB)/8, $0x0101010000000000 +DATA expandAVX512_44_inShuf0<>+0x18(SB)/8, $0x0101010000000000 +DATA expandAVX512_44_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_44_inShuf0<>+0x28(SB)/8, $0x0101010000000000 +DATA expandAVX512_44_inShuf0<>+0x30(SB)/8, $0xffffff0000000000 +DATA expandAVX512_44_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 + +GLOBL expandAVX512_44_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_44_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_44_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_44_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_44_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_44_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_44_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_44_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_44_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_44_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_44_inShuf1<>+0x00(SB)/8, $0xffffff0000000000 +DATA expandAVX512_44_inShuf1<>+0x08(SB)/8, $0xffffff0000000000 +DATA expandAVX512_44_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512_44_inShuf1<>+0x18(SB)/8, $0xffffff0000000000 +DATA expandAVX512_44_inShuf1<>+0x20(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_44_inShuf1<>+0x28(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_44_inShuf1<>+0x30(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_44_inShuf1<>+0x38(SB)/8, $0xff02020202020101 + +GLOBL expandAVX512_44_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_44_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_44_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_44_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_44_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_44_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512_44_mat1<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512_44_mat1<>+0x30(SB)/8, $0x0404040404040404 +DATA expandAVX512_44_mat1<>+0x38(SB)/8, $0x0808080808080808 + +GLOBL expandAVX512_44_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_44_inShuf2<>+0x00(SB)/8, $0x0202020101010101 +DATA expandAVX512_44_inShuf2<>+0x08(SB)/8, $0xffffffffffff0201 +DATA expandAVX512_44_inShuf2<>+0x10(SB)/8, $0x0202020101010101 +DATA expandAVX512_44_inShuf2<>+0x18(SB)/8, $0x0202020101010101 +DATA expandAVX512_44_inShuf2<>+0x20(SB)/8, $0xffffffffffff0201 +DATA expandAVX512_44_inShuf2<>+0x28(SB)/8, $0xffff020101010101 +DATA expandAVX512_44_inShuf2<>+0x30(SB)/8, $0xffffff0202020202 +DATA expandAVX512_44_inShuf2<>+0x38(SB)/8, $0xffffffffffffff02 + +GLOBL expandAVX512_44_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_44_mat2<>+0x00(SB)/8, $0x1010101010101010 +DATA expandAVX512_44_mat2<>+0x08(SB)/8, $0x1010101020202020 +DATA expandAVX512_44_mat2<>+0x10(SB)/8, $0x2020202020202020 +DATA expandAVX512_44_mat2<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512_44_mat2<>+0x20(SB)/8, $0x4040404080808080 +DATA expandAVX512_44_mat2<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512_44_mat2<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512_44_mat2<>+0x38(SB)/8, $0x0101010102020202 + +GLOBL expandAVX512_44_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_44_inShuf3<>+0x00(SB)/8, $0xffffff0202020202 +DATA expandAVX512_44_inShuf3<>+0x08(SB)/8, $0xffffff0202020202 +DATA expandAVX512_44_inShuf3<>+0x10(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_44_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202 +DATA expandAVX512_44_inShuf3<>+0x20(SB)/8, $0xffffffffffff0202 +DATA expandAVX512_44_inShuf3<>+0x28(SB)/8, $0xffffffffffff0202 +DATA expandAVX512_44_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_44_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_44_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_44_mat3<>+0x00(SB)/8, $0x0202020202020202 +DATA expandAVX512_44_mat3<>+0x08(SB)/8, $0x0404040404040404 +DATA expandAVX512_44_mat3<>+0x10(SB)/8, $0x0404040408080808 +DATA expandAVX512_44_mat3<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512_44_mat3<>+0x20(SB)/8, $0x2020202020202020 +DATA expandAVX512_44_mat3<>+0x28(SB)/8, $0x4040404040404040 +DATA expandAVX512_44_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_44_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_44_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_44_outShufLo+0x00(SB)/8, $0x1110080403020100 +DATA expandAVX512_44_outShufLo+0x08(SB)/8, $0x1c1b1a1918141312 +DATA expandAVX512_44_outShufLo+0x10(SB)/8, $0x31302c2b2a292820 +DATA expandAVX512_44_outShufLo+0x18(SB)/8, $0x4342414038343332 +DATA expandAVX512_44_outShufLo+0x20(SB)/8, $0x58504c4b4a494844 +DATA expandAVX512_44_outShufLo+0x28(SB)/8, $0x600706055c5b5a59 +DATA expandAVX512_44_outShufLo+0x30(SB)/8, $0x1d69681716150961 +DATA expandAVX512_44_outShufLo+0x38(SB)/8, $0x2f2e2d2171701f1e + +GLOBL expandAVX512_44_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_44_outShufHi0+0x00(SB)/8, $0x4844434241403938 +DATA expandAVX512_44_outShufHi0+0x08(SB)/8, $0x5a59585453525150 +DATA expandAVX512_44_outShufHi0+0x10(SB)/8, $0x6c6b6a6968605c5b +DATA expandAVX512_44_outShufHi0+0x18(SB)/8, $0xffff787473727170 +DATA expandAVX512_44_outShufHi0+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_44_outShufHi0+0x28(SB)/8, $0x46453e3d3c3b3aff +DATA expandAVX512_44_outShufHi0+0x30(SB)/8, $0xff57565549ffff47 +DATA expandAVX512_44_outShufHi0+0x38(SB)/8, $0x6d61ffff5f5e5dff + +GLOBL expandAVX512_44_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_44_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_44_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_44_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_44_outShufHi1+0x18(SB)/8, $0x0100ffffffffffff +DATA expandAVX512_44_outShufHi1+0x20(SB)/8, $0x0c0b0a0908040302 +DATA expandAVX512_44_outShufHi1+0x28(SB)/8, $0xffffffffffffff10 +DATA expandAVX512_44_outShufHi1+0x30(SB)/8, $0x20ffffffff1918ff +DATA expandAVX512_44_outShufHi1+0x38(SB)/8, $0xffff2928ffffff21 + +TEXT expandAVX512_44<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_44_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_44_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_44_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_44_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_44_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_44_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_44_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_44_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_44_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_44_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_44_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xce79fe003fffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x318601ffc0000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_48_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_48_inShuf0<>+0x00(SB)/8, $0x0101000000000000 +DATA expandAVX512_48_inShuf0<>+0x08(SB)/8, $0x0101000000000000 +DATA expandAVX512_48_inShuf0<>+0x10(SB)/8, $0x0101000000000000 +DATA expandAVX512_48_inShuf0<>+0x18(SB)/8, $0xffff000000000000 +DATA expandAVX512_48_inShuf0<>+0x20(SB)/8, $0xffff000000000000 +DATA expandAVX512_48_inShuf0<>+0x28(SB)/8, $0xffff000000000000 +DATA expandAVX512_48_inShuf0<>+0x30(SB)/8, $0xffff000000000000 +DATA expandAVX512_48_inShuf0<>+0x38(SB)/8, $0xffff000000000000 + +GLOBL expandAVX512_48_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_48_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_48_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_48_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_48_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_48_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_48_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_48_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_48_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_48_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_48_inShuf1<>+0x00(SB)/8, $0xffffffff01010101 +DATA expandAVX512_48_inShuf1<>+0x08(SB)/8, $0xffffffff01010101 +DATA expandAVX512_48_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101 +DATA expandAVX512_48_inShuf1<>+0x18(SB)/8, $0x0202020202020101 +DATA expandAVX512_48_inShuf1<>+0x20(SB)/8, $0x0202010101010101 +DATA expandAVX512_48_inShuf1<>+0x28(SB)/8, $0x0202010101010101 +DATA expandAVX512_48_inShuf1<>+0x30(SB)/8, $0x0202010101010101 +DATA expandAVX512_48_inShuf1<>+0x38(SB)/8, $0xffff010101010101 + +GLOBL expandAVX512_48_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_48_mat1<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_48_mat1<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_48_mat1<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_48_mat1<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_48_mat1<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512_48_mat1<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512_48_mat1<>+0x30(SB)/8, $0x2020202020202020 +DATA expandAVX512_48_mat1<>+0x38(SB)/8, $0x4040404040404040 + +GLOBL expandAVX512_48_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_48_inShuf2<>+0x00(SB)/8, $0xffff010101010101 +DATA expandAVX512_48_inShuf2<>+0x08(SB)/8, $0xffff020202020202 +DATA expandAVX512_48_inShuf2<>+0x10(SB)/8, $0xffff020202020202 +DATA expandAVX512_48_inShuf2<>+0x18(SB)/8, $0xffffffff02020202 +DATA expandAVX512_48_inShuf2<>+0x20(SB)/8, $0xffffffff02020202 +DATA expandAVX512_48_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_48_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_48_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_48_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_48_mat2<>+0x00(SB)/8, $0x8080808080808080 +DATA expandAVX512_48_mat2<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512_48_mat2<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_48_mat2<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_48_mat2<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_48_mat2<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_48_mat2<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_48_mat2<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_48_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_48_outShufLo+0x00(SB)/8, $0x0908050403020100 +DATA expandAVX512_48_outShufLo+0x08(SB)/8, $0x131211100d0c0b0a +DATA expandAVX512_48_outShufLo+0x10(SB)/8, $0x1d1c1b1a19181514 +DATA expandAVX512_48_outShufLo+0x18(SB)/8, $0x2928252423222120 +DATA expandAVX512_48_outShufLo+0x20(SB)/8, $0x333231302d2c2b2a +DATA expandAVX512_48_outShufLo+0x28(SB)/8, $0x3d3c3b3a39383534 +DATA expandAVX512_48_outShufLo+0x30(SB)/8, $0x0f0e434241400706 +DATA expandAVX512_48_outShufLo+0x38(SB)/8, $0x515017164b4a4948 + +GLOBL expandAVX512_48_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_48_outShufHi+0x00(SB)/8, $0x2524232221201918 +DATA expandAVX512_48_outShufHi+0x08(SB)/8, $0x31302d2c2b2a2928 +DATA expandAVX512_48_outShufHi+0x10(SB)/8, $0x3b3a393835343332 +DATA expandAVX512_48_outShufHi+0x18(SB)/8, $0x4544434241403d3c +DATA expandAVX512_48_outShufHi+0x20(SB)/8, $0x51504d4c4b4a4948 +DATA expandAVX512_48_outShufHi+0x28(SB)/8, $0x1d1c1b1a55545352 +DATA expandAVX512_48_outShufHi+0x30(SB)/8, $0x5b5a595827261f1e +DATA expandAVX512_48_outShufHi+0x38(SB)/8, $0x3736636261602f2e + +TEXT expandAVX512_48<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_48_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_48_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512_48_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512_48_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_48_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_48_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_48_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_48_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512_52_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_52_inShuf0<>+0x00(SB)/8, $0x0101000000000000 +DATA expandAVX512_52_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 +DATA expandAVX512_52_inShuf0<>+0x10(SB)/8, $0x0101000000000000 +DATA expandAVX512_52_inShuf0<>+0x18(SB)/8, $0xffff000000000000 +DATA expandAVX512_52_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00 +DATA expandAVX512_52_inShuf0<>+0x28(SB)/8, $0xffff000000000000 +DATA expandAVX512_52_inShuf0<>+0x30(SB)/8, $0xffff000000000000 +DATA expandAVX512_52_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 + +GLOBL expandAVX512_52_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_52_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_52_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_52_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_52_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_52_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_52_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_52_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_52_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_52_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_52_inShuf1<>+0x00(SB)/8, $0xffff000000000000 +DATA expandAVX512_52_inShuf1<>+0x08(SB)/8, $0xffff000000000000 +DATA expandAVX512_52_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512_52_inShuf1<>+0x18(SB)/8, $0xffff000000000000 +DATA expandAVX512_52_inShuf1<>+0x20(SB)/8, $0xffffffff01010101 +DATA expandAVX512_52_inShuf1<>+0x28(SB)/8, $0xffffffffff010101 +DATA expandAVX512_52_inShuf1<>+0x30(SB)/8, $0xff02020202020201 +DATA expandAVX512_52_inShuf1<>+0x38(SB)/8, $0x0202010101010101 + +GLOBL expandAVX512_52_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_52_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_52_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_52_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_52_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_52_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512_52_mat1<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512_52_mat1<>+0x30(SB)/8, $0x0202020202020202 +DATA expandAVX512_52_mat1<>+0x38(SB)/8, $0x0404040404040404 + +GLOBL expandAVX512_52_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_52_inShuf2<>+0x00(SB)/8, $0xffffffffffff0201 +DATA expandAVX512_52_inShuf2<>+0x08(SB)/8, $0x0202010101010101 +DATA expandAVX512_52_inShuf2<>+0x10(SB)/8, $0xffff010101010101 +DATA expandAVX512_52_inShuf2<>+0x18(SB)/8, $0xffffffffffffff01 +DATA expandAVX512_52_inShuf2<>+0x20(SB)/8, $0xffff010101010101 +DATA expandAVX512_52_inShuf2<>+0x28(SB)/8, $0xffff010101010101 +DATA expandAVX512_52_inShuf2<>+0x30(SB)/8, $0xffffffffffffff01 +DATA expandAVX512_52_inShuf2<>+0x38(SB)/8, $0xffff010101010101 + +GLOBL expandAVX512_52_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_52_mat2<>+0x00(SB)/8, $0x0404040408080808 +DATA expandAVX512_52_mat2<>+0x08(SB)/8, $0x0808080808080808 +DATA expandAVX512_52_mat2<>+0x10(SB)/8, $0x1010101010101010 +DATA expandAVX512_52_mat2<>+0x18(SB)/8, $0x1010101020202020 +DATA expandAVX512_52_mat2<>+0x20(SB)/8, $0x2020202020202020 +DATA expandAVX512_52_mat2<>+0x28(SB)/8, $0x4040404040404040 +DATA expandAVX512_52_mat2<>+0x30(SB)/8, $0x4040404080808080 +DATA expandAVX512_52_mat2<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_52_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_52_inShuf3<>+0x00(SB)/8, $0xffff020202020202 +DATA expandAVX512_52_inShuf3<>+0x08(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_52_inShuf3<>+0x10(SB)/8, $0xffffffff02020202 +DATA expandAVX512_52_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202 +DATA expandAVX512_52_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_52_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_52_mat3<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_52_mat3<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_52_mat3<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_52_mat3<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_52_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_52_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_52_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_52_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_52_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_52_outShufLo+0x00(SB)/8, $0x1008050403020100 +DATA expandAVX512_52_outShufLo+0x08(SB)/8, $0x1a19181514131211 +DATA expandAVX512_52_outShufLo+0x10(SB)/8, $0x2b2a2928201d1c1b +DATA expandAVX512_52_outShufLo+0x18(SB)/8, $0x3534333231302d2c +DATA expandAVX512_52_outShufLo+0x20(SB)/8, $0x4845444342414038 +DATA expandAVX512_52_outShufLo+0x28(SB)/8, $0x5958504d4c4b4a49 +DATA expandAVX512_52_outShufLo+0x30(SB)/8, $0x616007065d5c5b5a +DATA expandAVX512_52_outShufLo+0x38(SB)/8, $0x6a69681716096362 + +GLOBL expandAVX512_52_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_52_outShufHi0+0x00(SB)/8, $0x403d3c3b3a393830 +DATA expandAVX512_52_outShufHi0+0x08(SB)/8, $0x51504d4c4b4a4948 +DATA expandAVX512_52_outShufHi0+0x10(SB)/8, $0x6261605855545352 +DATA expandAVX512_52_outShufHi0+0x18(SB)/8, $0x6c6b6a6968656463 +DATA expandAVX512_52_outShufHi0+0x20(SB)/8, $0x7d7c7b7a7978706d +DATA expandAVX512_52_outShufHi0+0x28(SB)/8, $0x31ffffffffffffff +DATA expandAVX512_52_outShufHi0+0x30(SB)/8, $0xff3f3e3635343332 +DATA expandAVX512_52_outShufHi0+0x38(SB)/8, $0xffff4f4e41ffffff + +GLOBL expandAVX512_52_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_52_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_52_outShufHi1+0x28(SB)/8, $0xff08050403020100 +DATA expandAVX512_52_outShufHi1+0x30(SB)/8, $0x10ffffffffffffff +DATA expandAVX512_52_outShufHi1+0x38(SB)/8, $0x1918ffffff131211 + +TEXT expandAVX512_52<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_52_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_52_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_52_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_52_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_52_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_52_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_52_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_52_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_52_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_52_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_52_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0x387f80ffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0xc7807f0000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_56_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_56_inShuf0<>+0x00(SB)/8, $0x0100000000000000 +DATA expandAVX512_56_inShuf0<>+0x08(SB)/8, $0x0100000000000000 +DATA expandAVX512_56_inShuf0<>+0x10(SB)/8, $0xff00000000000000 +DATA expandAVX512_56_inShuf0<>+0x18(SB)/8, $0xff00000000000000 +DATA expandAVX512_56_inShuf0<>+0x20(SB)/8, $0xff00000000000000 +DATA expandAVX512_56_inShuf0<>+0x28(SB)/8, $0xff00000000000000 +DATA expandAVX512_56_inShuf0<>+0x30(SB)/8, $0xff00000000000000 +DATA expandAVX512_56_inShuf0<>+0x38(SB)/8, $0xff00000000000000 + +GLOBL expandAVX512_56_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_56_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_56_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_56_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_56_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_56_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_56_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_56_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_56_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_56_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_56_inShuf1<>+0x00(SB)/8, $0xffff010101010101 +DATA expandAVX512_56_inShuf1<>+0x08(SB)/8, $0x0202010101010101 +DATA expandAVX512_56_inShuf1<>+0x10(SB)/8, $0x0201010101010101 +DATA expandAVX512_56_inShuf1<>+0x18(SB)/8, $0xff01010101010101 +DATA expandAVX512_56_inShuf1<>+0x20(SB)/8, $0xff01010101010101 +DATA expandAVX512_56_inShuf1<>+0x28(SB)/8, $0xff01010101010101 +DATA expandAVX512_56_inShuf1<>+0x30(SB)/8, $0xff01010101010101 +DATA expandAVX512_56_inShuf1<>+0x38(SB)/8, $0xff01010101010101 + +GLOBL expandAVX512_56_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_56_inShuf2<>+0x00(SB)/8, $0xff02020202020202 +DATA expandAVX512_56_inShuf2<>+0x08(SB)/8, $0xffffff0202020202 +DATA expandAVX512_56_inShuf2<>+0x10(SB)/8, $0xffffffffffffff02 +DATA expandAVX512_56_inShuf2<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_56_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_56_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_56_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_56_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_56_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_56_mat2<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_56_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_56_mat2<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_56_mat2<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512_56_mat2<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_56_mat2<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_56_mat2<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_56_mat2<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_56_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_56_outShufLo+0x00(SB)/8, $0x0806050403020100 +DATA expandAVX512_56_outShufLo+0x08(SB)/8, $0x11100e0d0c0b0a09 +DATA expandAVX512_56_outShufLo+0x10(SB)/8, $0x1a19181615141312 +DATA expandAVX512_56_outShufLo+0x18(SB)/8, $0x232221201e1d1c1b +DATA expandAVX512_56_outShufLo+0x20(SB)/8, $0x2c2b2a2928262524 +DATA expandAVX512_56_outShufLo+0x28(SB)/8, $0x3534333231302e2d +DATA expandAVX512_56_outShufLo+0x30(SB)/8, $0x3e3d3c3b3a393836 +DATA expandAVX512_56_outShufLo+0x38(SB)/8, $0x0f45444342414007 + +GLOBL expandAVX512_56_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512_56_outShufHi+0x00(SB)/8, $0x11100d0c0b0a0908 +DATA expandAVX512_56_outShufHi+0x08(SB)/8, $0x1a19181615141312 +DATA expandAVX512_56_outShufHi+0x10(SB)/8, $0x232221201e1d1c1b +DATA expandAVX512_56_outShufHi+0x18(SB)/8, $0x2c2b2a2928262524 +DATA expandAVX512_56_outShufHi+0x20(SB)/8, $0x3534333231302e2d +DATA expandAVX512_56_outShufHi+0x28(SB)/8, $0x3e3d3c3b3a393836 +DATA expandAVX512_56_outShufHi+0x30(SB)/8, $0x0e46454443424140 +DATA expandAVX512_56_outShufHi+0x38(SB)/8, $0x50174c4b4a49480f + +TEXT expandAVX512_56<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_56_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_56_mat0<>(SB), Z3 + VMOVDQU64 expandAVX512_56_inShuf1<>(SB), Z4 + VMOVDQU64 expandAVX512_56_inShuf2<>(SB), Z5 + VMOVDQU64 expandAVX512_56_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_56_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z6 + VPERMB Z6, Z0, Z0 + VGF2P8AFFINEQB $0, Z3, Z0, Z0 + VPERMB Z6, Z4, Z4 + VGF2P8AFFINEQB $0, Z3, Z4, Z3 + VPERMB Z6, Z5, Z4 + VGF2P8AFFINEQB $0, expandAVX512_56_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512_60_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_60_inShuf0<>+0x00(SB)/8, $0x0100000000000000 +DATA expandAVX512_60_inShuf0<>+0x08(SB)/8, $0xffffffffffffff00 +DATA expandAVX512_60_inShuf0<>+0x10(SB)/8, $0xff00000000000000 +DATA expandAVX512_60_inShuf0<>+0x18(SB)/8, $0xff00000000000000 +DATA expandAVX512_60_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00 +DATA expandAVX512_60_inShuf0<>+0x28(SB)/8, $0xff00000000000000 +DATA expandAVX512_60_inShuf0<>+0x30(SB)/8, $0xff00000000000000 +DATA expandAVX512_60_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 + +GLOBL expandAVX512_60_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_60_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_60_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512_60_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512_60_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512_60_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512_60_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512_60_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512_60_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512_60_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_60_inShuf1<>+0x00(SB)/8, $0xff00000000000000 +DATA expandAVX512_60_inShuf1<>+0x08(SB)/8, $0xff00000000000000 +DATA expandAVX512_60_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512_60_inShuf1<>+0x18(SB)/8, $0xff00000000000000 +DATA expandAVX512_60_inShuf1<>+0x20(SB)/8, $0xffffffffff010101 +DATA expandAVX512_60_inShuf1<>+0x28(SB)/8, $0x0202020202010101 +DATA expandAVX512_60_inShuf1<>+0x30(SB)/8, $0xffffffffffff0201 +DATA expandAVX512_60_inShuf1<>+0x38(SB)/8, $0xff01010101010101 + +GLOBL expandAVX512_60_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512_60_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512_60_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512_60_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512_60_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512_60_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512_60_mat1<>+0x28(SB)/8, $0x0101010101010101 +DATA expandAVX512_60_mat1<>+0x30(SB)/8, $0x0101010102020202 +DATA expandAVX512_60_mat1<>+0x38(SB)/8, $0x0202020202020202 + +GLOBL expandAVX512_60_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512_60_inShuf2<>+0x00(SB)/8, $0xff01010101010101 +DATA expandAVX512_60_inShuf2<>+0x08(SB)/8, $0xffffffffffffff01 +DATA expandAVX512_60_inShuf2<>+0x10(SB)/8, $0xff01010101010101 +DATA expandAVX512_60_inShuf2<>+0x18(SB)/8, $0xff01010101010101 +DATA expandAVX512_60_inShuf2<>+0x20(SB)/8, $0xffffffffffffff01 +DATA expandAVX512_60_inShuf2<>+0x28(SB)/8, $0xff01010101010101 +DATA expandAVX512_60_inShuf2<>+0x30(SB)/8, $0xff01010101010101 +DATA expandAVX512_60_inShuf2<>+0x38(SB)/8, $0xffffffffffffff01 + +GLOBL expandAVX512_60_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512_60_mat2<>+0x00(SB)/8, $0x0404040404040404 +DATA expandAVX512_60_mat2<>+0x08(SB)/8, $0x0404040408080808 +DATA expandAVX512_60_mat2<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512_60_mat2<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512_60_mat2<>+0x20(SB)/8, $0x1010101020202020 +DATA expandAVX512_60_mat2<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_60_mat2<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_60_mat2<>+0x38(SB)/8, $0x4040404080808080 + +GLOBL expandAVX512_60_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512_60_inShuf3<>+0x00(SB)/8, $0xff01010101010101 +DATA expandAVX512_60_inShuf3<>+0x08(SB)/8, $0xffffffffffff0202 +DATA expandAVX512_60_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512_60_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512_60_mat3<>+0x00(SB)/8, $0x8080808080808080 +DATA expandAVX512_60_mat3<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512_60_mat3<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512_60_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512_60_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_60_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_60_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_60_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_60_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_60_outShufLo+0x00(SB)/8, $0x0806050403020100 +DATA expandAVX512_60_outShufLo+0x08(SB)/8, $0x1816151413121110 +DATA expandAVX512_60_outShufLo+0x10(SB)/8, $0x28201e1d1c1b1a19 +DATA expandAVX512_60_outShufLo+0x18(SB)/8, $0x31302e2d2c2b2a29 +DATA expandAVX512_60_outShufLo+0x20(SB)/8, $0x4140383635343332 +DATA expandAVX512_60_outShufLo+0x28(SB)/8, $0x4a49484645444342 +DATA expandAVX512_60_outShufLo+0x30(SB)/8, $0x5a5958504e4d4c4b +DATA expandAVX512_60_outShufLo+0x38(SB)/8, $0x626160075e5d5c5b + +GLOBL expandAVX512_60_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512_60_outShufHi0+0x00(SB)/8, $0x3b3a3938302a2928 +DATA expandAVX512_60_outShufHi0+0x08(SB)/8, $0x44434241403e3d3c +DATA expandAVX512_60_outShufHi0+0x10(SB)/8, $0x5453525150484645 +DATA expandAVX512_60_outShufHi0+0x18(SB)/8, $0x5d5c5b5a59585655 +DATA expandAVX512_60_outShufHi0+0x20(SB)/8, $0x6d6c6b6a6968605e +DATA expandAVX512_60_outShufHi0+0x28(SB)/8, $0x767574737271706e +DATA expandAVX512_60_outShufHi0+0x30(SB)/8, $0xffffffffffffff78 +DATA expandAVX512_60_outShufHi0+0x38(SB)/8, $0x31ffff2f2e2d2c2b + +GLOBL expandAVX512_60_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512_60_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512_60_outShufHi1+0x30(SB)/8, $0x06050403020100ff +DATA expandAVX512_60_outShufHi1+0x38(SB)/8, $0xff0908ffffffffff + +TEXT expandAVX512_60<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_60_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_60_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_60_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512_60_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512_60_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512_60_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512_60_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512_60_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512_60_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512_60_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512_60_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0x9f01ffffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x60fe000000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512_64_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512_64_inShuf0<>+0x00(SB)/8, $0x0000000000000000 +DATA expandAVX512_64_inShuf0<>+0x08(SB)/8, $0x0000000000000000 +DATA expandAVX512_64_inShuf0<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512_64_inShuf0<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512_64_inShuf0<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512_64_inShuf0<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512_64_inShuf0<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512_64_inShuf0<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512_64_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512_64_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512_64_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512_64_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512_64_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512_64_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512_64_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512_64_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512_64_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512_64_inShuf1<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_inShuf1<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_inShuf1<>+0x10(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_inShuf1<>+0x18(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_inShuf1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_inShuf1<>+0x28(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_inShuf1<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512_64_inShuf1<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512_64_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512_64_outShufLo+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512_64_outShufLo+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512_64_outShufLo+0x10(SB)/8, $0x1716151413121110 +DATA expandAVX512_64_outShufLo+0x18(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512_64_outShufLo+0x20(SB)/8, $0x2726252423222120 +DATA expandAVX512_64_outShufLo+0x28(SB)/8, $0x2f2e2d2c2b2a2928 +DATA expandAVX512_64_outShufLo+0x30(SB)/8, $0x3736353433323130 +DATA expandAVX512_64_outShufLo+0x38(SB)/8, $0x3f3e3d3c3b3a3938 + +TEXT expandAVX512_64<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512_64_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512_64_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512_64_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512_64_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + diff --git a/src/internal/runtime/gc/scan/expand_amd64_test.go b/src/internal/runtime/gc/scan/expand_amd64_test.go new file mode 100644 index 0000000000..a8f5b88c5c --- /dev/null +++ b/src/internal/runtime/gc/scan/expand_amd64_test.go @@ -0,0 +1,19 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 + +package scan_test + +import ( + "internal/runtime/gc/scan" + "testing" +) + +func TestExpandAVX512(t *testing.T) { + if !scan.CanAVX512() { + t.Skip("no AVX512") + } + testExpand(t, scan.ExpandAVX512) +} diff --git a/src/internal/runtime/gc/scan/expand_reference.go b/src/internal/runtime/gc/scan/expand_reference.go new file mode 100644 index 0000000000..45446528d7 --- /dev/null +++ b/src/internal/runtime/gc/scan/expand_reference.go @@ -0,0 +1,39 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan + +import ( + "internal/goarch" + "internal/runtime/gc" +) + +// ExpandReference is a reference implementation of an expander function +// that translates object mark bits into a bitmap of one bit per word of +// marked object, assuming the object is of the provided size class. +func ExpandReference(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) { + // Look up the size and derive the number of objects in a span. + // We're only concerned with small objects in single-page spans, + // and gc.PtrMask enforces this by being statically sized to + // accomodate only such spans. + size := uintptr(gc.SizeClassToSize[sizeClass]) + nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size + + // f is the expansion factor. For example, if our objects are of size 48, + // then each mark bit will translate into 6 (48/8 = 6) set bits in the + // pointer bitmap. + f := size / goarch.PtrSize + for i := range nObj { + // Check if the object is marked. + if packed[i/goarch.PtrBits]&(uintptr(1)<<(i%goarch.PtrBits)) == 0 { + continue + } + // Propagate that mark into the destination into one bit per the + // expansion factor f, offset to the object's offset within the span. + for j := range f { + b := i*f + j // i*f is the start bit for the object, j indexes into each corresponding word after. + unpacked[b/goarch.PtrBits] |= uintptr(1) << (b % goarch.PtrBits) + } + } +} diff --git a/src/internal/runtime/gc/scan/expand_test.go b/src/internal/runtime/gc/scan/expand_test.go new file mode 100644 index 0000000000..692817d8b2 --- /dev/null +++ b/src/internal/runtime/gc/scan/expand_test.go @@ -0,0 +1,37 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan_test + +import ( + "internal/goarch" + "internal/runtime/gc" + "internal/runtime/gc/scan" + "testing" +) + +type expandFunc func(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) + +func testExpand(t *testing.T, expF expandFunc) { + expR := scan.ExpandReference + + testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) { + var want, got gc.PtrMask + expR(sizeClass, objs, &want) + expF(sizeClass, objs, &got) + + for i := range want { + if got[i] != want[i] { + t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize) + if goarch.PtrSize == 4 { + t.Logf("got: %032b", got[i]) + t.Logf("want: %032b", want[i]) + } else { + t.Logf("got: %064b", got[i]) + t.Logf("want: %064b", want[i]) + } + } + } + }) +} diff --git a/src/internal/runtime/gc/scan/filter.go b/src/internal/runtime/gc/scan/filter.go new file mode 100644 index 0000000000..63cee9abf0 --- /dev/null +++ b/src/internal/runtime/gc/scan/filter.go @@ -0,0 +1,35 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan + +import "unsafe" + +// FilterNil packs non-nil (non-zero) values in bufp together +// at the beginning of bufp, returning the length of the +// packed buffer. It treats bufp as an array of size n. +// +// TODO(mknyszek): Add a faster SIMD-based implementation. +func FilterNil(bufp *uintptr, n int32) int32 { + buf := unsafe.Slice(bufp, int(n)) + lo := 0 + hi := len(buf) - 1 + for lo < hi { + for lo < hi && buf[hi] == 0 { + hi-- + } + for lo < hi && buf[lo] != 0 { + lo++ + } + if lo >= hi { + break + } + buf[lo] = buf[hi] + hi-- + } + if hi >= 0 && buf[hi] == 0 { + hi-- + } + return int32(hi) + 1 +} diff --git a/src/internal/runtime/gc/scan/filter_test.go b/src/internal/runtime/gc/scan/filter_test.go new file mode 100644 index 0000000000..115fbfb8bc --- /dev/null +++ b/src/internal/runtime/gc/scan/filter_test.go @@ -0,0 +1,94 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan_test + +import ( + "internal/runtime/gc/scan" + "testing" +) + +func TestFilterNil(t *testing.T) { + t.Run("empty", func(t *testing.T) { + testFilterNil(t, []uintptr{}, []uintptr{}) + }) + t.Run("one", func(t *testing.T) { + testFilterNil(t, []uintptr{4}, []uintptr{4}) + }) + t.Run("elimOne", func(t *testing.T) { + testFilterNil(t, []uintptr{0}, []uintptr{}) + }) + t.Run("oneElimBegin", func(t *testing.T) { + testFilterNil(t, []uintptr{0, 4}, []uintptr{4}) + }) + t.Run("oneElimEnd", func(t *testing.T) { + testFilterNil(t, []uintptr{4, 0}, []uintptr{4}) + }) + t.Run("oneElimMultiBegin", func(t *testing.T) { + testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4}) + }) + t.Run("oneElimMultiEnd", func(t *testing.T) { + testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4}) + }) + t.Run("oneElimMulti", func(t *testing.T) { + testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4}) + }) + t.Run("two", func(t *testing.T) { + testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12}) + }) + t.Run("twoElimBegin", func(t *testing.T) { + testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12}) + }) + t.Run("twoElimMid", func(t *testing.T) { + testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12}) + }) + t.Run("twoElimEnd", func(t *testing.T) { + testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12}) + }) + t.Run("twoElimMulti", func(t *testing.T) { + testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12}) + }) + t.Run("Multi", func(t *testing.T) { + testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5}) + }) +} + +func testFilterNil(t *testing.T, buf, want []uintptr) { + var bufp *uintptr + if len(buf) != 0 { + bufp = &buf[0] + } + n := scan.FilterNil(bufp, int32(len(buf))) + if n > int32(len(buf)) { + t.Errorf("bogus new length returned: %d > %d", n, len(buf)) + return + } + buf = buf[:n] + if len(buf) != len(want) { + t.Errorf("lengths differ: got %d, want %d", len(buf), len(want)) + } + + wantMap := make(map[uintptr]int) + gotMap := make(map[uintptr]int) + for _, p := range want { + wantMap[p]++ + } + for _, p := range buf { + gotMap[p]++ + } + for p, nWant := range wantMap { + if nGot, ok := gotMap[p]; !ok { + t.Errorf("want %d, but missing from output", p) + } else if nGot != nWant { + t.Errorf("want %d copies of %d, but got %d", nWant, p, nGot) + } + } + for p := range gotMap { + if _, ok := wantMap[p]; !ok { + t.Errorf("got %d, but didn't want it", p) + } + } + t.Logf("got: %v", buf) + t.Logf("want: %v", want) +} diff --git a/src/internal/runtime/gc/scan/mem_nounix_test.go b/src/internal/runtime/gc/scan/mem_nounix_test.go new file mode 100644 index 0000000000..f4d21d8a85 --- /dev/null +++ b/src/internal/runtime/gc/scan/mem_nounix_test.go @@ -0,0 +1,16 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !unix + +package scan_test + +import ( + "testing" +) + +func makeMem(t testing.TB, nPages int) ([]uintptr, func()) { + t.Skip("mmap unsupported") + return nil, nil +} diff --git a/src/internal/runtime/gc/scan/mem_unix_test.go b/src/internal/runtime/gc/scan/mem_unix_test.go new file mode 100644 index 0000000000..03f0bd5dd0 --- /dev/null +++ b/src/internal/runtime/gc/scan/mem_unix_test.go @@ -0,0 +1,25 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build unix + +package scan_test + +import ( + "internal/runtime/gc" + "syscall" + "testing" + "unsafe" +) + +func makeMem(t testing.TB, nPages int) ([]uintptr, func()) { + mem, err := syscall.Mmap(-1, 0, int(gc.PageSize*nPages), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_PRIVATE|syscall.MAP_ANON) + if err != nil { + t.Fatalf("mmap failed: %s", err) + } + free := func() { + syscall.Munmap(mem) + } + return unsafe.Slice((*uintptr)(unsafe.Pointer(unsafe.SliceData(mem))), len(mem)/8), free +} diff --git a/src/internal/runtime/gc/scan/mkasm.go b/src/internal/runtime/gc/scan/mkasm.go new file mode 100644 index 0000000000..e36defb2e1 --- /dev/null +++ b/src/internal/runtime/gc/scan/mkasm.go @@ -0,0 +1,412 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ignore + +package main + +import ( + "bytes" + "fmt" + "io" + "log" + "os" + "slices" + "strconv" + + "internal/runtime/gc" + "internal/runtime/gc/internal/gen" +) + +const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n" + +func main() { + generate("expand_amd64.s", genExpanders) +} + +func generate(fileName string, genFunc func(*gen.File)) { + var buf bytes.Buffer + tee := io.MultiWriter(&buf, os.Stdout) + + file := gen.NewFile(tee) + + genFunc(file) + + fmt.Fprintf(tee, header) + file.Compile() + + f, err := os.Create(fileName) + if err != nil { + log.Fatal(err) + } + defer f.Close() + _, err = f.Write(buf.Bytes()) + if err != nil { + log.Fatal(err) + } +} + +func genExpanders(file *gen.File) { + gcExpandersAVX512 := make([]*gen.Func, len(gc.SizeClassToSize)) + for sc, ob := range gc.SizeClassToSize { + if gc.SizeClassToNPages[sc] != 1 { + // These functions all produce a bitmap that covers exactly one + // page. + continue + } + if ob > gc.MinSizeForMallocHeader { + // This size class is too big to have a packed pointer/scalar bitmap. + break + } + + xf := int(ob) / 8 + log.Printf("size class %d bytes, expansion %dx", ob, xf) + + fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf)) + ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn) + + if xf == 1 { + expandIdentity(ptrObjBits) + } else { + ok := gfExpander(xf, ptrObjBits) + if !ok { + log.Printf("failed to generate expander for size class %d", sc) + } + } + file.AddFunc(fn) + gcExpandersAVX512[sc] = fn + } + + // Generate table mapping size class to expander PC + file.AddConst("·gcExpandersAVX512", gcExpandersAVX512) +} + +// mat8x8 is an 8x8 bit matrix. +type mat8x8 struct { + mat [8]uint8 +} + +func matGroupToVec(mats *[8]mat8x8) [8]uint64 { + var out [8]uint64 + for i, mat := range mats { + for j, row := range mat.mat { + // For some reason, Intel flips the rows. + out[i] |= uint64(row) << ((7 - j) * 8) + } + } + return out +} + +// expandIdentity implements 1x expansion (that is, no expansion). +func expandIdentity(ptrObjBits gen.Ptr[gen.Uint8x64]) { + objBitsLo := gen.Deref(ptrObjBits) + objBitsHi := gen.Deref(ptrObjBits.AddConst(64)) + gen.Return(objBitsLo, objBitsHi) +} + +// gfExpander produces a function that expands each bit in an input bitmap into +// f consecutive bits in an output bitmap. +// +// The input is +// +// AX *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits) +// +// The output is +// +// Z1 [64]uint8 = The bottom 512 bits of the expanded bitmap +// Z2 [64]uint8 = The top 512 bits of the expanded bitmap +// +// TODO(austin): This should Z0/Z1. +func gfExpander(f int, ptrObjBits gen.Ptr[gen.Uint8x64]) bool { + // TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler. + + // TODO(austin): For f >= 8, I suspect there are better ways to do this. + // + // For example, we could use a mask expansion to get a full byte for each + // input bit, and separately create the bytes that blend adjacent bits, then + // shuffle those bytes together. Certainly for f >= 16 this makes sense + // because each of those bytes will be used, possibly more than once. + + objBits := gen.Deref(ptrObjBits) + + type term struct { + iByte, oByte int + mat mat8x8 + } + var terms []term + + // Iterate over all output bytes and construct the 8x8 GF2 matrix to compute + // the output byte from the appropriate input byte. Gather all of these into + // "terms". + for oByte := 0; oByte < 1024/8; oByte++ { + var byteMat mat8x8 + iByte := -1 + for oBit := oByte * 8; oBit < oByte*8+8; oBit++ { + iBit := oBit / f + if iByte == -1 { + iByte = iBit / 8 + } else if iByte != iBit/8 { + log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8) + return false + } + // One way to view this is that the i'th row of the matrix will be + // ANDed with the input byte, and the parity of the result will set + // the i'th bit in the output. We use a simple 1 bit mask, so the + // parity is irrelevant beyond selecting out that one bit. + byteMat.mat[oBit%8] = 1 << (iBit % 8) + } + terms = append(terms, term{iByte, oByte, byteMat}) + } + + if false { + // Print input byte -> output byte as a matrix + maxIByte, maxOByte := 0, 0 + for _, term := range terms { + maxIByte = max(maxIByte, term.iByte) + maxOByte = max(maxOByte, term.oByte) + } + iToO := make([][]rune, maxIByte+1) + for i := range iToO { + iToO[i] = make([]rune, maxOByte+1) + } + matMap := make(map[mat8x8]int) + for _, term := range terms { + i, ok := matMap[term.mat] + if !ok { + i = len(matMap) + matMap[term.mat] = i + } + iToO[term.iByte][term.oByte] = 'A' + rune(i) + } + for o := range maxOByte + 1 { + fmt.Printf("%d", o) + for i := range maxIByte + 1 { + fmt.Printf(",") + if mat := iToO[i][o]; mat != 0 { + fmt.Printf("%c", mat) + } + } + fmt.Println() + } + } + + // In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel, + // and we get to operate on up to 8 matrixes in parallel (or 64 values). That is: + // + // abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+ + // mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7 + + // Group the terms by matrix, but limit each group to 8 terms. + const termsPerGroup = 8 // Number of terms we can multiply by the same matrix. + const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector. + + matMap := make(map[mat8x8]int) + allMats := make(map[mat8x8]bool) + var termGroups [][]term + for _, term := range terms { + allMats[term.mat] = true + + i, ok := matMap[term.mat] + if ok && f > groupsPerSuperGroup { + // The output is ultimately produced in two [64]uint8 registers. + // Getting every byte in the right place of each of these requires a + // final permutation that often requires more than one source. + // + // Up to 8x expansion, we can get a really nice grouping so we can use + // the same 8 matrix vector several times, without producing + // permutations that require more than two sources. + // + // Above 8x, however, we can't get nice matrixes anyway, so we + // instead prefer reducing the complexity of the permutations we + // need to produce the final outputs. To do this, avoid grouping + // together terms that are split across the two registers. + outRegister := termGroups[i][0].oByte / 64 + if term.oByte/64 != outRegister { + ok = false + } + } + if !ok { + // Start a new term group. + i = len(termGroups) + matMap[term.mat] = i + termGroups = append(termGroups, nil) + } + + termGroups[i] = append(termGroups[i], term) + + if len(termGroups[i]) == termsPerGroup { + // This term group is full. + delete(matMap, term.mat) + } + } + + for i, termGroup := range termGroups { + log.Printf("term group %d:", i) + for _, term := range termGroup { + log.Printf(" %+v", term) + } + } + + // We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack + // as many term groups as we can into each super-group to minimize the + // number of matrix multiplies. + // + // Ideally, we use the same matrix in each super-group, which might mean + // doing fewer than 8 multiplies at a time. That's fine because it never + // increases the total number of matrix multiplies. + // + // TODO: Packing the matrixes less densely may let us use more broadcast + // loads instead of general permutations, though. That replaces a load of + // the permutation with a load of the matrix, but is probably still slightly + // better. + var sgSize, nSuperGroups int + oneMatVec := f <= groupsPerSuperGroup + if oneMatVec { + // We can use the same matrix in each multiply by doing sgSize + // multiplies at a time. + sgSize = groupsPerSuperGroup / len(allMats) * len(allMats) + nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize + } else { + // We can't use the same matrix for each multiply. Just do as many at a + // time as we can. + // + // TODO: This is going to produce several distinct matrixes, when we + // probably only need two. Be smarter about how we create super-groups + // in this case. Maybe we build up an array of super-groups and then the + // loop below just turns them into ops? + sgSize = 8 + nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup + } + + // Construct each super-group. + var matGroup [8]mat8x8 + var matMuls []gen.Uint8x64 + var perm [128]int + for sgi := range nSuperGroups { + var iperm [64]uint8 + for i := range iperm { + iperm[i] = 0xff // "Don't care" + } + // Pick off sgSize term groups. + superGroup := termGroups[:min(len(termGroups), sgSize)] + termGroups = termGroups[len(superGroup):] + // Build the matrix and permutations for this super-group. + var thisMatGroup [8]mat8x8 + for i, termGroup := range superGroup { + // All terms in this group have the same matrix. Pick one. + thisMatGroup[i] = termGroup[0].mat + for j, term := range termGroup { + // Build the input permutation. + iperm[i*termsPerGroup+j] = uint8(term.iByte) + // Build the output permutation. + perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j + } + } + log.Printf("input permutation %d: %v", sgi, iperm) + + // Check that we're not making more distinct matrixes than expected. + if oneMatVec { + if sgi == 0 { + matGroup = thisMatGroup + } else if matGroup != thisMatGroup { + log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup) + return false + } + } + + // Emit matrix op. + matConst := gen.ConstUint64x8(matGroupToVec(&thisMatGroup), fmt.Sprintf("*_mat%d<>", sgi)) + inOp := objBits.Shuffle(gen.ConstUint8x64(iperm, fmt.Sprintf("*_inShuf%d<>", sgi))) + matMul := matConst.GF2P8Affine(inOp) + matMuls = append(matMuls, matMul) + } + + log.Printf("output permutation: %v", perm) + + outLo, ok := genShuffle("*_outShufLo", (*[64]int)(perm[:64]), matMuls...) + if !ok { + log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls)) + return false + } + outHi, ok := genShuffle("*_outShufHi", (*[64]int)(perm[64:]), matMuls...) + if !ok { + log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls)) + return false + } + gen.Return(outLo, outHi) + + return true +} + +func genShuffle(name string, perm *[64]int, args ...gen.Uint8x64) (gen.Uint8x64, bool) { + // Construct flattened permutation. + var vperm [64]byte + + // Get the inputs used by this permutation. + var inputs []int + for i, src := range perm { + inputIdx := slices.Index(inputs, src/64) + if inputIdx == -1 { + inputIdx = len(inputs) + inputs = append(inputs, src/64) + } + vperm[i] = byte(src%64 | (inputIdx << 6)) + } + + // Emit instructions for easy cases. + switch len(inputs) { + case 1: + constOp := gen.ConstUint8x64(vperm, name) + return args[inputs[0]].Shuffle(constOp), true + case 2: + constOp := gen.ConstUint8x64(vperm, name) + return args[inputs[0]].Shuffle2(args[inputs[1]], constOp), true + } + + // Harder case, we need to shuffle in from up to 2 more tables. + // + // Perform two shuffles. One shuffle will get its data from the first + // two inputs, the other shuffle will get its data from the other one + // or two inputs. All values they don't care each don't care about will + // be zeroed. + var vperms [2][64]byte + var masks [2]uint64 + for j, idx := range vperm { + for i := range vperms { + vperms[i][j] = 0xff // "Don't care" + } + if idx == 0xff { + continue + } + vperms[idx/128][j] = idx % 128 + masks[idx/128] |= uint64(1) << j + } + + // Validate that the masks are fully disjoint. + if masks[0]^masks[1] != ^uint64(0) { + panic("bad shuffle!") + } + + // Generate constants. + constOps := make([]gen.Uint8x64, len(vperms)) + for i, v := range vperms { + constOps[i] = gen.ConstUint8x64(v, name+strconv.Itoa(i)) + } + + // Generate shuffles. + switch len(inputs) { + case 3: + r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0])) + r1 := args[inputs[2]].ShuffleZeroed(constOps[1], gen.ConstMask64(masks[1])) + return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true + case 4: + r0 := args[inputs[0]].Shuffle2Zeroed(args[inputs[1]], constOps[0], gen.ConstMask64(masks[0])) + r1 := args[inputs[2]].Shuffle2Zeroed(args[inputs[3]], constOps[1], gen.ConstMask64(masks[1])) + return r0.ToUint64x8().Or(r1.ToUint64x8()).ToUint8x64(), true + } + + // Too many inputs. To support more, we'd need to separate tables much earlier. + // Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes). + return args[0], false +} diff --git a/src/internal/runtime/gc/scan/scan_amd64.go b/src/internal/runtime/gc/scan/scan_amd64.go new file mode 100644 index 0000000000..2ac181f97e --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_amd64.go @@ -0,0 +1,41 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan + +import ( + "internal/cpu" + "internal/runtime/gc" + "unsafe" +) + +func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + if CanAVX512() { + return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask) + } + panic("not implemented") +} + +func HasFastScanSpanPacked() bool { + return avx512ScanPackedReqsMet +} + +// -- AVX512 -- + +func CanAVX512() bool { + return avx512ScanPackedReqsMet +} + +func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)) +} + +//go:noescape +func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) + +var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL && + cpu.X86.HasAVX512BW && + cpu.X86.HasGFNI && + cpu.X86.HasAVX512BITALG && + cpu.X86.HasAVX512VBMI diff --git a/src/internal/runtime/gc/scan/scan_amd64.s b/src/internal/runtime/gc/scan/scan_amd64.s new file mode 100644 index 0000000000..055995fa38 --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_amd64.s @@ -0,0 +1,103 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// Test-only. +TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24 + MOVQ sizeClass+0(FP), CX + MOVQ packed+8(FP), AX + + // Call the expander for this size class + LEAQ ·gcExpandersAVX512(SB), BX + CALL (BX)(CX*8) + + MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer + VMOVDQU64 Z1, 0(DI) + VMOVDQU64 Z2, 64(DI) + VZEROUPPER + RET + +TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44 + // Z1+Z2 = Expand the grey object mask into a grey word mask + MOVQ objMarks+16(FP), AX + MOVQ sizeClass+24(FP), CX + LEAQ ·gcExpandersAVX512(SB), BX + CALL (BX)(CX*8) + + // Z3+Z4 = Load the pointer mask + MOVQ ptrMask+32(FP), AX + VMOVDQU64 0(AX), Z3 + VMOVDQU64 64(AX), Z4 + + // Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask + VPANDQ Z1, Z3, Z1 + VPANDQ Z2, Z4, Z2 + + // Now each bit of Z1+Z2 represents one word of the span. + // Thus, each byte covers 64 bytes of memory, which is also how + // much we can fix in a Z register. + // + // We do a load/compress for each 64 byte frame. + // + // Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame + VPOPCNTB Z1, Z3 // Requires BITALG + VPOPCNTB Z2, Z4 + + // Store the scan mask and word counts at 0(SP) and 128(SP). + // + // TODO: Is it better to read directly from the registers? + VMOVDQU64 Z1, 0(SP) + VMOVDQU64 Z2, 64(SP) + VMOVDQU64 Z3, 128(SP) + VMOVDQU64 Z4, 192(SP) + + // SI = Current address in span + MOVQ mem+0(FP), SI + // DI = Scan buffer base + MOVQ bufp+8(FP), DI + // DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer + MOVQ $0, DX + + // AX = address in scan mask, 128(AX) = address in popcount + LEAQ 0(SP), AX + + // Loop over the 64 byte frames in this span. + // BX = 1 past the end of the scan mask + LEAQ 128(SP), BX + + // Align loop to a cache line so that performance is less sensitive + // to how this function ends up laid out in memory. This is a hot + // function in the GC, and this is a tight loop. We don't want + // performance to waver wildly due to unrelated changes. + PCALIGN $64 +loop: + // CX = Fetch the mask of words to load from this frame. + MOVBQZX 0(AX), CX + // Skip empty frames. + TESTQ CX, CX + JZ skip + + // Load the 64 byte frame. + KMOVB CX, K1 + VMOVDQA64 0(SI), Z1 + + // Collect just the pointers from the greyed objects into the scan buffer, + // i.e., copy the word indices in the mask from Z1 into contiguous memory. + VPCOMPRESSQ Z1, K1, (DI)(DX*8) + // Advance the scan buffer position by the number of pointers. + MOVBQZX 128(AX), CX + ADDQ CX, DX + +skip: + ADDQ $64, SI + ADDQ $1, AX + CMPQ AX, BX + JB loop + +end: + MOVL DX, count+40(FP) + VZEROUPPER + RET diff --git a/src/internal/runtime/gc/scan/scan_amd64_test.go b/src/internal/runtime/gc/scan/scan_amd64_test.go new file mode 100644 index 0000000000..a914b4f4d7 --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_amd64_test.go @@ -0,0 +1,19 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 + +package scan_test + +import ( + "internal/runtime/gc/scan" + "testing" +) + +func TestScanSpanPackedAVX512(t *testing.T) { + if !scan.CanAVX512() { + t.Skip("no AVX512") + } + testScanSpanPacked(t, scan.ScanSpanPackedAVX512) +} diff --git a/src/internal/runtime/gc/scan/scan_generic.go b/src/internal/runtime/gc/scan/scan_generic.go new file mode 100644 index 0000000000..a4d51827cc --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_generic.go @@ -0,0 +1,23 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 + +package scan + +import ( + "internal/runtime/gc" + "unsafe" +) + +func HasFastScanSpanPacked() bool { + // N.B. ScanSpanPackedGeneric isn't actually fast enough to serve as a general-purpose implementation. + // The runtime's alternative of jumping between each object is still substantially better, even at + // relatively high object densities. + return false +} + +func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask) +} diff --git a/src/internal/runtime/gc/scan/scan_generic_test.go b/src/internal/runtime/gc/scan/scan_generic_test.go new file mode 100644 index 0000000000..250135eca4 --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_generic_test.go @@ -0,0 +1,14 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan_test + +import ( + "internal/runtime/gc/scan" + "testing" +) + +func TestScanSpanPackedGo(t *testing.T) { + testScanSpanPacked(t, scan.ScanSpanPackedGo) +} diff --git a/src/internal/runtime/gc/scan/scan_go.go b/src/internal/runtime/gc/scan/scan_go.go new file mode 100644 index 0000000000..9a2985a3cc --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_go.go @@ -0,0 +1,104 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan + +import ( + "internal/goarch" + "internal/runtime/gc" + "internal/runtime/sys" + "unsafe" +) + +// ScanSpanPackedGo is an optimized pure Go implementation of ScanSpanPacked. +func ScanSpanPackedGo(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + buf := newUnsafeBuf(bufp) + objBytes := uintptr(gc.SizeClassToSize[sizeClass]) + // TODO(austin): Trim objMarks to the number of objects in this size class? + for markI, markWord := range objMarks { + for range sys.OnesCount64(uint64(markWord)) { + bitI := sys.TrailingZeros64(uint64(markWord)) + markWord &^= 1 << bitI + + objIndex := markI*goarch.PtrBits + bitI + + // objStartInSpan is the index of the word from mem where the + // object stats. objEndInSpan points to the next object, i.e. + // it's an exclusive upper bound. + objStartInSpan := objBytes * uintptr(objIndex) / goarch.PtrSize + objEndInSpan := objStartInSpan + objBytes/goarch.PtrSize + + // TODO: Another way to do this would be to extract the pointer mask + // for this object (it's at most 64 bits) and do a bit iteration + // over that. + + for wordI := objStartInSpan; wordI < objEndInSpan; wordI++ { + val := *(*uintptr)(unsafe.Add(mem, wordI*goarch.PtrSize)) + // Check if we should enqueue this word. + // + // We load the word before the check because, even though this + // can lead to loading much more than necessary, it's faster. + // Most likely this is because it warms up the hardware + // prefetcher much better, and gives us more time before we need + // the value. + // + // We discard values that can't possibly be useful pointers + // here, too, because this filters out a lot of words and does + // so with as little processing as possible. + // + // TODO: This is close to, but not entirely branchless. + isPtr := bool2int(ptrMask[wordI/goarch.PtrBits]&(1<<(wordI%goarch.PtrBits)) != 0) + isNonNil := bool2int(val >= 4096) + pred := isPtr&isNonNil != 0 + buf.addIf(val, pred) + } + } + } + // We don't know the true size of bufp, but we can at least catch obvious errors + // in this function by making sure we didn't write more than gc.PageWords pointers + // into the buffer. + buf.check(gc.PageWords) + return int32(buf.n) +} + +// unsafeBuf allows for appending to a buffer without bounds-checks or branches. +type unsafeBuf[T any] struct { + base *T + n int +} + +func newUnsafeBuf[T any](base *T) unsafeBuf[T] { + return unsafeBuf[T]{base, 0} +} + +// addIf appends a value to the buffer if the predicate is true. +// +// addIf speculatively writes to the next index of the buffer, so the caller +// must be certain that such a write will still be in-bounds with respect +// to the buffer's true capacity. +func (b *unsafeBuf[T]) addIf(val T, pred bool) { + *(*T)(unsafe.Add(unsafe.Pointer(b.base), b.n*int(unsafe.Sizeof(val)))) = val + b.n += bool2int(pred) +} + +// check performs a bounds check on speculative writes into the buffer. +// Calling this shortly after a series of addIf calls is important to +// catch any misuse as fast as possible. Separating the bounds check from +// the append is more efficient, but one check to cover several appends is +// still efficient and much more memory safe. +func (b unsafeBuf[T]) check(cap int) { + // We fail even if b.n == cap because addIf speculatively writes one past b.n. + if b.n >= cap { + panic("unsafeBuf overflow") + } +} + +func bool2int(x bool) int { + // This particular pattern gets optimized by the compiler. + var b int + if x { + b = 1 + } + return b +} diff --git a/src/internal/runtime/gc/scan/scan_reference.go b/src/internal/runtime/gc/scan/scan_reference.go new file mode 100644 index 0000000000..05eca98df7 --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_reference.go @@ -0,0 +1,40 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan + +import ( + "internal/goarch" + "internal/runtime/gc" + "unsafe" +) + +// ScanSpanPackedReference is the reference implementation of ScanScanPacked. It prioritizes clarity over performance. +// +// Concretely, ScanScanPacked functions read pointers from mem, assumed to be gc.PageSize-aligned and gc.PageSize in size, +// and writes them to bufp, which is large enough to guarantee that even if pointer-word of mem is a pointer, it will fit. +// Therefore bufp, is always at least gc.PageSize in size. +// +// ScanSpanPacked is supposed to identify pointers by first filtering words by objMarks, where each bit of the mask +// represents gc.SizeClassToSize[sizeClass] bytes of memory, and then filtering again by the bits in ptrMask. +func ScanSpanPackedReference(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + buf := unsafe.Slice(bufp, gc.PageWords) + expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize + for word := range gc.PageWords { + objI := uintptr(word) / expandBy + if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 { + continue + } + if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 { + continue + } + ptr := *(*uintptr)(unsafe.Add(mem, word*goarch.PtrSize)) + if ptr == 0 { + continue + } + buf[count] = ptr + count++ + } + return count +} diff --git a/src/internal/runtime/gc/scan/scan_test.go b/src/internal/runtime/gc/scan/scan_test.go new file mode 100644 index 0000000000..9b577155ed --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_test.go @@ -0,0 +1,254 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan_test + +import ( + "fmt" + "internal/cpu" + "internal/goarch" + "internal/runtime/gc" + "internal/runtime/gc/scan" + "math/bits" + "math/rand/v2" + "slices" + "sync" + "testing" + "unsafe" +) + +type scanFunc func(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) + +func testScanSpanPacked(t *testing.T, scanF scanFunc) { + scanR := scan.ScanSpanPackedReference + + // Construct a fake memory + mem, free := makeMem(t, 1) + defer free() + for i := range mem { + // Use values > heap.PageSize because a scan function can discard + // pointers smaller than this. + mem[i] = uintptr(int(gc.PageSize) + i + 1) + } + + // Construct a random pointer mask + rnd := rand.New(rand.NewPCG(42, 42)) + var ptrs gc.PtrMask + for i := range ptrs { + ptrs[i] = uintptr(rnd.Uint64()) + } + + bufF := make([]uintptr, gc.PageWords) + bufR := make([]uintptr, gc.PageWords) + testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) { + nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs) + nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs) + + if nR != nF { + t.Errorf("want %d count, got %d", nR, nF) + } else if !slices.Equal(bufF[:nF], bufR[:nR]) { + t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF]) + } + }) +} + +func testObjs(t *testing.T, f func(t *testing.T, sizeClass int, objMask *gc.ObjMask)) { + for sizeClass := range gc.NumSizeClasses { + if sizeClass == 0 { + continue + } + size := uintptr(gc.SizeClassToSize[sizeClass]) + if size > gc.MinSizeForMallocHeader { + break // Pointer/scalar metadata is not packed for larger sizes. + } + t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) { + // Scan a few objects near i to test boundary conditions. + const objMask = 0x101 + nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size + for i := range nObj - uintptr(bits.Len(objMask)-1) { + t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) { + var objs gc.ObjMask + objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits) + f(t, sizeClass, &objs) + }) + } + }) + } +} + +var dataCacheSizes = sync.OnceValue(func() []uintptr { + cs := cpu.DataCacheSizes() + for i, c := range cs { + fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize) + } + return cs +}) + +func BenchmarkScanSpanPacked(b *testing.B) { + benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses) +} + +func benchmarkCacheSizes(b *testing.B, fn func(b *testing.B, heapPages int)) { + cacheSizes := dataCacheSizes() + b.Run("cache=tiny/pages=1", func(b *testing.B) { + fn(b, 1) + }) + for i, cacheBytes := range cacheSizes { + pages := int(cacheBytes*3/4) / gc.PageSize + b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) { + fn(b, pages) + }) + } + ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize + b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) { + fn(b, ramPages) + }) +} + +func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) { + for sc := range gc.NumSizeClasses { + if sc == 0 { + continue + } + if sc >= gc.MinSizeForMallocHeader { + break + } + b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) { + benchmarkScanSpanPacked(b, nPages, sc) + }) + } +} + +func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) { + rnd := rand.New(rand.NewPCG(42, 42)) + + // Construct a fake memory + mem, free := makeMem(b, nPages) + defer free() + for i := range mem { + // Use values > heap.PageSize because a scan function can discard + // pointers smaller than this. + mem[i] = uintptr(int(gc.PageSize) + i + 1) + } + + // Construct a random pointer mask + ptrs := make([]gc.PtrMask, nPages) + for i := range ptrs { + for j := range ptrs[i] { + ptrs[i][j] = uintptr(rnd.Uint64()) + } + } + + // Visit the pages in a random order + pageOrder := rnd.Perm(nPages) + + // Create the scan buffer. + buf := make([]uintptr, gc.PageWords) + + // Sweep from 0 marks to all marks. We'll use the same marks for each page + // because I don't think that predictability matters. + objBytes := uintptr(gc.SizeClassToSize[sizeClass]) + nObj := gc.PageSize / objBytes + markOrder := rnd.Perm(int(nObj)) + const steps = 11 + for i := 0; i < steps; i++ { + frac := float64(i) / float64(steps-1) + // Set frac marks. + nMarks := int(float64(len(markOrder))*frac + 0.5) + var objMarks gc.ObjMask + for _, mark := range markOrder[:nMarks] { + objMarks[mark/goarch.PtrBits] |= 1 << (mark % goarch.PtrBits) + } + greyClusters := 0 + for page := range ptrs { + greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page]) + } + + // Report MB/s of how much memory they're actually hitting. This assumes + // 64 byte cache lines (TODO: Should it assume 128 byte cache lines?) + // and expands each access to the whole cache line. This is useful for + // comparing against memory bandwidth. + // + // TODO: Add a benchmark that just measures single core memory bandwidth + // for comparison. (See runtime memcpy benchmarks.) + // + // TODO: Should there be a separate measure where we don't expand to + // cache lines? + avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs)) + + b.Run(fmt.Sprintf("pct=%d", int(100*frac)), func(b *testing.B) { + b.Run("impl=Reference", func(b *testing.B) { + b.SetBytes(avgBytes) + for i := range b.N { + page := pageOrder[i%len(pageOrder)] + scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) + } + }) + b.Run("impl=Go", func(b *testing.B) { + b.SetBytes(avgBytes) + for i := range b.N { + page := pageOrder[i%len(pageOrder)] + scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) + } + }) + if scan.HasFastScanSpanPacked() { + b.Run("impl=Platform", func(b *testing.B) { + b.SetBytes(avgBytes) + for i := range b.N { + page := pageOrder[i%len(pageOrder)] + scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) + } + }) + } + }) + } +} + +func countGreyClusters(sizeClass int, objMarks *gc.ObjMask, ptrMask *gc.PtrMask) int { + clusters := 0 + lastCluster := -1 + + expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize + for word := range gc.PageWords { + objI := uintptr(word) / expandBy + if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 { + continue + } + if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 { + continue + } + c := word * 8 / goarch.PtrBits + if c != lastCluster { + lastCluster = c + clusters++ + } + } + return clusters +} + +func BenchmarkScanMaxBandwidth(b *testing.B) { + // Measure the theoretical "maximum" bandwidth of scanning by reproducing + // the memory access pattern of a full page scan, but using memcpy as the + // kernel instead of scanning. + benchmarkCacheSizes(b, func(b *testing.B, heapPages int) { + mem, free := makeMem(b, heapPages) + defer free() + for i := range mem { + mem[i] = uintptr(int(gc.PageSize) + i + 1) + } + buf := make([]uintptr, gc.PageWords) + + // Visit the pages in a random order + rnd := rand.New(rand.NewPCG(42, 42)) + pageOrder := rnd.Perm(heapPages) + + b.SetBytes(int64(gc.PageSize)) + + b.ResetTimer() + for i := range b.N { + page := pageOrder[i%len(pageOrder)] + copy(buf, mem[gc.PageWords*page:]) + } + }) +} diff --git a/src/internal/runtime/gc/sizeclasses.go b/src/internal/runtime/gc/sizeclasses.go index d2cca1cef1..e5d562f943 100644 --- a/src/internal/runtime/gc/sizeclasses.go +++ b/src/internal/runtime/gc/sizeclasses.go @@ -82,14 +82,15 @@ package gc // 8192 13 32768 const ( - MinHeapAlign = 8 - MaxSmallSize = 32768 - SmallSizeDiv = 8 - SmallSizeMax = 1024 - LargeSizeDiv = 128 - NumSizeClasses = 68 - PageShift = 13 - MaxObjsPerSpan = 1024 + MinHeapAlign = 8 + MaxSmallSize = 32768 + SmallSizeDiv = 8 + SmallSizeMax = 1024 + LargeSizeDiv = 128 + NumSizeClasses = 68 + PageShift = 13 + MaxObjsPerSpan = 1024 + MaxSizeClassNPages = 10 ) var SizeClassToSize = [NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768} -- 2.51.0