]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] internal/runtime/gc: add simd package based greentea kernels
authorJunyang Shao <shaojunyang@google.com>
Sun, 9 Mar 2025 17:19:48 +0000 (17:19 +0000)
committerJunyang Shao <shaojunyang@google.com>
Fri, 21 Nov 2025 21:14:21 +0000 (13:14 -0800)
This CL adds a new generator to internal/runtime/gc/scan that generates expander
kernels in Go SIMD. This CL also includes a Go SIMD scan kernel and a
Go SIMD filter kernel.

This CL also includes the plumbing, it will use the Go SIMD kernels if
goexperiment.simd is on.

Benchmark results:
...
ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=80-88     354.8n ±  1%   272.4n ±  0%  -23.22% (p=0.002 n=6)
ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=90-88     375.7n ±  0%   287.1n ±  0%  -23.58% (p=0.002 n=6)
ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=100-88    450.0n ±  1%   327.4n ±  0%  -27.24% (p=0.002 n=6)
geomean                                                      246.5n         199.4n        -19.10%

Throughput +25%.

Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
Reviewed-on: https://go-review.googlesource.com/c/go/+/719520
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
19 files changed:
src/cmd/compile/internal/ssa/stmtlines_test.go
src/go/build/deps_test.go
src/internal/runtime/gc/scan/expand_amd64.s [deleted file]
src/internal/runtime/gc/scan/expand_amd64_test.go
src/internal/runtime/gc/scan/expand_simd_amd64_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/expand_test.go
src/internal/runtime/gc/scan/expanders_amd64.go [new file with mode: 0644]
src/internal/runtime/gc/scan/expanders_amd64.s [new file with mode: 0644]
src/internal/runtime/gc/scan/export_amd64_test.go [moved from src/internal/runtime/gc/scan/expand_amd64.go with 76% similarity]
src/internal/runtime/gc/scan/export_simd_amd64_test.go [new file with mode: 0644]
src/internal/runtime/gc/scan/mkasm.go
src/internal/runtime/gc/scan/mkexpanders.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_amd64.go
src/internal/runtime/gc/scan/scan_amd64.s
src/internal/runtime/gc/scan/scan_amd64_test.go
src/internal/runtime/gc/scan/scan_generic.go
src/internal/runtime/gc/scan/scan_nosimd_amd64.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_simd_amd64.go [new file with mode: 0644]
src/internal/runtime/gc/scan/scan_test.go

index 2bdd6c80b2d66cf0e10f75eefd6443288e2c45b7..34c3cf2255d1f023a982ea08de9612b6f1cc27bd 100644 (file)
@@ -140,7 +140,7 @@ func TestStmtLines(t *testing.T) {
        var m float64
        switch runtime.GOARCH {
        case "amd64":
-               m = 0.0111 // > 98.89% obtained on amd64, no backsliding
+               m = 0.0112 // > 98.88% obtained on amd64, no backsliding
        case "riscv64":
                m = 0.03 // XXX temporary update threshold to 97% for regabi
        default:
index 1b6e32d07cbc18af4d19ebd7c85eb63dbde4b0a1..0725aca43abdf5842d023376da5bf533218a0ce7 100644 (file)
@@ -88,6 +88,7 @@ var depsRules = `
        internal/strconv,
        internal/trace/tracev2,
        math/bits,
+       simd,
        structs
        < internal/bytealg
        < internal/stringslite
@@ -835,7 +836,8 @@ var depsRules = `
        os,
        reflect,
        strings,
-       sync
+       sync,
+       regexp
        < internal/runtime/gc/internal/gen;
 
        regexp, internal/txtar, internal/trace, internal/trace/raw
diff --git a/src/internal/runtime/gc/scan/expand_amd64.s b/src/internal/runtime/gc/scan/expand_amd64.s
deleted file mode 100644 (file)
index 6b0be44..0000000
+++ /dev/null
@@ -1,2631 +0,0 @@
-// Code generated by mkasm.go. DO NOT EDIT.
-
-#include "go_asm.h"
-#include "textflag.h"
-
-GLOBL ·gcExpandersAVX512(SB), RODATA, $0x220
-DATA  ·gcExpandersAVX512+0x00(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x08(SB)/8, $expandAVX512_1<>(SB)
-DATA  ·gcExpandersAVX512+0x10(SB)/8, $expandAVX512_2<>(SB)
-DATA  ·gcExpandersAVX512+0x18(SB)/8, $expandAVX512_3<>(SB)
-DATA  ·gcExpandersAVX512+0x20(SB)/8, $expandAVX512_4<>(SB)
-DATA  ·gcExpandersAVX512+0x28(SB)/8, $expandAVX512_6<>(SB)
-DATA  ·gcExpandersAVX512+0x30(SB)/8, $expandAVX512_8<>(SB)
-DATA  ·gcExpandersAVX512+0x38(SB)/8, $expandAVX512_10<>(SB)
-DATA  ·gcExpandersAVX512+0x40(SB)/8, $expandAVX512_12<>(SB)
-DATA  ·gcExpandersAVX512+0x48(SB)/8, $expandAVX512_14<>(SB)
-DATA  ·gcExpandersAVX512+0x50(SB)/8, $expandAVX512_16<>(SB)
-DATA  ·gcExpandersAVX512+0x58(SB)/8, $expandAVX512_18<>(SB)
-DATA  ·gcExpandersAVX512+0x60(SB)/8, $expandAVX512_20<>(SB)
-DATA  ·gcExpandersAVX512+0x68(SB)/8, $expandAVX512_22<>(SB)
-DATA  ·gcExpandersAVX512+0x70(SB)/8, $expandAVX512_24<>(SB)
-DATA  ·gcExpandersAVX512+0x78(SB)/8, $expandAVX512_26<>(SB)
-DATA  ·gcExpandersAVX512+0x80(SB)/8, $expandAVX512_28<>(SB)
-DATA  ·gcExpandersAVX512+0x88(SB)/8, $expandAVX512_30<>(SB)
-DATA  ·gcExpandersAVX512+0x90(SB)/8, $expandAVX512_32<>(SB)
-DATA  ·gcExpandersAVX512+0x98(SB)/8, $expandAVX512_36<>(SB)
-DATA  ·gcExpandersAVX512+0xa0(SB)/8, $expandAVX512_40<>(SB)
-DATA  ·gcExpandersAVX512+0xa8(SB)/8, $expandAVX512_44<>(SB)
-DATA  ·gcExpandersAVX512+0xb0(SB)/8, $expandAVX512_48<>(SB)
-DATA  ·gcExpandersAVX512+0xb8(SB)/8, $expandAVX512_52<>(SB)
-DATA  ·gcExpandersAVX512+0xc0(SB)/8, $expandAVX512_56<>(SB)
-DATA  ·gcExpandersAVX512+0xc8(SB)/8, $expandAVX512_60<>(SB)
-DATA  ·gcExpandersAVX512+0xd0(SB)/8, $expandAVX512_64<>(SB)
-DATA  ·gcExpandersAVX512+0xd8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0xe0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0xe8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0xf0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0xf8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x100(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x108(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x110(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x118(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x120(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x128(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x130(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x138(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x140(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x148(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x150(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x158(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x160(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x168(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x170(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x178(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x180(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x188(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x190(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x198(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1a0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1a8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1b0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1b8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1c0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1c8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1d0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1d8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1e0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1e8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1f0(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x1f8(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x200(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x208(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x210(SB)/8, $0
-DATA  ·gcExpandersAVX512+0x218(SB)/8, $0
-
-TEXT expandAVX512_1<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 (AX), Z1
-       VMOVDQU64 64(AX), Z2
-       RET
-
-GLOBL expandAVX512_2_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_2_inShuf0<>+0x00(SB)/8, $0x0706050403020100
-DATA  expandAVX512_2_inShuf0<>+0x08(SB)/8, $0x0706050403020100
-DATA  expandAVX512_2_inShuf0<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_2_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_2_inShuf0<>+0x20(SB)/8, $0x1716151413121110
-DATA  expandAVX512_2_inShuf0<>+0x28(SB)/8, $0x1716151413121110
-DATA  expandAVX512_2_inShuf0<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_2_inShuf0<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918
-
-GLOBL expandAVX512_2_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_2_mat0<>+0x00(SB)/8, $0x0101020204040808
-DATA  expandAVX512_2_mat0<>+0x08(SB)/8, $0x1010202040408080
-DATA  expandAVX512_2_mat0<>+0x10(SB)/8, $0x0101020204040808
-DATA  expandAVX512_2_mat0<>+0x18(SB)/8, $0x1010202040408080
-DATA  expandAVX512_2_mat0<>+0x20(SB)/8, $0x0101020204040808
-DATA  expandAVX512_2_mat0<>+0x28(SB)/8, $0x1010202040408080
-DATA  expandAVX512_2_mat0<>+0x30(SB)/8, $0x0101020204040808
-DATA  expandAVX512_2_mat0<>+0x38(SB)/8, $0x1010202040408080
-
-GLOBL expandAVX512_2_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_2_inShuf1<>+0x00(SB)/8, $0x2726252423222120
-DATA  expandAVX512_2_inShuf1<>+0x08(SB)/8, $0x2726252423222120
-DATA  expandAVX512_2_inShuf1<>+0x10(SB)/8, $0x2f2e2d2c2b2a2928
-DATA  expandAVX512_2_inShuf1<>+0x18(SB)/8, $0x2f2e2d2c2b2a2928
-DATA  expandAVX512_2_inShuf1<>+0x20(SB)/8, $0x3736353433323130
-DATA  expandAVX512_2_inShuf1<>+0x28(SB)/8, $0x3736353433323130
-DATA  expandAVX512_2_inShuf1<>+0x30(SB)/8, $0x3f3e3d3c3b3a3938
-DATA  expandAVX512_2_inShuf1<>+0x38(SB)/8, $0x3f3e3d3c3b3a3938
-
-GLOBL expandAVX512_2_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_2_outShufLo+0x00(SB)/8, $0x0b030a0209010800
-DATA  expandAVX512_2_outShufLo+0x08(SB)/8, $0x0f070e060d050c04
-DATA  expandAVX512_2_outShufLo+0x10(SB)/8, $0x1b131a1219111810
-DATA  expandAVX512_2_outShufLo+0x18(SB)/8, $0x1f171e161d151c14
-DATA  expandAVX512_2_outShufLo+0x20(SB)/8, $0x2b232a2229212820
-DATA  expandAVX512_2_outShufLo+0x28(SB)/8, $0x2f272e262d252c24
-DATA  expandAVX512_2_outShufLo+0x30(SB)/8, $0x3b333a3239313830
-DATA  expandAVX512_2_outShufLo+0x38(SB)/8, $0x3f373e363d353c34
-
-TEXT expandAVX512_2<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_2_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_2_mat0<>(SB), Z1
-       VMOVDQU64 expandAVX512_2_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_2_outShufLo(SB), Z3
-       VMOVDQU64 (AX), Z4
-       VPERMB Z4, Z0, Z0
-       VGF2P8AFFINEQB $0, Z1, Z0, Z0
-       VPERMB Z4, Z2, Z2
-       VGF2P8AFFINEQB $0, Z1, Z2, Z2
-       VPERMB Z0, Z3, Z1
-       VPERMB Z2, Z3, Z2
-       RET
-
-GLOBL expandAVX512_3_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_3_inShuf0<>+0x00(SB)/8, $0x0706050403020100
-DATA  expandAVX512_3_inShuf0<>+0x08(SB)/8, $0x0706050403020100
-DATA  expandAVX512_3_inShuf0<>+0x10(SB)/8, $0x0706050403020100
-DATA  expandAVX512_3_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_3_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_3_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_3_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_3_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_3_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_3_mat0<>+0x00(SB)/8, $0x0101010202020404
-DATA  expandAVX512_3_mat0<>+0x08(SB)/8, $0x0408080810101020
-DATA  expandAVX512_3_mat0<>+0x10(SB)/8, $0x2020404040808080
-DATA  expandAVX512_3_mat0<>+0x18(SB)/8, $0x0101010202020404
-DATA  expandAVX512_3_mat0<>+0x20(SB)/8, $0x0408080810101020
-DATA  expandAVX512_3_mat0<>+0x28(SB)/8, $0x2020404040808080
-DATA  expandAVX512_3_mat0<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_3_mat0<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_3_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_3_inShuf1<>+0x00(SB)/8, $0x1716151413121110
-DATA  expandAVX512_3_inShuf1<>+0x08(SB)/8, $0x1716151413121110
-DATA  expandAVX512_3_inShuf1<>+0x10(SB)/8, $0x1716151413121110
-DATA  expandAVX512_3_inShuf1<>+0x18(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_3_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_3_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_3_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_3_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_3_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_3_inShuf2<>+0x00(SB)/8, $0x2726252423222120
-DATA  expandAVX512_3_inShuf2<>+0x08(SB)/8, $0x2726252423222120
-DATA  expandAVX512_3_inShuf2<>+0x10(SB)/8, $0x2726252423222120
-DATA  expandAVX512_3_inShuf2<>+0x18(SB)/8, $0xffffffffff2a2928
-DATA  expandAVX512_3_inShuf2<>+0x20(SB)/8, $0xffffffffff2a2928
-DATA  expandAVX512_3_inShuf2<>+0x28(SB)/8, $0xffffffffffff2928
-DATA  expandAVX512_3_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_3_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_3_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_3_outShufLo+0x00(SB)/8, $0x0a02110901100800
-DATA  expandAVX512_3_outShufLo+0x08(SB)/8, $0x05140c04130b0312
-DATA  expandAVX512_3_outShufLo+0x10(SB)/8, $0x170f07160e06150d
-DATA  expandAVX512_3_outShufLo+0x18(SB)/8, $0x221a292119282018
-DATA  expandAVX512_3_outShufLo+0x20(SB)/8, $0x1d2c241c2b231b2a
-DATA  expandAVX512_3_outShufLo+0x28(SB)/8, $0x2f271f2e261e2d25
-DATA  expandAVX512_3_outShufLo+0x30(SB)/8, $0x4a42514941504840
-DATA  expandAVX512_3_outShufLo+0x38(SB)/8, $0x45544c44534b4352
-
-GLOBL expandAVX512_3_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_3_outShufHi+0x00(SB)/8, $0x170f07160e06150d
-DATA  expandAVX512_3_outShufHi+0x08(SB)/8, $0x221a292119282018
-DATA  expandAVX512_3_outShufHi+0x10(SB)/8, $0x1d2c241c2b231b2a
-DATA  expandAVX512_3_outShufHi+0x18(SB)/8, $0x2f271f2e261e2d25
-DATA  expandAVX512_3_outShufHi+0x20(SB)/8, $0x4a42514941504840
-DATA  expandAVX512_3_outShufHi+0x28(SB)/8, $0x45544c44534b4352
-DATA  expandAVX512_3_outShufHi+0x30(SB)/8, $0x574f47564e46554d
-DATA  expandAVX512_3_outShufHi+0x38(SB)/8, $0x625a696159686058
-
-TEXT expandAVX512_3<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_3_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_3_mat0<>(SB), Z3
-       VMOVDQU64 expandAVX512_3_inShuf1<>(SB), Z4
-       VMOVDQU64 expandAVX512_3_inShuf2<>(SB), Z5
-       VMOVDQU64 expandAVX512_3_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_3_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z6
-       VPERMB Z6, Z0, Z0
-       VGF2P8AFFINEQB $0, Z3, Z0, Z0
-       VPERMB Z6, Z4, Z4
-       VGF2P8AFFINEQB $0, Z3, Z4, Z4
-       VPERMB Z6, Z5, Z5
-       VGF2P8AFFINEQB $0, Z3, Z5, Z3
-       VPERMI2B Z4, Z0, Z1
-       VPERMI2B Z3, Z4, Z2
-       RET
-
-GLOBL expandAVX512_4_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_4_inShuf0<>+0x00(SB)/8, $0x0706050403020100
-DATA  expandAVX512_4_inShuf0<>+0x08(SB)/8, $0x0706050403020100
-DATA  expandAVX512_4_inShuf0<>+0x10(SB)/8, $0x0706050403020100
-DATA  expandAVX512_4_inShuf0<>+0x18(SB)/8, $0x0706050403020100
-DATA  expandAVX512_4_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_4_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_4_inShuf0<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_4_inShuf0<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908
-
-GLOBL expandAVX512_4_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_4_mat0<>+0x00(SB)/8, $0x0101010102020202
-DATA  expandAVX512_4_mat0<>+0x08(SB)/8, $0x0404040408080808
-DATA  expandAVX512_4_mat0<>+0x10(SB)/8, $0x1010101020202020
-DATA  expandAVX512_4_mat0<>+0x18(SB)/8, $0x4040404080808080
-DATA  expandAVX512_4_mat0<>+0x20(SB)/8, $0x0101010102020202
-DATA  expandAVX512_4_mat0<>+0x28(SB)/8, $0x0404040408080808
-DATA  expandAVX512_4_mat0<>+0x30(SB)/8, $0x1010101020202020
-DATA  expandAVX512_4_mat0<>+0x38(SB)/8, $0x4040404080808080
-
-GLOBL expandAVX512_4_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_4_inShuf1<>+0x00(SB)/8, $0x1716151413121110
-DATA  expandAVX512_4_inShuf1<>+0x08(SB)/8, $0x1716151413121110
-DATA  expandAVX512_4_inShuf1<>+0x10(SB)/8, $0x1716151413121110
-DATA  expandAVX512_4_inShuf1<>+0x18(SB)/8, $0x1716151413121110
-DATA  expandAVX512_4_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_4_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_4_inShuf1<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_4_inShuf1<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918
-
-GLOBL expandAVX512_4_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_4_outShufLo+0x00(SB)/8, $0x1911090118100800
-DATA  expandAVX512_4_outShufLo+0x08(SB)/8, $0x1b130b031a120a02
-DATA  expandAVX512_4_outShufLo+0x10(SB)/8, $0x1d150d051c140c04
-DATA  expandAVX512_4_outShufLo+0x18(SB)/8, $0x1f170f071e160e06
-DATA  expandAVX512_4_outShufLo+0x20(SB)/8, $0x3931292138302820
-DATA  expandAVX512_4_outShufLo+0x28(SB)/8, $0x3b332b233a322a22
-DATA  expandAVX512_4_outShufLo+0x30(SB)/8, $0x3d352d253c342c24
-DATA  expandAVX512_4_outShufLo+0x38(SB)/8, $0x3f372f273e362e26
-
-TEXT expandAVX512_4<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_4_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_4_mat0<>(SB), Z1
-       VMOVDQU64 expandAVX512_4_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_4_outShufLo(SB), Z3
-       VMOVDQU64 (AX), Z4
-       VPERMB Z4, Z0, Z0
-       VGF2P8AFFINEQB $0, Z1, Z0, Z0
-       VPERMB Z4, Z2, Z2
-       VGF2P8AFFINEQB $0, Z1, Z2, Z2
-       VPERMB Z0, Z3, Z1
-       VPERMB Z2, Z3, Z2
-       RET
-
-GLOBL expandAVX512_6_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_6_inShuf0<>+0x00(SB)/8, $0x0706050403020100
-DATA  expandAVX512_6_inShuf0<>+0x08(SB)/8, $0x0706050403020100
-DATA  expandAVX512_6_inShuf0<>+0x10(SB)/8, $0x0706050403020100
-DATA  expandAVX512_6_inShuf0<>+0x18(SB)/8, $0x0706050403020100
-DATA  expandAVX512_6_inShuf0<>+0x20(SB)/8, $0x0706050403020100
-DATA  expandAVX512_6_inShuf0<>+0x28(SB)/8, $0x0706050403020100
-DATA  expandAVX512_6_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_6_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_6_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_6_mat0<>+0x00(SB)/8, $0x0101010101010202
-DATA  expandAVX512_6_mat0<>+0x08(SB)/8, $0x0202020204040404
-DATA  expandAVX512_6_mat0<>+0x10(SB)/8, $0x0404080808080808
-DATA  expandAVX512_6_mat0<>+0x18(SB)/8, $0x1010101010102020
-DATA  expandAVX512_6_mat0<>+0x20(SB)/8, $0x2020202040404040
-DATA  expandAVX512_6_mat0<>+0x28(SB)/8, $0x4040808080808080
-DATA  expandAVX512_6_mat0<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_6_mat0<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_6_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_6_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_6_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_6_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_6_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_6_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_6_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_6_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_6_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_6_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_6_inShuf2<>+0x00(SB)/8, $0xffff151413121110
-DATA  expandAVX512_6_inShuf2<>+0x08(SB)/8, $0xffff151413121110
-DATA  expandAVX512_6_inShuf2<>+0x10(SB)/8, $0xffffff1413121110
-DATA  expandAVX512_6_inShuf2<>+0x18(SB)/8, $0xffffff1413121110
-DATA  expandAVX512_6_inShuf2<>+0x20(SB)/8, $0xffffff1413121110
-DATA  expandAVX512_6_inShuf2<>+0x28(SB)/8, $0xffffff1413121110
-DATA  expandAVX512_6_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_6_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_6_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_6_outShufLo+0x00(SB)/8, $0x0901282018100800
-DATA  expandAVX512_6_outShufLo+0x08(SB)/8, $0x1a120a0229211911
-DATA  expandAVX512_6_outShufLo+0x10(SB)/8, $0x2b231b130b032a22
-DATA  expandAVX512_6_outShufLo+0x18(SB)/8, $0x0d052c241c140c04
-DATA  expandAVX512_6_outShufLo+0x20(SB)/8, $0x1e160e062d251d15
-DATA  expandAVX512_6_outShufLo+0x28(SB)/8, $0x2f271f170f072e26
-DATA  expandAVX512_6_outShufLo+0x30(SB)/8, $0x4941686058504840
-DATA  expandAVX512_6_outShufLo+0x38(SB)/8, $0x5a524a4269615951
-
-GLOBL expandAVX512_6_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_6_outShufHi+0x00(SB)/8, $0x2b231b130b032a22
-DATA  expandAVX512_6_outShufHi+0x08(SB)/8, $0x0d052c241c140c04
-DATA  expandAVX512_6_outShufHi+0x10(SB)/8, $0x1e160e062d251d15
-DATA  expandAVX512_6_outShufHi+0x18(SB)/8, $0x2f271f170f072e26
-DATA  expandAVX512_6_outShufHi+0x20(SB)/8, $0x4941686058504840
-DATA  expandAVX512_6_outShufHi+0x28(SB)/8, $0x5a524a4269615951
-DATA  expandAVX512_6_outShufHi+0x30(SB)/8, $0x6b635b534b436a62
-DATA  expandAVX512_6_outShufHi+0x38(SB)/8, $0x4d456c645c544c44
-
-TEXT expandAVX512_6<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_6_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_6_mat0<>(SB), Z3
-       VMOVDQU64 expandAVX512_6_inShuf1<>(SB), Z4
-       VMOVDQU64 expandAVX512_6_inShuf2<>(SB), Z5
-       VMOVDQU64 expandAVX512_6_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_6_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z6
-       VPERMB Z6, Z0, Z0
-       VGF2P8AFFINEQB $0, Z3, Z0, Z0
-       VPERMB Z6, Z4, Z4
-       VGF2P8AFFINEQB $0, Z3, Z4, Z4
-       VPERMB Z6, Z5, Z5
-       VGF2P8AFFINEQB $0, Z3, Z5, Z3
-       VPERMI2B Z4, Z0, Z1
-       VPERMI2B Z3, Z4, Z2
-       RET
-
-GLOBL expandAVX512_8_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_8_inShuf0<>+0x00(SB)/8, $0x0706050403020100
-DATA  expandAVX512_8_inShuf0<>+0x08(SB)/8, $0x0706050403020100
-DATA  expandAVX512_8_inShuf0<>+0x10(SB)/8, $0x0706050403020100
-DATA  expandAVX512_8_inShuf0<>+0x18(SB)/8, $0x0706050403020100
-DATA  expandAVX512_8_inShuf0<>+0x20(SB)/8, $0x0706050403020100
-DATA  expandAVX512_8_inShuf0<>+0x28(SB)/8, $0x0706050403020100
-DATA  expandAVX512_8_inShuf0<>+0x30(SB)/8, $0x0706050403020100
-DATA  expandAVX512_8_inShuf0<>+0x38(SB)/8, $0x0706050403020100
-
-GLOBL expandAVX512_8_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_8_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_8_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_8_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_8_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_8_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_8_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_8_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_8_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_8_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_8_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_8_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_8_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_8_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_8_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_8_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_8_inShuf1<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_8_inShuf1<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908
-
-GLOBL expandAVX512_8_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_8_outShufLo+0x00(SB)/8, $0x3830282018100800
-DATA  expandAVX512_8_outShufLo+0x08(SB)/8, $0x3931292119110901
-DATA  expandAVX512_8_outShufLo+0x10(SB)/8, $0x3a322a221a120a02
-DATA  expandAVX512_8_outShufLo+0x18(SB)/8, $0x3b332b231b130b03
-DATA  expandAVX512_8_outShufLo+0x20(SB)/8, $0x3c342c241c140c04
-DATA  expandAVX512_8_outShufLo+0x28(SB)/8, $0x3d352d251d150d05
-DATA  expandAVX512_8_outShufLo+0x30(SB)/8, $0x3e362e261e160e06
-DATA  expandAVX512_8_outShufLo+0x38(SB)/8, $0x3f372f271f170f07
-
-TEXT expandAVX512_8<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_8_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_8_mat0<>(SB), Z1
-       VMOVDQU64 expandAVX512_8_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_8_outShufLo(SB), Z3
-       VMOVDQU64 (AX), Z4
-       VPERMB Z4, Z0, Z0
-       VGF2P8AFFINEQB $0, Z1, Z0, Z0
-       VPERMB Z4, Z2, Z2
-       VGF2P8AFFINEQB $0, Z1, Z2, Z2
-       VPERMB Z0, Z3, Z1
-       VPERMB Z2, Z3, Z2
-       RET
-
-GLOBL expandAVX512_10_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_10_inShuf0<>+0x00(SB)/8, $0xff06050403020100
-DATA  expandAVX512_10_inShuf0<>+0x08(SB)/8, $0xff06050403020100
-DATA  expandAVX512_10_inShuf0<>+0x10(SB)/8, $0xff06050403020100
-DATA  expandAVX512_10_inShuf0<>+0x18(SB)/8, $0xff06050403020100
-DATA  expandAVX512_10_inShuf0<>+0x20(SB)/8, $0xffff050403020100
-DATA  expandAVX512_10_inShuf0<>+0x28(SB)/8, $0xffff050403020100
-DATA  expandAVX512_10_inShuf0<>+0x30(SB)/8, $0xffff050403020100
-DATA  expandAVX512_10_inShuf0<>+0x38(SB)/8, $0xffff050403020100
-
-GLOBL expandAVX512_10_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_10_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_10_mat0<>+0x08(SB)/8, $0x0101020202020202
-DATA  expandAVX512_10_mat0<>+0x10(SB)/8, $0x0202020204040404
-DATA  expandAVX512_10_mat0<>+0x18(SB)/8, $0x0404040404040808
-DATA  expandAVX512_10_mat0<>+0x20(SB)/8, $0x0808080808080808
-DATA  expandAVX512_10_mat0<>+0x28(SB)/8, $0x1010101010101010
-DATA  expandAVX512_10_mat0<>+0x30(SB)/8, $0x1010202020202020
-DATA  expandAVX512_10_mat0<>+0x38(SB)/8, $0x2020202040404040
-
-GLOBL expandAVX512_10_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_10_inShuf1<>+0x00(SB)/8, $0xffff050403020100
-DATA  expandAVX512_10_inShuf1<>+0x08(SB)/8, $0xffff050403020100
-DATA  expandAVX512_10_inShuf1<>+0x10(SB)/8, $0xff0c0b0a09080706
-DATA  expandAVX512_10_inShuf1<>+0x18(SB)/8, $0xff0c0b0a09080706
-DATA  expandAVX512_10_inShuf1<>+0x20(SB)/8, $0xff0c0b0a09080706
-DATA  expandAVX512_10_inShuf1<>+0x28(SB)/8, $0xff0c0b0a09080706
-DATA  expandAVX512_10_inShuf1<>+0x30(SB)/8, $0xffff0b0a09080706
-DATA  expandAVX512_10_inShuf1<>+0x38(SB)/8, $0xffff0b0a09080706
-
-GLOBL expandAVX512_10_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_10_mat1<>+0x00(SB)/8, $0x4040404040408080
-DATA  expandAVX512_10_mat1<>+0x08(SB)/8, $0x8080808080808080
-DATA  expandAVX512_10_mat1<>+0x10(SB)/8, $0x0808080808080808
-DATA  expandAVX512_10_mat1<>+0x18(SB)/8, $0x1010101010101010
-DATA  expandAVX512_10_mat1<>+0x20(SB)/8, $0x1010202020202020
-DATA  expandAVX512_10_mat1<>+0x28(SB)/8, $0x2020202040404040
-DATA  expandAVX512_10_mat1<>+0x30(SB)/8, $0x4040404040408080
-DATA  expandAVX512_10_mat1<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_10_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_10_inShuf2<>+0x00(SB)/8, $0xffff0c0b0a090807
-DATA  expandAVX512_10_inShuf2<>+0x08(SB)/8, $0xffff0c0b0a090807
-DATA  expandAVX512_10_inShuf2<>+0x10(SB)/8, $0xffff0c0b0a090807
-DATA  expandAVX512_10_inShuf2<>+0x18(SB)/8, $0xffff0c0b0a090807
-DATA  expandAVX512_10_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_10_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_10_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_10_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_10_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_10_mat2<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_10_mat2<>+0x08(SB)/8, $0x0101020202020202
-DATA  expandAVX512_10_mat2<>+0x10(SB)/8, $0x0202020204040404
-DATA  expandAVX512_10_mat2<>+0x18(SB)/8, $0x0404040404040808
-DATA  expandAVX512_10_mat2<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_10_mat2<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_10_mat2<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_10_mat2<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_10_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_10_outShufLo+0x00(SB)/8, $0x3830282018100800
-DATA  expandAVX512_10_outShufLo+0x08(SB)/8, $0x2921191109014840
-DATA  expandAVX512_10_outShufLo+0x10(SB)/8, $0x1a120a0249413931
-DATA  expandAVX512_10_outShufLo+0x18(SB)/8, $0x0b034a423a322a22
-DATA  expandAVX512_10_outShufLo+0x20(SB)/8, $0x4b433b332b231b13
-DATA  expandAVX512_10_outShufLo+0x28(SB)/8, $0x3c342c241c140c04
-DATA  expandAVX512_10_outShufLo+0x30(SB)/8, $0x2d251d150d054c44
-DATA  expandAVX512_10_outShufLo+0x38(SB)/8, $0x1e160e064d453d35
-
-GLOBL expandAVX512_10_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_10_outShufHi+0x00(SB)/8, $0x4840383028201810
-DATA  expandAVX512_10_outShufHi+0x08(SB)/8, $0x3931292119115850
-DATA  expandAVX512_10_outShufHi+0x10(SB)/8, $0x2a221a1259514941
-DATA  expandAVX512_10_outShufHi+0x18(SB)/8, $0x1b135a524a423a32
-DATA  expandAVX512_10_outShufHi+0x20(SB)/8, $0x5b534b433b332b23
-DATA  expandAVX512_10_outShufHi+0x28(SB)/8, $0x4c443c342c241c14
-DATA  expandAVX512_10_outShufHi+0x30(SB)/8, $0x3d352d251d155c54
-DATA  expandAVX512_10_outShufHi+0x38(SB)/8, $0x2e261e165d554d45
-
-TEXT expandAVX512_10<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_10_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_10_inShuf1<>(SB), Z3
-       VMOVDQU64 expandAVX512_10_inShuf2<>(SB), Z4
-       VMOVDQU64 expandAVX512_10_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_10_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z5
-       VPERMB Z5, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_10_mat0<>(SB), Z0, Z0
-       VPERMB Z5, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_10_mat1<>(SB), Z3, Z3
-       VPERMB Z5, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_10_mat2<>(SB), Z4, Z4
-       VPERMI2B Z3, Z0, Z1
-       VPERMI2B Z4, Z3, Z2
-       RET
-
-GLOBL expandAVX512_12_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_12_inShuf0<>+0x00(SB)/8, $0xffff050403020100
-DATA  expandAVX512_12_inShuf0<>+0x08(SB)/8, $0xffff050403020100
-DATA  expandAVX512_12_inShuf0<>+0x10(SB)/8, $0xffff050403020100
-DATA  expandAVX512_12_inShuf0<>+0x18(SB)/8, $0xffff050403020100
-DATA  expandAVX512_12_inShuf0<>+0x20(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_12_inShuf0<>+0x28(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_12_inShuf0<>+0x30(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_12_inShuf0<>+0x38(SB)/8, $0xffffff0403020100
-
-GLOBL expandAVX512_12_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_12_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_12_mat0<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_12_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_12_mat0<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_12_mat0<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_12_mat0<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_12_mat0<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_12_mat0<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_12_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_12_inShuf1<>+0x00(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_12_inShuf1<>+0x08(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_12_inShuf1<>+0x10(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_12_inShuf1<>+0x18(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_12_inShuf1<>+0x20(SB)/8, $0xffff0a0908070605
-DATA  expandAVX512_12_inShuf1<>+0x28(SB)/8, $0xffff0a0908070605
-DATA  expandAVX512_12_inShuf1<>+0x30(SB)/8, $0xffff0a0908070605
-DATA  expandAVX512_12_inShuf1<>+0x38(SB)/8, $0xffff0a0908070605
-
-GLOBL expandAVX512_12_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_12_mat1<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_12_mat1<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_12_mat1<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_12_mat1<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_12_mat1<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_12_mat1<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_12_mat1<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_12_mat1<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_12_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_12_inShuf2<>+0x00(SB)/8, $0xffffff0908070605
-DATA  expandAVX512_12_inShuf2<>+0x08(SB)/8, $0xffffff0908070605
-DATA  expandAVX512_12_inShuf2<>+0x10(SB)/8, $0xffffff0908070605
-DATA  expandAVX512_12_inShuf2<>+0x18(SB)/8, $0xffffff0908070605
-DATA  expandAVX512_12_inShuf2<>+0x20(SB)/8, $0xffffff0a09080706
-DATA  expandAVX512_12_inShuf2<>+0x28(SB)/8, $0xffffff0a09080706
-DATA  expandAVX512_12_inShuf2<>+0x30(SB)/8, $0xffffff0a09080706
-DATA  expandAVX512_12_inShuf2<>+0x38(SB)/8, $0xffffff0a09080706
-
-GLOBL expandAVX512_12_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_12_mat2<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_12_mat2<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_12_mat2<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_12_mat2<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_12_mat2<>+0x20(SB)/8, $0x0101010101010101
-DATA  expandAVX512_12_mat2<>+0x28(SB)/8, $0x0101010102020202
-DATA  expandAVX512_12_mat2<>+0x30(SB)/8, $0x0202020202020202
-DATA  expandAVX512_12_mat2<>+0x38(SB)/8, $0x0404040404040404
-
-GLOBL expandAVX512_12_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_12_outShufLo+0x00(SB)/8, $0x3830282018100800
-DATA  expandAVX512_12_outShufLo+0x08(SB)/8, $0x1911090158504840
-DATA  expandAVX512_12_outShufLo+0x10(SB)/8, $0x5951494139312921
-DATA  expandAVX512_12_outShufLo+0x18(SB)/8, $0x3a322a221a120a02
-DATA  expandAVX512_12_outShufLo+0x20(SB)/8, $0x1b130b035a524a42
-DATA  expandAVX512_12_outShufLo+0x28(SB)/8, $0x5b534b433b332b23
-DATA  expandAVX512_12_outShufLo+0x30(SB)/8, $0x3c342c241c140c04
-DATA  expandAVX512_12_outShufLo+0x38(SB)/8, $0x1d150d055c544c44
-
-GLOBL expandAVX512_12_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_12_outShufHi+0x00(SB)/8, $0x5850484038302820
-DATA  expandAVX512_12_outShufHi+0x08(SB)/8, $0x3931292178706860
-DATA  expandAVX512_12_outShufHi+0x10(SB)/8, $0x7971696159514941
-DATA  expandAVX512_12_outShufHi+0x18(SB)/8, $0x5a524a423a322a22
-DATA  expandAVX512_12_outShufHi+0x20(SB)/8, $0x3b332b237a726a62
-DATA  expandAVX512_12_outShufHi+0x28(SB)/8, $0x7b736b635b534b43
-DATA  expandAVX512_12_outShufHi+0x30(SB)/8, $0x5c544c443c342c24
-DATA  expandAVX512_12_outShufHi+0x38(SB)/8, $0x3d352d257c746c64
-
-TEXT expandAVX512_12<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_12_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_12_inShuf1<>(SB), Z3
-       VMOVDQU64 expandAVX512_12_inShuf2<>(SB), Z4
-       VMOVDQU64 expandAVX512_12_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_12_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z5
-       VPERMB Z5, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_12_mat0<>(SB), Z0, Z0
-       VPERMB Z5, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_12_mat1<>(SB), Z3, Z3
-       VPERMB Z5, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_12_mat2<>(SB), Z4, Z4
-       VPERMI2B Z3, Z0, Z1
-       VPERMI2B Z4, Z3, Z2
-       RET
-
-GLOBL expandAVX512_14_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_inShuf0<>+0x00(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_14_inShuf0<>+0x08(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_14_inShuf0<>+0x10(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_14_inShuf0<>+0x18(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_14_inShuf0<>+0x20(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_14_inShuf0<>+0x28(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_14_inShuf0<>+0x30(SB)/8, $0xffffff0403020100
-DATA  expandAVX512_14_inShuf0<>+0x38(SB)/8, $0xffffff0403020100
-
-GLOBL expandAVX512_14_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_14_mat0<>+0x08(SB)/8, $0x0101010101010202
-DATA  expandAVX512_14_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_14_mat0<>+0x18(SB)/8, $0x0202020204040404
-DATA  expandAVX512_14_mat0<>+0x20(SB)/8, $0x0404040404040404
-DATA  expandAVX512_14_mat0<>+0x28(SB)/8, $0x0404080808080808
-DATA  expandAVX512_14_mat0<>+0x30(SB)/8, $0x0808080808080808
-DATA  expandAVX512_14_mat0<>+0x38(SB)/8, $0x1010101010101010
-
-GLOBL expandAVX512_14_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_inShuf1<>+0x00(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_14_inShuf1<>+0x08(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_14_inShuf1<>+0x10(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_14_inShuf1<>+0x18(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_14_inShuf1<>+0x20(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_14_inShuf1<>+0x28(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_14_inShuf1<>+0x30(SB)/8, $0xffffff0807060504
-DATA  expandAVX512_14_inShuf1<>+0x38(SB)/8, $0xffffff0807060504
-
-GLOBL expandAVX512_14_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_mat1<>+0x00(SB)/8, $0x1010101010102020
-DATA  expandAVX512_14_mat1<>+0x08(SB)/8, $0x2020202020202020
-DATA  expandAVX512_14_mat1<>+0x10(SB)/8, $0x2020202040404040
-DATA  expandAVX512_14_mat1<>+0x18(SB)/8, $0x4040404040404040
-DATA  expandAVX512_14_mat1<>+0x20(SB)/8, $0x4040808080808080
-DATA  expandAVX512_14_mat1<>+0x28(SB)/8, $0x8080808080808080
-DATA  expandAVX512_14_mat1<>+0x30(SB)/8, $0x1010101010102020
-DATA  expandAVX512_14_mat1<>+0x38(SB)/8, $0x2020202020202020
-
-GLOBL expandAVX512_14_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_inShuf2<>+0x00(SB)/8, $0xffffff0807060504
-DATA  expandAVX512_14_inShuf2<>+0x08(SB)/8, $0xffffff0807060504
-DATA  expandAVX512_14_inShuf2<>+0x10(SB)/8, $0xffffff0807060504
-DATA  expandAVX512_14_inShuf2<>+0x18(SB)/8, $0xffffff0807060504
-DATA  expandAVX512_14_inShuf2<>+0x20(SB)/8, $0xffffff0908070605
-DATA  expandAVX512_14_inShuf2<>+0x28(SB)/8, $0xffffff0908070605
-DATA  expandAVX512_14_inShuf2<>+0x30(SB)/8, $0xffffffff08070605
-DATA  expandAVX512_14_inShuf2<>+0x38(SB)/8, $0xffffffff08070605
-
-GLOBL expandAVX512_14_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_mat2<>+0x00(SB)/8, $0x2020202040404040
-DATA  expandAVX512_14_mat2<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_14_mat2<>+0x10(SB)/8, $0x4040808080808080
-DATA  expandAVX512_14_mat2<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_14_mat2<>+0x20(SB)/8, $0x0101010101010101
-DATA  expandAVX512_14_mat2<>+0x28(SB)/8, $0x0101010101010202
-DATA  expandAVX512_14_mat2<>+0x30(SB)/8, $0x0202020202020202
-DATA  expandAVX512_14_mat2<>+0x38(SB)/8, $0x0202020204040404
-
-GLOBL expandAVX512_14_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_inShuf3<>+0x00(SB)/8, $0xffffffff08070605
-DATA  expandAVX512_14_inShuf3<>+0x08(SB)/8, $0xffffffff08070605
-DATA  expandAVX512_14_inShuf3<>+0x10(SB)/8, $0xffffffff08070605
-DATA  expandAVX512_14_inShuf3<>+0x18(SB)/8, $0xffffffff08070605
-DATA  expandAVX512_14_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_14_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_14_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_14_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_14_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_14_mat3<>+0x00(SB)/8, $0x0404040404040404
-DATA  expandAVX512_14_mat3<>+0x08(SB)/8, $0x0404080808080808
-DATA  expandAVX512_14_mat3<>+0x10(SB)/8, $0x0808080808080808
-DATA  expandAVX512_14_mat3<>+0x18(SB)/8, $0x1010101010101010
-DATA  expandAVX512_14_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_14_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_14_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_14_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_14_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_14_outShufLo+0x00(SB)/8, $0x3830282018100800
-DATA  expandAVX512_14_outShufLo+0x08(SB)/8, $0x0901686058504840
-DATA  expandAVX512_14_outShufLo+0x10(SB)/8, $0x4941393129211911
-DATA  expandAVX512_14_outShufLo+0x18(SB)/8, $0x1a120a0269615951
-DATA  expandAVX512_14_outShufLo+0x20(SB)/8, $0x5a524a423a322a22
-DATA  expandAVX512_14_outShufLo+0x28(SB)/8, $0x2b231b130b036a62
-DATA  expandAVX512_14_outShufLo+0x30(SB)/8, $0x6b635b534b433b33
-DATA  expandAVX512_14_outShufLo+0x38(SB)/8, $0x3c342c241c140c04
-
-GLOBL expandAVX512_14_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_14_outShufHi0+0x00(SB)/8, $0x6860585048403830
-DATA  expandAVX512_14_outShufHi0+0x08(SB)/8, $0x3931ffffffff7870
-DATA  expandAVX512_14_outShufHi0+0x10(SB)/8, $0x7971696159514941
-DATA  expandAVX512_14_outShufHi0+0x18(SB)/8, $0x4a423a32ffffffff
-DATA  expandAVX512_14_outShufHi0+0x20(SB)/8, $0xffff7a726a625a52
-DATA  expandAVX512_14_outShufHi0+0x28(SB)/8, $0x5b534b433b33ffff
-DATA  expandAVX512_14_outShufHi0+0x30(SB)/8, $0xffffffff7b736b63
-DATA  expandAVX512_14_outShufHi0+0x38(SB)/8, $0x6c645c544c443c34
-
-GLOBL expandAVX512_14_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_14_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_14_outShufHi1+0x08(SB)/8, $0xffff18100800ffff
-DATA  expandAVX512_14_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_14_outShufHi1+0x18(SB)/8, $0xffffffff19110901
-DATA  expandAVX512_14_outShufHi1+0x20(SB)/8, $0x0a02ffffffffffff
-DATA  expandAVX512_14_outShufHi1+0x28(SB)/8, $0xffffffffffff1a12
-DATA  expandAVX512_14_outShufHi1+0x30(SB)/8, $0x1b130b03ffffffff
-DATA  expandAVX512_14_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
-
-TEXT expandAVX512_14<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_14_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_14_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_14_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_14_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_14_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_14_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_14_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_14_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_14_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_14_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_14_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xff0ffc3ff0ffc3ff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0xf003c00f003c00, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_16_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_16_inShuf0<>+0x00(SB)/8, $0x0303020201010000
-DATA  expandAVX512_16_inShuf0<>+0x08(SB)/8, $0x0303020201010000
-DATA  expandAVX512_16_inShuf0<>+0x10(SB)/8, $0x0303020201010000
-DATA  expandAVX512_16_inShuf0<>+0x18(SB)/8, $0x0303020201010000
-DATA  expandAVX512_16_inShuf0<>+0x20(SB)/8, $0x0303020201010000
-DATA  expandAVX512_16_inShuf0<>+0x28(SB)/8, $0x0303020201010000
-DATA  expandAVX512_16_inShuf0<>+0x30(SB)/8, $0x0303020201010000
-DATA  expandAVX512_16_inShuf0<>+0x38(SB)/8, $0x0303020201010000
-
-GLOBL expandAVX512_16_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_16_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_16_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_16_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_16_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_16_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_16_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_16_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_16_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_16_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_16_inShuf1<>+0x00(SB)/8, $0x0707060605050404
-DATA  expandAVX512_16_inShuf1<>+0x08(SB)/8, $0x0707060605050404
-DATA  expandAVX512_16_inShuf1<>+0x10(SB)/8, $0x0707060605050404
-DATA  expandAVX512_16_inShuf1<>+0x18(SB)/8, $0x0707060605050404
-DATA  expandAVX512_16_inShuf1<>+0x20(SB)/8, $0x0707060605050404
-DATA  expandAVX512_16_inShuf1<>+0x28(SB)/8, $0x0707060605050404
-DATA  expandAVX512_16_inShuf1<>+0x30(SB)/8, $0x0707060605050404
-DATA  expandAVX512_16_inShuf1<>+0x38(SB)/8, $0x0707060605050404
-
-GLOBL expandAVX512_16_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_16_outShufLo+0x00(SB)/8, $0x1918111009080100
-DATA  expandAVX512_16_outShufLo+0x08(SB)/8, $0x3938313029282120
-DATA  expandAVX512_16_outShufLo+0x10(SB)/8, $0x1b1a13120b0a0302
-DATA  expandAVX512_16_outShufLo+0x18(SB)/8, $0x3b3a33322b2a2322
-DATA  expandAVX512_16_outShufLo+0x20(SB)/8, $0x1d1c15140d0c0504
-DATA  expandAVX512_16_outShufLo+0x28(SB)/8, $0x3d3c35342d2c2524
-DATA  expandAVX512_16_outShufLo+0x30(SB)/8, $0x1f1e17160f0e0706
-DATA  expandAVX512_16_outShufLo+0x38(SB)/8, $0x3f3e37362f2e2726
-
-TEXT expandAVX512_16<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_16_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_16_mat0<>(SB), Z1
-       VMOVDQU64 expandAVX512_16_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_16_outShufLo(SB), Z3
-       VMOVDQU64 (AX), Z4
-       VPERMB Z4, Z0, Z0
-       VGF2P8AFFINEQB $0, Z1, Z0, Z0
-       VPERMB Z4, Z2, Z2
-       VGF2P8AFFINEQB $0, Z1, Z2, Z2
-       VPERMB Z0, Z3, Z1
-       VPERMB Z2, Z3, Z2
-       RET
-
-GLOBL expandAVX512_18_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_inShuf0<>+0x00(SB)/8, $0x0303020201010000
-DATA  expandAVX512_18_inShuf0<>+0x08(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_18_inShuf0<>+0x10(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_18_inShuf0<>+0x18(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_18_inShuf0<>+0x20(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_18_inShuf0<>+0x28(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_18_inShuf0<>+0x30(SB)/8, $0x0303020201010000
-DATA  expandAVX512_18_inShuf0<>+0x38(SB)/8, $0xff03020201010000
-
-GLOBL expandAVX512_18_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_18_mat0<>+0x08(SB)/8, $0x0101020202020202
-DATA  expandAVX512_18_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_18_mat0<>+0x18(SB)/8, $0x0202020204040404
-DATA  expandAVX512_18_mat0<>+0x20(SB)/8, $0x0404040404040404
-DATA  expandAVX512_18_mat0<>+0x28(SB)/8, $0x0404040404040808
-DATA  expandAVX512_18_mat0<>+0x30(SB)/8, $0x0808080808080808
-DATA  expandAVX512_18_mat0<>+0x38(SB)/8, $0x1010101010101010
-
-GLOBL expandAVX512_18_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_inShuf1<>+0x00(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_18_inShuf1<>+0x08(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_18_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_18_inShuf1<>+0x18(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_18_inShuf1<>+0x20(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_18_inShuf1<>+0x28(SB)/8, $0xffff020201010000
-DATA  expandAVX512_18_inShuf1<>+0x30(SB)/8, $0xff06060505040403
-DATA  expandAVX512_18_inShuf1<>+0x38(SB)/8, $0xffffffff06050403
-
-GLOBL expandAVX512_18_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_mat1<>+0x00(SB)/8, $0x1010202020202020
-DATA  expandAVX512_18_mat1<>+0x08(SB)/8, $0x2020202020202020
-DATA  expandAVX512_18_mat1<>+0x10(SB)/8, $0x2020202040404040
-DATA  expandAVX512_18_mat1<>+0x18(SB)/8, $0x4040404040404040
-DATA  expandAVX512_18_mat1<>+0x20(SB)/8, $0x4040404040408080
-DATA  expandAVX512_18_mat1<>+0x28(SB)/8, $0x8080808080808080
-DATA  expandAVX512_18_mat1<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_18_mat1<>+0x38(SB)/8, $0x1010202020202020
-
-GLOBL expandAVX512_18_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_inShuf2<>+0x00(SB)/8, $0xffffffff06050403
-DATA  expandAVX512_18_inShuf2<>+0x08(SB)/8, $0xffffffff06050403
-DATA  expandAVX512_18_inShuf2<>+0x10(SB)/8, $0xffffffff06050403
-DATA  expandAVX512_18_inShuf2<>+0x18(SB)/8, $0xffffffff06050403
-DATA  expandAVX512_18_inShuf2<>+0x20(SB)/8, $0x0606050504040303
-DATA  expandAVX512_18_inShuf2<>+0x28(SB)/8, $0x0707060605050404
-DATA  expandAVX512_18_inShuf2<>+0x30(SB)/8, $0xffffffffff060504
-DATA  expandAVX512_18_inShuf2<>+0x38(SB)/8, $0xffffffffff060504
-
-GLOBL expandAVX512_18_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_mat2<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_18_mat2<>+0x08(SB)/8, $0x2020202040404040
-DATA  expandAVX512_18_mat2<>+0x10(SB)/8, $0x4040404040404040
-DATA  expandAVX512_18_mat2<>+0x18(SB)/8, $0x4040404040408080
-DATA  expandAVX512_18_mat2<>+0x20(SB)/8, $0x8080808080808080
-DATA  expandAVX512_18_mat2<>+0x28(SB)/8, $0x0101010101010101
-DATA  expandAVX512_18_mat2<>+0x30(SB)/8, $0x0101020202020202
-DATA  expandAVX512_18_mat2<>+0x38(SB)/8, $0x0202020202020202
-
-GLOBL expandAVX512_18_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_inShuf3<>+0x00(SB)/8, $0xffffffffff060504
-DATA  expandAVX512_18_inShuf3<>+0x08(SB)/8, $0xffffffffff060504
-DATA  expandAVX512_18_inShuf3<>+0x10(SB)/8, $0xffffffffff060504
-DATA  expandAVX512_18_inShuf3<>+0x18(SB)/8, $0xffff060605050404
-DATA  expandAVX512_18_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_18_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_18_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_18_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_18_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_18_mat3<>+0x00(SB)/8, $0x0202020204040404
-DATA  expandAVX512_18_mat3<>+0x08(SB)/8, $0x0404040404040404
-DATA  expandAVX512_18_mat3<>+0x10(SB)/8, $0x0404040404040808
-DATA  expandAVX512_18_mat3<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_18_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_18_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_18_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_18_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_18_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_18_outShufLo+0x00(SB)/8, $0x3028201810080100
-DATA  expandAVX512_18_outShufLo+0x08(SB)/8, $0x6058504840393831
-DATA  expandAVX512_18_outShufLo+0x10(SB)/8, $0x2119110903026968
-DATA  expandAVX512_18_outShufLo+0x18(SB)/8, $0x5149413b3a333229
-DATA  expandAVX512_18_outShufLo+0x20(SB)/8, $0x120a05046b6a6159
-DATA  expandAVX512_18_outShufLo+0x28(SB)/8, $0x423d3c35342a221a
-DATA  expandAVX512_18_outShufLo+0x30(SB)/8, $0x07066d6c625a524a
-DATA  expandAVX512_18_outShufLo+0x38(SB)/8, $0x3e37362b231b130b
-
-GLOBL expandAVX512_18_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_18_outShufHi0+0x00(SB)/8, $0x6160585048403830
-DATA  expandAVX512_18_outShufHi0+0x08(SB)/8, $0xffffffff78706968
-DATA  expandAVX512_18_outShufHi0+0x10(SB)/8, $0x59514941393231ff
-DATA  expandAVX512_18_outShufHi0+0x18(SB)/8, $0xffff79716b6a6362
-DATA  expandAVX512_18_outShufHi0+0x20(SB)/8, $0x4a423a3433ffffff
-DATA  expandAVX512_18_outShufHi0+0x28(SB)/8, $0x7a726d6c65645a52
-DATA  expandAVX512_18_outShufHi0+0x30(SB)/8, $0x3b3635ffffffffff
-DATA  expandAVX512_18_outShufHi0+0x38(SB)/8, $0x6f6e67665b534b43
-
-GLOBL expandAVX512_18_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_18_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_18_outShufHi1+0x08(SB)/8, $0x18100800ffffffff
-DATA  expandAVX512_18_outShufHi1+0x10(SB)/8, $0xffffffffffffff19
-DATA  expandAVX512_18_outShufHi1+0x18(SB)/8, $0x0901ffffffffffff
-DATA  expandAVX512_18_outShufHi1+0x20(SB)/8, $0xffffffffff1b1a11
-DATA  expandAVX512_18_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_18_outShufHi1+0x30(SB)/8, $0xffffff1d1c120a02
-DATA  expandAVX512_18_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
-
-TEXT expandAVX512_18<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_18_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_18_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_18_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_18_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_18_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_18_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_18_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_18_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_18_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_18_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_18_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xffe0fff83ffe0fff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0x1f0007c001f000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_20_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_20_inShuf0<>+0x00(SB)/8, $0x0303020201010000
-DATA  expandAVX512_20_inShuf0<>+0x08(SB)/8, $0xffffffff03020100
-DATA  expandAVX512_20_inShuf0<>+0x10(SB)/8, $0xff03020201010000
-DATA  expandAVX512_20_inShuf0<>+0x18(SB)/8, $0xffff020201010000
-DATA  expandAVX512_20_inShuf0<>+0x20(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_20_inShuf0<>+0x28(SB)/8, $0xffff020201010000
-DATA  expandAVX512_20_inShuf0<>+0x30(SB)/8, $0xffff020201010000
-DATA  expandAVX512_20_inShuf0<>+0x38(SB)/8, $0xffffffffff020100
-
-GLOBL expandAVX512_20_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_20_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_20_mat0<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_20_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_20_mat0<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_20_mat0<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_20_mat0<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_20_mat0<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_20_mat0<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_20_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_20_inShuf1<>+0x00(SB)/8, $0xffff020201010000
-DATA  expandAVX512_20_inShuf1<>+0x08(SB)/8, $0xffff020201010000
-DATA  expandAVX512_20_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_20_inShuf1<>+0x18(SB)/8, $0xffff020201010000
-DATA  expandAVX512_20_inShuf1<>+0x20(SB)/8, $0xff06060505040403
-DATA  expandAVX512_20_inShuf1<>+0x28(SB)/8, $0x0606050504040303
-DATA  expandAVX512_20_inShuf1<>+0x30(SB)/8, $0xffffffff06050403
-DATA  expandAVX512_20_inShuf1<>+0x38(SB)/8, $0xffff050504040303
-
-GLOBL expandAVX512_20_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_20_mat1<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_20_mat1<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_20_mat1<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_20_mat1<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_20_mat1<>+0x20(SB)/8, $0x0202020202020202
-DATA  expandAVX512_20_mat1<>+0x28(SB)/8, $0x0404040404040404
-DATA  expandAVX512_20_mat1<>+0x30(SB)/8, $0x0404040408080808
-DATA  expandAVX512_20_mat1<>+0x38(SB)/8, $0x0808080808080808
-
-GLOBL expandAVX512_20_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_20_inShuf2<>+0x00(SB)/8, $0xffff050504040303
-DATA  expandAVX512_20_inShuf2<>+0x08(SB)/8, $0xffffffffff050403
-DATA  expandAVX512_20_inShuf2<>+0x10(SB)/8, $0xffff050504040303
-DATA  expandAVX512_20_inShuf2<>+0x18(SB)/8, $0xffff050504040303
-DATA  expandAVX512_20_inShuf2<>+0x20(SB)/8, $0xffffffffff050403
-DATA  expandAVX512_20_inShuf2<>+0x28(SB)/8, $0xffff050504040303
-DATA  expandAVX512_20_inShuf2<>+0x30(SB)/8, $0xffff060605050404
-DATA  expandAVX512_20_inShuf2<>+0x38(SB)/8, $0xffffffffff060504
-
-GLOBL expandAVX512_20_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_20_mat2<>+0x00(SB)/8, $0x1010101010101010
-DATA  expandAVX512_20_mat2<>+0x08(SB)/8, $0x1010101020202020
-DATA  expandAVX512_20_mat2<>+0x10(SB)/8, $0x2020202020202020
-DATA  expandAVX512_20_mat2<>+0x18(SB)/8, $0x4040404040404040
-DATA  expandAVX512_20_mat2<>+0x20(SB)/8, $0x4040404080808080
-DATA  expandAVX512_20_mat2<>+0x28(SB)/8, $0x8080808080808080
-DATA  expandAVX512_20_mat2<>+0x30(SB)/8, $0x0101010101010101
-DATA  expandAVX512_20_mat2<>+0x38(SB)/8, $0x0101010102020202
-
-GLOBL expandAVX512_20_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_20_outShufLo+0x00(SB)/8, $0x2019181110080100
-DATA  expandAVX512_20_outShufLo+0x08(SB)/8, $0x4841403831302928
-DATA  expandAVX512_20_outShufLo+0x10(SB)/8, $0x1209030259585049
-DATA  expandAVX512_20_outShufLo+0x18(SB)/8, $0x33322b2a211b1a13
-DATA  expandAVX512_20_outShufLo+0x20(SB)/8, $0x5b5a514b4a434239
-DATA  expandAVX512_20_outShufLo+0x28(SB)/8, $0x221d1c15140a0504
-DATA  expandAVX512_20_outShufLo+0x30(SB)/8, $0x4c45443a35342d2c
-DATA  expandAVX512_20_outShufLo+0x38(SB)/8, $0x160b07065d5c524d
-
-GLOBL expandAVX512_20_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_20_outShufHi+0x00(SB)/8, $0x4140393830292820
-DATA  expandAVX512_20_outShufHi+0x08(SB)/8, $0x6968605958515048
-DATA  expandAVX512_20_outShufHi+0x10(SB)/8, $0x312b2a2221787170
-DATA  expandAVX512_20_outShufHi+0x18(SB)/8, $0x5a53524943423b3a
-DATA  expandAVX512_20_outShufHi+0x20(SB)/8, $0x237973726b6a615b
-DATA  expandAVX512_20_outShufHi+0x28(SB)/8, $0x45443d3c322d2c24
-DATA  expandAVX512_20_outShufHi+0x30(SB)/8, $0x6d6c625d5c55544a
-DATA  expandAVX512_20_outShufHi+0x38(SB)/8, $0x332f2e26257a7574
-
-TEXT expandAVX512_20<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_20_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_20_inShuf1<>(SB), Z3
-       VMOVDQU64 expandAVX512_20_inShuf2<>(SB), Z4
-       VMOVDQU64 expandAVX512_20_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_20_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z5
-       VPERMB Z5, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_20_mat0<>(SB), Z0, Z0
-       VPERMB Z5, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_20_mat1<>(SB), Z3, Z3
-       VPERMB Z5, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_20_mat2<>(SB), Z4, Z4
-       VPERMI2B Z3, Z0, Z1
-       VPERMI2B Z4, Z3, Z2
-       RET
-
-GLOBL expandAVX512_22_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_inShuf0<>+0x00(SB)/8, $0xffff020201010000
-DATA  expandAVX512_22_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_22_inShuf0<>+0x10(SB)/8, $0xffff020201010000
-DATA  expandAVX512_22_inShuf0<>+0x18(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_22_inShuf0<>+0x20(SB)/8, $0xffff020201010000
-DATA  expandAVX512_22_inShuf0<>+0x28(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_22_inShuf0<>+0x30(SB)/8, $0xffff020201010000
-DATA  expandAVX512_22_inShuf0<>+0x38(SB)/8, $0xffff020201010000
-
-GLOBL expandAVX512_22_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_22_mat0<>+0x08(SB)/8, $0x0101010101010202
-DATA  expandAVX512_22_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_22_mat0<>+0x18(SB)/8, $0x0202020204040404
-DATA  expandAVX512_22_mat0<>+0x20(SB)/8, $0x0404040404040404
-DATA  expandAVX512_22_mat0<>+0x28(SB)/8, $0x0404080808080808
-DATA  expandAVX512_22_mat0<>+0x30(SB)/8, $0x0808080808080808
-DATA  expandAVX512_22_mat0<>+0x38(SB)/8, $0x1010101010101010
-
-GLOBL expandAVX512_22_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_inShuf1<>+0x00(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_22_inShuf1<>+0x08(SB)/8, $0xffff020201010000
-DATA  expandAVX512_22_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_22_inShuf1<>+0x18(SB)/8, $0xffff020201010000
-DATA  expandAVX512_22_inShuf1<>+0x20(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_22_inShuf1<>+0x28(SB)/8, $0xffffffff01010000
-DATA  expandAVX512_22_inShuf1<>+0x30(SB)/8, $0xffff040403030202
-DATA  expandAVX512_22_inShuf1<>+0x38(SB)/8, $0xffff050504040303
-
-GLOBL expandAVX512_22_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_mat1<>+0x00(SB)/8, $0x1010101010102020
-DATA  expandAVX512_22_mat1<>+0x08(SB)/8, $0x2020202020202020
-DATA  expandAVX512_22_mat1<>+0x10(SB)/8, $0x2020202040404040
-DATA  expandAVX512_22_mat1<>+0x18(SB)/8, $0x4040404040404040
-DATA  expandAVX512_22_mat1<>+0x20(SB)/8, $0x4040808080808080
-DATA  expandAVX512_22_mat1<>+0x28(SB)/8, $0x8080808080808080
-DATA  expandAVX512_22_mat1<>+0x30(SB)/8, $0x8080808080808080
-DATA  expandAVX512_22_mat1<>+0x38(SB)/8, $0x0101010101010101
-
-GLOBL expandAVX512_22_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_inShuf2<>+0x00(SB)/8, $0xffffffffff050403
-DATA  expandAVX512_22_inShuf2<>+0x08(SB)/8, $0xffff050504040303
-DATA  expandAVX512_22_inShuf2<>+0x10(SB)/8, $0xffffffffff050403
-DATA  expandAVX512_22_inShuf2<>+0x18(SB)/8, $0xffff050504040303
-DATA  expandAVX512_22_inShuf2<>+0x20(SB)/8, $0xffffffffff050403
-DATA  expandAVX512_22_inShuf2<>+0x28(SB)/8, $0xffff050504040303
-DATA  expandAVX512_22_inShuf2<>+0x30(SB)/8, $0xffff050504040303
-DATA  expandAVX512_22_inShuf2<>+0x38(SB)/8, $0xffffffffff050403
-
-GLOBL expandAVX512_22_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_mat2<>+0x00(SB)/8, $0x0101010101010202
-DATA  expandAVX512_22_mat2<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_22_mat2<>+0x10(SB)/8, $0x0202020204040404
-DATA  expandAVX512_22_mat2<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_22_mat2<>+0x20(SB)/8, $0x0404080808080808
-DATA  expandAVX512_22_mat2<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_22_mat2<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_22_mat2<>+0x38(SB)/8, $0x1010101010102020
-
-GLOBL expandAVX512_22_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_inShuf3<>+0x00(SB)/8, $0xffff050504040303
-DATA  expandAVX512_22_inShuf3<>+0x08(SB)/8, $0xffffffffff050403
-DATA  expandAVX512_22_inShuf3<>+0x10(SB)/8, $0xffffff0504040303
-DATA  expandAVX512_22_inShuf3<>+0x18(SB)/8, $0xffffffffffff0403
-DATA  expandAVX512_22_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_22_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_22_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_22_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_22_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_22_mat3<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_22_mat3<>+0x08(SB)/8, $0x2020202040404040
-DATA  expandAVX512_22_mat3<>+0x10(SB)/8, $0x4040404040404040
-DATA  expandAVX512_22_mat3<>+0x18(SB)/8, $0x4040808080808080
-DATA  expandAVX512_22_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_22_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_22_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_22_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_22_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_22_outShufLo+0x00(SB)/8, $0x2120181110080100
-DATA  expandAVX512_22_outShufLo+0x08(SB)/8, $0x4948403938313028
-DATA  expandAVX512_22_outShufLo+0x10(SB)/8, $0x0302696860595850
-DATA  expandAVX512_22_outShufLo+0x18(SB)/8, $0x3229232219131209
-DATA  expandAVX512_22_outShufLo+0x20(SB)/8, $0x5a514b4a413b3a33
-DATA  expandAVX512_22_outShufLo+0x28(SB)/8, $0x140a05046b6a615b
-DATA  expandAVX512_22_outShufLo+0x30(SB)/8, $0x3c35342a25241a15
-DATA  expandAVX512_22_outShufLo+0x38(SB)/8, $0x625d5c524d4c423d
-
-GLOBL expandAVX512_22_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_22_outShufHi0+0x00(SB)/8, $0x5049484039383130
-DATA  expandAVX512_22_outShufHi0+0x08(SB)/8, $0x7871706968605958
-DATA  expandAVX512_22_outShufHi0+0x10(SB)/8, $0x3332ffffffffffff
-DATA  expandAVX512_22_outShufHi0+0x18(SB)/8, $0x5b5a514b4a413b3a
-DATA  expandAVX512_22_outShufHi0+0x20(SB)/8, $0xffff7973726b6a61
-DATA  expandAVX512_22_outShufHi0+0x28(SB)/8, $0x3d3c3534ffffffff
-DATA  expandAVX512_22_outShufHi0+0x30(SB)/8, $0x6c625d5c524d4c42
-DATA  expandAVX512_22_outShufHi0+0x38(SB)/8, $0xffffffff7a75746d
-
-GLOBL expandAVX512_22_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_22_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_22_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_22_outShufHi1+0x10(SB)/8, $0xffff181110080100
-DATA  expandAVX512_22_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_22_outShufHi1+0x20(SB)/8, $0x0302ffffffffffff
-DATA  expandAVX512_22_outShufHi1+0x28(SB)/8, $0xffffffff19131209
-DATA  expandAVX512_22_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_22_outShufHi1+0x38(SB)/8, $0x140a0504ffffffff
-
-TEXT expandAVX512_22<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_22_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_22_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_22_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_22_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_22_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_22_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_22_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_22_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_22_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_22_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_22_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xffff03fffc0ffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0xf0000fc0003f0000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_24_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_24_inShuf0<>+0x00(SB)/8, $0x0202010101000000
-DATA  expandAVX512_24_inShuf0<>+0x08(SB)/8, $0x0202010101000000
-DATA  expandAVX512_24_inShuf0<>+0x10(SB)/8, $0x0202010101000000
-DATA  expandAVX512_24_inShuf0<>+0x18(SB)/8, $0x0202010101000000
-DATA  expandAVX512_24_inShuf0<>+0x20(SB)/8, $0x0202010101000000
-DATA  expandAVX512_24_inShuf0<>+0x28(SB)/8, $0xff02010101000000
-DATA  expandAVX512_24_inShuf0<>+0x30(SB)/8, $0xffff010101000000
-DATA  expandAVX512_24_inShuf0<>+0x38(SB)/8, $0xffff010101000000
-
-GLOBL expandAVX512_24_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_24_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_24_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_24_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_24_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_24_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_24_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_24_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_24_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_24_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_24_inShuf1<>+0x00(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_24_inShuf1<>+0x08(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_24_inShuf1<>+0x10(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_24_inShuf1<>+0x18(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_24_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_24_inShuf1<>+0x28(SB)/8, $0x0404040303030202
-DATA  expandAVX512_24_inShuf1<>+0x30(SB)/8, $0x0404030303020202
-DATA  expandAVX512_24_inShuf1<>+0x38(SB)/8, $0x0404030303020202
-
-GLOBL expandAVX512_24_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_24_inShuf2<>+0x00(SB)/8, $0x0505040404030303
-DATA  expandAVX512_24_inShuf2<>+0x08(SB)/8, $0x0505040404030303
-DATA  expandAVX512_24_inShuf2<>+0x10(SB)/8, $0x0505040404030303
-DATA  expandAVX512_24_inShuf2<>+0x18(SB)/8, $0xffff040404030303
-DATA  expandAVX512_24_inShuf2<>+0x20(SB)/8, $0xffff040404030303
-DATA  expandAVX512_24_inShuf2<>+0x28(SB)/8, $0xffffffffffffff04
-DATA  expandAVX512_24_inShuf2<>+0x30(SB)/8, $0xffffffffffffff04
-DATA  expandAVX512_24_inShuf2<>+0x38(SB)/8, $0xffffffffffffff05
-
-GLOBL expandAVX512_24_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_24_mat2<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_24_mat2<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_24_mat2<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_24_mat2<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_24_mat2<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_24_mat2<>+0x28(SB)/8, $0x4040404040404040
-DATA  expandAVX512_24_mat2<>+0x30(SB)/8, $0x8080808080808080
-DATA  expandAVX512_24_mat2<>+0x38(SB)/8, $0x0101010101010101
-
-GLOBL expandAVX512_24_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_24_inShuf3<>+0x00(SB)/8, $0xffffffffffffff05
-DATA  expandAVX512_24_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_24_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_24_mat3<>+0x00(SB)/8, $0x0202020202020202
-DATA  expandAVX512_24_mat3<>+0x08(SB)/8, $0x0000000000000000
-DATA  expandAVX512_24_mat3<>+0x10(SB)/8, $0x0000000000000000
-DATA  expandAVX512_24_mat3<>+0x18(SB)/8, $0x0000000000000000
-DATA  expandAVX512_24_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_24_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_24_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_24_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_24_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_24_outShufLo+0x00(SB)/8, $0x11100a0908020100
-DATA  expandAVX512_24_outShufLo+0x08(SB)/8, $0x282221201a191812
-DATA  expandAVX512_24_outShufLo+0x10(SB)/8, $0x3a39383231302a29
-DATA  expandAVX512_24_outShufLo+0x18(SB)/8, $0x14130d0c0b050403
-DATA  expandAVX512_24_outShufLo+0x20(SB)/8, $0x2b2524231d1c1b15
-DATA  expandAVX512_24_outShufLo+0x28(SB)/8, $0x3d3c3b3534332d2c
-DATA  expandAVX512_24_outShufLo+0x30(SB)/8, $0x1716480f0e400706
-DATA  expandAVX512_24_outShufLo+0x38(SB)/8, $0x2e602726581f1e50
-
-GLOBL expandAVX512_24_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_24_outShufHi0+0x00(SB)/8, $0x3a39383231302928
-DATA  expandAVX512_24_outShufHi0+0x08(SB)/8, $0x51504a4948424140
-DATA  expandAVX512_24_outShufHi0+0x10(SB)/8, $0x2a6261605a595852
-DATA  expandAVX512_24_outShufHi0+0x18(SB)/8, $0x3d3c3b3534332c2b
-DATA  expandAVX512_24_outShufHi0+0x20(SB)/8, $0x54534d4c4b454443
-DATA  expandAVX512_24_outShufHi0+0x28(SB)/8, $0x2d6564635d5c5b55
-DATA  expandAVX512_24_outShufHi0+0x30(SB)/8, $0x703f3e6837362f2e
-DATA  expandAVX512_24_outShufHi0+0x38(SB)/8, $0x5756ff4f4e784746
-
-GLOBL expandAVX512_24_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_24_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_24_outShufHi1+0x38(SB)/8, $0xffff00ffffffffff
-
-TEXT expandAVX512_24<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_24_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_24_mat0<>(SB), Z2
-       VMOVDQU64 expandAVX512_24_inShuf1<>(SB), Z3
-       VMOVDQU64 expandAVX512_24_inShuf2<>(SB), Z4
-       VMOVDQU64 expandAVX512_24_inShuf3<>(SB), Z5
-       VMOVDQU64 expandAVX512_24_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_24_outShufHi0(SB), Z6
-       VMOVDQU64 expandAVX512_24_outShufHi1(SB), Z7
-       VMOVDQU64 (AX), Z8
-       VPERMB Z8, Z0, Z0
-       VGF2P8AFFINEQB $0, Z2, Z0, Z0
-       VPERMB Z8, Z3, Z3
-       VGF2P8AFFINEQB $0, Z2, Z3, Z2
-       VPERMB Z8, Z4, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_24_mat2<>(SB), Z3, Z3
-       VPERMB Z8, Z5, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_24_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xdfffffffffffffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z6
-       MOVQ $0x2000000000000000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z7, K1, Z0
-       VPORQ Z0, Z6, Z2
-       RET
-
-GLOBL expandAVX512_26_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_inShuf0<>+0x00(SB)/8, $0x0202010101000000
-DATA  expandAVX512_26_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_26_inShuf0<>+0x10(SB)/8, $0xffff020201010000
-DATA  expandAVX512_26_inShuf0<>+0x18(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_26_inShuf0<>+0x20(SB)/8, $0xffff020201010000
-DATA  expandAVX512_26_inShuf0<>+0x28(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_26_inShuf0<>+0x30(SB)/8, $0x0202010101000000
-DATA  expandAVX512_26_inShuf0<>+0x38(SB)/8, $0xffff010101000000
-
-GLOBL expandAVX512_26_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_26_mat0<>+0x08(SB)/8, $0x0101020202020202
-DATA  expandAVX512_26_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_26_mat0<>+0x18(SB)/8, $0x0202020204040404
-DATA  expandAVX512_26_mat0<>+0x20(SB)/8, $0x0404040404040404
-DATA  expandAVX512_26_mat0<>+0x28(SB)/8, $0x0404040404040808
-DATA  expandAVX512_26_mat0<>+0x30(SB)/8, $0x0808080808080808
-DATA  expandAVX512_26_mat0<>+0x38(SB)/8, $0x1010101010101010
-
-GLOBL expandAVX512_26_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_26_inShuf1<>+0x08(SB)/8, $0xffffffff01010000
-DATA  expandAVX512_26_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_26_inShuf1<>+0x18(SB)/8, $0xffffffff01010000
-DATA  expandAVX512_26_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_26_inShuf1<>+0x28(SB)/8, $0xffff010101000000
-DATA  expandAVX512_26_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_26_inShuf1<>+0x38(SB)/8, $0xff04040403030302
-
-GLOBL expandAVX512_26_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_mat1<>+0x00(SB)/8, $0x1010202020202020
-DATA  expandAVX512_26_mat1<>+0x08(SB)/8, $0x2020202020202020
-DATA  expandAVX512_26_mat1<>+0x10(SB)/8, $0x2020202040404040
-DATA  expandAVX512_26_mat1<>+0x18(SB)/8, $0x4040404040404040
-DATA  expandAVX512_26_mat1<>+0x20(SB)/8, $0x4040404040408080
-DATA  expandAVX512_26_mat1<>+0x28(SB)/8, $0x8080808080808080
-DATA  expandAVX512_26_mat1<>+0x30(SB)/8, $0x0101010101010101
-DATA  expandAVX512_26_mat1<>+0x38(SB)/8, $0x0808080808080808
-
-GLOBL expandAVX512_26_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_inShuf2<>+0x00(SB)/8, $0x0404030303020202
-DATA  expandAVX512_26_inShuf2<>+0x08(SB)/8, $0xffffffffff040302
-DATA  expandAVX512_26_inShuf2<>+0x10(SB)/8, $0xffff040403030202
-DATA  expandAVX512_26_inShuf2<>+0x18(SB)/8, $0xffffffffff040302
-DATA  expandAVX512_26_inShuf2<>+0x20(SB)/8, $0xffff040403030202
-DATA  expandAVX512_26_inShuf2<>+0x28(SB)/8, $0xffffffffff040302
-DATA  expandAVX512_26_inShuf2<>+0x30(SB)/8, $0xff04030303020202
-DATA  expandAVX512_26_inShuf2<>+0x38(SB)/8, $0xffff040404030303
-
-GLOBL expandAVX512_26_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_mat2<>+0x00(SB)/8, $0x1010101010101010
-DATA  expandAVX512_26_mat2<>+0x08(SB)/8, $0x1010202020202020
-DATA  expandAVX512_26_mat2<>+0x10(SB)/8, $0x2020202020202020
-DATA  expandAVX512_26_mat2<>+0x18(SB)/8, $0x2020202040404040
-DATA  expandAVX512_26_mat2<>+0x20(SB)/8, $0x4040404040404040
-DATA  expandAVX512_26_mat2<>+0x28(SB)/8, $0x4040404040408080
-DATA  expandAVX512_26_mat2<>+0x30(SB)/8, $0x8080808080808080
-DATA  expandAVX512_26_mat2<>+0x38(SB)/8, $0x0101010101010101
-
-GLOBL expandAVX512_26_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403
-DATA  expandAVX512_26_inShuf3<>+0x08(SB)/8, $0xffffffff04040303
-DATA  expandAVX512_26_inShuf3<>+0x10(SB)/8, $0xffffffffffff0403
-DATA  expandAVX512_26_inShuf3<>+0x18(SB)/8, $0xffffffff04040303
-DATA  expandAVX512_26_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403
-DATA  expandAVX512_26_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04
-DATA  expandAVX512_26_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_26_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_26_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_26_mat3<>+0x00(SB)/8, $0x0101020202020202
-DATA  expandAVX512_26_mat3<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_26_mat3<>+0x10(SB)/8, $0x0202020204040404
-DATA  expandAVX512_26_mat3<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_26_mat3<>+0x20(SB)/8, $0x0404040404040808
-DATA  expandAVX512_26_mat3<>+0x28(SB)/8, $0x1010101010101010
-DATA  expandAVX512_26_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_26_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_26_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_26_outShufLo+0x00(SB)/8, $0x2018111008020100
-DATA  expandAVX512_26_outShufLo+0x08(SB)/8, $0x3a39383231302821
-DATA  expandAVX512_26_outShufLo+0x10(SB)/8, $0x6860595850494840
-DATA  expandAVX512_26_outShufLo+0x18(SB)/8, $0x1312090504036a69
-DATA  expandAVX512_26_outShufLo+0x20(SB)/8, $0x3b35343329232219
-DATA  expandAVX512_26_outShufLo+0x28(SB)/8, $0x5b5a514b4a413d3c
-DATA  expandAVX512_26_outShufLo+0x30(SB)/8, $0x0a7007066d6c6b61
-DATA  expandAVX512_26_outShufLo+0x38(SB)/8, $0x37362a25241a1514
-
-GLOBL expandAVX512_26_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_26_outShufHi0+0x00(SB)/8, $0x5851504842414038
-DATA  expandAVX512_26_outShufHi0+0x08(SB)/8, $0x7978727170686160
-DATA  expandAVX512_26_outShufHi0+0x10(SB)/8, $0xffffffffffffff7a
-DATA  expandAVX512_26_outShufHi0+0x18(SB)/8, $0x52494544433b3a39
-DATA  expandAVX512_26_outShufHi0+0x20(SB)/8, $0x7574736963625953
-DATA  expandAVX512_26_outShufHi0+0x28(SB)/8, $0xffffffffff7d7c7b
-DATA  expandAVX512_26_outShufHi0+0x30(SB)/8, $0xff47463e3d3cffff
-DATA  expandAVX512_26_outShufHi0+0x38(SB)/8, $0x766a65645a55544a
-
-GLOBL expandAVX512_26_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_26_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_26_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_26_outShufHi1+0x10(SB)/8, $0x20191810090800ff
-DATA  expandAVX512_26_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_26_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_26_outShufHi1+0x28(SB)/8, $0x1a110b0a01ffffff
-DATA  expandAVX512_26_outShufHi1+0x30(SB)/8, $0x28ffffffffff211b
-DATA  expandAVX512_26_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
-
-TEXT expandAVX512_26<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_26_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_26_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_26_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_26_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_26_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_26_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_26_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_26_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_26_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_26_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_26_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xff7c07ffff01ffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0x83f80000fe0000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_28_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_inShuf0<>+0x00(SB)/8, $0x0202010101000000
-DATA  expandAVX512_28_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_28_inShuf0<>+0x10(SB)/8, $0x0202010101000000
-DATA  expandAVX512_28_inShuf0<>+0x18(SB)/8, $0xff02010101000000
-DATA  expandAVX512_28_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_28_inShuf0<>+0x28(SB)/8, $0xffff010101000000
-DATA  expandAVX512_28_inShuf0<>+0x30(SB)/8, $0xffff010101000000
-DATA  expandAVX512_28_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100
-
-GLOBL expandAVX512_28_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_28_mat0<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_28_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_28_mat0<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_28_mat0<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_28_mat0<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_28_mat0<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_28_mat0<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_28_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_inShuf1<>+0x00(SB)/8, $0xffff010101000000
-DATA  expandAVX512_28_inShuf1<>+0x08(SB)/8, $0xffff010101000000
-DATA  expandAVX512_28_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_28_inShuf1<>+0x18(SB)/8, $0xffff010101000000
-DATA  expandAVX512_28_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_28_inShuf1<>+0x28(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_28_inShuf1<>+0x30(SB)/8, $0x0404040303030202
-DATA  expandAVX512_28_inShuf1<>+0x38(SB)/8, $0xffffffffff040302
-
-GLOBL expandAVX512_28_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_mat1<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_28_mat1<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_28_mat1<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_28_mat1<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_28_mat1<>+0x20(SB)/8, $0x0101010101010101
-DATA  expandAVX512_28_mat1<>+0x28(SB)/8, $0x0202020202020202
-DATA  expandAVX512_28_mat1<>+0x30(SB)/8, $0x0404040404040404
-DATA  expandAVX512_28_mat1<>+0x38(SB)/8, $0x0404040408080808
-
-GLOBL expandAVX512_28_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_inShuf2<>+0x00(SB)/8, $0x0404030303020202
-DATA  expandAVX512_28_inShuf2<>+0x08(SB)/8, $0x0404030303020202
-DATA  expandAVX512_28_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302
-DATA  expandAVX512_28_inShuf2<>+0x18(SB)/8, $0xffff030303020202
-DATA  expandAVX512_28_inShuf2<>+0x20(SB)/8, $0xffff030303020202
-DATA  expandAVX512_28_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302
-DATA  expandAVX512_28_inShuf2<>+0x30(SB)/8, $0xffff030303020202
-DATA  expandAVX512_28_inShuf2<>+0x38(SB)/8, $0xffff040404030303
-
-GLOBL expandAVX512_28_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_mat2<>+0x00(SB)/8, $0x0808080808080808
-DATA  expandAVX512_28_mat2<>+0x08(SB)/8, $0x1010101010101010
-DATA  expandAVX512_28_mat2<>+0x10(SB)/8, $0x1010101020202020
-DATA  expandAVX512_28_mat2<>+0x18(SB)/8, $0x2020202020202020
-DATA  expandAVX512_28_mat2<>+0x20(SB)/8, $0x4040404040404040
-DATA  expandAVX512_28_mat2<>+0x28(SB)/8, $0x4040404080808080
-DATA  expandAVX512_28_mat2<>+0x30(SB)/8, $0x8080808080808080
-DATA  expandAVX512_28_mat2<>+0x38(SB)/8, $0x0101010101010101
-
-GLOBL expandAVX512_28_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403
-DATA  expandAVX512_28_inShuf3<>+0x08(SB)/8, $0xffff040404030303
-DATA  expandAVX512_28_inShuf3<>+0x10(SB)/8, $0xffffffffffffff04
-DATA  expandAVX512_28_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_28_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_28_mat3<>+0x00(SB)/8, $0x0101010102020202
-DATA  expandAVX512_28_mat3<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_28_mat3<>+0x10(SB)/8, $0x0808080808080808
-DATA  expandAVX512_28_mat3<>+0x18(SB)/8, $0x0000000000000000
-DATA  expandAVX512_28_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_28_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_28_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_28_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_28_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_28_outShufLo+0x00(SB)/8, $0x1812111008020100
-DATA  expandAVX512_28_outShufLo+0x08(SB)/8, $0x31302a2928201a19
-DATA  expandAVX512_28_outShufLo+0x10(SB)/8, $0x4a49484241403832
-DATA  expandAVX512_28_outShufLo+0x18(SB)/8, $0x090504035a595850
-DATA  expandAVX512_28_outShufLo+0x20(SB)/8, $0x2b211d1c1b151413
-DATA  expandAVX512_28_outShufLo+0x28(SB)/8, $0x4443393534332d2c
-DATA  expandAVX512_28_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b45
-DATA  expandAVX512_28_outShufLo+0x38(SB)/8, $0x1e6817160a600706
-
-GLOBL expandAVX512_28_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_28_outShufHi0+0x00(SB)/8, $0x4948424140383130
-DATA  expandAVX512_28_outShufHi0+0x08(SB)/8, $0x6261605a5958504a
-DATA  expandAVX512_28_outShufHi0+0x10(SB)/8, $0xff7a797872717068
-DATA  expandAVX512_28_outShufHi0+0x18(SB)/8, $0x4339343332ffffff
-DATA  expandAVX512_28_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b4544
-DATA  expandAVX512_28_outShufHi0+0x28(SB)/8, $0x757473696564635d
-DATA  expandAVX512_28_outShufHi0+0x30(SB)/8, $0x35ffffffff7d7c7b
-DATA  expandAVX512_28_outShufHi0+0x38(SB)/8, $0x4f4eff47463a3736
-
-GLOBL expandAVX512_28_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_28_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_outShufHi1+0x10(SB)/8, $0x00ffffffffffffff
-DATA  expandAVX512_28_outShufHi1+0x18(SB)/8, $0xffffffffff0a0908
-DATA  expandAVX512_28_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_28_outShufHi1+0x30(SB)/8, $0xff0d0c0b01ffffff
-DATA  expandAVX512_28_outShufHi1+0x38(SB)/8, $0xffff10ffffffffff
-
-TEXT expandAVX512_28<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_28_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_28_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_28_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_28_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_28_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_28_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_28_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_28_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_28_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_28_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_28_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xdf87fffff87fffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0x2078000007800000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_30_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_inShuf0<>+0x00(SB)/8, $0x0202010101000000
-DATA  expandAVX512_30_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
-DATA  expandAVX512_30_inShuf0<>+0x10(SB)/8, $0xffff010101000000
-DATA  expandAVX512_30_inShuf0<>+0x18(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_30_inShuf0<>+0x20(SB)/8, $0xffff010101000000
-DATA  expandAVX512_30_inShuf0<>+0x28(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_30_inShuf0<>+0x30(SB)/8, $0xffff010101000000
-DATA  expandAVX512_30_inShuf0<>+0x38(SB)/8, $0xffff010101000000
-
-GLOBL expandAVX512_30_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_30_mat0<>+0x08(SB)/8, $0x0101010101010202
-DATA  expandAVX512_30_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_30_mat0<>+0x18(SB)/8, $0x0202020204040404
-DATA  expandAVX512_30_mat0<>+0x20(SB)/8, $0x0404040404040404
-DATA  expandAVX512_30_mat0<>+0x28(SB)/8, $0x0404080808080808
-DATA  expandAVX512_30_mat0<>+0x30(SB)/8, $0x0808080808080808
-DATA  expandAVX512_30_mat0<>+0x38(SB)/8, $0x1010101010101010
-
-GLOBL expandAVX512_30_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_30_inShuf1<>+0x08(SB)/8, $0xffff010101000000
-DATA  expandAVX512_30_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_30_inShuf1<>+0x18(SB)/8, $0xffff010101000000
-DATA  expandAVX512_30_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_30_inShuf1<>+0x28(SB)/8, $0xffff010101000000
-DATA  expandAVX512_30_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_30_inShuf1<>+0x38(SB)/8, $0x0404030303020202
-
-GLOBL expandAVX512_30_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_mat1<>+0x00(SB)/8, $0x1010101010102020
-DATA  expandAVX512_30_mat1<>+0x08(SB)/8, $0x2020202020202020
-DATA  expandAVX512_30_mat1<>+0x10(SB)/8, $0x2020202040404040
-DATA  expandAVX512_30_mat1<>+0x18(SB)/8, $0x4040404040404040
-DATA  expandAVX512_30_mat1<>+0x20(SB)/8, $0x4040808080808080
-DATA  expandAVX512_30_mat1<>+0x28(SB)/8, $0x8080808080808080
-DATA  expandAVX512_30_mat1<>+0x30(SB)/8, $0x0101010101010101
-DATA  expandAVX512_30_mat1<>+0x38(SB)/8, $0x0202020202020202
-
-GLOBL expandAVX512_30_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_inShuf2<>+0x00(SB)/8, $0xffffffffff040302
-DATA  expandAVX512_30_inShuf2<>+0x08(SB)/8, $0xffff030303020202
-DATA  expandAVX512_30_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302
-DATA  expandAVX512_30_inShuf2<>+0x18(SB)/8, $0xffff030303020202
-DATA  expandAVX512_30_inShuf2<>+0x20(SB)/8, $0xffff030303020202
-DATA  expandAVX512_30_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302
-DATA  expandAVX512_30_inShuf2<>+0x30(SB)/8, $0xffff030303020202
-DATA  expandAVX512_30_inShuf2<>+0x38(SB)/8, $0xffffffffffff0302
-
-GLOBL expandAVX512_30_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_mat2<>+0x00(SB)/8, $0x0202020204040404
-DATA  expandAVX512_30_mat2<>+0x08(SB)/8, $0x0404040404040404
-DATA  expandAVX512_30_mat2<>+0x10(SB)/8, $0x0404080808080808
-DATA  expandAVX512_30_mat2<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_30_mat2<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_30_mat2<>+0x28(SB)/8, $0x1010101010102020
-DATA  expandAVX512_30_mat2<>+0x30(SB)/8, $0x2020202020202020
-DATA  expandAVX512_30_mat2<>+0x38(SB)/8, $0x2020202040404040
-
-GLOBL expandAVX512_30_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_inShuf3<>+0x00(SB)/8, $0xffff030303020202
-DATA  expandAVX512_30_inShuf3<>+0x08(SB)/8, $0xffffffffffff0302
-DATA  expandAVX512_30_inShuf3<>+0x10(SB)/8, $0xffff030303020202
-DATA  expandAVX512_30_inShuf3<>+0x18(SB)/8, $0xffff040404030303
-DATA  expandAVX512_30_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403
-DATA  expandAVX512_30_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04
-DATA  expandAVX512_30_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_30_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_30_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_30_mat3<>+0x00(SB)/8, $0x4040404040404040
-DATA  expandAVX512_30_mat3<>+0x08(SB)/8, $0x4040808080808080
-DATA  expandAVX512_30_mat3<>+0x10(SB)/8, $0x8080808080808080
-DATA  expandAVX512_30_mat3<>+0x18(SB)/8, $0x0101010101010101
-DATA  expandAVX512_30_mat3<>+0x20(SB)/8, $0x0101010101010202
-DATA  expandAVX512_30_mat3<>+0x28(SB)/8, $0x0202020202020202
-DATA  expandAVX512_30_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_30_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_30_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_30_outShufLo+0x00(SB)/8, $0x1812111008020100
-DATA  expandAVX512_30_outShufLo+0x08(SB)/8, $0x3832313028222120
-DATA  expandAVX512_30_outShufLo+0x10(SB)/8, $0x58504a4948403a39
-DATA  expandAVX512_30_outShufLo+0x18(SB)/8, $0x04036a6968605a59
-DATA  expandAVX512_30_outShufLo+0x20(SB)/8, $0x2423191514130905
-DATA  expandAVX512_30_outShufLo+0x28(SB)/8, $0x3d3c3b3534332925
-DATA  expandAVX512_30_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b41
-DATA  expandAVX512_30_outShufLo+0x38(SB)/8, $0x0a7007066d6c6b61
-
-GLOBL expandAVX512_30_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_30_outShufHi0+0x00(SB)/8, $0x504a4948403a3938
-DATA  expandAVX512_30_outShufHi0+0x08(SB)/8, $0x70686261605a5958
-DATA  expandAVX512_30_outShufHi0+0x10(SB)/8, $0xffffffffff787271
-DATA  expandAVX512_30_outShufHi0+0x18(SB)/8, $0x3c3bffffffffffff
-DATA  expandAVX512_30_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b413d
-DATA  expandAVX512_30_outShufHi0+0x28(SB)/8, $0x757473696564635d
-DATA  expandAVX512_30_outShufHi0+0x30(SB)/8, $0xffffffffffffff79
-DATA  expandAVX512_30_outShufHi0+0x38(SB)/8, $0x42ff3f3effffffff
-
-GLOBL expandAVX512_30_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_30_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_30_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_30_outShufHi1+0x10(SB)/8, $0x1008020100ffffff
-DATA  expandAVX512_30_outShufHi1+0x18(SB)/8, $0xffff201a19181211
-DATA  expandAVX512_30_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_30_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_30_outShufHi1+0x30(SB)/8, $0x15141309050403ff
-DATA  expandAVX512_30_outShufHi1+0x38(SB)/8, $0xff28ffff211d1c1b
-
-TEXT expandAVX512_30<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_30_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_30_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_30_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_30_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_30_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_30_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_30_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_30_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_30_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_30_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_30_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xb001ffffc007ffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0x4ffe00003ff80000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_32_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_32_inShuf0<>+0x00(SB)/8, $0x0101010100000000
-DATA  expandAVX512_32_inShuf0<>+0x08(SB)/8, $0x0101010100000000
-DATA  expandAVX512_32_inShuf0<>+0x10(SB)/8, $0x0101010100000000
-DATA  expandAVX512_32_inShuf0<>+0x18(SB)/8, $0x0101010100000000
-DATA  expandAVX512_32_inShuf0<>+0x20(SB)/8, $0x0101010100000000
-DATA  expandAVX512_32_inShuf0<>+0x28(SB)/8, $0x0101010100000000
-DATA  expandAVX512_32_inShuf0<>+0x30(SB)/8, $0x0101010100000000
-DATA  expandAVX512_32_inShuf0<>+0x38(SB)/8, $0x0101010100000000
-
-GLOBL expandAVX512_32_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_32_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_32_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_32_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_32_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_32_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_32_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_32_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_32_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_32_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_32_inShuf1<>+0x00(SB)/8, $0x0303030302020202
-DATA  expandAVX512_32_inShuf1<>+0x08(SB)/8, $0x0303030302020202
-DATA  expandAVX512_32_inShuf1<>+0x10(SB)/8, $0x0303030302020202
-DATA  expandAVX512_32_inShuf1<>+0x18(SB)/8, $0x0303030302020202
-DATA  expandAVX512_32_inShuf1<>+0x20(SB)/8, $0x0303030302020202
-DATA  expandAVX512_32_inShuf1<>+0x28(SB)/8, $0x0303030302020202
-DATA  expandAVX512_32_inShuf1<>+0x30(SB)/8, $0x0303030302020202
-DATA  expandAVX512_32_inShuf1<>+0x38(SB)/8, $0x0303030302020202
-
-GLOBL expandAVX512_32_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_32_outShufLo+0x00(SB)/8, $0x0b0a090803020100
-DATA  expandAVX512_32_outShufLo+0x08(SB)/8, $0x1b1a191813121110
-DATA  expandAVX512_32_outShufLo+0x10(SB)/8, $0x2b2a292823222120
-DATA  expandAVX512_32_outShufLo+0x18(SB)/8, $0x3b3a393833323130
-DATA  expandAVX512_32_outShufLo+0x20(SB)/8, $0x0f0e0d0c07060504
-DATA  expandAVX512_32_outShufLo+0x28(SB)/8, $0x1f1e1d1c17161514
-DATA  expandAVX512_32_outShufLo+0x30(SB)/8, $0x2f2e2d2c27262524
-DATA  expandAVX512_32_outShufLo+0x38(SB)/8, $0x3f3e3d3c37363534
-
-TEXT expandAVX512_32<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_32_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_32_mat0<>(SB), Z1
-       VMOVDQU64 expandAVX512_32_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_32_outShufLo(SB), Z3
-       VMOVDQU64 (AX), Z4
-       VPERMB Z4, Z0, Z0
-       VGF2P8AFFINEQB $0, Z1, Z0, Z0
-       VPERMB Z4, Z2, Z2
-       VGF2P8AFFINEQB $0, Z1, Z2, Z2
-       VPERMB Z0, Z3, Z1
-       VPERMB Z2, Z3, Z2
-       RET
-
-GLOBL expandAVX512_36_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_36_inShuf0<>+0x00(SB)/8, $0x0101010100000000
-DATA  expandAVX512_36_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_36_inShuf0<>+0x10(SB)/8, $0x0101010100000000
-DATA  expandAVX512_36_inShuf0<>+0x18(SB)/8, $0x0101010100000000
-DATA  expandAVX512_36_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_36_inShuf0<>+0x28(SB)/8, $0x0101010100000000
-DATA  expandAVX512_36_inShuf0<>+0x30(SB)/8, $0x0101010100000000
-DATA  expandAVX512_36_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100
-
-GLOBL expandAVX512_36_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_36_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_36_mat0<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_36_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_36_mat0<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_36_mat0<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_36_mat0<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_36_mat0<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_36_mat0<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_36_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_36_inShuf1<>+0x00(SB)/8, $0x0101010100000000
-DATA  expandAVX512_36_inShuf1<>+0x08(SB)/8, $0xffffff0100000000
-DATA  expandAVX512_36_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
-DATA  expandAVX512_36_inShuf1<>+0x18(SB)/8, $0xffffffff00000000
-DATA  expandAVX512_36_inShuf1<>+0x20(SB)/8, $0xff02020202010101
-DATA  expandAVX512_36_inShuf1<>+0x28(SB)/8, $0xffffffffffff0201
-DATA  expandAVX512_36_inShuf1<>+0x30(SB)/8, $0x0202020201010101
-DATA  expandAVX512_36_inShuf1<>+0x38(SB)/8, $0x0303030302020202
-
-GLOBL expandAVX512_36_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_36_mat1<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_36_mat1<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_36_mat1<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_36_mat1<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_36_mat1<>+0x20(SB)/8, $0x4040404040404040
-DATA  expandAVX512_36_mat1<>+0x28(SB)/8, $0x4040404080808080
-DATA  expandAVX512_36_mat1<>+0x30(SB)/8, $0x8080808080808080
-DATA  expandAVX512_36_mat1<>+0x38(SB)/8, $0x0101010101010101
-
-GLOBL expandAVX512_36_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_36_inShuf2<>+0x00(SB)/8, $0xffffffffffff0302
-DATA  expandAVX512_36_inShuf2<>+0x08(SB)/8, $0x0303030302020202
-DATA  expandAVX512_36_inShuf2<>+0x10(SB)/8, $0x0303030302020202
-DATA  expandAVX512_36_inShuf2<>+0x18(SB)/8, $0xffffffffffff0302
-DATA  expandAVX512_36_inShuf2<>+0x20(SB)/8, $0x0303030302020202
-DATA  expandAVX512_36_inShuf2<>+0x28(SB)/8, $0xffff030302020202
-DATA  expandAVX512_36_inShuf2<>+0x30(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_36_inShuf2<>+0x38(SB)/8, $0xffffffff02020202
-
-GLOBL expandAVX512_36_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_36_mat2<>+0x00(SB)/8, $0x0101010102020202
-DATA  expandAVX512_36_mat2<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_36_mat2<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_36_mat2<>+0x18(SB)/8, $0x0404040408080808
-DATA  expandAVX512_36_mat2<>+0x20(SB)/8, $0x0808080808080808
-DATA  expandAVX512_36_mat2<>+0x28(SB)/8, $0x1010101010101010
-DATA  expandAVX512_36_mat2<>+0x30(SB)/8, $0x1010101020202020
-DATA  expandAVX512_36_mat2<>+0x38(SB)/8, $0x2020202020202020
-
-GLOBL expandAVX512_36_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_36_outShufLo+0x00(SB)/8, $0x1211100803020100
-DATA  expandAVX512_36_outShufLo+0x08(SB)/8, $0x2928201b1a191813
-DATA  expandAVX512_36_outShufLo+0x10(SB)/8, $0x4038333231302b2a
-DATA  expandAVX512_36_outShufLo+0x18(SB)/8, $0x504b4a4948434241
-DATA  expandAVX512_36_outShufLo+0x20(SB)/8, $0x070605045b5a5958
-DATA  expandAVX512_36_outShufLo+0x28(SB)/8, $0x1e1d1c1716151409
-DATA  expandAVX512_36_outShufLo+0x30(SB)/8, $0x35342f2e2d2c211f
-DATA  expandAVX512_36_outShufLo+0x38(SB)/8, $0x4c47464544393736
-
-GLOBL expandAVX512_36_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_36_outShufHi+0x00(SB)/8, $0x3332313028222120
-DATA  expandAVX512_36_outShufHi+0x08(SB)/8, $0x4a4948403b3a3938
-DATA  expandAVX512_36_outShufHi+0x10(SB)/8, $0x616058535251504b
-DATA  expandAVX512_36_outShufHi+0x18(SB)/8, $0x78706b6a69686362
-DATA  expandAVX512_36_outShufHi+0x20(SB)/8, $0x29262524237b7a79
-DATA  expandAVX512_36_outShufHi+0x28(SB)/8, $0x3f3e3d3c37363534
-DATA  expandAVX512_36_outShufHi+0x30(SB)/8, $0x5655544f4e4d4c41
-DATA  expandAVX512_36_outShufHi+0x38(SB)/8, $0x6d6c676665645957
-
-TEXT expandAVX512_36<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_36_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_36_inShuf1<>(SB), Z3
-       VMOVDQU64 expandAVX512_36_inShuf2<>(SB), Z4
-       VMOVDQU64 expandAVX512_36_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_36_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z5
-       VPERMB Z5, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_36_mat0<>(SB), Z0, Z0
-       VPERMB Z5, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_36_mat1<>(SB), Z3, Z3
-       VPERMB Z5, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_36_mat2<>(SB), Z4, Z4
-       VPERMI2B Z3, Z0, Z1
-       VPERMI2B Z4, Z3, Z2
-       RET
-
-GLOBL expandAVX512_40_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_inShuf0<>+0x00(SB)/8, $0x0101010000000000
-DATA  expandAVX512_40_inShuf0<>+0x08(SB)/8, $0x0101010000000000
-DATA  expandAVX512_40_inShuf0<>+0x10(SB)/8, $0x0101010000000000
-DATA  expandAVX512_40_inShuf0<>+0x18(SB)/8, $0x0101010000000000
-DATA  expandAVX512_40_inShuf0<>+0x20(SB)/8, $0x0101010000000000
-DATA  expandAVX512_40_inShuf0<>+0x28(SB)/8, $0xffffff0000000000
-DATA  expandAVX512_40_inShuf0<>+0x30(SB)/8, $0xffffff0000000000
-DATA  expandAVX512_40_inShuf0<>+0x38(SB)/8, $0xffffff0000000000
-
-GLOBL expandAVX512_40_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_40_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_40_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_40_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_40_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_40_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_40_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_40_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_40_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_inShuf1<>+0x00(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_40_inShuf1<>+0x08(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_40_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_40_inShuf1<>+0x18(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_40_inShuf1<>+0x20(SB)/8, $0xffffffffffffff01
-DATA  expandAVX512_40_inShuf1<>+0x28(SB)/8, $0xffff020202020201
-DATA  expandAVX512_40_inShuf1<>+0x30(SB)/8, $0x0202020101010101
-DATA  expandAVX512_40_inShuf1<>+0x38(SB)/8, $0x0202020101010101
-
-GLOBL expandAVX512_40_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_mat1<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_40_mat1<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_40_mat1<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_40_mat1<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_40_mat1<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_40_mat1<>+0x28(SB)/8, $0x1010101010101010
-DATA  expandAVX512_40_mat1<>+0x30(SB)/8, $0x2020202020202020
-DATA  expandAVX512_40_mat1<>+0x38(SB)/8, $0x4040404040404040
-
-GLOBL expandAVX512_40_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_inShuf2<>+0x00(SB)/8, $0x0202020101010101
-DATA  expandAVX512_40_inShuf2<>+0x08(SB)/8, $0x0303030202020202
-DATA  expandAVX512_40_inShuf2<>+0x10(SB)/8, $0x0303030202020202
-DATA  expandAVX512_40_inShuf2<>+0x18(SB)/8, $0xffffff0202020202
-DATA  expandAVX512_40_inShuf2<>+0x20(SB)/8, $0xffffff0202020202
-DATA  expandAVX512_40_inShuf2<>+0x28(SB)/8, $0xffffffffffff0202
-DATA  expandAVX512_40_inShuf2<>+0x30(SB)/8, $0xffffffffffff0202
-DATA  expandAVX512_40_inShuf2<>+0x38(SB)/8, $0xffffffffffff0202
-
-GLOBL expandAVX512_40_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_mat2<>+0x00(SB)/8, $0x8080808080808080
-DATA  expandAVX512_40_mat2<>+0x08(SB)/8, $0x0101010101010101
-DATA  expandAVX512_40_mat2<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_40_mat2<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_40_mat2<>+0x20(SB)/8, $0x0808080808080808
-DATA  expandAVX512_40_mat2<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_40_mat2<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_40_mat2<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_40_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_inShuf3<>+0x00(SB)/8, $0xffffffffffff0303
-DATA  expandAVX512_40_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_40_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_40_mat3<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_40_mat3<>+0x08(SB)/8, $0x0000000000000000
-DATA  expandAVX512_40_mat3<>+0x10(SB)/8, $0x0000000000000000
-DATA  expandAVX512_40_mat3<>+0x18(SB)/8, $0x0000000000000000
-DATA  expandAVX512_40_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_40_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_40_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_40_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_40_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_40_outShufLo+0x00(SB)/8, $0x0a09080403020100
-DATA  expandAVX512_40_outShufLo+0x08(SB)/8, $0x1814131211100c0b
-DATA  expandAVX512_40_outShufLo+0x10(SB)/8, $0x232221201c1b1a19
-DATA  expandAVX512_40_outShufLo+0x18(SB)/8, $0x31302c2b2a292824
-DATA  expandAVX512_40_outShufLo+0x20(SB)/8, $0x3c3b3a3938343332
-DATA  expandAVX512_40_outShufLo+0x28(SB)/8, $0x0f0e0d4140070605
-DATA  expandAVX512_40_outShufLo+0x30(SB)/8, $0x1d51501716154948
-DATA  expandAVX512_40_outShufLo+0x38(SB)/8, $0x6027262559581f1e
-
-GLOBL expandAVX512_40_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_40_outShufHi0+0x00(SB)/8, $0x3938343332313028
-DATA  expandAVX512_40_outShufHi0+0x08(SB)/8, $0x44434241403c3b3a
-DATA  expandAVX512_40_outShufHi0+0x10(SB)/8, $0x5251504c4b4a4948
-DATA  expandAVX512_40_outShufHi0+0x18(SB)/8, $0x605c5b5a59585453
-DATA  expandAVX512_40_outShufHi0+0x20(SB)/8, $0x2c2b2a2964636261
-DATA  expandAVX512_40_outShufHi0+0x28(SB)/8, $0x3e3d69683736352d
-DATA  expandAVX512_40_outShufHi0+0x30(SB)/8, $0x797847464571703f
-DATA  expandAVX512_40_outShufHi0+0x38(SB)/8, $0x575655ffff4f4e4d
-
-GLOBL expandAVX512_40_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_40_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_40_outShufHi1+0x38(SB)/8, $0xffffff0100ffffff
-
-TEXT expandAVX512_40<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_40_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_40_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_40_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_40_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_40_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_40_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_40_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_40_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_40_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_40_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_40_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xe7ffffffffffffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0x1800000000000000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_44_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_inShuf0<>+0x00(SB)/8, $0x0101010000000000
-DATA  expandAVX512_44_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_44_inShuf0<>+0x10(SB)/8, $0x0101010000000000
-DATA  expandAVX512_44_inShuf0<>+0x18(SB)/8, $0x0101010000000000
-DATA  expandAVX512_44_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_44_inShuf0<>+0x28(SB)/8, $0x0101010000000000
-DATA  expandAVX512_44_inShuf0<>+0x30(SB)/8, $0xffffff0000000000
-DATA  expandAVX512_44_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
-
-GLOBL expandAVX512_44_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_44_mat0<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_44_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_44_mat0<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_44_mat0<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_44_mat0<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_44_mat0<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_44_mat0<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_44_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_inShuf1<>+0x00(SB)/8, $0xffffff0000000000
-DATA  expandAVX512_44_inShuf1<>+0x08(SB)/8, $0xffffff0000000000
-DATA  expandAVX512_44_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
-DATA  expandAVX512_44_inShuf1<>+0x18(SB)/8, $0xffffff0000000000
-DATA  expandAVX512_44_inShuf1<>+0x20(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_44_inShuf1<>+0x28(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_44_inShuf1<>+0x30(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_44_inShuf1<>+0x38(SB)/8, $0xff02020202020101
-
-GLOBL expandAVX512_44_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_mat1<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_44_mat1<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_44_mat1<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_44_mat1<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_44_mat1<>+0x20(SB)/8, $0x0101010101010101
-DATA  expandAVX512_44_mat1<>+0x28(SB)/8, $0x0202020202020202
-DATA  expandAVX512_44_mat1<>+0x30(SB)/8, $0x0404040404040404
-DATA  expandAVX512_44_mat1<>+0x38(SB)/8, $0x0808080808080808
-
-GLOBL expandAVX512_44_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_inShuf2<>+0x00(SB)/8, $0x0202020101010101
-DATA  expandAVX512_44_inShuf2<>+0x08(SB)/8, $0xffffffffffff0201
-DATA  expandAVX512_44_inShuf2<>+0x10(SB)/8, $0x0202020101010101
-DATA  expandAVX512_44_inShuf2<>+0x18(SB)/8, $0x0202020101010101
-DATA  expandAVX512_44_inShuf2<>+0x20(SB)/8, $0xffffffffffff0201
-DATA  expandAVX512_44_inShuf2<>+0x28(SB)/8, $0xffff020101010101
-DATA  expandAVX512_44_inShuf2<>+0x30(SB)/8, $0xffffff0202020202
-DATA  expandAVX512_44_inShuf2<>+0x38(SB)/8, $0xffffffffffffff02
-
-GLOBL expandAVX512_44_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_mat2<>+0x00(SB)/8, $0x1010101010101010
-DATA  expandAVX512_44_mat2<>+0x08(SB)/8, $0x1010101020202020
-DATA  expandAVX512_44_mat2<>+0x10(SB)/8, $0x2020202020202020
-DATA  expandAVX512_44_mat2<>+0x18(SB)/8, $0x4040404040404040
-DATA  expandAVX512_44_mat2<>+0x20(SB)/8, $0x4040404080808080
-DATA  expandAVX512_44_mat2<>+0x28(SB)/8, $0x8080808080808080
-DATA  expandAVX512_44_mat2<>+0x30(SB)/8, $0x0101010101010101
-DATA  expandAVX512_44_mat2<>+0x38(SB)/8, $0x0101010102020202
-
-GLOBL expandAVX512_44_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_inShuf3<>+0x00(SB)/8, $0xffffff0202020202
-DATA  expandAVX512_44_inShuf3<>+0x08(SB)/8, $0xffffff0202020202
-DATA  expandAVX512_44_inShuf3<>+0x10(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_44_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202
-DATA  expandAVX512_44_inShuf3<>+0x20(SB)/8, $0xffffffffffff0202
-DATA  expandAVX512_44_inShuf3<>+0x28(SB)/8, $0xffffffffffff0202
-DATA  expandAVX512_44_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_44_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_44_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_44_mat3<>+0x00(SB)/8, $0x0202020202020202
-DATA  expandAVX512_44_mat3<>+0x08(SB)/8, $0x0404040404040404
-DATA  expandAVX512_44_mat3<>+0x10(SB)/8, $0x0404040408080808
-DATA  expandAVX512_44_mat3<>+0x18(SB)/8, $0x1010101010101010
-DATA  expandAVX512_44_mat3<>+0x20(SB)/8, $0x2020202020202020
-DATA  expandAVX512_44_mat3<>+0x28(SB)/8, $0x4040404040404040
-DATA  expandAVX512_44_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_44_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_44_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_44_outShufLo+0x00(SB)/8, $0x1110080403020100
-DATA  expandAVX512_44_outShufLo+0x08(SB)/8, $0x1c1b1a1918141312
-DATA  expandAVX512_44_outShufLo+0x10(SB)/8, $0x31302c2b2a292820
-DATA  expandAVX512_44_outShufLo+0x18(SB)/8, $0x4342414038343332
-DATA  expandAVX512_44_outShufLo+0x20(SB)/8, $0x58504c4b4a494844
-DATA  expandAVX512_44_outShufLo+0x28(SB)/8, $0x600706055c5b5a59
-DATA  expandAVX512_44_outShufLo+0x30(SB)/8, $0x1d69681716150961
-DATA  expandAVX512_44_outShufLo+0x38(SB)/8, $0x2f2e2d2171701f1e
-
-GLOBL expandAVX512_44_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_44_outShufHi0+0x00(SB)/8, $0x4844434241403938
-DATA  expandAVX512_44_outShufHi0+0x08(SB)/8, $0x5a59585453525150
-DATA  expandAVX512_44_outShufHi0+0x10(SB)/8, $0x6c6b6a6968605c5b
-DATA  expandAVX512_44_outShufHi0+0x18(SB)/8, $0xffff787473727170
-DATA  expandAVX512_44_outShufHi0+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_44_outShufHi0+0x28(SB)/8, $0x46453e3d3c3b3aff
-DATA  expandAVX512_44_outShufHi0+0x30(SB)/8, $0xff57565549ffff47
-DATA  expandAVX512_44_outShufHi0+0x38(SB)/8, $0x6d61ffff5f5e5dff
-
-GLOBL expandAVX512_44_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_44_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_44_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_44_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_44_outShufHi1+0x18(SB)/8, $0x0100ffffffffffff
-DATA  expandAVX512_44_outShufHi1+0x20(SB)/8, $0x0c0b0a0908040302
-DATA  expandAVX512_44_outShufHi1+0x28(SB)/8, $0xffffffffffffff10
-DATA  expandAVX512_44_outShufHi1+0x30(SB)/8, $0x20ffffffff1918ff
-DATA  expandAVX512_44_outShufHi1+0x38(SB)/8, $0xffff2928ffffff21
-
-TEXT expandAVX512_44<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_44_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_44_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_44_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_44_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_44_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_44_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_44_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_44_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_44_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_44_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_44_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0xce79fe003fffffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0x318601ffc0000000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_48_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_48_inShuf0<>+0x00(SB)/8, $0x0101000000000000
-DATA  expandAVX512_48_inShuf0<>+0x08(SB)/8, $0x0101000000000000
-DATA  expandAVX512_48_inShuf0<>+0x10(SB)/8, $0x0101000000000000
-DATA  expandAVX512_48_inShuf0<>+0x18(SB)/8, $0xffff000000000000
-DATA  expandAVX512_48_inShuf0<>+0x20(SB)/8, $0xffff000000000000
-DATA  expandAVX512_48_inShuf0<>+0x28(SB)/8, $0xffff000000000000
-DATA  expandAVX512_48_inShuf0<>+0x30(SB)/8, $0xffff000000000000
-DATA  expandAVX512_48_inShuf0<>+0x38(SB)/8, $0xffff000000000000
-
-GLOBL expandAVX512_48_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_48_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_48_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_48_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_48_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_48_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_48_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_48_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_48_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_48_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_48_inShuf1<>+0x00(SB)/8, $0xffffffff01010101
-DATA  expandAVX512_48_inShuf1<>+0x08(SB)/8, $0xffffffff01010101
-DATA  expandAVX512_48_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101
-DATA  expandAVX512_48_inShuf1<>+0x18(SB)/8, $0x0202020202020101
-DATA  expandAVX512_48_inShuf1<>+0x20(SB)/8, $0x0202010101010101
-DATA  expandAVX512_48_inShuf1<>+0x28(SB)/8, $0x0202010101010101
-DATA  expandAVX512_48_inShuf1<>+0x30(SB)/8, $0x0202010101010101
-DATA  expandAVX512_48_inShuf1<>+0x38(SB)/8, $0xffff010101010101
-
-GLOBL expandAVX512_48_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_48_mat1<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_48_mat1<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_48_mat1<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_48_mat1<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_48_mat1<>+0x20(SB)/8, $0x0808080808080808
-DATA  expandAVX512_48_mat1<>+0x28(SB)/8, $0x1010101010101010
-DATA  expandAVX512_48_mat1<>+0x30(SB)/8, $0x2020202020202020
-DATA  expandAVX512_48_mat1<>+0x38(SB)/8, $0x4040404040404040
-
-GLOBL expandAVX512_48_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_48_inShuf2<>+0x00(SB)/8, $0xffff010101010101
-DATA  expandAVX512_48_inShuf2<>+0x08(SB)/8, $0xffff020202020202
-DATA  expandAVX512_48_inShuf2<>+0x10(SB)/8, $0xffff020202020202
-DATA  expandAVX512_48_inShuf2<>+0x18(SB)/8, $0xffffffff02020202
-DATA  expandAVX512_48_inShuf2<>+0x20(SB)/8, $0xffffffff02020202
-DATA  expandAVX512_48_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_48_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_48_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_48_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_48_mat2<>+0x00(SB)/8, $0x8080808080808080
-DATA  expandAVX512_48_mat2<>+0x08(SB)/8, $0x0101010101010101
-DATA  expandAVX512_48_mat2<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_48_mat2<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_48_mat2<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_48_mat2<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_48_mat2<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_48_mat2<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_48_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_48_outShufLo+0x00(SB)/8, $0x0908050403020100
-DATA  expandAVX512_48_outShufLo+0x08(SB)/8, $0x131211100d0c0b0a
-DATA  expandAVX512_48_outShufLo+0x10(SB)/8, $0x1d1c1b1a19181514
-DATA  expandAVX512_48_outShufLo+0x18(SB)/8, $0x2928252423222120
-DATA  expandAVX512_48_outShufLo+0x20(SB)/8, $0x333231302d2c2b2a
-DATA  expandAVX512_48_outShufLo+0x28(SB)/8, $0x3d3c3b3a39383534
-DATA  expandAVX512_48_outShufLo+0x30(SB)/8, $0x0f0e434241400706
-DATA  expandAVX512_48_outShufLo+0x38(SB)/8, $0x515017164b4a4948
-
-GLOBL expandAVX512_48_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_48_outShufHi+0x00(SB)/8, $0x2524232221201918
-DATA  expandAVX512_48_outShufHi+0x08(SB)/8, $0x31302d2c2b2a2928
-DATA  expandAVX512_48_outShufHi+0x10(SB)/8, $0x3b3a393835343332
-DATA  expandAVX512_48_outShufHi+0x18(SB)/8, $0x4544434241403d3c
-DATA  expandAVX512_48_outShufHi+0x20(SB)/8, $0x51504d4c4b4a4948
-DATA  expandAVX512_48_outShufHi+0x28(SB)/8, $0x1d1c1b1a55545352
-DATA  expandAVX512_48_outShufHi+0x30(SB)/8, $0x5b5a595827261f1e
-DATA  expandAVX512_48_outShufHi+0x38(SB)/8, $0x3736636261602f2e
-
-TEXT expandAVX512_48<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_48_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_48_inShuf1<>(SB), Z3
-       VMOVDQU64 expandAVX512_48_inShuf2<>(SB), Z4
-       VMOVDQU64 expandAVX512_48_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_48_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z5
-       VPERMB Z5, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_48_mat0<>(SB), Z0, Z0
-       VPERMB Z5, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_48_mat1<>(SB), Z3, Z3
-       VPERMB Z5, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_48_mat2<>(SB), Z4, Z4
-       VPERMI2B Z3, Z0, Z1
-       VPERMI2B Z4, Z3, Z2
-       RET
-
-GLOBL expandAVX512_52_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_inShuf0<>+0x00(SB)/8, $0x0101000000000000
-DATA  expandAVX512_52_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
-DATA  expandAVX512_52_inShuf0<>+0x10(SB)/8, $0x0101000000000000
-DATA  expandAVX512_52_inShuf0<>+0x18(SB)/8, $0xffff000000000000
-DATA  expandAVX512_52_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00
-DATA  expandAVX512_52_inShuf0<>+0x28(SB)/8, $0xffff000000000000
-DATA  expandAVX512_52_inShuf0<>+0x30(SB)/8, $0xffff000000000000
-DATA  expandAVX512_52_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
-
-GLOBL expandAVX512_52_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_52_mat0<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_52_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_52_mat0<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_52_mat0<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_52_mat0<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_52_mat0<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_52_mat0<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_52_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_inShuf1<>+0x00(SB)/8, $0xffff000000000000
-DATA  expandAVX512_52_inShuf1<>+0x08(SB)/8, $0xffff000000000000
-DATA  expandAVX512_52_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
-DATA  expandAVX512_52_inShuf1<>+0x18(SB)/8, $0xffff000000000000
-DATA  expandAVX512_52_inShuf1<>+0x20(SB)/8, $0xffffffff01010101
-DATA  expandAVX512_52_inShuf1<>+0x28(SB)/8, $0xffffffffff010101
-DATA  expandAVX512_52_inShuf1<>+0x30(SB)/8, $0xff02020202020201
-DATA  expandAVX512_52_inShuf1<>+0x38(SB)/8, $0x0202010101010101
-
-GLOBL expandAVX512_52_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_mat1<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_52_mat1<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_52_mat1<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_52_mat1<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_52_mat1<>+0x20(SB)/8, $0x0101010101010101
-DATA  expandAVX512_52_mat1<>+0x28(SB)/8, $0x0202020202020202
-DATA  expandAVX512_52_mat1<>+0x30(SB)/8, $0x0202020202020202
-DATA  expandAVX512_52_mat1<>+0x38(SB)/8, $0x0404040404040404
-
-GLOBL expandAVX512_52_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_inShuf2<>+0x00(SB)/8, $0xffffffffffff0201
-DATA  expandAVX512_52_inShuf2<>+0x08(SB)/8, $0x0202010101010101
-DATA  expandAVX512_52_inShuf2<>+0x10(SB)/8, $0xffff010101010101
-DATA  expandAVX512_52_inShuf2<>+0x18(SB)/8, $0xffffffffffffff01
-DATA  expandAVX512_52_inShuf2<>+0x20(SB)/8, $0xffff010101010101
-DATA  expandAVX512_52_inShuf2<>+0x28(SB)/8, $0xffff010101010101
-DATA  expandAVX512_52_inShuf2<>+0x30(SB)/8, $0xffffffffffffff01
-DATA  expandAVX512_52_inShuf2<>+0x38(SB)/8, $0xffff010101010101
-
-GLOBL expandAVX512_52_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_mat2<>+0x00(SB)/8, $0x0404040408080808
-DATA  expandAVX512_52_mat2<>+0x08(SB)/8, $0x0808080808080808
-DATA  expandAVX512_52_mat2<>+0x10(SB)/8, $0x1010101010101010
-DATA  expandAVX512_52_mat2<>+0x18(SB)/8, $0x1010101020202020
-DATA  expandAVX512_52_mat2<>+0x20(SB)/8, $0x2020202020202020
-DATA  expandAVX512_52_mat2<>+0x28(SB)/8, $0x4040404040404040
-DATA  expandAVX512_52_mat2<>+0x30(SB)/8, $0x4040404080808080
-DATA  expandAVX512_52_mat2<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_52_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_inShuf3<>+0x00(SB)/8, $0xffff020202020202
-DATA  expandAVX512_52_inShuf3<>+0x08(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_52_inShuf3<>+0x10(SB)/8, $0xffffffff02020202
-DATA  expandAVX512_52_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202
-DATA  expandAVX512_52_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_52_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_52_mat3<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_52_mat3<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_52_mat3<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_52_mat3<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_52_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_52_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_52_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_52_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_52_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_52_outShufLo+0x00(SB)/8, $0x1008050403020100
-DATA  expandAVX512_52_outShufLo+0x08(SB)/8, $0x1a19181514131211
-DATA  expandAVX512_52_outShufLo+0x10(SB)/8, $0x2b2a2928201d1c1b
-DATA  expandAVX512_52_outShufLo+0x18(SB)/8, $0x3534333231302d2c
-DATA  expandAVX512_52_outShufLo+0x20(SB)/8, $0x4845444342414038
-DATA  expandAVX512_52_outShufLo+0x28(SB)/8, $0x5958504d4c4b4a49
-DATA  expandAVX512_52_outShufLo+0x30(SB)/8, $0x616007065d5c5b5a
-DATA  expandAVX512_52_outShufLo+0x38(SB)/8, $0x6a69681716096362
-
-GLOBL expandAVX512_52_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_52_outShufHi0+0x00(SB)/8, $0x403d3c3b3a393830
-DATA  expandAVX512_52_outShufHi0+0x08(SB)/8, $0x51504d4c4b4a4948
-DATA  expandAVX512_52_outShufHi0+0x10(SB)/8, $0x6261605855545352
-DATA  expandAVX512_52_outShufHi0+0x18(SB)/8, $0x6c6b6a6968656463
-DATA  expandAVX512_52_outShufHi0+0x20(SB)/8, $0x7d7c7b7a7978706d
-DATA  expandAVX512_52_outShufHi0+0x28(SB)/8, $0x31ffffffffffffff
-DATA  expandAVX512_52_outShufHi0+0x30(SB)/8, $0xff3f3e3635343332
-DATA  expandAVX512_52_outShufHi0+0x38(SB)/8, $0xffff4f4e41ffffff
-
-GLOBL expandAVX512_52_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_52_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_52_outShufHi1+0x28(SB)/8, $0xff08050403020100
-DATA  expandAVX512_52_outShufHi1+0x30(SB)/8, $0x10ffffffffffffff
-DATA  expandAVX512_52_outShufHi1+0x38(SB)/8, $0x1918ffffff131211
-
-TEXT expandAVX512_52<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_52_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_52_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_52_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_52_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_52_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_52_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_52_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_52_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_52_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_52_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_52_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0x387f80ffffffffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0xc7807f0000000000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_56_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_56_inShuf0<>+0x00(SB)/8, $0x0100000000000000
-DATA  expandAVX512_56_inShuf0<>+0x08(SB)/8, $0x0100000000000000
-DATA  expandAVX512_56_inShuf0<>+0x10(SB)/8, $0xff00000000000000
-DATA  expandAVX512_56_inShuf0<>+0x18(SB)/8, $0xff00000000000000
-DATA  expandAVX512_56_inShuf0<>+0x20(SB)/8, $0xff00000000000000
-DATA  expandAVX512_56_inShuf0<>+0x28(SB)/8, $0xff00000000000000
-DATA  expandAVX512_56_inShuf0<>+0x30(SB)/8, $0xff00000000000000
-DATA  expandAVX512_56_inShuf0<>+0x38(SB)/8, $0xff00000000000000
-
-GLOBL expandAVX512_56_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_56_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_56_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_56_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_56_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_56_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_56_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_56_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_56_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_56_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_56_inShuf1<>+0x00(SB)/8, $0xffff010101010101
-DATA  expandAVX512_56_inShuf1<>+0x08(SB)/8, $0x0202010101010101
-DATA  expandAVX512_56_inShuf1<>+0x10(SB)/8, $0x0201010101010101
-DATA  expandAVX512_56_inShuf1<>+0x18(SB)/8, $0xff01010101010101
-DATA  expandAVX512_56_inShuf1<>+0x20(SB)/8, $0xff01010101010101
-DATA  expandAVX512_56_inShuf1<>+0x28(SB)/8, $0xff01010101010101
-DATA  expandAVX512_56_inShuf1<>+0x30(SB)/8, $0xff01010101010101
-DATA  expandAVX512_56_inShuf1<>+0x38(SB)/8, $0xff01010101010101
-
-GLOBL expandAVX512_56_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_56_inShuf2<>+0x00(SB)/8, $0xff02020202020202
-DATA  expandAVX512_56_inShuf2<>+0x08(SB)/8, $0xffffff0202020202
-DATA  expandAVX512_56_inShuf2<>+0x10(SB)/8, $0xffffffffffffff02
-DATA  expandAVX512_56_inShuf2<>+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_56_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_56_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_56_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_56_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_56_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_56_mat2<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_56_mat2<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_56_mat2<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_56_mat2<>+0x18(SB)/8, $0x0000000000000000
-DATA  expandAVX512_56_mat2<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_56_mat2<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_56_mat2<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_56_mat2<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_56_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_56_outShufLo+0x00(SB)/8, $0x0806050403020100
-DATA  expandAVX512_56_outShufLo+0x08(SB)/8, $0x11100e0d0c0b0a09
-DATA  expandAVX512_56_outShufLo+0x10(SB)/8, $0x1a19181615141312
-DATA  expandAVX512_56_outShufLo+0x18(SB)/8, $0x232221201e1d1c1b
-DATA  expandAVX512_56_outShufLo+0x20(SB)/8, $0x2c2b2a2928262524
-DATA  expandAVX512_56_outShufLo+0x28(SB)/8, $0x3534333231302e2d
-DATA  expandAVX512_56_outShufLo+0x30(SB)/8, $0x3e3d3c3b3a393836
-DATA  expandAVX512_56_outShufLo+0x38(SB)/8, $0x0f45444342414007
-
-GLOBL expandAVX512_56_outShufHi(SB), RODATA, $0x40
-DATA  expandAVX512_56_outShufHi+0x00(SB)/8, $0x11100d0c0b0a0908
-DATA  expandAVX512_56_outShufHi+0x08(SB)/8, $0x1a19181615141312
-DATA  expandAVX512_56_outShufHi+0x10(SB)/8, $0x232221201e1d1c1b
-DATA  expandAVX512_56_outShufHi+0x18(SB)/8, $0x2c2b2a2928262524
-DATA  expandAVX512_56_outShufHi+0x20(SB)/8, $0x3534333231302e2d
-DATA  expandAVX512_56_outShufHi+0x28(SB)/8, $0x3e3d3c3b3a393836
-DATA  expandAVX512_56_outShufHi+0x30(SB)/8, $0x0e46454443424140
-DATA  expandAVX512_56_outShufHi+0x38(SB)/8, $0x50174c4b4a49480f
-
-TEXT expandAVX512_56<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_56_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_56_mat0<>(SB), Z3
-       VMOVDQU64 expandAVX512_56_inShuf1<>(SB), Z4
-       VMOVDQU64 expandAVX512_56_inShuf2<>(SB), Z5
-       VMOVDQU64 expandAVX512_56_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_56_outShufHi(SB), Z2
-       VMOVDQU64 (AX), Z6
-       VPERMB Z6, Z0, Z0
-       VGF2P8AFFINEQB $0, Z3, Z0, Z0
-       VPERMB Z6, Z4, Z4
-       VGF2P8AFFINEQB $0, Z3, Z4, Z3
-       VPERMB Z6, Z5, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_56_mat2<>(SB), Z4, Z4
-       VPERMI2B Z3, Z0, Z1
-       VPERMI2B Z4, Z3, Z2
-       RET
-
-GLOBL expandAVX512_60_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_inShuf0<>+0x00(SB)/8, $0x0100000000000000
-DATA  expandAVX512_60_inShuf0<>+0x08(SB)/8, $0xffffffffffffff00
-DATA  expandAVX512_60_inShuf0<>+0x10(SB)/8, $0xff00000000000000
-DATA  expandAVX512_60_inShuf0<>+0x18(SB)/8, $0xff00000000000000
-DATA  expandAVX512_60_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00
-DATA  expandAVX512_60_inShuf0<>+0x28(SB)/8, $0xff00000000000000
-DATA  expandAVX512_60_inShuf0<>+0x30(SB)/8, $0xff00000000000000
-DATA  expandAVX512_60_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
-
-GLOBL expandAVX512_60_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_60_mat0<>+0x08(SB)/8, $0x0101010102020202
-DATA  expandAVX512_60_mat0<>+0x10(SB)/8, $0x0202020202020202
-DATA  expandAVX512_60_mat0<>+0x18(SB)/8, $0x0404040404040404
-DATA  expandAVX512_60_mat0<>+0x20(SB)/8, $0x0404040408080808
-DATA  expandAVX512_60_mat0<>+0x28(SB)/8, $0x0808080808080808
-DATA  expandAVX512_60_mat0<>+0x30(SB)/8, $0x1010101010101010
-DATA  expandAVX512_60_mat0<>+0x38(SB)/8, $0x1010101020202020
-
-GLOBL expandAVX512_60_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_inShuf1<>+0x00(SB)/8, $0xff00000000000000
-DATA  expandAVX512_60_inShuf1<>+0x08(SB)/8, $0xff00000000000000
-DATA  expandAVX512_60_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
-DATA  expandAVX512_60_inShuf1<>+0x18(SB)/8, $0xff00000000000000
-DATA  expandAVX512_60_inShuf1<>+0x20(SB)/8, $0xffffffffff010101
-DATA  expandAVX512_60_inShuf1<>+0x28(SB)/8, $0x0202020202010101
-DATA  expandAVX512_60_inShuf1<>+0x30(SB)/8, $0xffffffffffff0201
-DATA  expandAVX512_60_inShuf1<>+0x38(SB)/8, $0xff01010101010101
-
-GLOBL expandAVX512_60_mat1<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_mat1<>+0x00(SB)/8, $0x2020202020202020
-DATA  expandAVX512_60_mat1<>+0x08(SB)/8, $0x4040404040404040
-DATA  expandAVX512_60_mat1<>+0x10(SB)/8, $0x4040404080808080
-DATA  expandAVX512_60_mat1<>+0x18(SB)/8, $0x8080808080808080
-DATA  expandAVX512_60_mat1<>+0x20(SB)/8, $0x0101010101010101
-DATA  expandAVX512_60_mat1<>+0x28(SB)/8, $0x0101010101010101
-DATA  expandAVX512_60_mat1<>+0x30(SB)/8, $0x0101010102020202
-DATA  expandAVX512_60_mat1<>+0x38(SB)/8, $0x0202020202020202
-
-GLOBL expandAVX512_60_inShuf2<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_inShuf2<>+0x00(SB)/8, $0xff01010101010101
-DATA  expandAVX512_60_inShuf2<>+0x08(SB)/8, $0xffffffffffffff01
-DATA  expandAVX512_60_inShuf2<>+0x10(SB)/8, $0xff01010101010101
-DATA  expandAVX512_60_inShuf2<>+0x18(SB)/8, $0xff01010101010101
-DATA  expandAVX512_60_inShuf2<>+0x20(SB)/8, $0xffffffffffffff01
-DATA  expandAVX512_60_inShuf2<>+0x28(SB)/8, $0xff01010101010101
-DATA  expandAVX512_60_inShuf2<>+0x30(SB)/8, $0xff01010101010101
-DATA  expandAVX512_60_inShuf2<>+0x38(SB)/8, $0xffffffffffffff01
-
-GLOBL expandAVX512_60_mat2<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_mat2<>+0x00(SB)/8, $0x0404040404040404
-DATA  expandAVX512_60_mat2<>+0x08(SB)/8, $0x0404040408080808
-DATA  expandAVX512_60_mat2<>+0x10(SB)/8, $0x0808080808080808
-DATA  expandAVX512_60_mat2<>+0x18(SB)/8, $0x1010101010101010
-DATA  expandAVX512_60_mat2<>+0x20(SB)/8, $0x1010101020202020
-DATA  expandAVX512_60_mat2<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_60_mat2<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_60_mat2<>+0x38(SB)/8, $0x4040404080808080
-
-GLOBL expandAVX512_60_inShuf3<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_inShuf3<>+0x00(SB)/8, $0xff01010101010101
-DATA  expandAVX512_60_inShuf3<>+0x08(SB)/8, $0xffffffffffff0202
-DATA  expandAVX512_60_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
-
-GLOBL expandAVX512_60_mat3<>(SB), RODATA, $0x40
-DATA  expandAVX512_60_mat3<>+0x00(SB)/8, $0x8080808080808080
-DATA  expandAVX512_60_mat3<>+0x08(SB)/8, $0x0101010101010101
-DATA  expandAVX512_60_mat3<>+0x10(SB)/8, $0x0000000000000000
-DATA  expandAVX512_60_mat3<>+0x18(SB)/8, $0x0000000000000000
-DATA  expandAVX512_60_mat3<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_60_mat3<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_60_mat3<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_60_mat3<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_60_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_60_outShufLo+0x00(SB)/8, $0x0806050403020100
-DATA  expandAVX512_60_outShufLo+0x08(SB)/8, $0x1816151413121110
-DATA  expandAVX512_60_outShufLo+0x10(SB)/8, $0x28201e1d1c1b1a19
-DATA  expandAVX512_60_outShufLo+0x18(SB)/8, $0x31302e2d2c2b2a29
-DATA  expandAVX512_60_outShufLo+0x20(SB)/8, $0x4140383635343332
-DATA  expandAVX512_60_outShufLo+0x28(SB)/8, $0x4a49484645444342
-DATA  expandAVX512_60_outShufLo+0x30(SB)/8, $0x5a5958504e4d4c4b
-DATA  expandAVX512_60_outShufLo+0x38(SB)/8, $0x626160075e5d5c5b
-
-GLOBL expandAVX512_60_outShufHi0(SB), RODATA, $0x40
-DATA  expandAVX512_60_outShufHi0+0x00(SB)/8, $0x3b3a3938302a2928
-DATA  expandAVX512_60_outShufHi0+0x08(SB)/8, $0x44434241403e3d3c
-DATA  expandAVX512_60_outShufHi0+0x10(SB)/8, $0x5453525150484645
-DATA  expandAVX512_60_outShufHi0+0x18(SB)/8, $0x5d5c5b5a59585655
-DATA  expandAVX512_60_outShufHi0+0x20(SB)/8, $0x6d6c6b6a6968605e
-DATA  expandAVX512_60_outShufHi0+0x28(SB)/8, $0x767574737271706e
-DATA  expandAVX512_60_outShufHi0+0x30(SB)/8, $0xffffffffffffff78
-DATA  expandAVX512_60_outShufHi0+0x38(SB)/8, $0x31ffff2f2e2d2c2b
-
-GLOBL expandAVX512_60_outShufHi1(SB), RODATA, $0x40
-DATA  expandAVX512_60_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
-DATA  expandAVX512_60_outShufHi1+0x30(SB)/8, $0x06050403020100ff
-DATA  expandAVX512_60_outShufHi1+0x38(SB)/8, $0xff0908ffffffffff
-
-TEXT expandAVX512_60<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_60_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_60_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_60_inShuf2<>(SB), Z3
-       VMOVDQU64 expandAVX512_60_inShuf3<>(SB), Z4
-       VMOVDQU64 expandAVX512_60_outShufLo(SB), Z1
-       VMOVDQU64 expandAVX512_60_outShufHi0(SB), Z5
-       VMOVDQU64 expandAVX512_60_outShufHi1(SB), Z6
-       VMOVDQU64 (AX), Z7
-       VPERMB Z7, Z0, Z0
-       VGF2P8AFFINEQB $0, expandAVX512_60_mat0<>(SB), Z0, Z0
-       VPERMB Z7, Z2, Z2
-       VGF2P8AFFINEQB $0, expandAVX512_60_mat1<>(SB), Z2, Z2
-       VPERMB Z7, Z3, Z3
-       VGF2P8AFFINEQB $0, expandAVX512_60_mat2<>(SB), Z3, Z3
-       VPERMB Z7, Z4, Z4
-       VGF2P8AFFINEQB $0, expandAVX512_60_mat3<>(SB), Z4, Z4
-       VPERMI2B Z2, Z0, Z1
-       MOVQ $0x9f01ffffffffffff, AX
-       KMOVQ AX, K1
-       VPERMI2B.Z Z3, Z2, K1, Z5
-       MOVQ $0x60fe000000000000, AX
-       KMOVQ AX, K1
-       VPERMB.Z Z4, Z6, K1, Z0
-       VPORQ Z0, Z5, Z2
-       RET
-
-GLOBL expandAVX512_64_inShuf0<>(SB), RODATA, $0x40
-DATA  expandAVX512_64_inShuf0<>+0x00(SB)/8, $0x0000000000000000
-DATA  expandAVX512_64_inShuf0<>+0x08(SB)/8, $0x0000000000000000
-DATA  expandAVX512_64_inShuf0<>+0x10(SB)/8, $0x0000000000000000
-DATA  expandAVX512_64_inShuf0<>+0x18(SB)/8, $0x0000000000000000
-DATA  expandAVX512_64_inShuf0<>+0x20(SB)/8, $0x0000000000000000
-DATA  expandAVX512_64_inShuf0<>+0x28(SB)/8, $0x0000000000000000
-DATA  expandAVX512_64_inShuf0<>+0x30(SB)/8, $0x0000000000000000
-DATA  expandAVX512_64_inShuf0<>+0x38(SB)/8, $0x0000000000000000
-
-GLOBL expandAVX512_64_mat0<>(SB), RODATA, $0x40
-DATA  expandAVX512_64_mat0<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_mat0<>+0x08(SB)/8, $0x0202020202020202
-DATA  expandAVX512_64_mat0<>+0x10(SB)/8, $0x0404040404040404
-DATA  expandAVX512_64_mat0<>+0x18(SB)/8, $0x0808080808080808
-DATA  expandAVX512_64_mat0<>+0x20(SB)/8, $0x1010101010101010
-DATA  expandAVX512_64_mat0<>+0x28(SB)/8, $0x2020202020202020
-DATA  expandAVX512_64_mat0<>+0x30(SB)/8, $0x4040404040404040
-DATA  expandAVX512_64_mat0<>+0x38(SB)/8, $0x8080808080808080
-
-GLOBL expandAVX512_64_inShuf1<>(SB), RODATA, $0x40
-DATA  expandAVX512_64_inShuf1<>+0x00(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_inShuf1<>+0x08(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_inShuf1<>+0x10(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_inShuf1<>+0x18(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_inShuf1<>+0x20(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_inShuf1<>+0x28(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_inShuf1<>+0x30(SB)/8, $0x0101010101010101
-DATA  expandAVX512_64_inShuf1<>+0x38(SB)/8, $0x0101010101010101
-
-GLOBL expandAVX512_64_outShufLo(SB), RODATA, $0x40
-DATA  expandAVX512_64_outShufLo+0x00(SB)/8, $0x0706050403020100
-DATA  expandAVX512_64_outShufLo+0x08(SB)/8, $0x0f0e0d0c0b0a0908
-DATA  expandAVX512_64_outShufLo+0x10(SB)/8, $0x1716151413121110
-DATA  expandAVX512_64_outShufLo+0x18(SB)/8, $0x1f1e1d1c1b1a1918
-DATA  expandAVX512_64_outShufLo+0x20(SB)/8, $0x2726252423222120
-DATA  expandAVX512_64_outShufLo+0x28(SB)/8, $0x2f2e2d2c2b2a2928
-DATA  expandAVX512_64_outShufLo+0x30(SB)/8, $0x3736353433323130
-DATA  expandAVX512_64_outShufLo+0x38(SB)/8, $0x3f3e3d3c3b3a3938
-
-TEXT expandAVX512_64<>(SB), NOSPLIT, $0-0
-       VMOVDQU64 expandAVX512_64_inShuf0<>(SB), Z0
-       VMOVDQU64 expandAVX512_64_mat0<>(SB), Z1
-       VMOVDQU64 expandAVX512_64_inShuf1<>(SB), Z2
-       VMOVDQU64 expandAVX512_64_outShufLo(SB), Z3
-       VMOVDQU64 (AX), Z4
-       VPERMB Z4, Z0, Z0
-       VGF2P8AFFINEQB $0, Z1, Z0, Z0
-       VPERMB Z4, Z2, Z2
-       VGF2P8AFFINEQB $0, Z1, Z2, Z2
-       VPERMB Z0, Z3, Z1
-       VPERMB Z2, Z3, Z2
-       RET
-
index a8f5b88c5cb3a615735c86475cf79b595a0d74fc..89736f21dad5989e9502f0ab2ecb77936e45485b 100644 (file)
@@ -11,9 +11,9 @@ import (
        "testing"
 )
 
-func TestExpandAVX512(t *testing.T) {
+func TestExpandAVX512Asm(t *testing.T) {
        if !scan.CanAVX512() {
                t.Skip("no AVX512")
        }
-       testExpand(t, scan.ExpandAVX512)
+       testExpand(t, scan.ExpandAVX512Asm)
 }
diff --git a/src/internal/runtime/gc/scan/expand_simd_amd64_test.go b/src/internal/runtime/gc/scan/expand_simd_amd64_test.go
new file mode 100644 (file)
index 0000000..28f3147
--- /dev/null
@@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && goexperiment.simd
+
+package scan_test
+
+import (
+       "internal/runtime/gc/scan"
+       "testing"
+)
+
+func TestExpandAVX512(t *testing.T) {
+       if !scan.CanAVX512() {
+               t.Skip("no AVX512")
+       }
+       testExpand(t, scan.ExpandAVX512)
+}
index 692817d8b2bd966329a85bea1ae1022b985399cd..2e75574bab50554f6f5b078b7271049f51eaf443 100644 (file)
@@ -23,7 +23,7 @@ func testExpand(t *testing.T, expF expandFunc) {
 
                for i := range want {
                        if got[i] != want[i] {
-                               t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
+                               t.Errorf("expansion differs from reference at bit %d, sizeClass=%d", i*goarch.PtrSize, sizeClass)
                                if goarch.PtrSize == 4 {
                                        t.Logf("got:  %032b", got[i])
                                        t.Logf("want: %032b", want[i])
diff --git a/src/internal/runtime/gc/scan/expanders_amd64.go b/src/internal/runtime/gc/scan/expanders_amd64.go
new file mode 100644 (file)
index 0000000..878dc5f
--- /dev/null
@@ -0,0 +1,1530 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package scan
+
+import (
+       "simd"
+       "unsafe"
+)
+
+var gcExpandersAVX512 = [68]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
+       nil,
+       expandAVX512_1,
+       expandAVX512_2,
+       expandAVX512_3,
+       expandAVX512_4,
+       expandAVX512_6,
+       expandAVX512_8,
+       expandAVX512_10,
+       expandAVX512_12,
+       expandAVX512_14,
+       expandAVX512_16,
+       expandAVX512_18,
+       expandAVX512_20,
+       expandAVX512_22,
+       expandAVX512_24,
+       expandAVX512_26,
+       expandAVX512_28,
+       expandAVX512_30,
+       expandAVX512_32,
+       expandAVX512_36,
+       expandAVX512_40,
+       expandAVX512_44,
+       expandAVX512_48,
+       expandAVX512_52,
+       expandAVX512_56,
+       expandAVX512_60,
+       expandAVX512_64,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+       nil,
+}
+
+func expandAVX512_1(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src) + 64))).AsUint8x64()
+       return x.AsUint64x8(), y.AsUint64x8()
+}
+
+var expandAVX512_2_mat0 = [8]uint64{
+       0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080,
+       0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080,
+}
+var expandAVX512_2_inShuf0 = [8]uint64{
+       0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+       0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918,
+}
+var expandAVX512_2_inShuf1 = [8]uint64{
+       0x2726252423222120, 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x2f2e2d2c2b2a2928,
+       0x3736353433323130, 0x3736353433323130, 0x3f3e3d3c3b3a3938, 0x3f3e3d3c3b3a3938,
+}
+var expandAVX512_2_outShufLo = [8]uint64{
+       0x0b030a0209010800, 0x0f070e060d050c04, 0x1b131a1219111810, 0x1f171e161d151c14,
+       0x2b232a2229212820, 0x2f272e262d252c24, 0x3b333a3239313830, 0x3f373e363d353c34,
+}
+
+func expandAVX512_2(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_2_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_2_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_2_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_2_outShufLo).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v4.Permute(v8)
+       v10 := v7.Permute(v8)
+       return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_3_mat0 = [8]uint64{
+       0x0101010202020404, 0x0408080810101020, 0x2020404040808080, 0x0101010202020404,
+       0x0408080810101020, 0x2020404040808080, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_3_inShuf0 = [8]uint64{
+       0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908,
+       0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3_inShuf1 = [8]uint64{
+       0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918,
+       0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3_inShuf2 = [8]uint64{
+       0x2726252423222120, 0x2726252423222120, 0x2726252423222120, 0xffffffffff2a2928,
+       0xffffffffff2a2928, 0xffffffffffff2928, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_3_outShufLo = [8]uint64{
+       0x0a02110901100800, 0x05140c04130b0312, 0x170f07160e06150d, 0x221a292119282018,
+       0x1d2c241c2b231b2a, 0x2f271f2e261e2d25, 0x4a42514941504840, 0x45544c44534b4352,
+}
+var expandAVX512_3_outShufHi = [8]uint64{
+       0x170f07160e06150d, 0x221a292119282018, 0x1d2c241c2b231b2a, 0x2f271f2e261e2d25,
+       0x4a42514941504840, 0x45544c44534b4352, 0x574f47564e46554d, 0x625a696159686058,
+}
+
+func expandAVX512_3(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_3_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_3_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_3_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_3_inShuf2).AsUint8x64()
+       v11 := simd.LoadUint64x8(&expandAVX512_3_outShufLo).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_3_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v0.Permute(v8)
+       v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v12 := v4.ConcatPermute(v7, v11)
+       v14 := v7.ConcatPermute(v10, v13)
+       return v12.AsUint64x8(), v14.AsUint64x8()
+}
+
+var expandAVX512_4_mat0 = [8]uint64{
+       0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080,
+       0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080,
+}
+var expandAVX512_4_inShuf0 = [8]uint64{
+       0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+       0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+}
+var expandAVX512_4_inShuf1 = [8]uint64{
+       0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1716151413121110,
+       0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918,
+}
+var expandAVX512_4_outShufLo = [8]uint64{
+       0x1911090118100800, 0x1b130b031a120a02, 0x1d150d051c140c04, 0x1f170f071e160e06,
+       0x3931292138302820, 0x3b332b233a322a22, 0x3d352d253c342c24, 0x3f372f273e362e26,
+}
+
+func expandAVX512_4(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_4_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_4_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_4_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_4_outShufLo).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v4.Permute(v8)
+       v10 := v7.Permute(v8)
+       return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_6_mat0 = [8]uint64{
+       0x0101010101010202, 0x0202020204040404, 0x0404080808080808, 0x1010101010102020,
+       0x2020202040404040, 0x4040808080808080, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_6_inShuf0 = [8]uint64{
+       0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+       0x0706050403020100, 0x0706050403020100, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6_inShuf1 = [8]uint64{
+       0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+       0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6_inShuf2 = [8]uint64{
+       0xffff151413121110, 0xffff151413121110, 0xffffff1413121110, 0xffffff1413121110,
+       0xffffff1413121110, 0xffffff1413121110, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_6_outShufLo = [8]uint64{
+       0x0901282018100800, 0x1a120a0229211911, 0x2b231b130b032a22, 0x0d052c241c140c04,
+       0x1e160e062d251d15, 0x2f271f170f072e26, 0x4941686058504840, 0x5a524a4269615951,
+}
+var expandAVX512_6_outShufHi = [8]uint64{
+       0x2b231b130b032a22, 0x0d052c241c140c04, 0x1e160e062d251d15, 0x2f271f170f072e26,
+       0x4941686058504840, 0x5a524a4269615951, 0x6b635b534b436a62, 0x4d456c645c544c44,
+}
+
+func expandAVX512_6(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_6_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_6_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_6_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_6_inShuf2).AsUint8x64()
+       v11 := simd.LoadUint64x8(&expandAVX512_6_outShufLo).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_6_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v0.Permute(v8)
+       v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v12 := v4.ConcatPermute(v7, v11)
+       v14 := v7.ConcatPermute(v10, v13)
+       return v12.AsUint64x8(), v14.AsUint64x8()
+}
+
+var expandAVX512_8_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_8_inShuf0 = [8]uint64{
+       0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+       0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100,
+}
+var expandAVX512_8_inShuf1 = [8]uint64{
+       0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+       0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908,
+}
+var expandAVX512_8_outShufLo = [8]uint64{
+       0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03,
+       0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07,
+}
+
+func expandAVX512_8(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_8_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_8_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_8_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_8_outShufLo).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v4.Permute(v8)
+       v10 := v7.Permute(v8)
+       return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_10_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808,
+       0x0808080808080808, 0x1010101010101010, 0x1010202020202020, 0x2020202040404040,
+}
+var expandAVX512_10_inShuf0 = [8]uint64{
+       0xff06050403020100, 0xff06050403020100, 0xff06050403020100, 0xff06050403020100,
+       0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100,
+}
+var expandAVX512_10_mat1 = [8]uint64{
+       0x4040404040408080, 0x8080808080808080, 0x0808080808080808, 0x1010101010101010,
+       0x1010202020202020, 0x2020202040404040, 0x4040404040408080, 0x8080808080808080,
+}
+var expandAVX512_10_inShuf1 = [8]uint64{
+       0xffff050403020100, 0xffff050403020100, 0xff0c0b0a09080706, 0xff0c0b0a09080706,
+       0xff0c0b0a09080706, 0xff0c0b0a09080706, 0xffff0b0a09080706, 0xffff0b0a09080706,
+}
+var expandAVX512_10_mat2 = [8]uint64{
+       0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_10_inShuf2 = [8]uint64{
+       0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_10_outShufLo = [8]uint64{
+       0x3830282018100800, 0x2921191109014840, 0x1a120a0249413931, 0x0b034a423a322a22,
+       0x4b433b332b231b13, 0x3c342c241c140c04, 0x2d251d150d054c44, 0x1e160e064d453d35,
+}
+var expandAVX512_10_outShufHi = [8]uint64{
+       0x4840383028201810, 0x3931292119115850, 0x2a221a1259514941, 0x1b135a524a423a32,
+       0x5b534b433b332b23, 0x4c443c342c241c14, 0x3d352d251d155c54, 0x2e261e165d554d45,
+}
+
+func expandAVX512_10(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_10_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_10_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_10_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_10_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_10_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_10_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_10_outShufLo).AsUint8x64()
+       v15 := simd.LoadUint64x8(&expandAVX512_10_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v14 := v4.ConcatPermute(v8, v13)
+       v16 := v8.ConcatPermute(v12, v15)
+       return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_12_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_12_inShuf0 = [8]uint64{
+       0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100,
+       0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+}
+var expandAVX512_12_mat1 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_12_inShuf1 = [8]uint64{
+       0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+       0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605,
+}
+var expandAVX512_12_mat2 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+}
+var expandAVX512_12_inShuf2 = [8]uint64{
+       0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605,
+       0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706,
+}
+var expandAVX512_12_outShufLo = [8]uint64{
+       0x3830282018100800, 0x1911090158504840, 0x5951494139312921, 0x3a322a221a120a02,
+       0x1b130b035a524a42, 0x5b534b433b332b23, 0x3c342c241c140c04, 0x1d150d055c544c44,
+}
+var expandAVX512_12_outShufHi = [8]uint64{
+       0x5850484038302820, 0x3931292178706860, 0x7971696159514941, 0x5a524a423a322a22,
+       0x3b332b237a726a62, 0x7b736b635b534b43, 0x5c544c443c342c24, 0x3d352d257c746c64,
+}
+
+func expandAVX512_12(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_12_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_12_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_12_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_12_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_12_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_12_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_12_outShufLo).AsUint8x64()
+       v15 := simd.LoadUint64x8(&expandAVX512_12_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v14 := v4.ConcatPermute(v8, v13)
+       v16 := v8.ConcatPermute(v12, v15)
+       return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_14_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+       0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_14_inShuf0 = [8]uint64{
+       0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+       0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100,
+}
+var expandAVX512_14_mat1 = [8]uint64{
+       0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+       0x4040808080808080, 0x8080808080808080, 0x1010101010102020, 0x2020202020202020,
+}
+var expandAVX512_14_inShuf1 = [8]uint64{
+       0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100,
+       0xffffffff03020100, 0xffffffff03020100, 0xffffff0807060504, 0xffffff0807060504,
+}
+var expandAVX512_14_mat2 = [8]uint64{
+       0x2020202040404040, 0x4040404040404040, 0x4040808080808080, 0x8080808080808080,
+       0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+}
+var expandAVX512_14_inShuf2 = [8]uint64{
+       0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504,
+       0xffffff0908070605, 0xffffff0908070605, 0xffffffff08070605, 0xffffffff08070605,
+}
+var expandAVX512_14_mat3 = [8]uint64{
+       0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_14_inShuf3 = [8]uint64{
+       0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_14_outShufLo = [8]uint64{
+       0x3830282018100800, 0x0901686058504840, 0x4941393129211911, 0x1a120a0269615951,
+       0x5a524a423a322a22, 0x2b231b130b036a62, 0x6b635b534b433b33, 0x3c342c241c140c04,
+}
+var expandAVX512_14_outShufHi0 = [8]uint64{
+       0x6860585048403830, 0x3931ffffffff7870, 0x7971696159514941, 0x4a423a32ffffffff,
+       0xffff7a726a625a52, 0x5b534b433b33ffff, 0xffffffff7b736b63, 0x6c645c544c443c34,
+}
+var expandAVX512_14_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffff18100800ffff, 0xffffffffffffffff, 0xffffffff19110901,
+       0x0a02ffffffffffff, 0xffffffffffff1a12, 0x1b130b03ffffffff, 0xffffffffffffffff,
+}
+
+func expandAVX512_14(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_14_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_14_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_14_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_14_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_14_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_14_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_14_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_14_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_14_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_14_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_14_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xff0ffc3ff0ffc3ff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0xf003c00f003c00)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_16_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_16_inShuf0 = [8]uint64{
+       0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000,
+       0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000,
+}
+var expandAVX512_16_inShuf1 = [8]uint64{
+       0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404,
+       0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404,
+}
+var expandAVX512_16_outShufLo = [8]uint64{
+       0x1918111009080100, 0x3938313029282120, 0x1b1a13120b0a0302, 0x3b3a33322b2a2322,
+       0x1d1c15140d0c0504, 0x3d3c35342d2c2524, 0x1f1e17160f0e0706, 0x3f3e37362f2e2726,
+}
+
+func expandAVX512_16(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_16_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_16_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_16_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_16_outShufLo).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v4.Permute(v8)
+       v10 := v7.Permute(v8)
+       return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_18_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404,
+       0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_18_inShuf0 = [8]uint64{
+       0x0303020201010000, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100,
+       0xffffffff03020100, 0xffffffff03020100, 0x0303020201010000, 0xff03020201010000,
+}
+var expandAVX512_18_mat1 = [8]uint64{
+       0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+       0x4040404040408080, 0x8080808080808080, 0x1010101010101010, 0x1010202020202020,
+}
+var expandAVX512_18_inShuf1 = [8]uint64{
+       0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100,
+       0xffffffffff020100, 0xffff020201010000, 0xff06060505040403, 0xffffffff06050403,
+}
+var expandAVX512_18_mat2 = [8]uint64{
+       0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040404040408080,
+       0x8080808080808080, 0x0101010101010101, 0x0101020202020202, 0x0202020202020202,
+}
+var expandAVX512_18_inShuf2 = [8]uint64{
+       0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403,
+       0x0606050504040303, 0x0707060605050404, 0xffffffffff060504, 0xffffffffff060504,
+}
+var expandAVX512_18_mat3 = [8]uint64{
+       0x0202020204040404, 0x0404040404040404, 0x0404040404040808, 0x0808080808080808,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_18_inShuf3 = [8]uint64{
+       0xffffffffff060504, 0xffffffffff060504, 0xffffffffff060504, 0xffff060605050404,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_18_outShufLo = [8]uint64{
+       0x3028201810080100, 0x6058504840393831, 0x2119110903026968, 0x5149413b3a333229,
+       0x120a05046b6a6159, 0x423d3c35342a221a, 0x07066d6c625a524a, 0x3e37362b231b130b,
+}
+var expandAVX512_18_outShufHi0 = [8]uint64{
+       0x6160585048403830, 0xffffffff78706968, 0x59514941393231ff, 0xffff79716b6a6362,
+       0x4a423a3433ffffff, 0x7a726d6c65645a52, 0x3b3635ffffffffff, 0x6f6e67665b534b43,
+}
+var expandAVX512_18_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0x18100800ffffffff, 0xffffffffffffff19, 0x0901ffffffffffff,
+       0xffffffffff1b1a11, 0xffffffffffffffff, 0xffffff1d1c120a02, 0xffffffffffffffff,
+}
+
+func expandAVX512_18(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_18_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_18_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_18_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_18_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_18_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_18_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_18_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_18_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_18_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_18_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_18_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xffe0fff83ffe0fff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0x1f0007c001f000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_20_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_20_inShuf0 = [8]uint64{
+       0x0303020201010000, 0xffffffff03020100, 0xff03020201010000, 0xffff020201010000,
+       0xffffffffff020100, 0xffff020201010000, 0xffff020201010000, 0xffffffffff020100,
+}
+var expandAVX512_20_mat1 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x0808080808080808,
+}
+var expandAVX512_20_inShuf1 = [8]uint64{
+       0xffff020201010000, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000,
+       0xff06060505040403, 0x0606050504040303, 0xffffffff06050403, 0xffff050504040303,
+}
+var expandAVX512_20_mat2 = [8]uint64{
+       0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040,
+       0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202,
+}
+var expandAVX512_20_inShuf2 = [8]uint64{
+       0xffff050504040303, 0xffffffffff050403, 0xffff050504040303, 0xffff050504040303,
+       0xffffffffff050403, 0xffff050504040303, 0xffff060605050404, 0xffffffffff060504,
+}
+var expandAVX512_20_outShufLo = [8]uint64{
+       0x2019181110080100, 0x4841403831302928, 0x1209030259585049, 0x33322b2a211b1a13,
+       0x5b5a514b4a434239, 0x221d1c15140a0504, 0x4c45443a35342d2c, 0x160b07065d5c524d,
+}
+var expandAVX512_20_outShufHi = [8]uint64{
+       0x4140393830292820, 0x6968605958515048, 0x312b2a2221787170, 0x5a53524943423b3a,
+       0x237973726b6a615b, 0x45443d3c322d2c24, 0x6d6c625d5c55544a, 0x332f2e26257a7574,
+}
+
+func expandAVX512_20(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_20_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_20_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_20_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_20_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_20_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_20_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_20_outShufLo).AsUint8x64()
+       v15 := simd.LoadUint64x8(&expandAVX512_20_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v14 := v4.ConcatPermute(v8, v13)
+       v16 := v8.ConcatPermute(v12, v15)
+       return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_22_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+       0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_22_inShuf0 = [8]uint64{
+       0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100,
+       0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffff020201010000,
+}
+var expandAVX512_22_mat1 = [8]uint64{
+       0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+       0x4040808080808080, 0x8080808080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_22_inShuf1 = [8]uint64{
+       0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000,
+       0xffffffffff020100, 0xffffffff01010000, 0xffff040403030202, 0xffff050504040303,
+}
+var expandAVX512_22_mat2 = [8]uint64{
+       0x0101010101010202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404,
+       0x0404080808080808, 0x0808080808080808, 0x1010101010101010, 0x1010101010102020,
+}
+var expandAVX512_22_inShuf2 = [8]uint64{
+       0xffffffffff050403, 0xffff050504040303, 0xffffffffff050403, 0xffff050504040303,
+       0xffffffffff050403, 0xffff050504040303, 0xffff050504040303, 0xffffffffff050403,
+}
+var expandAVX512_22_mat3 = [8]uint64{
+       0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040808080808080,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_22_inShuf3 = [8]uint64{
+       0xffff050504040303, 0xffffffffff050403, 0xffffff0504040303, 0xffffffffffff0403,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_22_outShufLo = [8]uint64{
+       0x2120181110080100, 0x4948403938313028, 0x0302696860595850, 0x3229232219131209,
+       0x5a514b4a413b3a33, 0x140a05046b6a615b, 0x3c35342a25241a15, 0x625d5c524d4c423d,
+}
+var expandAVX512_22_outShufHi0 = [8]uint64{
+       0x5049484039383130, 0x7871706968605958, 0x3332ffffffffffff, 0x5b5a514b4a413b3a,
+       0xffff7973726b6a61, 0x3d3c3534ffffffff, 0x6c625d5c524d4c42, 0xffffffff7a75746d,
+}
+var expandAVX512_22_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffff181110080100, 0xffffffffffffffff,
+       0x0302ffffffffffff, 0xffffffff19131209, 0xffffffffffffffff, 0x140a0504ffffffff,
+}
+
+func expandAVX512_22(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_22_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_22_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_22_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_22_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_22_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_22_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_22_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_22_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_22_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_22_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_22_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xffff03fffc0ffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0xf0000fc0003f0000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_24_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_24_inShuf0 = [8]uint64{
+       0x0202010101000000, 0x0202010101000000, 0x0202010101000000, 0x0202010101000000,
+       0x0202010101000000, 0xff02010101000000, 0xffff010101000000, 0xffff010101000000,
+}
+var expandAVX512_24_inShuf1 = [8]uint64{
+       0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02,
+       0xffffffffffffff02, 0x0404040303030202, 0x0404030303020202, 0x0404030303020202,
+}
+var expandAVX512_24_mat2 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x4040404040404040, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_24_inShuf2 = [8]uint64{
+       0x0505040404030303, 0x0505040404030303, 0x0505040404030303, 0xffff040404030303,
+       0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffff04, 0xffffffffffffff05,
+}
+var expandAVX512_24_mat3 = [8]uint64{
+       0x0202020202020202, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_24_inShuf3 = [8]uint64{
+       0xffffffffffffff05, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_24_outShufLo = [8]uint64{
+       0x11100a0908020100, 0x282221201a191812, 0x3a39383231302a29, 0x14130d0c0b050403,
+       0x2b2524231d1c1b15, 0x3d3c3b3534332d2c, 0x1716480f0e400706, 0x2e602726581f1e50,
+}
+var expandAVX512_24_outShufHi0 = [8]uint64{
+       0x3a39383231302928, 0x51504a4948424140, 0x2a6261605a595852, 0x3d3c3b3534332c2b,
+       0x54534d4c4b454443, 0x2d6564635d5c5b55, 0x703f3e6837362f2e, 0x5756ff4f4e784746,
+}
+var expandAVX512_24_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff00ffffffffff,
+}
+
+func expandAVX512_24(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_24_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_24_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_24_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_24_mat2).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_24_inShuf2).AsUint8x64()
+       v12 := simd.LoadUint64x8(&expandAVX512_24_mat3).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_24_inShuf3).AsUint8x64()
+       v16 := simd.LoadUint64x8(&expandAVX512_24_outShufLo).AsUint8x64()
+       v18 := simd.LoadUint64x8(&expandAVX512_24_outShufHi0).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_24_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v10 := v0.Permute(v9)
+       v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0)
+       v14 := v0.Permute(v13)
+       v15 := v14.GaloisFieldAffineTransform(v12.AsUint64x8(), 0)
+       v17 := v4.ConcatPermute(v7, v16)
+       u0 := uint64(0xdfffffffffffffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v20 := v7.ConcatPermute(v11, v18).Masked(m0)
+       u1 := uint64(0x2000000000000000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v21 := v15.Permute(v19).Masked(m1)
+       v22 := v20.Or(v21)
+       return v17.AsUint64x8(), v22.AsUint64x8()
+}
+
+var expandAVX512_26_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404,
+       0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_26_inShuf0 = [8]uint64{
+       0x0202010101000000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100,
+       0xffff020201010000, 0xffffffffff020100, 0x0202010101000000, 0xffff010101000000,
+}
+var expandAVX512_26_mat1 = [8]uint64{
+       0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+       0x4040404040408080, 0x8080808080808080, 0x0101010101010101, 0x0808080808080808,
+}
+var expandAVX512_26_inShuf1 = [8]uint64{
+       0xffffffffffff0100, 0xffffffff01010000, 0xffffffffffff0100, 0xffffffff01010000,
+       0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0xff04040403030302,
+}
+var expandAVX512_26_mat2 = [8]uint64{
+       0x1010101010101010, 0x1010202020202020, 0x2020202020202020, 0x2020202040404040,
+       0x4040404040404040, 0x4040404040408080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_26_inShuf2 = [8]uint64{
+       0x0404030303020202, 0xffffffffff040302, 0xffff040403030202, 0xffffffffff040302,
+       0xffff040403030202, 0xffffffffff040302, 0xff04030303020202, 0xffff040404030303,
+}
+var expandAVX512_26_mat3 = [8]uint64{
+       0x0101020202020202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404,
+       0x0404040404040808, 0x1010101010101010, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_26_inShuf3 = [8]uint64{
+       0xffffffffffff0403, 0xffffffff04040303, 0xffffffffffff0403, 0xffffffff04040303,
+       0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_26_outShufLo = [8]uint64{
+       0x2018111008020100, 0x3a39383231302821, 0x6860595850494840, 0x1312090504036a69,
+       0x3b35343329232219, 0x5b5a514b4a413d3c, 0x0a7007066d6c6b61, 0x37362a25241a1514,
+}
+var expandAVX512_26_outShufHi0 = [8]uint64{
+       0x5851504842414038, 0x7978727170686160, 0xffffffffffffff7a, 0x52494544433b3a39,
+       0x7574736963625953, 0xffffffffff7d7c7b, 0xff47463e3d3cffff, 0x766a65645a55544a,
+}
+var expandAVX512_26_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0x20191810090800ff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0x1a110b0a01ffffff, 0x28ffffffffff211b, 0xffffffffffffffff,
+}
+
+func expandAVX512_26(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_26_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_26_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_26_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_26_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_26_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_26_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_26_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_26_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_26_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_26_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_26_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xff7c07ffff01ffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0x83f80000fe0000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_28_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_28_inShuf0 = [8]uint64{
+       0x0202010101000000, 0xffffffffff020100, 0x0202010101000000, 0xff02010101000000,
+       0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100,
+}
+var expandAVX512_28_mat1 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808,
+}
+var expandAVX512_28_inShuf1 = [8]uint64{
+       0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000,
+       0xffffffffffffff02, 0xffffffffffffff02, 0x0404040303030202, 0xffffffffff040302,
+}
+var expandAVX512_28_mat2 = [8]uint64{
+       0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020,
+       0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_28_inShuf2 = [8]uint64{
+       0x0404030303020202, 0x0404030303020202, 0xffffffffffff0302, 0xffff030303020202,
+       0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303,
+}
+var expandAVX512_28_mat3 = [8]uint64{
+       0x0101010102020202, 0x0202020202020202, 0x0808080808080808, 0x0000000000000000,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_28_inShuf3 = [8]uint64{
+       0xffffffffffff0403, 0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_28_outShufLo = [8]uint64{
+       0x1812111008020100, 0x31302a2928201a19, 0x4a49484241403832, 0x090504035a595850,
+       0x2b211d1c1b151413, 0x4443393534332d2c, 0x5d5c5b514d4c4b45, 0x1e6817160a600706,
+}
+var expandAVX512_28_outShufHi0 = [8]uint64{
+       0x4948424140383130, 0x6261605a5958504a, 0xff7a797872717068, 0x4339343332ffffff,
+       0x5c5b514d4c4b4544, 0x757473696564635d, 0x35ffffffff7d7c7b, 0x4f4eff47463a3736,
+}
+var expandAVX512_28_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0x00ffffffffffffff, 0xffffffffff0a0908,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xff0d0c0b01ffffff, 0xffff10ffffffffff,
+}
+
+func expandAVX512_28(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_28_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_28_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_28_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_28_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_28_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_28_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_28_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_28_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_28_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_28_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_28_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xdf87fffff87fffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0x2078000007800000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_30_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404,
+       0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010,
+}
+var expandAVX512_30_inShuf0 = [8]uint64{
+       0x0202010101000000, 0xffffffffff020100, 0xffff010101000000, 0xffffffffffff0100,
+       0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000,
+}
+var expandAVX512_30_mat1 = [8]uint64{
+       0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040,
+       0x4040808080808080, 0x8080808080808080, 0x0101010101010101, 0x0202020202020202,
+}
+var expandAVX512_30_inShuf1 = [8]uint64{
+       0xffffffffffff0100, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000,
+       0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0x0404030303020202,
+}
+var expandAVX512_30_mat2 = [8]uint64{
+       0x0202020204040404, 0x0404040404040404, 0x0404080808080808, 0x0808080808080808,
+       0x1010101010101010, 0x1010101010102020, 0x2020202020202020, 0x2020202040404040,
+}
+var expandAVX512_30_inShuf2 = [8]uint64{
+       0xffffffffff040302, 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202,
+       0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffffffffffff0302,
+}
+var expandAVX512_30_mat3 = [8]uint64{
+       0x4040404040404040, 0x4040808080808080, 0x8080808080808080, 0x0101010101010101,
+       0x0101010101010202, 0x0202020202020202, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_30_inShuf3 = [8]uint64{
+       0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303,
+       0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_30_outShufLo = [8]uint64{
+       0x1812111008020100, 0x3832313028222120, 0x58504a4948403a39, 0x04036a6968605a59,
+       0x2423191514130905, 0x3d3c3b3534332925, 0x5d5c5b514d4c4b41, 0x0a7007066d6c6b61,
+}
+var expandAVX512_30_outShufHi0 = [8]uint64{
+       0x504a4948403a3938, 0x70686261605a5958, 0xffffffffff787271, 0x3c3bffffffffffff,
+       0x5c5b514d4c4b413d, 0x757473696564635d, 0xffffffffffffff79, 0x42ff3f3effffffff,
+}
+var expandAVX512_30_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0x1008020100ffffff, 0xffff201a19181211,
+       0xffffffffffffffff, 0xffffffffffffffff, 0x15141309050403ff, 0xff28ffff211d1c1b,
+}
+
+func expandAVX512_30(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_30_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_30_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_30_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_30_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_30_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_30_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_30_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_30_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_30_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_30_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_30_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xb001ffffc007ffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0x4ffe00003ff80000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_32_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_32_inShuf0 = [8]uint64{
+       0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000,
+       0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000,
+}
+var expandAVX512_32_inShuf1 = [8]uint64{
+       0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202,
+       0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202,
+}
+var expandAVX512_32_outShufLo = [8]uint64{
+       0x0b0a090803020100, 0x1b1a191813121110, 0x2b2a292823222120, 0x3b3a393833323130,
+       0x0f0e0d0c07060504, 0x1f1e1d1c17161514, 0x2f2e2d2c27262524, 0x3f3e3d3c37363534,
+}
+
+func expandAVX512_32(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_32_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_32_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_32_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_32_outShufLo).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v4.Permute(v8)
+       v10 := v7.Permute(v8)
+       return v9.AsUint64x8(), v10.AsUint64x8()
+}
+
+var expandAVX512_36_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_36_inShuf0 = [8]uint64{
+       0x0101010100000000, 0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000,
+       0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000, 0xffffffffffff0100,
+}
+var expandAVX512_36_mat1 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101,
+}
+var expandAVX512_36_inShuf1 = [8]uint64{
+       0x0101010100000000, 0xffffff0100000000, 0xffffffffffffff00, 0xffffffff00000000,
+       0xff02020202010101, 0xffffffffffff0201, 0x0202020201010101, 0x0303030302020202,
+}
+var expandAVX512_36_mat2 = [8]uint64{
+       0x0101010102020202, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808,
+       0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020,
+}
+var expandAVX512_36_inShuf2 = [8]uint64{
+       0xffffffffffff0302, 0x0303030302020202, 0x0303030302020202, 0xffffffffffff0302,
+       0x0303030302020202, 0xffff030302020202, 0xffffffffffffff02, 0xffffffff02020202,
+}
+var expandAVX512_36_outShufLo = [8]uint64{
+       0x1211100803020100, 0x2928201b1a191813, 0x4038333231302b2a, 0x504b4a4948434241,
+       0x070605045b5a5958, 0x1e1d1c1716151409, 0x35342f2e2d2c211f, 0x4c47464544393736,
+}
+var expandAVX512_36_outShufHi = [8]uint64{
+       0x3332313028222120, 0x4a4948403b3a3938, 0x616058535251504b, 0x78706b6a69686362,
+       0x29262524237b7a79, 0x3f3e3d3c37363534, 0x5655544f4e4d4c41, 0x6d6c676665645957,
+}
+
+func expandAVX512_36(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_36_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_36_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_36_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_36_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_36_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_36_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_36_outShufLo).AsUint8x64()
+       v15 := simd.LoadUint64x8(&expandAVX512_36_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v14 := v4.ConcatPermute(v8, v13)
+       v16 := v8.ConcatPermute(v12, v15)
+       return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_40_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_40_inShuf0 = [8]uint64{
+       0x0101010000000000, 0x0101010000000000, 0x0101010000000000, 0x0101010000000000,
+       0x0101010000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000,
+}
+var expandAVX512_40_mat1 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040,
+}
+var expandAVX512_40_inShuf1 = [8]uint64{
+       0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101,
+       0xffffffffffffff01, 0xffff020202020201, 0x0202020101010101, 0x0202020101010101,
+}
+var expandAVX512_40_mat2 = [8]uint64{
+       0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0404040404040404,
+       0x0808080808080808, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_40_inShuf2 = [8]uint64{
+       0x0202020101010101, 0x0303030202020202, 0x0303030202020202, 0xffffff0202020202,
+       0xffffff0202020202, 0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffff0202,
+}
+var expandAVX512_40_mat3 = [8]uint64{
+       0x0101010101010101, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_40_inShuf3 = [8]uint64{
+       0xffffffffffff0303, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_40_outShufLo = [8]uint64{
+       0x0a09080403020100, 0x1814131211100c0b, 0x232221201c1b1a19, 0x31302c2b2a292824,
+       0x3c3b3a3938343332, 0x0f0e0d4140070605, 0x1d51501716154948, 0x6027262559581f1e,
+}
+var expandAVX512_40_outShufHi0 = [8]uint64{
+       0x3938343332313028, 0x44434241403c3b3a, 0x5251504c4b4a4948, 0x605c5b5a59585453,
+       0x2c2b2a2964636261, 0x3e3d69683736352d, 0x797847464571703f, 0x575655ffff4f4e4d,
+}
+var expandAVX512_40_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffff0100ffffff,
+}
+
+func expandAVX512_40(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_40_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_40_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_40_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_40_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_40_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_40_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_40_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_40_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_40_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_40_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_40_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xe7ffffffffffffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0x1800000000000000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_44_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_44_inShuf0 = [8]uint64{
+       0x0101010000000000, 0xffffffffffff0100, 0x0101010000000000, 0x0101010000000000,
+       0xffffffffffff0100, 0x0101010000000000, 0xffffff0000000000, 0xffffffffffffff00,
+}
+var expandAVX512_44_mat1 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+}
+var expandAVX512_44_inShuf1 = [8]uint64{
+       0xffffff0000000000, 0xffffff0000000000, 0xffffffffffffff00, 0xffffff0000000000,
+       0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xff02020202020101,
+}
+var expandAVX512_44_mat2 = [8]uint64{
+       0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040,
+       0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202,
+}
+var expandAVX512_44_inShuf2 = [8]uint64{
+       0x0202020101010101, 0xffffffffffff0201, 0x0202020101010101, 0x0202020101010101,
+       0xffffffffffff0201, 0xffff020101010101, 0xffffff0202020202, 0xffffffffffffff02,
+}
+var expandAVX512_44_mat3 = [8]uint64{
+       0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x1010101010101010,
+       0x2020202020202020, 0x4040404040404040, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_44_inShuf3 = [8]uint64{
+       0xffffff0202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffff0202,
+       0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_44_outShufLo = [8]uint64{
+       0x1110080403020100, 0x1c1b1a1918141312, 0x31302c2b2a292820, 0x4342414038343332,
+       0x58504c4b4a494844, 0x600706055c5b5a59, 0x1d69681716150961, 0x2f2e2d2171701f1e,
+}
+var expandAVX512_44_outShufHi0 = [8]uint64{
+       0x4844434241403938, 0x5a59585453525150, 0x6c6b6a6968605c5b, 0xffff787473727170,
+       0xffffffffffffffff, 0x46453e3d3c3b3aff, 0xff57565549ffff47, 0x6d61ffff5f5e5dff,
+}
+var expandAVX512_44_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x0100ffffffffffff,
+       0x0c0b0a0908040302, 0xffffffffffffff10, 0x20ffffffff1918ff, 0xffff2928ffffff21,
+}
+
+func expandAVX512_44(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_44_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_44_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_44_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_44_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_44_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_44_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_44_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_44_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_44_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_44_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_44_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0xce79fe003fffffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0x318601ffc0000000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_48_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_48_inShuf0 = [8]uint64{
+       0x0101000000000000, 0x0101000000000000, 0x0101000000000000, 0xffff000000000000,
+       0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000,
+}
+var expandAVX512_48_mat1 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040404040404,
+       0x0808080808080808, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040,
+}
+var expandAVX512_48_inShuf1 = [8]uint64{
+       0xffffffff01010101, 0xffffffff01010101, 0xffffffffffff0101, 0x0202020202020101,
+       0x0202010101010101, 0x0202010101010101, 0x0202010101010101, 0xffff010101010101,
+}
+var expandAVX512_48_mat2 = [8]uint64{
+       0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0808080808080808,
+       0x1010101010101010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_48_inShuf2 = [8]uint64{
+       0xffff010101010101, 0xffff020202020202, 0xffff020202020202, 0xffffffff02020202,
+       0xffffffff02020202, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_48_outShufLo = [8]uint64{
+       0x0908050403020100, 0x131211100d0c0b0a, 0x1d1c1b1a19181514, 0x2928252423222120,
+       0x333231302d2c2b2a, 0x3d3c3b3a39383534, 0x0f0e434241400706, 0x515017164b4a4948,
+}
+var expandAVX512_48_outShufHi = [8]uint64{
+       0x2524232221201918, 0x31302d2c2b2a2928, 0x3b3a393835343332, 0x4544434241403d3c,
+       0x51504d4c4b4a4948, 0x1d1c1b1a55545352, 0x5b5a595827261f1e, 0x3736636261602f2e,
+}
+
+func expandAVX512_48(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_48_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_48_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_48_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_48_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_48_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_48_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_48_outShufLo).AsUint8x64()
+       v15 := simd.LoadUint64x8(&expandAVX512_48_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v14 := v4.ConcatPermute(v8, v13)
+       v16 := v8.ConcatPermute(v12, v15)
+       return v14.AsUint64x8(), v16.AsUint64x8()
+}
+
+var expandAVX512_52_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_52_inShuf0 = [8]uint64{
+       0x0101000000000000, 0xffffffffffff0100, 0x0101000000000000, 0xffff000000000000,
+       0xffffffffffffff00, 0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00,
+}
+var expandAVX512_52_mat1 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x0101010101010101, 0x0202020202020202, 0x0202020202020202, 0x0404040404040404,
+}
+var expandAVX512_52_inShuf1 = [8]uint64{
+       0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00, 0xffff000000000000,
+       0xffffffff01010101, 0xffffffffff010101, 0xff02020202020201, 0x0202010101010101,
+}
+var expandAVX512_52_mat2 = [8]uint64{
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+}
+var expandAVX512_52_inShuf2 = [8]uint64{
+       0xffffffffffff0201, 0x0202010101010101, 0xffff010101010101, 0xffffffffffffff01,
+       0xffff010101010101, 0xffff010101010101, 0xffffffffffffff01, 0xffff010101010101,
+}
+var expandAVX512_52_mat3 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0404040404040404, 0x0808080808080808,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_52_inShuf3 = [8]uint64{
+       0xffff020202020202, 0xffffffffffffff02, 0xffffffff02020202, 0xffffffffffff0202,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_52_outShufLo = [8]uint64{
+       0x1008050403020100, 0x1a19181514131211, 0x2b2a2928201d1c1b, 0x3534333231302d2c,
+       0x4845444342414038, 0x5958504d4c4b4a49, 0x616007065d5c5b5a, 0x6a69681716096362,
+}
+var expandAVX512_52_outShufHi0 = [8]uint64{
+       0x403d3c3b3a393830, 0x51504d4c4b4a4948, 0x6261605855545352, 0x6c6b6a6968656463,
+       0x7d7c7b7a7978706d, 0x31ffffffffffffff, 0xff3f3e3635343332, 0xffff4f4e41ffffff,
+}
+var expandAVX512_52_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xff08050403020100, 0x10ffffffffffffff, 0x1918ffffff131211,
+}
+
+func expandAVX512_52(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_52_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_52_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_52_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_52_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_52_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_52_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_52_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_52_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_52_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_52_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_52_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0x387f80ffffffffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0xc7807f0000000000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_56_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_56_inShuf0 = [8]uint64{
+       0x0100000000000000, 0x0100000000000000, 0xff00000000000000, 0xff00000000000000,
+       0xff00000000000000, 0xff00000000000000, 0xff00000000000000, 0xff00000000000000,
+}
+var expandAVX512_56_inShuf1 = [8]uint64{
+       0xffff010101010101, 0x0202010101010101, 0x0201010101010101, 0xff01010101010101,
+       0xff01010101010101, 0xff01010101010101, 0xff01010101010101, 0xff01010101010101,
+}
+var expandAVX512_56_mat2 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0000000000000000,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_56_inShuf2 = [8]uint64{
+       0xff02020202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_56_outShufLo = [8]uint64{
+       0x0806050403020100, 0x11100e0d0c0b0a09, 0x1a19181615141312, 0x232221201e1d1c1b,
+       0x2c2b2a2928262524, 0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0f45444342414007,
+}
+var expandAVX512_56_outShufHi = [8]uint64{
+       0x11100d0c0b0a0908, 0x1a19181615141312, 0x232221201e1d1c1b, 0x2c2b2a2928262524,
+       0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0e46454443424140, 0x50174c4b4a49480f,
+}
+
+func expandAVX512_56(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_56_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_56_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_56_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_56_mat2).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_56_inShuf2).AsUint8x64()
+       v12 := simd.LoadUint64x8(&expandAVX512_56_outShufLo).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_56_outShufHi).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v10 := v0.Permute(v9)
+       v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0)
+       v13 := v4.ConcatPermute(v7, v12)
+       v15 := v7.ConcatPermute(v11, v14)
+       return v13.AsUint64x8(), v15.AsUint64x8()
+}
+
+var expandAVX512_60_mat0 = [8]uint64{
+       0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404,
+       0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020,
+}
+var expandAVX512_60_inShuf0 = [8]uint64{
+       0x0100000000000000, 0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000,
+       0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00,
+}
+var expandAVX512_60_mat1 = [8]uint64{
+       0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080,
+       0x0101010101010101, 0x0101010101010101, 0x0101010102020202, 0x0202020202020202,
+}
+var expandAVX512_60_inShuf1 = [8]uint64{
+       0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00, 0xff00000000000000,
+       0xffffffffff010101, 0x0202020202010101, 0xffffffffffff0201, 0xff01010101010101,
+}
+var expandAVX512_60_mat2 = [8]uint64{
+       0x0404040404040404, 0x0404040408080808, 0x0808080808080808, 0x1010101010101010,
+       0x1010101020202020, 0x2020202020202020, 0x4040404040404040, 0x4040404080808080,
+}
+var expandAVX512_60_inShuf2 = [8]uint64{
+       0xff01010101010101, 0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101,
+       0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101, 0xffffffffffffff01,
+}
+var expandAVX512_60_mat3 = [8]uint64{
+       0x8080808080808080, 0x0101010101010101, 0x0000000000000000, 0x0000000000000000,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_60_inShuf3 = [8]uint64{
+       0xff01010101010101, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+}
+var expandAVX512_60_outShufLo = [8]uint64{
+       0x0806050403020100, 0x1816151413121110, 0x28201e1d1c1b1a19, 0x31302e2d2c2b2a29,
+       0x4140383635343332, 0x4a49484645444342, 0x5a5958504e4d4c4b, 0x626160075e5d5c5b,
+}
+var expandAVX512_60_outShufHi0 = [8]uint64{
+       0x3b3a3938302a2928, 0x44434241403e3d3c, 0x5453525150484645, 0x5d5c5b5a59585655,
+       0x6d6c6b6a6968605e, 0x767574737271706e, 0xffffffffffffff78, 0x31ffff2f2e2d2c2b,
+}
+var expandAVX512_60_outShufHi1 = [8]uint64{
+       0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
+       0xffffffffffffffff, 0xffffffffffffffff, 0x06050403020100ff, 0xff0908ffffffffff,
+}
+
+func expandAVX512_60(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_60_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_60_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_60_mat1).AsUint8x64()
+       v6 := simd.LoadUint64x8(&expandAVX512_60_inShuf1).AsUint8x64()
+       v9 := simd.LoadUint64x8(&expandAVX512_60_mat2).AsUint8x64()
+       v10 := simd.LoadUint64x8(&expandAVX512_60_inShuf2).AsUint8x64()
+       v13 := simd.LoadUint64x8(&expandAVX512_60_mat3).AsUint8x64()
+       v14 := simd.LoadUint64x8(&expandAVX512_60_inShuf3).AsUint8x64()
+       v17 := simd.LoadUint64x8(&expandAVX512_60_outShufLo).AsUint8x64()
+       v19 := simd.LoadUint64x8(&expandAVX512_60_outShufHi0).AsUint8x64()
+       v20 := simd.LoadUint64x8(&expandAVX512_60_outShufHi1).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v7 := v0.Permute(v6)
+       v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0)
+       v11 := v0.Permute(v10)
+       v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0)
+       v15 := v0.Permute(v14)
+       v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0)
+       v18 := v4.ConcatPermute(v8, v17)
+       u0 := uint64(0x9f01ffffffffffff)
+       m0 := simd.Mask8x64FromBits(u0)
+       v21 := v8.ConcatPermute(v12, v19).Masked(m0)
+       u1 := uint64(0x60fe000000000000)
+       m1 := simd.Mask8x64FromBits(u1)
+       v22 := v16.Permute(v20).Masked(m1)
+       v23 := v21.Or(v22)
+       return v18.AsUint64x8(), v23.AsUint64x8()
+}
+
+var expandAVX512_64_mat0 = [8]uint64{
+       0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808,
+       0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080,
+}
+var expandAVX512_64_inShuf0 = [8]uint64{
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+       0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+}
+var expandAVX512_64_inShuf1 = [8]uint64{
+       0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101,
+       0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101,
+}
+var expandAVX512_64_outShufLo = [8]uint64{
+       0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x1716151413121110, 0x1f1e1d1c1b1a1918,
+       0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x3736353433323130, 0x3f3e3d3c3b3a3938,
+}
+
+func expandAVX512_64(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       v1 := simd.LoadUint64x8(&expandAVX512_64_mat0).AsUint8x64()
+       v2 := simd.LoadUint64x8(&expandAVX512_64_inShuf0).AsUint8x64()
+       v5 := simd.LoadUint64x8(&expandAVX512_64_inShuf1).AsUint8x64()
+       v8 := simd.LoadUint64x8(&expandAVX512_64_outShufLo).AsUint8x64()
+       v3 := v0.Permute(v2)
+       v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v6 := v0.Permute(v5)
+       v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0)
+       v9 := v4.Permute(v8)
+       v10 := v7.Permute(v8)
+       return v9.AsUint64x8(), v10.AsUint64x8()
+}
diff --git a/src/internal/runtime/gc/scan/expanders_amd64.s b/src/internal/runtime/gc/scan/expanders_amd64.s
new file mode 100644 (file)
index 0000000..c90d715
--- /dev/null
@@ -0,0 +1,2631 @@
+// Code generated by mkasm.go. DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+GLOBL ·gcExpandersAVX512Asm(SB), RODATA, $0x220
+DATA  ·gcExpandersAVX512Asm+0x00(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x08(SB)/8, $expandAVX512Asm_1<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x10(SB)/8, $expandAVX512Asm_2<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x18(SB)/8, $expandAVX512Asm_3<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x20(SB)/8, $expandAVX512Asm_4<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x28(SB)/8, $expandAVX512Asm_6<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x30(SB)/8, $expandAVX512Asm_8<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x38(SB)/8, $expandAVX512Asm_10<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x40(SB)/8, $expandAVX512Asm_12<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x48(SB)/8, $expandAVX512Asm_14<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x50(SB)/8, $expandAVX512Asm_16<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x58(SB)/8, $expandAVX512Asm_18<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x60(SB)/8, $expandAVX512Asm_20<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x68(SB)/8, $expandAVX512Asm_22<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x70(SB)/8, $expandAVX512Asm_24<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x78(SB)/8, $expandAVX512Asm_26<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x80(SB)/8, $expandAVX512Asm_28<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x88(SB)/8, $expandAVX512Asm_30<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x90(SB)/8, $expandAVX512Asm_32<>(SB)
+DATA  ·gcExpandersAVX512Asm+0x98(SB)/8, $expandAVX512Asm_36<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xa0(SB)/8, $expandAVX512Asm_40<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xa8(SB)/8, $expandAVX512Asm_44<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xb0(SB)/8, $expandAVX512Asm_48<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xb8(SB)/8, $expandAVX512Asm_52<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xc0(SB)/8, $expandAVX512Asm_56<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xc8(SB)/8, $expandAVX512Asm_60<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xd0(SB)/8, $expandAVX512Asm_64<>(SB)
+DATA  ·gcExpandersAVX512Asm+0xd8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0xe0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0xe8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0xf0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0xf8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x100(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x108(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x110(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x118(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x120(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x128(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x130(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x138(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x140(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x148(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x150(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x158(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x160(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x168(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x170(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x178(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x180(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x188(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x190(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x198(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1a0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1a8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1b0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1b8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1c0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1c8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1d0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1d8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1e0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1e8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1f0(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x1f8(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x200(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x208(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x210(SB)/8, $0
+DATA  ·gcExpandersAVX512Asm+0x218(SB)/8, $0
+
+TEXT expandAVX512Asm_1<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 (AX), Z1
+       VMOVDQU64 64(AX), Z2
+       RET
+
+GLOBL expandAVX512Asm_2_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_2_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_2_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_2_inShuf0<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_2_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_2_inShuf0<>+0x20(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_2_inShuf0<>+0x28(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_2_inShuf0<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_2_inShuf0<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918
+
+GLOBL expandAVX512Asm_2_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_2_mat0<>+0x00(SB)/8, $0x0101020204040808
+DATA  expandAVX512Asm_2_mat0<>+0x08(SB)/8, $0x1010202040408080
+DATA  expandAVX512Asm_2_mat0<>+0x10(SB)/8, $0x0101020204040808
+DATA  expandAVX512Asm_2_mat0<>+0x18(SB)/8, $0x1010202040408080
+DATA  expandAVX512Asm_2_mat0<>+0x20(SB)/8, $0x0101020204040808
+DATA  expandAVX512Asm_2_mat0<>+0x28(SB)/8, $0x1010202040408080
+DATA  expandAVX512Asm_2_mat0<>+0x30(SB)/8, $0x0101020204040808
+DATA  expandAVX512Asm_2_mat0<>+0x38(SB)/8, $0x1010202040408080
+
+GLOBL expandAVX512Asm_2_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_2_inShuf1<>+0x00(SB)/8, $0x2726252423222120
+DATA  expandAVX512Asm_2_inShuf1<>+0x08(SB)/8, $0x2726252423222120
+DATA  expandAVX512Asm_2_inShuf1<>+0x10(SB)/8, $0x2f2e2d2c2b2a2928
+DATA  expandAVX512Asm_2_inShuf1<>+0x18(SB)/8, $0x2f2e2d2c2b2a2928
+DATA  expandAVX512Asm_2_inShuf1<>+0x20(SB)/8, $0x3736353433323130
+DATA  expandAVX512Asm_2_inShuf1<>+0x28(SB)/8, $0x3736353433323130
+DATA  expandAVX512Asm_2_inShuf1<>+0x30(SB)/8, $0x3f3e3d3c3b3a3938
+DATA  expandAVX512Asm_2_inShuf1<>+0x38(SB)/8, $0x3f3e3d3c3b3a3938
+
+GLOBL expandAVX512Asm_2_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_2_outShufLo+0x00(SB)/8, $0x0b030a0209010800
+DATA  expandAVX512Asm_2_outShufLo+0x08(SB)/8, $0x0f070e060d050c04
+DATA  expandAVX512Asm_2_outShufLo+0x10(SB)/8, $0x1b131a1219111810
+DATA  expandAVX512Asm_2_outShufLo+0x18(SB)/8, $0x1f171e161d151c14
+DATA  expandAVX512Asm_2_outShufLo+0x20(SB)/8, $0x2b232a2229212820
+DATA  expandAVX512Asm_2_outShufLo+0x28(SB)/8, $0x2f272e262d252c24
+DATA  expandAVX512Asm_2_outShufLo+0x30(SB)/8, $0x3b333a3239313830
+DATA  expandAVX512Asm_2_outShufLo+0x38(SB)/8, $0x3f373e363d353c34
+
+TEXT expandAVX512Asm_2<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_2_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_2_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512Asm_2_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_2_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_3_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_3_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_3_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_3_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_3_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_3_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_3_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_3_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_3_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_3_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_3_mat0<>+0x00(SB)/8, $0x0101010202020404
+DATA  expandAVX512Asm_3_mat0<>+0x08(SB)/8, $0x0408080810101020
+DATA  expandAVX512Asm_3_mat0<>+0x10(SB)/8, $0x2020404040808080
+DATA  expandAVX512Asm_3_mat0<>+0x18(SB)/8, $0x0101010202020404
+DATA  expandAVX512Asm_3_mat0<>+0x20(SB)/8, $0x0408080810101020
+DATA  expandAVX512Asm_3_mat0<>+0x28(SB)/8, $0x2020404040808080
+DATA  expandAVX512Asm_3_mat0<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_3_mat0<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_3_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_3_inShuf1<>+0x00(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_3_inShuf1<>+0x08(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_3_inShuf1<>+0x10(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_3_inShuf1<>+0x18(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_3_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_3_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_3_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_3_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_3_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_3_inShuf2<>+0x00(SB)/8, $0x2726252423222120
+DATA  expandAVX512Asm_3_inShuf2<>+0x08(SB)/8, $0x2726252423222120
+DATA  expandAVX512Asm_3_inShuf2<>+0x10(SB)/8, $0x2726252423222120
+DATA  expandAVX512Asm_3_inShuf2<>+0x18(SB)/8, $0xffffffffff2a2928
+DATA  expandAVX512Asm_3_inShuf2<>+0x20(SB)/8, $0xffffffffff2a2928
+DATA  expandAVX512Asm_3_inShuf2<>+0x28(SB)/8, $0xffffffffffff2928
+DATA  expandAVX512Asm_3_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_3_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_3_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_3_outShufLo+0x00(SB)/8, $0x0a02110901100800
+DATA  expandAVX512Asm_3_outShufLo+0x08(SB)/8, $0x05140c04130b0312
+DATA  expandAVX512Asm_3_outShufLo+0x10(SB)/8, $0x170f07160e06150d
+DATA  expandAVX512Asm_3_outShufLo+0x18(SB)/8, $0x221a292119282018
+DATA  expandAVX512Asm_3_outShufLo+0x20(SB)/8, $0x1d2c241c2b231b2a
+DATA  expandAVX512Asm_3_outShufLo+0x28(SB)/8, $0x2f271f2e261e2d25
+DATA  expandAVX512Asm_3_outShufLo+0x30(SB)/8, $0x4a42514941504840
+DATA  expandAVX512Asm_3_outShufLo+0x38(SB)/8, $0x45544c44534b4352
+
+GLOBL expandAVX512Asm_3_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_3_outShufHi+0x00(SB)/8, $0x170f07160e06150d
+DATA  expandAVX512Asm_3_outShufHi+0x08(SB)/8, $0x221a292119282018
+DATA  expandAVX512Asm_3_outShufHi+0x10(SB)/8, $0x1d2c241c2b231b2a
+DATA  expandAVX512Asm_3_outShufHi+0x18(SB)/8, $0x2f271f2e261e2d25
+DATA  expandAVX512Asm_3_outShufHi+0x20(SB)/8, $0x4a42514941504840
+DATA  expandAVX512Asm_3_outShufHi+0x28(SB)/8, $0x45544c44534b4352
+DATA  expandAVX512Asm_3_outShufHi+0x30(SB)/8, $0x574f47564e46554d
+DATA  expandAVX512Asm_3_outShufHi+0x38(SB)/8, $0x625a696159686058
+
+TEXT expandAVX512Asm_3<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_3_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_3_mat0<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_3_inShuf1<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_3_inShuf2<>(SB), Z5
+       VMOVDQU64 expandAVX512Asm_3_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_3_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z6
+       VPERMB Z6, Z0, Z0
+       VGF2P8AFFINEQB $0, Z3, Z0, Z0
+       VPERMB Z6, Z4, Z4
+       VGF2P8AFFINEQB $0, Z3, Z4, Z4
+       VPERMB Z6, Z5, Z5
+       VGF2P8AFFINEQB $0, Z3, Z5, Z3
+       VPERMI2B Z4, Z0, Z1
+       VPERMI2B Z3, Z4, Z2
+       RET
+
+GLOBL expandAVX512Asm_4_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_4_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_4_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_4_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_4_inShuf0<>+0x18(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_4_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_4_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_4_inShuf0<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_4_inShuf0<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908
+
+GLOBL expandAVX512Asm_4_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_4_mat0<>+0x00(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_4_mat0<>+0x08(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_4_mat0<>+0x10(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_4_mat0<>+0x18(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_4_mat0<>+0x20(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_4_mat0<>+0x28(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_4_mat0<>+0x30(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_4_mat0<>+0x38(SB)/8, $0x4040404080808080
+
+GLOBL expandAVX512Asm_4_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_4_inShuf1<>+0x00(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_4_inShuf1<>+0x08(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_4_inShuf1<>+0x10(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_4_inShuf1<>+0x18(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_4_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_4_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_4_inShuf1<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_4_inShuf1<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918
+
+GLOBL expandAVX512Asm_4_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_4_outShufLo+0x00(SB)/8, $0x1911090118100800
+DATA  expandAVX512Asm_4_outShufLo+0x08(SB)/8, $0x1b130b031a120a02
+DATA  expandAVX512Asm_4_outShufLo+0x10(SB)/8, $0x1d150d051c140c04
+DATA  expandAVX512Asm_4_outShufLo+0x18(SB)/8, $0x1f170f071e160e06
+DATA  expandAVX512Asm_4_outShufLo+0x20(SB)/8, $0x3931292138302820
+DATA  expandAVX512Asm_4_outShufLo+0x28(SB)/8, $0x3b332b233a322a22
+DATA  expandAVX512Asm_4_outShufLo+0x30(SB)/8, $0x3d352d253c342c24
+DATA  expandAVX512Asm_4_outShufLo+0x38(SB)/8, $0x3f372f273e362e26
+
+TEXT expandAVX512Asm_4<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_4_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_4_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512Asm_4_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_4_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_6_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_6_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_6_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_6_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_6_inShuf0<>+0x18(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_6_inShuf0<>+0x20(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_6_inShuf0<>+0x28(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_6_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_6_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_6_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_6_mat0<>+0x00(SB)/8, $0x0101010101010202
+DATA  expandAVX512Asm_6_mat0<>+0x08(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_6_mat0<>+0x10(SB)/8, $0x0404080808080808
+DATA  expandAVX512Asm_6_mat0<>+0x18(SB)/8, $0x1010101010102020
+DATA  expandAVX512Asm_6_mat0<>+0x20(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_6_mat0<>+0x28(SB)/8, $0x4040808080808080
+DATA  expandAVX512Asm_6_mat0<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_6_mat0<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_6_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_6_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_6_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_6_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_6_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_6_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_6_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_6_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_6_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_6_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_6_inShuf2<>+0x00(SB)/8, $0xffff151413121110
+DATA  expandAVX512Asm_6_inShuf2<>+0x08(SB)/8, $0xffff151413121110
+DATA  expandAVX512Asm_6_inShuf2<>+0x10(SB)/8, $0xffffff1413121110
+DATA  expandAVX512Asm_6_inShuf2<>+0x18(SB)/8, $0xffffff1413121110
+DATA  expandAVX512Asm_6_inShuf2<>+0x20(SB)/8, $0xffffff1413121110
+DATA  expandAVX512Asm_6_inShuf2<>+0x28(SB)/8, $0xffffff1413121110
+DATA  expandAVX512Asm_6_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_6_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_6_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_6_outShufLo+0x00(SB)/8, $0x0901282018100800
+DATA  expandAVX512Asm_6_outShufLo+0x08(SB)/8, $0x1a120a0229211911
+DATA  expandAVX512Asm_6_outShufLo+0x10(SB)/8, $0x2b231b130b032a22
+DATA  expandAVX512Asm_6_outShufLo+0x18(SB)/8, $0x0d052c241c140c04
+DATA  expandAVX512Asm_6_outShufLo+0x20(SB)/8, $0x1e160e062d251d15
+DATA  expandAVX512Asm_6_outShufLo+0x28(SB)/8, $0x2f271f170f072e26
+DATA  expandAVX512Asm_6_outShufLo+0x30(SB)/8, $0x4941686058504840
+DATA  expandAVX512Asm_6_outShufLo+0x38(SB)/8, $0x5a524a4269615951
+
+GLOBL expandAVX512Asm_6_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_6_outShufHi+0x00(SB)/8, $0x2b231b130b032a22
+DATA  expandAVX512Asm_6_outShufHi+0x08(SB)/8, $0x0d052c241c140c04
+DATA  expandAVX512Asm_6_outShufHi+0x10(SB)/8, $0x1e160e062d251d15
+DATA  expandAVX512Asm_6_outShufHi+0x18(SB)/8, $0x2f271f170f072e26
+DATA  expandAVX512Asm_6_outShufHi+0x20(SB)/8, $0x4941686058504840
+DATA  expandAVX512Asm_6_outShufHi+0x28(SB)/8, $0x5a524a4269615951
+DATA  expandAVX512Asm_6_outShufHi+0x30(SB)/8, $0x6b635b534b436a62
+DATA  expandAVX512Asm_6_outShufHi+0x38(SB)/8, $0x4d456c645c544c44
+
+TEXT expandAVX512Asm_6<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_6_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_6_mat0<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_6_inShuf1<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_6_inShuf2<>(SB), Z5
+       VMOVDQU64 expandAVX512Asm_6_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_6_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z6
+       VPERMB Z6, Z0, Z0
+       VGF2P8AFFINEQB $0, Z3, Z0, Z0
+       VPERMB Z6, Z4, Z4
+       VGF2P8AFFINEQB $0, Z3, Z4, Z4
+       VPERMB Z6, Z5, Z5
+       VGF2P8AFFINEQB $0, Z3, Z5, Z3
+       VPERMI2B Z4, Z0, Z1
+       VPERMI2B Z3, Z4, Z2
+       RET
+
+GLOBL expandAVX512Asm_8_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_8_inShuf0<>+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_8_inShuf0<>+0x08(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_8_inShuf0<>+0x10(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_8_inShuf0<>+0x18(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_8_inShuf0<>+0x20(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_8_inShuf0<>+0x28(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_8_inShuf0<>+0x30(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_8_inShuf0<>+0x38(SB)/8, $0x0706050403020100
+
+GLOBL expandAVX512Asm_8_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_8_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_8_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_8_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_8_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_8_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_8_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_8_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_8_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_8_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_8_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_8_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_8_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_8_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_8_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_8_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_8_inShuf1<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_8_inShuf1<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908
+
+GLOBL expandAVX512Asm_8_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_8_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512Asm_8_outShufLo+0x08(SB)/8, $0x3931292119110901
+DATA  expandAVX512Asm_8_outShufLo+0x10(SB)/8, $0x3a322a221a120a02
+DATA  expandAVX512Asm_8_outShufLo+0x18(SB)/8, $0x3b332b231b130b03
+DATA  expandAVX512Asm_8_outShufLo+0x20(SB)/8, $0x3c342c241c140c04
+DATA  expandAVX512Asm_8_outShufLo+0x28(SB)/8, $0x3d352d251d150d05
+DATA  expandAVX512Asm_8_outShufLo+0x30(SB)/8, $0x3e362e261e160e06
+DATA  expandAVX512Asm_8_outShufLo+0x38(SB)/8, $0x3f372f271f170f07
+
+TEXT expandAVX512Asm_8<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_8_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_8_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512Asm_8_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_8_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_10_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_inShuf0<>+0x00(SB)/8, $0xff06050403020100
+DATA  expandAVX512Asm_10_inShuf0<>+0x08(SB)/8, $0xff06050403020100
+DATA  expandAVX512Asm_10_inShuf0<>+0x10(SB)/8, $0xff06050403020100
+DATA  expandAVX512Asm_10_inShuf0<>+0x18(SB)/8, $0xff06050403020100
+DATA  expandAVX512Asm_10_inShuf0<>+0x20(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_10_inShuf0<>+0x28(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_10_inShuf0<>+0x30(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_10_inShuf0<>+0x38(SB)/8, $0xffff050403020100
+
+GLOBL expandAVX512Asm_10_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_10_mat0<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512Asm_10_mat0<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_10_mat0<>+0x18(SB)/8, $0x0404040404040808
+DATA  expandAVX512Asm_10_mat0<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_10_mat0<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_10_mat0<>+0x30(SB)/8, $0x1010202020202020
+DATA  expandAVX512Asm_10_mat0<>+0x38(SB)/8, $0x2020202040404040
+
+GLOBL expandAVX512Asm_10_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_inShuf1<>+0x00(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_10_inShuf1<>+0x08(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_10_inShuf1<>+0x10(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512Asm_10_inShuf1<>+0x18(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512Asm_10_inShuf1<>+0x20(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512Asm_10_inShuf1<>+0x28(SB)/8, $0xff0c0b0a09080706
+DATA  expandAVX512Asm_10_inShuf1<>+0x30(SB)/8, $0xffff0b0a09080706
+DATA  expandAVX512Asm_10_inShuf1<>+0x38(SB)/8, $0xffff0b0a09080706
+
+GLOBL expandAVX512Asm_10_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_mat1<>+0x00(SB)/8, $0x4040404040408080
+DATA  expandAVX512Asm_10_mat1<>+0x08(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_10_mat1<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_10_mat1<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_10_mat1<>+0x20(SB)/8, $0x1010202020202020
+DATA  expandAVX512Asm_10_mat1<>+0x28(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_10_mat1<>+0x30(SB)/8, $0x4040404040408080
+DATA  expandAVX512Asm_10_mat1<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_10_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_inShuf2<>+0x00(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512Asm_10_inShuf2<>+0x08(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512Asm_10_inShuf2<>+0x10(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512Asm_10_inShuf2<>+0x18(SB)/8, $0xffff0c0b0a090807
+DATA  expandAVX512Asm_10_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_10_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_10_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_10_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_10_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_mat2<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_10_mat2<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512Asm_10_mat2<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_10_mat2<>+0x18(SB)/8, $0x0404040404040808
+DATA  expandAVX512Asm_10_mat2<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_10_mat2<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_10_mat2<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_10_mat2<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_10_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512Asm_10_outShufLo+0x08(SB)/8, $0x2921191109014840
+DATA  expandAVX512Asm_10_outShufLo+0x10(SB)/8, $0x1a120a0249413931
+DATA  expandAVX512Asm_10_outShufLo+0x18(SB)/8, $0x0b034a423a322a22
+DATA  expandAVX512Asm_10_outShufLo+0x20(SB)/8, $0x4b433b332b231b13
+DATA  expandAVX512Asm_10_outShufLo+0x28(SB)/8, $0x3c342c241c140c04
+DATA  expandAVX512Asm_10_outShufLo+0x30(SB)/8, $0x2d251d150d054c44
+DATA  expandAVX512Asm_10_outShufLo+0x38(SB)/8, $0x1e160e064d453d35
+
+GLOBL expandAVX512Asm_10_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_10_outShufHi+0x00(SB)/8, $0x4840383028201810
+DATA  expandAVX512Asm_10_outShufHi+0x08(SB)/8, $0x3931292119115850
+DATA  expandAVX512Asm_10_outShufHi+0x10(SB)/8, $0x2a221a1259514941
+DATA  expandAVX512Asm_10_outShufHi+0x18(SB)/8, $0x1b135a524a423a32
+DATA  expandAVX512Asm_10_outShufHi+0x20(SB)/8, $0x5b534b433b332b23
+DATA  expandAVX512Asm_10_outShufHi+0x28(SB)/8, $0x4c443c342c241c14
+DATA  expandAVX512Asm_10_outShufHi+0x30(SB)/8, $0x3d352d251d155c54
+DATA  expandAVX512Asm_10_outShufHi+0x38(SB)/8, $0x2e261e165d554d45
+
+TEXT expandAVX512Asm_10<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_10_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_10_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_10_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_10_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_10_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_10_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_10_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_10_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_12_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_inShuf0<>+0x00(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_12_inShuf0<>+0x08(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_12_inShuf0<>+0x10(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_12_inShuf0<>+0x18(SB)/8, $0xffff050403020100
+DATA  expandAVX512Asm_12_inShuf0<>+0x20(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_12_inShuf0<>+0x28(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_12_inShuf0<>+0x30(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_12_inShuf0<>+0x38(SB)/8, $0xffffff0403020100
+
+GLOBL expandAVX512Asm_12_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_12_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_12_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_12_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_12_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_12_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_12_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_12_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_12_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_inShuf1<>+0x00(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_12_inShuf1<>+0x08(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_12_inShuf1<>+0x10(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_12_inShuf1<>+0x18(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_12_inShuf1<>+0x20(SB)/8, $0xffff0a0908070605
+DATA  expandAVX512Asm_12_inShuf1<>+0x28(SB)/8, $0xffff0a0908070605
+DATA  expandAVX512Asm_12_inShuf1<>+0x30(SB)/8, $0xffff0a0908070605
+DATA  expandAVX512Asm_12_inShuf1<>+0x38(SB)/8, $0xffff0a0908070605
+
+GLOBL expandAVX512Asm_12_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_12_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_12_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_12_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_12_mat1<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_12_mat1<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_12_mat1<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_12_mat1<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_12_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_inShuf2<>+0x00(SB)/8, $0xffffff0908070605
+DATA  expandAVX512Asm_12_inShuf2<>+0x08(SB)/8, $0xffffff0908070605
+DATA  expandAVX512Asm_12_inShuf2<>+0x10(SB)/8, $0xffffff0908070605
+DATA  expandAVX512Asm_12_inShuf2<>+0x18(SB)/8, $0xffffff0908070605
+DATA  expandAVX512Asm_12_inShuf2<>+0x20(SB)/8, $0xffffff0a09080706
+DATA  expandAVX512Asm_12_inShuf2<>+0x28(SB)/8, $0xffffff0a09080706
+DATA  expandAVX512Asm_12_inShuf2<>+0x30(SB)/8, $0xffffff0a09080706
+DATA  expandAVX512Asm_12_inShuf2<>+0x38(SB)/8, $0xffffff0a09080706
+
+GLOBL expandAVX512Asm_12_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_mat2<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_12_mat2<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_12_mat2<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_12_mat2<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_12_mat2<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_12_mat2<>+0x28(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_12_mat2<>+0x30(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_12_mat2<>+0x38(SB)/8, $0x0404040404040404
+
+GLOBL expandAVX512Asm_12_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512Asm_12_outShufLo+0x08(SB)/8, $0x1911090158504840
+DATA  expandAVX512Asm_12_outShufLo+0x10(SB)/8, $0x5951494139312921
+DATA  expandAVX512Asm_12_outShufLo+0x18(SB)/8, $0x3a322a221a120a02
+DATA  expandAVX512Asm_12_outShufLo+0x20(SB)/8, $0x1b130b035a524a42
+DATA  expandAVX512Asm_12_outShufLo+0x28(SB)/8, $0x5b534b433b332b23
+DATA  expandAVX512Asm_12_outShufLo+0x30(SB)/8, $0x3c342c241c140c04
+DATA  expandAVX512Asm_12_outShufLo+0x38(SB)/8, $0x1d150d055c544c44
+
+GLOBL expandAVX512Asm_12_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_12_outShufHi+0x00(SB)/8, $0x5850484038302820
+DATA  expandAVX512Asm_12_outShufHi+0x08(SB)/8, $0x3931292178706860
+DATA  expandAVX512Asm_12_outShufHi+0x10(SB)/8, $0x7971696159514941
+DATA  expandAVX512Asm_12_outShufHi+0x18(SB)/8, $0x5a524a423a322a22
+DATA  expandAVX512Asm_12_outShufHi+0x20(SB)/8, $0x3b332b237a726a62
+DATA  expandAVX512Asm_12_outShufHi+0x28(SB)/8, $0x7b736b635b534b43
+DATA  expandAVX512Asm_12_outShufHi+0x30(SB)/8, $0x5c544c443c342c24
+DATA  expandAVX512Asm_12_outShufHi+0x38(SB)/8, $0x3d352d257c746c64
+
+TEXT expandAVX512Asm_12<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_12_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_12_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_12_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_12_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_12_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_12_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_12_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_12_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_14_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_inShuf0<>+0x00(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_14_inShuf0<>+0x08(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_14_inShuf0<>+0x10(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_14_inShuf0<>+0x18(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_14_inShuf0<>+0x20(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_14_inShuf0<>+0x28(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_14_inShuf0<>+0x30(SB)/8, $0xffffff0403020100
+DATA  expandAVX512Asm_14_inShuf0<>+0x38(SB)/8, $0xffffff0403020100
+
+GLOBL expandAVX512Asm_14_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_14_mat0<>+0x08(SB)/8, $0x0101010101010202
+DATA  expandAVX512Asm_14_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_14_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_14_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_14_mat0<>+0x28(SB)/8, $0x0404080808080808
+DATA  expandAVX512Asm_14_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_14_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512Asm_14_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_inShuf1<>+0x00(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_14_inShuf1<>+0x08(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_14_inShuf1<>+0x10(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_14_inShuf1<>+0x18(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_14_inShuf1<>+0x20(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_14_inShuf1<>+0x28(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_14_inShuf1<>+0x30(SB)/8, $0xffffff0807060504
+DATA  expandAVX512Asm_14_inShuf1<>+0x38(SB)/8, $0xffffff0807060504
+
+GLOBL expandAVX512Asm_14_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_mat1<>+0x00(SB)/8, $0x1010101010102020
+DATA  expandAVX512Asm_14_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_14_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_14_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_14_mat1<>+0x20(SB)/8, $0x4040808080808080
+DATA  expandAVX512Asm_14_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_14_mat1<>+0x30(SB)/8, $0x1010101010102020
+DATA  expandAVX512Asm_14_mat1<>+0x38(SB)/8, $0x2020202020202020
+
+GLOBL expandAVX512Asm_14_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_inShuf2<>+0x00(SB)/8, $0xffffff0807060504
+DATA  expandAVX512Asm_14_inShuf2<>+0x08(SB)/8, $0xffffff0807060504
+DATA  expandAVX512Asm_14_inShuf2<>+0x10(SB)/8, $0xffffff0807060504
+DATA  expandAVX512Asm_14_inShuf2<>+0x18(SB)/8, $0xffffff0807060504
+DATA  expandAVX512Asm_14_inShuf2<>+0x20(SB)/8, $0xffffff0908070605
+DATA  expandAVX512Asm_14_inShuf2<>+0x28(SB)/8, $0xffffff0908070605
+DATA  expandAVX512Asm_14_inShuf2<>+0x30(SB)/8, $0xffffffff08070605
+DATA  expandAVX512Asm_14_inShuf2<>+0x38(SB)/8, $0xffffffff08070605
+
+GLOBL expandAVX512Asm_14_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_mat2<>+0x00(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_14_mat2<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_14_mat2<>+0x10(SB)/8, $0x4040808080808080
+DATA  expandAVX512Asm_14_mat2<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_14_mat2<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_14_mat2<>+0x28(SB)/8, $0x0101010101010202
+DATA  expandAVX512Asm_14_mat2<>+0x30(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_14_mat2<>+0x38(SB)/8, $0x0202020204040404
+
+GLOBL expandAVX512Asm_14_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_inShuf3<>+0x00(SB)/8, $0xffffffff08070605
+DATA  expandAVX512Asm_14_inShuf3<>+0x08(SB)/8, $0xffffffff08070605
+DATA  expandAVX512Asm_14_inShuf3<>+0x10(SB)/8, $0xffffffff08070605
+DATA  expandAVX512Asm_14_inShuf3<>+0x18(SB)/8, $0xffffffff08070605
+DATA  expandAVX512Asm_14_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_14_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_14_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_14_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_14_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_mat3<>+0x00(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_14_mat3<>+0x08(SB)/8, $0x0404080808080808
+DATA  expandAVX512Asm_14_mat3<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_14_mat3<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_14_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_14_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_14_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_14_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_14_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_outShufLo+0x00(SB)/8, $0x3830282018100800
+DATA  expandAVX512Asm_14_outShufLo+0x08(SB)/8, $0x0901686058504840
+DATA  expandAVX512Asm_14_outShufLo+0x10(SB)/8, $0x4941393129211911
+DATA  expandAVX512Asm_14_outShufLo+0x18(SB)/8, $0x1a120a0269615951
+DATA  expandAVX512Asm_14_outShufLo+0x20(SB)/8, $0x5a524a423a322a22
+DATA  expandAVX512Asm_14_outShufLo+0x28(SB)/8, $0x2b231b130b036a62
+DATA  expandAVX512Asm_14_outShufLo+0x30(SB)/8, $0x6b635b534b433b33
+DATA  expandAVX512Asm_14_outShufLo+0x38(SB)/8, $0x3c342c241c140c04
+
+GLOBL expandAVX512Asm_14_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_outShufHi0+0x00(SB)/8, $0x6860585048403830
+DATA  expandAVX512Asm_14_outShufHi0+0x08(SB)/8, $0x3931ffffffff7870
+DATA  expandAVX512Asm_14_outShufHi0+0x10(SB)/8, $0x7971696159514941
+DATA  expandAVX512Asm_14_outShufHi0+0x18(SB)/8, $0x4a423a32ffffffff
+DATA  expandAVX512Asm_14_outShufHi0+0x20(SB)/8, $0xffff7a726a625a52
+DATA  expandAVX512Asm_14_outShufHi0+0x28(SB)/8, $0x5b534b433b33ffff
+DATA  expandAVX512Asm_14_outShufHi0+0x30(SB)/8, $0xffffffff7b736b63
+DATA  expandAVX512Asm_14_outShufHi0+0x38(SB)/8, $0x6c645c544c443c34
+
+GLOBL expandAVX512Asm_14_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_14_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_14_outShufHi1+0x08(SB)/8, $0xffff18100800ffff
+DATA  expandAVX512Asm_14_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_14_outShufHi1+0x18(SB)/8, $0xffffffff19110901
+DATA  expandAVX512Asm_14_outShufHi1+0x20(SB)/8, $0x0a02ffffffffffff
+DATA  expandAVX512Asm_14_outShufHi1+0x28(SB)/8, $0xffffffffffff1a12
+DATA  expandAVX512Asm_14_outShufHi1+0x30(SB)/8, $0x1b130b03ffffffff
+DATA  expandAVX512Asm_14_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
+
+TEXT expandAVX512Asm_14<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_14_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_14_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_14_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_14_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_14_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_14_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_14_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xff0ffc3ff0ffc3ff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0xf003c00f003c00, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_16_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_16_inShuf0<>+0x00(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_16_inShuf0<>+0x08(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_16_inShuf0<>+0x10(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_16_inShuf0<>+0x18(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_16_inShuf0<>+0x20(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_16_inShuf0<>+0x28(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_16_inShuf0<>+0x30(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_16_inShuf0<>+0x38(SB)/8, $0x0303020201010000
+
+GLOBL expandAVX512Asm_16_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_16_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_16_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_16_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_16_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_16_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_16_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_16_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_16_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_16_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_16_inShuf1<>+0x00(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_16_inShuf1<>+0x08(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_16_inShuf1<>+0x10(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_16_inShuf1<>+0x18(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_16_inShuf1<>+0x20(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_16_inShuf1<>+0x28(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_16_inShuf1<>+0x30(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_16_inShuf1<>+0x38(SB)/8, $0x0707060605050404
+
+GLOBL expandAVX512Asm_16_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_16_outShufLo+0x00(SB)/8, $0x1918111009080100
+DATA  expandAVX512Asm_16_outShufLo+0x08(SB)/8, $0x3938313029282120
+DATA  expandAVX512Asm_16_outShufLo+0x10(SB)/8, $0x1b1a13120b0a0302
+DATA  expandAVX512Asm_16_outShufLo+0x18(SB)/8, $0x3b3a33322b2a2322
+DATA  expandAVX512Asm_16_outShufLo+0x20(SB)/8, $0x1d1c15140d0c0504
+DATA  expandAVX512Asm_16_outShufLo+0x28(SB)/8, $0x3d3c35342d2c2524
+DATA  expandAVX512Asm_16_outShufLo+0x30(SB)/8, $0x1f1e17160f0e0706
+DATA  expandAVX512Asm_16_outShufLo+0x38(SB)/8, $0x3f3e37362f2e2726
+
+TEXT expandAVX512Asm_16<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_16_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_16_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512Asm_16_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_16_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_18_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_inShuf0<>+0x00(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_18_inShuf0<>+0x08(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_18_inShuf0<>+0x10(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_18_inShuf0<>+0x18(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_18_inShuf0<>+0x20(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_18_inShuf0<>+0x28(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_18_inShuf0<>+0x30(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_18_inShuf0<>+0x38(SB)/8, $0xff03020201010000
+
+GLOBL expandAVX512Asm_18_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_18_mat0<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512Asm_18_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_18_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_18_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_18_mat0<>+0x28(SB)/8, $0x0404040404040808
+DATA  expandAVX512Asm_18_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_18_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512Asm_18_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_inShuf1<>+0x00(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_18_inShuf1<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_18_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_18_inShuf1<>+0x18(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_18_inShuf1<>+0x20(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_18_inShuf1<>+0x28(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_18_inShuf1<>+0x30(SB)/8, $0xff06060505040403
+DATA  expandAVX512Asm_18_inShuf1<>+0x38(SB)/8, $0xffffffff06050403
+
+GLOBL expandAVX512Asm_18_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_mat1<>+0x00(SB)/8, $0x1010202020202020
+DATA  expandAVX512Asm_18_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_18_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_18_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_18_mat1<>+0x20(SB)/8, $0x4040404040408080
+DATA  expandAVX512Asm_18_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_18_mat1<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_18_mat1<>+0x38(SB)/8, $0x1010202020202020
+
+GLOBL expandAVX512Asm_18_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_inShuf2<>+0x00(SB)/8, $0xffffffff06050403
+DATA  expandAVX512Asm_18_inShuf2<>+0x08(SB)/8, $0xffffffff06050403
+DATA  expandAVX512Asm_18_inShuf2<>+0x10(SB)/8, $0xffffffff06050403
+DATA  expandAVX512Asm_18_inShuf2<>+0x18(SB)/8, $0xffffffff06050403
+DATA  expandAVX512Asm_18_inShuf2<>+0x20(SB)/8, $0x0606050504040303
+DATA  expandAVX512Asm_18_inShuf2<>+0x28(SB)/8, $0x0707060605050404
+DATA  expandAVX512Asm_18_inShuf2<>+0x30(SB)/8, $0xffffffffff060504
+DATA  expandAVX512Asm_18_inShuf2<>+0x38(SB)/8, $0xffffffffff060504
+
+GLOBL expandAVX512Asm_18_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_mat2<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_18_mat2<>+0x08(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_18_mat2<>+0x10(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_18_mat2<>+0x18(SB)/8, $0x4040404040408080
+DATA  expandAVX512Asm_18_mat2<>+0x20(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_18_mat2<>+0x28(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_18_mat2<>+0x30(SB)/8, $0x0101020202020202
+DATA  expandAVX512Asm_18_mat2<>+0x38(SB)/8, $0x0202020202020202
+
+GLOBL expandAVX512Asm_18_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_inShuf3<>+0x00(SB)/8, $0xffffffffff060504
+DATA  expandAVX512Asm_18_inShuf3<>+0x08(SB)/8, $0xffffffffff060504
+DATA  expandAVX512Asm_18_inShuf3<>+0x10(SB)/8, $0xffffffffff060504
+DATA  expandAVX512Asm_18_inShuf3<>+0x18(SB)/8, $0xffff060605050404
+DATA  expandAVX512Asm_18_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_18_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_18_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_18_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_18_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_mat3<>+0x00(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_18_mat3<>+0x08(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_18_mat3<>+0x10(SB)/8, $0x0404040404040808
+DATA  expandAVX512Asm_18_mat3<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_18_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_18_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_18_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_18_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_18_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_outShufLo+0x00(SB)/8, $0x3028201810080100
+DATA  expandAVX512Asm_18_outShufLo+0x08(SB)/8, $0x6058504840393831
+DATA  expandAVX512Asm_18_outShufLo+0x10(SB)/8, $0x2119110903026968
+DATA  expandAVX512Asm_18_outShufLo+0x18(SB)/8, $0x5149413b3a333229
+DATA  expandAVX512Asm_18_outShufLo+0x20(SB)/8, $0x120a05046b6a6159
+DATA  expandAVX512Asm_18_outShufLo+0x28(SB)/8, $0x423d3c35342a221a
+DATA  expandAVX512Asm_18_outShufLo+0x30(SB)/8, $0x07066d6c625a524a
+DATA  expandAVX512Asm_18_outShufLo+0x38(SB)/8, $0x3e37362b231b130b
+
+GLOBL expandAVX512Asm_18_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_outShufHi0+0x00(SB)/8, $0x6160585048403830
+DATA  expandAVX512Asm_18_outShufHi0+0x08(SB)/8, $0xffffffff78706968
+DATA  expandAVX512Asm_18_outShufHi0+0x10(SB)/8, $0x59514941393231ff
+DATA  expandAVX512Asm_18_outShufHi0+0x18(SB)/8, $0xffff79716b6a6362
+DATA  expandAVX512Asm_18_outShufHi0+0x20(SB)/8, $0x4a423a3433ffffff
+DATA  expandAVX512Asm_18_outShufHi0+0x28(SB)/8, $0x7a726d6c65645a52
+DATA  expandAVX512Asm_18_outShufHi0+0x30(SB)/8, $0x3b3635ffffffffff
+DATA  expandAVX512Asm_18_outShufHi0+0x38(SB)/8, $0x6f6e67665b534b43
+
+GLOBL expandAVX512Asm_18_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_18_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_18_outShufHi1+0x08(SB)/8, $0x18100800ffffffff
+DATA  expandAVX512Asm_18_outShufHi1+0x10(SB)/8, $0xffffffffffffff19
+DATA  expandAVX512Asm_18_outShufHi1+0x18(SB)/8, $0x0901ffffffffffff
+DATA  expandAVX512Asm_18_outShufHi1+0x20(SB)/8, $0xffffffffff1b1a11
+DATA  expandAVX512Asm_18_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_18_outShufHi1+0x30(SB)/8, $0xffffff1d1c120a02
+DATA  expandAVX512Asm_18_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
+
+TEXT expandAVX512Asm_18<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_18_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_18_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_18_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_18_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_18_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_18_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_18_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xffe0fff83ffe0fff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x1f0007c001f000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_20_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_inShuf0<>+0x00(SB)/8, $0x0303020201010000
+DATA  expandAVX512Asm_20_inShuf0<>+0x08(SB)/8, $0xffffffff03020100
+DATA  expandAVX512Asm_20_inShuf0<>+0x10(SB)/8, $0xff03020201010000
+DATA  expandAVX512Asm_20_inShuf0<>+0x18(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_20_inShuf0<>+0x20(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_20_inShuf0<>+0x28(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_20_inShuf0<>+0x30(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_20_inShuf0<>+0x38(SB)/8, $0xffffffffff020100
+
+GLOBL expandAVX512Asm_20_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_20_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_20_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_20_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_20_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_20_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_20_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_20_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_20_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_inShuf1<>+0x00(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_20_inShuf1<>+0x08(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_20_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_20_inShuf1<>+0x18(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_20_inShuf1<>+0x20(SB)/8, $0xff06060505040403
+DATA  expandAVX512Asm_20_inShuf1<>+0x28(SB)/8, $0x0606050504040303
+DATA  expandAVX512Asm_20_inShuf1<>+0x30(SB)/8, $0xffffffff06050403
+DATA  expandAVX512Asm_20_inShuf1<>+0x38(SB)/8, $0xffff050504040303
+
+GLOBL expandAVX512Asm_20_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_20_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_20_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_20_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_20_mat1<>+0x20(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_20_mat1<>+0x28(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_20_mat1<>+0x30(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_20_mat1<>+0x38(SB)/8, $0x0808080808080808
+
+GLOBL expandAVX512Asm_20_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_inShuf2<>+0x00(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_20_inShuf2<>+0x08(SB)/8, $0xffffffffff050403
+DATA  expandAVX512Asm_20_inShuf2<>+0x10(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_20_inShuf2<>+0x18(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_20_inShuf2<>+0x20(SB)/8, $0xffffffffff050403
+DATA  expandAVX512Asm_20_inShuf2<>+0x28(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_20_inShuf2<>+0x30(SB)/8, $0xffff060605050404
+DATA  expandAVX512Asm_20_inShuf2<>+0x38(SB)/8, $0xffffffffff060504
+
+GLOBL expandAVX512Asm_20_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_mat2<>+0x00(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_20_mat2<>+0x08(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_20_mat2<>+0x10(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_20_mat2<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_20_mat2<>+0x20(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_20_mat2<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_20_mat2<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_20_mat2<>+0x38(SB)/8, $0x0101010102020202
+
+GLOBL expandAVX512Asm_20_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_outShufLo+0x00(SB)/8, $0x2019181110080100
+DATA  expandAVX512Asm_20_outShufLo+0x08(SB)/8, $0x4841403831302928
+DATA  expandAVX512Asm_20_outShufLo+0x10(SB)/8, $0x1209030259585049
+DATA  expandAVX512Asm_20_outShufLo+0x18(SB)/8, $0x33322b2a211b1a13
+DATA  expandAVX512Asm_20_outShufLo+0x20(SB)/8, $0x5b5a514b4a434239
+DATA  expandAVX512Asm_20_outShufLo+0x28(SB)/8, $0x221d1c15140a0504
+DATA  expandAVX512Asm_20_outShufLo+0x30(SB)/8, $0x4c45443a35342d2c
+DATA  expandAVX512Asm_20_outShufLo+0x38(SB)/8, $0x160b07065d5c524d
+
+GLOBL expandAVX512Asm_20_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_20_outShufHi+0x00(SB)/8, $0x4140393830292820
+DATA  expandAVX512Asm_20_outShufHi+0x08(SB)/8, $0x6968605958515048
+DATA  expandAVX512Asm_20_outShufHi+0x10(SB)/8, $0x312b2a2221787170
+DATA  expandAVX512Asm_20_outShufHi+0x18(SB)/8, $0x5a53524943423b3a
+DATA  expandAVX512Asm_20_outShufHi+0x20(SB)/8, $0x237973726b6a615b
+DATA  expandAVX512Asm_20_outShufHi+0x28(SB)/8, $0x45443d3c322d2c24
+DATA  expandAVX512Asm_20_outShufHi+0x30(SB)/8, $0x6d6c625d5c55544a
+DATA  expandAVX512Asm_20_outShufHi+0x38(SB)/8, $0x332f2e26257a7574
+
+TEXT expandAVX512Asm_20<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_20_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_20_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_20_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_20_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_20_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_20_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_20_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_20_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_22_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_inShuf0<>+0x00(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_22_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_22_inShuf0<>+0x10(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_22_inShuf0<>+0x18(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_22_inShuf0<>+0x20(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_22_inShuf0<>+0x28(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_22_inShuf0<>+0x30(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_22_inShuf0<>+0x38(SB)/8, $0xffff020201010000
+
+GLOBL expandAVX512Asm_22_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_22_mat0<>+0x08(SB)/8, $0x0101010101010202
+DATA  expandAVX512Asm_22_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_22_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_22_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_22_mat0<>+0x28(SB)/8, $0x0404080808080808
+DATA  expandAVX512Asm_22_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_22_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512Asm_22_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_inShuf1<>+0x00(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_22_inShuf1<>+0x08(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_22_inShuf1<>+0x10(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_22_inShuf1<>+0x18(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_22_inShuf1<>+0x20(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_22_inShuf1<>+0x28(SB)/8, $0xffffffff01010000
+DATA  expandAVX512Asm_22_inShuf1<>+0x30(SB)/8, $0xffff040403030202
+DATA  expandAVX512Asm_22_inShuf1<>+0x38(SB)/8, $0xffff050504040303
+
+GLOBL expandAVX512Asm_22_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_mat1<>+0x00(SB)/8, $0x1010101010102020
+DATA  expandAVX512Asm_22_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_22_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_22_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_22_mat1<>+0x20(SB)/8, $0x4040808080808080
+DATA  expandAVX512Asm_22_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_22_mat1<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_22_mat1<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512Asm_22_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_inShuf2<>+0x00(SB)/8, $0xffffffffff050403
+DATA  expandAVX512Asm_22_inShuf2<>+0x08(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_22_inShuf2<>+0x10(SB)/8, $0xffffffffff050403
+DATA  expandAVX512Asm_22_inShuf2<>+0x18(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_22_inShuf2<>+0x20(SB)/8, $0xffffffffff050403
+DATA  expandAVX512Asm_22_inShuf2<>+0x28(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_22_inShuf2<>+0x30(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_22_inShuf2<>+0x38(SB)/8, $0xffffffffff050403
+
+GLOBL expandAVX512Asm_22_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_mat2<>+0x00(SB)/8, $0x0101010101010202
+DATA  expandAVX512Asm_22_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_22_mat2<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_22_mat2<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_22_mat2<>+0x20(SB)/8, $0x0404080808080808
+DATA  expandAVX512Asm_22_mat2<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_22_mat2<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_22_mat2<>+0x38(SB)/8, $0x1010101010102020
+
+GLOBL expandAVX512Asm_22_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_inShuf3<>+0x00(SB)/8, $0xffff050504040303
+DATA  expandAVX512Asm_22_inShuf3<>+0x08(SB)/8, $0xffffffffff050403
+DATA  expandAVX512Asm_22_inShuf3<>+0x10(SB)/8, $0xffffff0504040303
+DATA  expandAVX512Asm_22_inShuf3<>+0x18(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512Asm_22_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_22_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_22_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_22_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_22_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_mat3<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_22_mat3<>+0x08(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_22_mat3<>+0x10(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_22_mat3<>+0x18(SB)/8, $0x4040808080808080
+DATA  expandAVX512Asm_22_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_22_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_22_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_22_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_22_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_outShufLo+0x00(SB)/8, $0x2120181110080100
+DATA  expandAVX512Asm_22_outShufLo+0x08(SB)/8, $0x4948403938313028
+DATA  expandAVX512Asm_22_outShufLo+0x10(SB)/8, $0x0302696860595850
+DATA  expandAVX512Asm_22_outShufLo+0x18(SB)/8, $0x3229232219131209
+DATA  expandAVX512Asm_22_outShufLo+0x20(SB)/8, $0x5a514b4a413b3a33
+DATA  expandAVX512Asm_22_outShufLo+0x28(SB)/8, $0x140a05046b6a615b
+DATA  expandAVX512Asm_22_outShufLo+0x30(SB)/8, $0x3c35342a25241a15
+DATA  expandAVX512Asm_22_outShufLo+0x38(SB)/8, $0x625d5c524d4c423d
+
+GLOBL expandAVX512Asm_22_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_outShufHi0+0x00(SB)/8, $0x5049484039383130
+DATA  expandAVX512Asm_22_outShufHi0+0x08(SB)/8, $0x7871706968605958
+DATA  expandAVX512Asm_22_outShufHi0+0x10(SB)/8, $0x3332ffffffffffff
+DATA  expandAVX512Asm_22_outShufHi0+0x18(SB)/8, $0x5b5a514b4a413b3a
+DATA  expandAVX512Asm_22_outShufHi0+0x20(SB)/8, $0xffff7973726b6a61
+DATA  expandAVX512Asm_22_outShufHi0+0x28(SB)/8, $0x3d3c3534ffffffff
+DATA  expandAVX512Asm_22_outShufHi0+0x30(SB)/8, $0x6c625d5c524d4c42
+DATA  expandAVX512Asm_22_outShufHi0+0x38(SB)/8, $0xffffffff7a75746d
+
+GLOBL expandAVX512Asm_22_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_22_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_22_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_22_outShufHi1+0x10(SB)/8, $0xffff181110080100
+DATA  expandAVX512Asm_22_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_22_outShufHi1+0x20(SB)/8, $0x0302ffffffffffff
+DATA  expandAVX512Asm_22_outShufHi1+0x28(SB)/8, $0xffffffff19131209
+DATA  expandAVX512Asm_22_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_22_outShufHi1+0x38(SB)/8, $0x140a0504ffffffff
+
+TEXT expandAVX512Asm_22<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_22_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_22_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_22_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_22_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_22_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_22_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_22_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xffff03fffc0ffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0xf0000fc0003f0000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_24_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_24_inShuf0<>+0x08(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_24_inShuf0<>+0x10(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_24_inShuf0<>+0x18(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_24_inShuf0<>+0x20(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_24_inShuf0<>+0x28(SB)/8, $0xff02010101000000
+DATA  expandAVX512Asm_24_inShuf0<>+0x30(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_24_inShuf0<>+0x38(SB)/8, $0xffff010101000000
+
+GLOBL expandAVX512Asm_24_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_24_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_24_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_24_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_24_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_24_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_24_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_24_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_24_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_inShuf1<>+0x00(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_24_inShuf1<>+0x08(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_24_inShuf1<>+0x10(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_24_inShuf1<>+0x18(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_24_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_24_inShuf1<>+0x28(SB)/8, $0x0404040303030202
+DATA  expandAVX512Asm_24_inShuf1<>+0x30(SB)/8, $0x0404030303020202
+DATA  expandAVX512Asm_24_inShuf1<>+0x38(SB)/8, $0x0404030303020202
+
+GLOBL expandAVX512Asm_24_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_inShuf2<>+0x00(SB)/8, $0x0505040404030303
+DATA  expandAVX512Asm_24_inShuf2<>+0x08(SB)/8, $0x0505040404030303
+DATA  expandAVX512Asm_24_inShuf2<>+0x10(SB)/8, $0x0505040404030303
+DATA  expandAVX512Asm_24_inShuf2<>+0x18(SB)/8, $0xffff040404030303
+DATA  expandAVX512Asm_24_inShuf2<>+0x20(SB)/8, $0xffff040404030303
+DATA  expandAVX512Asm_24_inShuf2<>+0x28(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512Asm_24_inShuf2<>+0x30(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512Asm_24_inShuf2<>+0x38(SB)/8, $0xffffffffffffff05
+
+GLOBL expandAVX512Asm_24_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_mat2<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_24_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_24_mat2<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_24_mat2<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_24_mat2<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_24_mat2<>+0x28(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_24_mat2<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_24_mat2<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512Asm_24_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_inShuf3<>+0x00(SB)/8, $0xffffffffffffff05
+DATA  expandAVX512Asm_24_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_24_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_mat3<>+0x00(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_24_mat3<>+0x08(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_24_mat3<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_24_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_24_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_24_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_24_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_24_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_24_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_outShufLo+0x00(SB)/8, $0x11100a0908020100
+DATA  expandAVX512Asm_24_outShufLo+0x08(SB)/8, $0x282221201a191812
+DATA  expandAVX512Asm_24_outShufLo+0x10(SB)/8, $0x3a39383231302a29
+DATA  expandAVX512Asm_24_outShufLo+0x18(SB)/8, $0x14130d0c0b050403
+DATA  expandAVX512Asm_24_outShufLo+0x20(SB)/8, $0x2b2524231d1c1b15
+DATA  expandAVX512Asm_24_outShufLo+0x28(SB)/8, $0x3d3c3b3534332d2c
+DATA  expandAVX512Asm_24_outShufLo+0x30(SB)/8, $0x1716480f0e400706
+DATA  expandAVX512Asm_24_outShufLo+0x38(SB)/8, $0x2e602726581f1e50
+
+GLOBL expandAVX512Asm_24_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_outShufHi0+0x00(SB)/8, $0x3a39383231302928
+DATA  expandAVX512Asm_24_outShufHi0+0x08(SB)/8, $0x51504a4948424140
+DATA  expandAVX512Asm_24_outShufHi0+0x10(SB)/8, $0x2a6261605a595852
+DATA  expandAVX512Asm_24_outShufHi0+0x18(SB)/8, $0x3d3c3b3534332c2b
+DATA  expandAVX512Asm_24_outShufHi0+0x20(SB)/8, $0x54534d4c4b454443
+DATA  expandAVX512Asm_24_outShufHi0+0x28(SB)/8, $0x2d6564635d5c5b55
+DATA  expandAVX512Asm_24_outShufHi0+0x30(SB)/8, $0x703f3e6837362f2e
+DATA  expandAVX512Asm_24_outShufHi0+0x38(SB)/8, $0x5756ff4f4e784746
+
+GLOBL expandAVX512Asm_24_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_24_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_24_outShufHi1+0x38(SB)/8, $0xffff00ffffffffff
+
+TEXT expandAVX512Asm_24<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_24_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_24_mat0<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_24_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_24_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_24_inShuf3<>(SB), Z5
+       VMOVDQU64 expandAVX512Asm_24_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_24_outShufHi0(SB), Z6
+       VMOVDQU64 expandAVX512Asm_24_outShufHi1(SB), Z7
+       VMOVDQU64 (AX), Z8
+       VPERMB Z8, Z0, Z0
+       VGF2P8AFFINEQB $0, Z2, Z0, Z0
+       VPERMB Z8, Z3, Z3
+       VGF2P8AFFINEQB $0, Z2, Z3, Z2
+       VPERMB Z8, Z4, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_24_mat2<>(SB), Z3, Z3
+       VPERMB Z8, Z5, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_24_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xdfffffffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z6
+       MOVQ $0x2000000000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z7, K1, Z0
+       VPORQ Z0, Z6, Z2
+       RET
+
+GLOBL expandAVX512Asm_26_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_26_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_26_inShuf0<>+0x10(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_26_inShuf0<>+0x18(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_26_inShuf0<>+0x20(SB)/8, $0xffff020201010000
+DATA  expandAVX512Asm_26_inShuf0<>+0x28(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_26_inShuf0<>+0x30(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_26_inShuf0<>+0x38(SB)/8, $0xffff010101000000
+
+GLOBL expandAVX512Asm_26_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_26_mat0<>+0x08(SB)/8, $0x0101020202020202
+DATA  expandAVX512Asm_26_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_26_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_26_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_26_mat0<>+0x28(SB)/8, $0x0404040404040808
+DATA  expandAVX512Asm_26_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_26_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512Asm_26_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_26_inShuf1<>+0x08(SB)/8, $0xffffffff01010000
+DATA  expandAVX512Asm_26_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_26_inShuf1<>+0x18(SB)/8, $0xffffffff01010000
+DATA  expandAVX512Asm_26_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_26_inShuf1<>+0x28(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_26_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_26_inShuf1<>+0x38(SB)/8, $0xff04040403030302
+
+GLOBL expandAVX512Asm_26_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_mat1<>+0x00(SB)/8, $0x1010202020202020
+DATA  expandAVX512Asm_26_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_26_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_26_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_26_mat1<>+0x20(SB)/8, $0x4040404040408080
+DATA  expandAVX512Asm_26_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_26_mat1<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_26_mat1<>+0x38(SB)/8, $0x0808080808080808
+
+GLOBL expandAVX512Asm_26_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_inShuf2<>+0x00(SB)/8, $0x0404030303020202
+DATA  expandAVX512Asm_26_inShuf2<>+0x08(SB)/8, $0xffffffffff040302
+DATA  expandAVX512Asm_26_inShuf2<>+0x10(SB)/8, $0xffff040403030202
+DATA  expandAVX512Asm_26_inShuf2<>+0x18(SB)/8, $0xffffffffff040302
+DATA  expandAVX512Asm_26_inShuf2<>+0x20(SB)/8, $0xffff040403030202
+DATA  expandAVX512Asm_26_inShuf2<>+0x28(SB)/8, $0xffffffffff040302
+DATA  expandAVX512Asm_26_inShuf2<>+0x30(SB)/8, $0xff04030303020202
+DATA  expandAVX512Asm_26_inShuf2<>+0x38(SB)/8, $0xffff040404030303
+
+GLOBL expandAVX512Asm_26_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_mat2<>+0x00(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_26_mat2<>+0x08(SB)/8, $0x1010202020202020
+DATA  expandAVX512Asm_26_mat2<>+0x10(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_26_mat2<>+0x18(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_26_mat2<>+0x20(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_26_mat2<>+0x28(SB)/8, $0x4040404040408080
+DATA  expandAVX512Asm_26_mat2<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_26_mat2<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512Asm_26_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512Asm_26_inShuf3<>+0x08(SB)/8, $0xffffffff04040303
+DATA  expandAVX512Asm_26_inShuf3<>+0x10(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512Asm_26_inShuf3<>+0x18(SB)/8, $0xffffffff04040303
+DATA  expandAVX512Asm_26_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512Asm_26_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512Asm_26_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_26_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_26_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_mat3<>+0x00(SB)/8, $0x0101020202020202
+DATA  expandAVX512Asm_26_mat3<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_26_mat3<>+0x10(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_26_mat3<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_26_mat3<>+0x20(SB)/8, $0x0404040404040808
+DATA  expandAVX512Asm_26_mat3<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_26_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_26_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_26_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_outShufLo+0x00(SB)/8, $0x2018111008020100
+DATA  expandAVX512Asm_26_outShufLo+0x08(SB)/8, $0x3a39383231302821
+DATA  expandAVX512Asm_26_outShufLo+0x10(SB)/8, $0x6860595850494840
+DATA  expandAVX512Asm_26_outShufLo+0x18(SB)/8, $0x1312090504036a69
+DATA  expandAVX512Asm_26_outShufLo+0x20(SB)/8, $0x3b35343329232219
+DATA  expandAVX512Asm_26_outShufLo+0x28(SB)/8, $0x5b5a514b4a413d3c
+DATA  expandAVX512Asm_26_outShufLo+0x30(SB)/8, $0x0a7007066d6c6b61
+DATA  expandAVX512Asm_26_outShufLo+0x38(SB)/8, $0x37362a25241a1514
+
+GLOBL expandAVX512Asm_26_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_outShufHi0+0x00(SB)/8, $0x5851504842414038
+DATA  expandAVX512Asm_26_outShufHi0+0x08(SB)/8, $0x7978727170686160
+DATA  expandAVX512Asm_26_outShufHi0+0x10(SB)/8, $0xffffffffffffff7a
+DATA  expandAVX512Asm_26_outShufHi0+0x18(SB)/8, $0x52494544433b3a39
+DATA  expandAVX512Asm_26_outShufHi0+0x20(SB)/8, $0x7574736963625953
+DATA  expandAVX512Asm_26_outShufHi0+0x28(SB)/8, $0xffffffffff7d7c7b
+DATA  expandAVX512Asm_26_outShufHi0+0x30(SB)/8, $0xff47463e3d3cffff
+DATA  expandAVX512Asm_26_outShufHi0+0x38(SB)/8, $0x766a65645a55544a
+
+GLOBL expandAVX512Asm_26_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_26_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_26_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_26_outShufHi1+0x10(SB)/8, $0x20191810090800ff
+DATA  expandAVX512Asm_26_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_26_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_26_outShufHi1+0x28(SB)/8, $0x1a110b0a01ffffff
+DATA  expandAVX512Asm_26_outShufHi1+0x30(SB)/8, $0x28ffffffffff211b
+DATA  expandAVX512Asm_26_outShufHi1+0x38(SB)/8, $0xffffffffffffffff
+
+TEXT expandAVX512Asm_26<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_26_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_26_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_26_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_26_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_26_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_26_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_26_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xff7c07ffff01ffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x83f80000fe0000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_28_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_28_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_28_inShuf0<>+0x10(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_28_inShuf0<>+0x18(SB)/8, $0xff02010101000000
+DATA  expandAVX512Asm_28_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_28_inShuf0<>+0x28(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_28_inShuf0<>+0x30(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_28_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100
+
+GLOBL expandAVX512Asm_28_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_28_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_28_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_28_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_28_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_28_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_28_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_28_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_28_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_inShuf1<>+0x00(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_28_inShuf1<>+0x08(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_28_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_28_inShuf1<>+0x18(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_28_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_28_inShuf1<>+0x28(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_28_inShuf1<>+0x30(SB)/8, $0x0404040303030202
+DATA  expandAVX512Asm_28_inShuf1<>+0x38(SB)/8, $0xffffffffff040302
+
+GLOBL expandAVX512Asm_28_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_28_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_28_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_28_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_28_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_28_mat1<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_28_mat1<>+0x30(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_28_mat1<>+0x38(SB)/8, $0x0404040408080808
+
+GLOBL expandAVX512Asm_28_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_inShuf2<>+0x00(SB)/8, $0x0404030303020202
+DATA  expandAVX512Asm_28_inShuf2<>+0x08(SB)/8, $0x0404030303020202
+DATA  expandAVX512Asm_28_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512Asm_28_inShuf2<>+0x18(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_28_inShuf2<>+0x20(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_28_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512Asm_28_inShuf2<>+0x30(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_28_inShuf2<>+0x38(SB)/8, $0xffff040404030303
+
+GLOBL expandAVX512Asm_28_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_mat2<>+0x00(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_28_mat2<>+0x08(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_28_mat2<>+0x10(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_28_mat2<>+0x18(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_28_mat2<>+0x20(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_28_mat2<>+0x28(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_28_mat2<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_28_mat2<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512Asm_28_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512Asm_28_inShuf3<>+0x08(SB)/8, $0xffff040404030303
+DATA  expandAVX512Asm_28_inShuf3<>+0x10(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512Asm_28_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_28_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_mat3<>+0x00(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_28_mat3<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_28_mat3<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_28_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_28_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_28_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_28_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_28_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_28_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_outShufLo+0x00(SB)/8, $0x1812111008020100
+DATA  expandAVX512Asm_28_outShufLo+0x08(SB)/8, $0x31302a2928201a19
+DATA  expandAVX512Asm_28_outShufLo+0x10(SB)/8, $0x4a49484241403832
+DATA  expandAVX512Asm_28_outShufLo+0x18(SB)/8, $0x090504035a595850
+DATA  expandAVX512Asm_28_outShufLo+0x20(SB)/8, $0x2b211d1c1b151413
+DATA  expandAVX512Asm_28_outShufLo+0x28(SB)/8, $0x4443393534332d2c
+DATA  expandAVX512Asm_28_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b45
+DATA  expandAVX512Asm_28_outShufLo+0x38(SB)/8, $0x1e6817160a600706
+
+GLOBL expandAVX512Asm_28_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_outShufHi0+0x00(SB)/8, $0x4948424140383130
+DATA  expandAVX512Asm_28_outShufHi0+0x08(SB)/8, $0x6261605a5958504a
+DATA  expandAVX512Asm_28_outShufHi0+0x10(SB)/8, $0xff7a797872717068
+DATA  expandAVX512Asm_28_outShufHi0+0x18(SB)/8, $0x4339343332ffffff
+DATA  expandAVX512Asm_28_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b4544
+DATA  expandAVX512Asm_28_outShufHi0+0x28(SB)/8, $0x757473696564635d
+DATA  expandAVX512Asm_28_outShufHi0+0x30(SB)/8, $0x35ffffffff7d7c7b
+DATA  expandAVX512Asm_28_outShufHi0+0x38(SB)/8, $0x4f4eff47463a3736
+
+GLOBL expandAVX512Asm_28_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_28_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_outShufHi1+0x10(SB)/8, $0x00ffffffffffffff
+DATA  expandAVX512Asm_28_outShufHi1+0x18(SB)/8, $0xffffffffff0a0908
+DATA  expandAVX512Asm_28_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_28_outShufHi1+0x30(SB)/8, $0xff0d0c0b01ffffff
+DATA  expandAVX512Asm_28_outShufHi1+0x38(SB)/8, $0xffff10ffffffffff
+
+TEXT expandAVX512Asm_28<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_28_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_28_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_28_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_28_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_28_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_28_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_28_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xdf87fffff87fffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x2078000007800000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_30_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_inShuf0<>+0x00(SB)/8, $0x0202010101000000
+DATA  expandAVX512Asm_30_inShuf0<>+0x08(SB)/8, $0xffffffffff020100
+DATA  expandAVX512Asm_30_inShuf0<>+0x10(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_30_inShuf0<>+0x18(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_30_inShuf0<>+0x20(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_30_inShuf0<>+0x28(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_30_inShuf0<>+0x30(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_30_inShuf0<>+0x38(SB)/8, $0xffff010101000000
+
+GLOBL expandAVX512Asm_30_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_30_mat0<>+0x08(SB)/8, $0x0101010101010202
+DATA  expandAVX512Asm_30_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_30_mat0<>+0x18(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_30_mat0<>+0x20(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_30_mat0<>+0x28(SB)/8, $0x0404080808080808
+DATA  expandAVX512Asm_30_mat0<>+0x30(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_30_mat0<>+0x38(SB)/8, $0x1010101010101010
+
+GLOBL expandAVX512Asm_30_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_30_inShuf1<>+0x08(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_30_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_30_inShuf1<>+0x18(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_30_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_30_inShuf1<>+0x28(SB)/8, $0xffff010101000000
+DATA  expandAVX512Asm_30_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_30_inShuf1<>+0x38(SB)/8, $0x0404030303020202
+
+GLOBL expandAVX512Asm_30_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_mat1<>+0x00(SB)/8, $0x1010101010102020
+DATA  expandAVX512Asm_30_mat1<>+0x08(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_30_mat1<>+0x10(SB)/8, $0x2020202040404040
+DATA  expandAVX512Asm_30_mat1<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_30_mat1<>+0x20(SB)/8, $0x4040808080808080
+DATA  expandAVX512Asm_30_mat1<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_30_mat1<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_30_mat1<>+0x38(SB)/8, $0x0202020202020202
+
+GLOBL expandAVX512Asm_30_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_inShuf2<>+0x00(SB)/8, $0xffffffffff040302
+DATA  expandAVX512Asm_30_inShuf2<>+0x08(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_30_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512Asm_30_inShuf2<>+0x18(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_30_inShuf2<>+0x20(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_30_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512Asm_30_inShuf2<>+0x30(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_30_inShuf2<>+0x38(SB)/8, $0xffffffffffff0302
+
+GLOBL expandAVX512Asm_30_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_mat2<>+0x00(SB)/8, $0x0202020204040404
+DATA  expandAVX512Asm_30_mat2<>+0x08(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_30_mat2<>+0x10(SB)/8, $0x0404080808080808
+DATA  expandAVX512Asm_30_mat2<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_30_mat2<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_30_mat2<>+0x28(SB)/8, $0x1010101010102020
+DATA  expandAVX512Asm_30_mat2<>+0x30(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_30_mat2<>+0x38(SB)/8, $0x2020202040404040
+
+GLOBL expandAVX512Asm_30_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_inShuf3<>+0x00(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_30_inShuf3<>+0x08(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512Asm_30_inShuf3<>+0x10(SB)/8, $0xffff030303020202
+DATA  expandAVX512Asm_30_inShuf3<>+0x18(SB)/8, $0xffff040404030303
+DATA  expandAVX512Asm_30_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403
+DATA  expandAVX512Asm_30_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04
+DATA  expandAVX512Asm_30_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_30_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_30_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_mat3<>+0x00(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_30_mat3<>+0x08(SB)/8, $0x4040808080808080
+DATA  expandAVX512Asm_30_mat3<>+0x10(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_30_mat3<>+0x18(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_30_mat3<>+0x20(SB)/8, $0x0101010101010202
+DATA  expandAVX512Asm_30_mat3<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_30_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_30_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_30_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_outShufLo+0x00(SB)/8, $0x1812111008020100
+DATA  expandAVX512Asm_30_outShufLo+0x08(SB)/8, $0x3832313028222120
+DATA  expandAVX512Asm_30_outShufLo+0x10(SB)/8, $0x58504a4948403a39
+DATA  expandAVX512Asm_30_outShufLo+0x18(SB)/8, $0x04036a6968605a59
+DATA  expandAVX512Asm_30_outShufLo+0x20(SB)/8, $0x2423191514130905
+DATA  expandAVX512Asm_30_outShufLo+0x28(SB)/8, $0x3d3c3b3534332925
+DATA  expandAVX512Asm_30_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b41
+DATA  expandAVX512Asm_30_outShufLo+0x38(SB)/8, $0x0a7007066d6c6b61
+
+GLOBL expandAVX512Asm_30_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_outShufHi0+0x00(SB)/8, $0x504a4948403a3938
+DATA  expandAVX512Asm_30_outShufHi0+0x08(SB)/8, $0x70686261605a5958
+DATA  expandAVX512Asm_30_outShufHi0+0x10(SB)/8, $0xffffffffff787271
+DATA  expandAVX512Asm_30_outShufHi0+0x18(SB)/8, $0x3c3bffffffffffff
+DATA  expandAVX512Asm_30_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b413d
+DATA  expandAVX512Asm_30_outShufHi0+0x28(SB)/8, $0x757473696564635d
+DATA  expandAVX512Asm_30_outShufHi0+0x30(SB)/8, $0xffffffffffffff79
+DATA  expandAVX512Asm_30_outShufHi0+0x38(SB)/8, $0x42ff3f3effffffff
+
+GLOBL expandAVX512Asm_30_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_30_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_30_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_30_outShufHi1+0x10(SB)/8, $0x1008020100ffffff
+DATA  expandAVX512Asm_30_outShufHi1+0x18(SB)/8, $0xffff201a19181211
+DATA  expandAVX512Asm_30_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_30_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_30_outShufHi1+0x30(SB)/8, $0x15141309050403ff
+DATA  expandAVX512Asm_30_outShufHi1+0x38(SB)/8, $0xff28ffff211d1c1b
+
+TEXT expandAVX512Asm_30<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_30_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_30_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_30_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_30_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_30_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_30_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_30_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xb001ffffc007ffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x4ffe00003ff80000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_32_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_32_inShuf0<>+0x00(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_32_inShuf0<>+0x08(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_32_inShuf0<>+0x10(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_32_inShuf0<>+0x18(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_32_inShuf0<>+0x20(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_32_inShuf0<>+0x28(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_32_inShuf0<>+0x30(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_32_inShuf0<>+0x38(SB)/8, $0x0101010100000000
+
+GLOBL expandAVX512Asm_32_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_32_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_32_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_32_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_32_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_32_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_32_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_32_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_32_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_32_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_32_inShuf1<>+0x00(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_32_inShuf1<>+0x08(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_32_inShuf1<>+0x10(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_32_inShuf1<>+0x18(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_32_inShuf1<>+0x20(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_32_inShuf1<>+0x28(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_32_inShuf1<>+0x30(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_32_inShuf1<>+0x38(SB)/8, $0x0303030302020202
+
+GLOBL expandAVX512Asm_32_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_32_outShufLo+0x00(SB)/8, $0x0b0a090803020100
+DATA  expandAVX512Asm_32_outShufLo+0x08(SB)/8, $0x1b1a191813121110
+DATA  expandAVX512Asm_32_outShufLo+0x10(SB)/8, $0x2b2a292823222120
+DATA  expandAVX512Asm_32_outShufLo+0x18(SB)/8, $0x3b3a393833323130
+DATA  expandAVX512Asm_32_outShufLo+0x20(SB)/8, $0x0f0e0d0c07060504
+DATA  expandAVX512Asm_32_outShufLo+0x28(SB)/8, $0x1f1e1d1c17161514
+DATA  expandAVX512Asm_32_outShufLo+0x30(SB)/8, $0x2f2e2d2c27262524
+DATA  expandAVX512Asm_32_outShufLo+0x38(SB)/8, $0x3f3e3d3c37363534
+
+TEXT expandAVX512Asm_32<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_32_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_32_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512Asm_32_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_32_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_36_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_inShuf0<>+0x00(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_36_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_36_inShuf0<>+0x10(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_36_inShuf0<>+0x18(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_36_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_36_inShuf0<>+0x28(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_36_inShuf0<>+0x30(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_36_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100
+
+GLOBL expandAVX512Asm_36_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_36_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_36_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_36_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_36_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_36_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_36_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_36_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_36_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_inShuf1<>+0x00(SB)/8, $0x0101010100000000
+DATA  expandAVX512Asm_36_inShuf1<>+0x08(SB)/8, $0xffffff0100000000
+DATA  expandAVX512Asm_36_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512Asm_36_inShuf1<>+0x18(SB)/8, $0xffffffff00000000
+DATA  expandAVX512Asm_36_inShuf1<>+0x20(SB)/8, $0xff02020202010101
+DATA  expandAVX512Asm_36_inShuf1<>+0x28(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512Asm_36_inShuf1<>+0x30(SB)/8, $0x0202020201010101
+DATA  expandAVX512Asm_36_inShuf1<>+0x38(SB)/8, $0x0303030302020202
+
+GLOBL expandAVX512Asm_36_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_36_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_36_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_36_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_36_mat1<>+0x20(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_36_mat1<>+0x28(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_36_mat1<>+0x30(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_36_mat1<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512Asm_36_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_inShuf2<>+0x00(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512Asm_36_inShuf2<>+0x08(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_36_inShuf2<>+0x10(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_36_inShuf2<>+0x18(SB)/8, $0xffffffffffff0302
+DATA  expandAVX512Asm_36_inShuf2<>+0x20(SB)/8, $0x0303030302020202
+DATA  expandAVX512Asm_36_inShuf2<>+0x28(SB)/8, $0xffff030302020202
+DATA  expandAVX512Asm_36_inShuf2<>+0x30(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_36_inShuf2<>+0x38(SB)/8, $0xffffffff02020202
+
+GLOBL expandAVX512Asm_36_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_mat2<>+0x00(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_36_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_36_mat2<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_36_mat2<>+0x18(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_36_mat2<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_36_mat2<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_36_mat2<>+0x30(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_36_mat2<>+0x38(SB)/8, $0x2020202020202020
+
+GLOBL expandAVX512Asm_36_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_outShufLo+0x00(SB)/8, $0x1211100803020100
+DATA  expandAVX512Asm_36_outShufLo+0x08(SB)/8, $0x2928201b1a191813
+DATA  expandAVX512Asm_36_outShufLo+0x10(SB)/8, $0x4038333231302b2a
+DATA  expandAVX512Asm_36_outShufLo+0x18(SB)/8, $0x504b4a4948434241
+DATA  expandAVX512Asm_36_outShufLo+0x20(SB)/8, $0x070605045b5a5958
+DATA  expandAVX512Asm_36_outShufLo+0x28(SB)/8, $0x1e1d1c1716151409
+DATA  expandAVX512Asm_36_outShufLo+0x30(SB)/8, $0x35342f2e2d2c211f
+DATA  expandAVX512Asm_36_outShufLo+0x38(SB)/8, $0x4c47464544393736
+
+GLOBL expandAVX512Asm_36_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_36_outShufHi+0x00(SB)/8, $0x3332313028222120
+DATA  expandAVX512Asm_36_outShufHi+0x08(SB)/8, $0x4a4948403b3a3938
+DATA  expandAVX512Asm_36_outShufHi+0x10(SB)/8, $0x616058535251504b
+DATA  expandAVX512Asm_36_outShufHi+0x18(SB)/8, $0x78706b6a69686362
+DATA  expandAVX512Asm_36_outShufHi+0x20(SB)/8, $0x29262524237b7a79
+DATA  expandAVX512Asm_36_outShufHi+0x28(SB)/8, $0x3f3e3d3c37363534
+DATA  expandAVX512Asm_36_outShufHi+0x30(SB)/8, $0x5655544f4e4d4c41
+DATA  expandAVX512Asm_36_outShufHi+0x38(SB)/8, $0x6d6c676665645957
+
+TEXT expandAVX512Asm_36<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_36_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_36_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_36_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_36_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_36_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_36_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_36_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_36_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_40_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_inShuf0<>+0x00(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_40_inShuf0<>+0x08(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_40_inShuf0<>+0x10(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_40_inShuf0<>+0x18(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_40_inShuf0<>+0x20(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_40_inShuf0<>+0x28(SB)/8, $0xffffff0000000000
+DATA  expandAVX512Asm_40_inShuf0<>+0x30(SB)/8, $0xffffff0000000000
+DATA  expandAVX512Asm_40_inShuf0<>+0x38(SB)/8, $0xffffff0000000000
+
+GLOBL expandAVX512Asm_40_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_40_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_40_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_40_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_40_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_40_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_40_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_40_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_40_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_inShuf1<>+0x00(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_40_inShuf1<>+0x08(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_40_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_40_inShuf1<>+0x18(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_40_inShuf1<>+0x20(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512Asm_40_inShuf1<>+0x28(SB)/8, $0xffff020202020201
+DATA  expandAVX512Asm_40_inShuf1<>+0x30(SB)/8, $0x0202020101010101
+DATA  expandAVX512Asm_40_inShuf1<>+0x38(SB)/8, $0x0202020101010101
+
+GLOBL expandAVX512Asm_40_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_mat1<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_40_mat1<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_40_mat1<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_40_mat1<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_40_mat1<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_40_mat1<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_40_mat1<>+0x30(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_40_mat1<>+0x38(SB)/8, $0x4040404040404040
+
+GLOBL expandAVX512Asm_40_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_inShuf2<>+0x00(SB)/8, $0x0202020101010101
+DATA  expandAVX512Asm_40_inShuf2<>+0x08(SB)/8, $0x0303030202020202
+DATA  expandAVX512Asm_40_inShuf2<>+0x10(SB)/8, $0x0303030202020202
+DATA  expandAVX512Asm_40_inShuf2<>+0x18(SB)/8, $0xffffff0202020202
+DATA  expandAVX512Asm_40_inShuf2<>+0x20(SB)/8, $0xffffff0202020202
+DATA  expandAVX512Asm_40_inShuf2<>+0x28(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512Asm_40_inShuf2<>+0x30(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512Asm_40_inShuf2<>+0x38(SB)/8, $0xffffffffffff0202
+
+GLOBL expandAVX512Asm_40_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_mat2<>+0x00(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_40_mat2<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_40_mat2<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_40_mat2<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_40_mat2<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_40_mat2<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_40_mat2<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_40_mat2<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_40_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_inShuf3<>+0x00(SB)/8, $0xffffffffffff0303
+DATA  expandAVX512Asm_40_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_40_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_mat3<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_40_mat3<>+0x08(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_40_mat3<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_40_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_40_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_40_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_40_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_40_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_40_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_outShufLo+0x00(SB)/8, $0x0a09080403020100
+DATA  expandAVX512Asm_40_outShufLo+0x08(SB)/8, $0x1814131211100c0b
+DATA  expandAVX512Asm_40_outShufLo+0x10(SB)/8, $0x232221201c1b1a19
+DATA  expandAVX512Asm_40_outShufLo+0x18(SB)/8, $0x31302c2b2a292824
+DATA  expandAVX512Asm_40_outShufLo+0x20(SB)/8, $0x3c3b3a3938343332
+DATA  expandAVX512Asm_40_outShufLo+0x28(SB)/8, $0x0f0e0d4140070605
+DATA  expandAVX512Asm_40_outShufLo+0x30(SB)/8, $0x1d51501716154948
+DATA  expandAVX512Asm_40_outShufLo+0x38(SB)/8, $0x6027262559581f1e
+
+GLOBL expandAVX512Asm_40_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_outShufHi0+0x00(SB)/8, $0x3938343332313028
+DATA  expandAVX512Asm_40_outShufHi0+0x08(SB)/8, $0x44434241403c3b3a
+DATA  expandAVX512Asm_40_outShufHi0+0x10(SB)/8, $0x5251504c4b4a4948
+DATA  expandAVX512Asm_40_outShufHi0+0x18(SB)/8, $0x605c5b5a59585453
+DATA  expandAVX512Asm_40_outShufHi0+0x20(SB)/8, $0x2c2b2a2964636261
+DATA  expandAVX512Asm_40_outShufHi0+0x28(SB)/8, $0x3e3d69683736352d
+DATA  expandAVX512Asm_40_outShufHi0+0x30(SB)/8, $0x797847464571703f
+DATA  expandAVX512Asm_40_outShufHi0+0x38(SB)/8, $0x575655ffff4f4e4d
+
+GLOBL expandAVX512Asm_40_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_40_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_outShufHi1+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_40_outShufHi1+0x38(SB)/8, $0xffffff0100ffffff
+
+TEXT expandAVX512Asm_40<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_40_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_40_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_40_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_40_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_40_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_40_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_40_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xe7ffffffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x1800000000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_44_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_inShuf0<>+0x00(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_44_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_44_inShuf0<>+0x10(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_44_inShuf0<>+0x18(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_44_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_44_inShuf0<>+0x28(SB)/8, $0x0101010000000000
+DATA  expandAVX512Asm_44_inShuf0<>+0x30(SB)/8, $0xffffff0000000000
+DATA  expandAVX512Asm_44_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
+
+GLOBL expandAVX512Asm_44_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_44_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_44_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_44_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_44_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_44_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_44_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_44_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_44_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_inShuf1<>+0x00(SB)/8, $0xffffff0000000000
+DATA  expandAVX512Asm_44_inShuf1<>+0x08(SB)/8, $0xffffff0000000000
+DATA  expandAVX512Asm_44_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512Asm_44_inShuf1<>+0x18(SB)/8, $0xffffff0000000000
+DATA  expandAVX512Asm_44_inShuf1<>+0x20(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_44_inShuf1<>+0x28(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_44_inShuf1<>+0x30(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_44_inShuf1<>+0x38(SB)/8, $0xff02020202020101
+
+GLOBL expandAVX512Asm_44_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_44_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_44_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_44_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_44_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_44_mat1<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_44_mat1<>+0x30(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_44_mat1<>+0x38(SB)/8, $0x0808080808080808
+
+GLOBL expandAVX512Asm_44_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_inShuf2<>+0x00(SB)/8, $0x0202020101010101
+DATA  expandAVX512Asm_44_inShuf2<>+0x08(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512Asm_44_inShuf2<>+0x10(SB)/8, $0x0202020101010101
+DATA  expandAVX512Asm_44_inShuf2<>+0x18(SB)/8, $0x0202020101010101
+DATA  expandAVX512Asm_44_inShuf2<>+0x20(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512Asm_44_inShuf2<>+0x28(SB)/8, $0xffff020101010101
+DATA  expandAVX512Asm_44_inShuf2<>+0x30(SB)/8, $0xffffff0202020202
+DATA  expandAVX512Asm_44_inShuf2<>+0x38(SB)/8, $0xffffffffffffff02
+
+GLOBL expandAVX512Asm_44_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_mat2<>+0x00(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_44_mat2<>+0x08(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_44_mat2<>+0x10(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_44_mat2<>+0x18(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_44_mat2<>+0x20(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_44_mat2<>+0x28(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_44_mat2<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_44_mat2<>+0x38(SB)/8, $0x0101010102020202
+
+GLOBL expandAVX512Asm_44_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_inShuf3<>+0x00(SB)/8, $0xffffff0202020202
+DATA  expandAVX512Asm_44_inShuf3<>+0x08(SB)/8, $0xffffff0202020202
+DATA  expandAVX512Asm_44_inShuf3<>+0x10(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_44_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512Asm_44_inShuf3<>+0x20(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512Asm_44_inShuf3<>+0x28(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512Asm_44_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_44_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_44_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_mat3<>+0x00(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_44_mat3<>+0x08(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_44_mat3<>+0x10(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_44_mat3<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_44_mat3<>+0x20(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_44_mat3<>+0x28(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_44_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_44_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_44_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_outShufLo+0x00(SB)/8, $0x1110080403020100
+DATA  expandAVX512Asm_44_outShufLo+0x08(SB)/8, $0x1c1b1a1918141312
+DATA  expandAVX512Asm_44_outShufLo+0x10(SB)/8, $0x31302c2b2a292820
+DATA  expandAVX512Asm_44_outShufLo+0x18(SB)/8, $0x4342414038343332
+DATA  expandAVX512Asm_44_outShufLo+0x20(SB)/8, $0x58504c4b4a494844
+DATA  expandAVX512Asm_44_outShufLo+0x28(SB)/8, $0x600706055c5b5a59
+DATA  expandAVX512Asm_44_outShufLo+0x30(SB)/8, $0x1d69681716150961
+DATA  expandAVX512Asm_44_outShufLo+0x38(SB)/8, $0x2f2e2d2171701f1e
+
+GLOBL expandAVX512Asm_44_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_outShufHi0+0x00(SB)/8, $0x4844434241403938
+DATA  expandAVX512Asm_44_outShufHi0+0x08(SB)/8, $0x5a59585453525150
+DATA  expandAVX512Asm_44_outShufHi0+0x10(SB)/8, $0x6c6b6a6968605c5b
+DATA  expandAVX512Asm_44_outShufHi0+0x18(SB)/8, $0xffff787473727170
+DATA  expandAVX512Asm_44_outShufHi0+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_44_outShufHi0+0x28(SB)/8, $0x46453e3d3c3b3aff
+DATA  expandAVX512Asm_44_outShufHi0+0x30(SB)/8, $0xff57565549ffff47
+DATA  expandAVX512Asm_44_outShufHi0+0x38(SB)/8, $0x6d61ffff5f5e5dff
+
+GLOBL expandAVX512Asm_44_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_44_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_44_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_44_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_44_outShufHi1+0x18(SB)/8, $0x0100ffffffffffff
+DATA  expandAVX512Asm_44_outShufHi1+0x20(SB)/8, $0x0c0b0a0908040302
+DATA  expandAVX512Asm_44_outShufHi1+0x28(SB)/8, $0xffffffffffffff10
+DATA  expandAVX512Asm_44_outShufHi1+0x30(SB)/8, $0x20ffffffff1918ff
+DATA  expandAVX512Asm_44_outShufHi1+0x38(SB)/8, $0xffff2928ffffff21
+
+TEXT expandAVX512Asm_44<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_44_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_44_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_44_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_44_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_44_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_44_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_44_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0xce79fe003fffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x318601ffc0000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_48_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_inShuf0<>+0x00(SB)/8, $0x0101000000000000
+DATA  expandAVX512Asm_48_inShuf0<>+0x08(SB)/8, $0x0101000000000000
+DATA  expandAVX512Asm_48_inShuf0<>+0x10(SB)/8, $0x0101000000000000
+DATA  expandAVX512Asm_48_inShuf0<>+0x18(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_48_inShuf0<>+0x20(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_48_inShuf0<>+0x28(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_48_inShuf0<>+0x30(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_48_inShuf0<>+0x38(SB)/8, $0xffff000000000000
+
+GLOBL expandAVX512Asm_48_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_48_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_48_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_48_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_48_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_48_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_48_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_48_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_48_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_inShuf1<>+0x00(SB)/8, $0xffffffff01010101
+DATA  expandAVX512Asm_48_inShuf1<>+0x08(SB)/8, $0xffffffff01010101
+DATA  expandAVX512Asm_48_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101
+DATA  expandAVX512Asm_48_inShuf1<>+0x18(SB)/8, $0x0202020202020101
+DATA  expandAVX512Asm_48_inShuf1<>+0x20(SB)/8, $0x0202010101010101
+DATA  expandAVX512Asm_48_inShuf1<>+0x28(SB)/8, $0x0202010101010101
+DATA  expandAVX512Asm_48_inShuf1<>+0x30(SB)/8, $0x0202010101010101
+DATA  expandAVX512Asm_48_inShuf1<>+0x38(SB)/8, $0xffff010101010101
+
+GLOBL expandAVX512Asm_48_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_mat1<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_48_mat1<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_48_mat1<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_48_mat1<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_48_mat1<>+0x20(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_48_mat1<>+0x28(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_48_mat1<>+0x30(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_48_mat1<>+0x38(SB)/8, $0x4040404040404040
+
+GLOBL expandAVX512Asm_48_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_inShuf2<>+0x00(SB)/8, $0xffff010101010101
+DATA  expandAVX512Asm_48_inShuf2<>+0x08(SB)/8, $0xffff020202020202
+DATA  expandAVX512Asm_48_inShuf2<>+0x10(SB)/8, $0xffff020202020202
+DATA  expandAVX512Asm_48_inShuf2<>+0x18(SB)/8, $0xffffffff02020202
+DATA  expandAVX512Asm_48_inShuf2<>+0x20(SB)/8, $0xffffffff02020202
+DATA  expandAVX512Asm_48_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_48_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_48_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_48_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_mat2<>+0x00(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_48_mat2<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_48_mat2<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_48_mat2<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_48_mat2<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_48_mat2<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_48_mat2<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_48_mat2<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_48_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_outShufLo+0x00(SB)/8, $0x0908050403020100
+DATA  expandAVX512Asm_48_outShufLo+0x08(SB)/8, $0x131211100d0c0b0a
+DATA  expandAVX512Asm_48_outShufLo+0x10(SB)/8, $0x1d1c1b1a19181514
+DATA  expandAVX512Asm_48_outShufLo+0x18(SB)/8, $0x2928252423222120
+DATA  expandAVX512Asm_48_outShufLo+0x20(SB)/8, $0x333231302d2c2b2a
+DATA  expandAVX512Asm_48_outShufLo+0x28(SB)/8, $0x3d3c3b3a39383534
+DATA  expandAVX512Asm_48_outShufLo+0x30(SB)/8, $0x0f0e434241400706
+DATA  expandAVX512Asm_48_outShufLo+0x38(SB)/8, $0x515017164b4a4948
+
+GLOBL expandAVX512Asm_48_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_48_outShufHi+0x00(SB)/8, $0x2524232221201918
+DATA  expandAVX512Asm_48_outShufHi+0x08(SB)/8, $0x31302d2c2b2a2928
+DATA  expandAVX512Asm_48_outShufHi+0x10(SB)/8, $0x3b3a393835343332
+DATA  expandAVX512Asm_48_outShufHi+0x18(SB)/8, $0x4544434241403d3c
+DATA  expandAVX512Asm_48_outShufHi+0x20(SB)/8, $0x51504d4c4b4a4948
+DATA  expandAVX512Asm_48_outShufHi+0x28(SB)/8, $0x1d1c1b1a55545352
+DATA  expandAVX512Asm_48_outShufHi+0x30(SB)/8, $0x5b5a595827261f1e
+DATA  expandAVX512Asm_48_outShufHi+0x38(SB)/8, $0x3736636261602f2e
+
+TEXT expandAVX512Asm_48<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_48_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_48_inShuf1<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_48_inShuf2<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_48_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_48_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z5
+       VPERMB Z5, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_48_mat0<>(SB), Z0, Z0
+       VPERMB Z5, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_48_mat1<>(SB), Z3, Z3
+       VPERMB Z5, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_48_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_52_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_inShuf0<>+0x00(SB)/8, $0x0101000000000000
+DATA  expandAVX512Asm_52_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100
+DATA  expandAVX512Asm_52_inShuf0<>+0x10(SB)/8, $0x0101000000000000
+DATA  expandAVX512Asm_52_inShuf0<>+0x18(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_52_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512Asm_52_inShuf0<>+0x28(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_52_inShuf0<>+0x30(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_52_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
+
+GLOBL expandAVX512Asm_52_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_52_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_52_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_52_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_52_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_52_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_52_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_52_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_52_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_inShuf1<>+0x00(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_52_inShuf1<>+0x08(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_52_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512Asm_52_inShuf1<>+0x18(SB)/8, $0xffff000000000000
+DATA  expandAVX512Asm_52_inShuf1<>+0x20(SB)/8, $0xffffffff01010101
+DATA  expandAVX512Asm_52_inShuf1<>+0x28(SB)/8, $0xffffffffff010101
+DATA  expandAVX512Asm_52_inShuf1<>+0x30(SB)/8, $0xff02020202020201
+DATA  expandAVX512Asm_52_inShuf1<>+0x38(SB)/8, $0x0202010101010101
+
+GLOBL expandAVX512Asm_52_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_52_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_52_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_52_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_52_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_52_mat1<>+0x28(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_52_mat1<>+0x30(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_52_mat1<>+0x38(SB)/8, $0x0404040404040404
+
+GLOBL expandAVX512Asm_52_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_inShuf2<>+0x00(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512Asm_52_inShuf2<>+0x08(SB)/8, $0x0202010101010101
+DATA  expandAVX512Asm_52_inShuf2<>+0x10(SB)/8, $0xffff010101010101
+DATA  expandAVX512Asm_52_inShuf2<>+0x18(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512Asm_52_inShuf2<>+0x20(SB)/8, $0xffff010101010101
+DATA  expandAVX512Asm_52_inShuf2<>+0x28(SB)/8, $0xffff010101010101
+DATA  expandAVX512Asm_52_inShuf2<>+0x30(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512Asm_52_inShuf2<>+0x38(SB)/8, $0xffff010101010101
+
+GLOBL expandAVX512Asm_52_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_mat2<>+0x00(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_52_mat2<>+0x08(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_52_mat2<>+0x10(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_52_mat2<>+0x18(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_52_mat2<>+0x20(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_52_mat2<>+0x28(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_52_mat2<>+0x30(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_52_mat2<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_52_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_inShuf3<>+0x00(SB)/8, $0xffff020202020202
+DATA  expandAVX512Asm_52_inShuf3<>+0x08(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_52_inShuf3<>+0x10(SB)/8, $0xffffffff02020202
+DATA  expandAVX512Asm_52_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512Asm_52_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_52_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_mat3<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_52_mat3<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_52_mat3<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_52_mat3<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_52_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_52_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_52_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_52_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_52_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_outShufLo+0x00(SB)/8, $0x1008050403020100
+DATA  expandAVX512Asm_52_outShufLo+0x08(SB)/8, $0x1a19181514131211
+DATA  expandAVX512Asm_52_outShufLo+0x10(SB)/8, $0x2b2a2928201d1c1b
+DATA  expandAVX512Asm_52_outShufLo+0x18(SB)/8, $0x3534333231302d2c
+DATA  expandAVX512Asm_52_outShufLo+0x20(SB)/8, $0x4845444342414038
+DATA  expandAVX512Asm_52_outShufLo+0x28(SB)/8, $0x5958504d4c4b4a49
+DATA  expandAVX512Asm_52_outShufLo+0x30(SB)/8, $0x616007065d5c5b5a
+DATA  expandAVX512Asm_52_outShufLo+0x38(SB)/8, $0x6a69681716096362
+
+GLOBL expandAVX512Asm_52_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_outShufHi0+0x00(SB)/8, $0x403d3c3b3a393830
+DATA  expandAVX512Asm_52_outShufHi0+0x08(SB)/8, $0x51504d4c4b4a4948
+DATA  expandAVX512Asm_52_outShufHi0+0x10(SB)/8, $0x6261605855545352
+DATA  expandAVX512Asm_52_outShufHi0+0x18(SB)/8, $0x6c6b6a6968656463
+DATA  expandAVX512Asm_52_outShufHi0+0x20(SB)/8, $0x7d7c7b7a7978706d
+DATA  expandAVX512Asm_52_outShufHi0+0x28(SB)/8, $0x31ffffffffffffff
+DATA  expandAVX512Asm_52_outShufHi0+0x30(SB)/8, $0xff3f3e3635343332
+DATA  expandAVX512Asm_52_outShufHi0+0x38(SB)/8, $0xffff4f4e41ffffff
+
+GLOBL expandAVX512Asm_52_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_52_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_52_outShufHi1+0x28(SB)/8, $0xff08050403020100
+DATA  expandAVX512Asm_52_outShufHi1+0x30(SB)/8, $0x10ffffffffffffff
+DATA  expandAVX512Asm_52_outShufHi1+0x38(SB)/8, $0x1918ffffff131211
+
+TEXT expandAVX512Asm_52<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_52_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_52_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_52_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_52_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_52_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_52_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_52_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0x387f80ffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0xc7807f0000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_56_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_56_inShuf0<>+0x00(SB)/8, $0x0100000000000000
+DATA  expandAVX512Asm_56_inShuf0<>+0x08(SB)/8, $0x0100000000000000
+DATA  expandAVX512Asm_56_inShuf0<>+0x10(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_56_inShuf0<>+0x18(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_56_inShuf0<>+0x20(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_56_inShuf0<>+0x28(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_56_inShuf0<>+0x30(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_56_inShuf0<>+0x38(SB)/8, $0xff00000000000000
+
+GLOBL expandAVX512Asm_56_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_56_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_56_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_56_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_56_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_56_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_56_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_56_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_56_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_56_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_56_inShuf1<>+0x00(SB)/8, $0xffff010101010101
+DATA  expandAVX512Asm_56_inShuf1<>+0x08(SB)/8, $0x0202010101010101
+DATA  expandAVX512Asm_56_inShuf1<>+0x10(SB)/8, $0x0201010101010101
+DATA  expandAVX512Asm_56_inShuf1<>+0x18(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_56_inShuf1<>+0x20(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_56_inShuf1<>+0x28(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_56_inShuf1<>+0x30(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_56_inShuf1<>+0x38(SB)/8, $0xff01010101010101
+
+GLOBL expandAVX512Asm_56_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_56_inShuf2<>+0x00(SB)/8, $0xff02020202020202
+DATA  expandAVX512Asm_56_inShuf2<>+0x08(SB)/8, $0xffffff0202020202
+DATA  expandAVX512Asm_56_inShuf2<>+0x10(SB)/8, $0xffffffffffffff02
+DATA  expandAVX512Asm_56_inShuf2<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_56_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_56_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_56_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_56_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_56_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_56_mat2<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_56_mat2<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_56_mat2<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_56_mat2<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_56_mat2<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_56_mat2<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_56_mat2<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_56_mat2<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_56_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_56_outShufLo+0x00(SB)/8, $0x0806050403020100
+DATA  expandAVX512Asm_56_outShufLo+0x08(SB)/8, $0x11100e0d0c0b0a09
+DATA  expandAVX512Asm_56_outShufLo+0x10(SB)/8, $0x1a19181615141312
+DATA  expandAVX512Asm_56_outShufLo+0x18(SB)/8, $0x232221201e1d1c1b
+DATA  expandAVX512Asm_56_outShufLo+0x20(SB)/8, $0x2c2b2a2928262524
+DATA  expandAVX512Asm_56_outShufLo+0x28(SB)/8, $0x3534333231302e2d
+DATA  expandAVX512Asm_56_outShufLo+0x30(SB)/8, $0x3e3d3c3b3a393836
+DATA  expandAVX512Asm_56_outShufLo+0x38(SB)/8, $0x0f45444342414007
+
+GLOBL expandAVX512Asm_56_outShufHi(SB), RODATA, $0x40
+DATA  expandAVX512Asm_56_outShufHi+0x00(SB)/8, $0x11100d0c0b0a0908
+DATA  expandAVX512Asm_56_outShufHi+0x08(SB)/8, $0x1a19181615141312
+DATA  expandAVX512Asm_56_outShufHi+0x10(SB)/8, $0x232221201e1d1c1b
+DATA  expandAVX512Asm_56_outShufHi+0x18(SB)/8, $0x2c2b2a2928262524
+DATA  expandAVX512Asm_56_outShufHi+0x20(SB)/8, $0x3534333231302e2d
+DATA  expandAVX512Asm_56_outShufHi+0x28(SB)/8, $0x3e3d3c3b3a393836
+DATA  expandAVX512Asm_56_outShufHi+0x30(SB)/8, $0x0e46454443424140
+DATA  expandAVX512Asm_56_outShufHi+0x38(SB)/8, $0x50174c4b4a49480f
+
+TEXT expandAVX512Asm_56<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_56_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_56_mat0<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_56_inShuf1<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_56_inShuf2<>(SB), Z5
+       VMOVDQU64 expandAVX512Asm_56_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_56_outShufHi(SB), Z2
+       VMOVDQU64 (AX), Z6
+       VPERMB Z6, Z0, Z0
+       VGF2P8AFFINEQB $0, Z3, Z0, Z0
+       VPERMB Z6, Z4, Z4
+       VGF2P8AFFINEQB $0, Z3, Z4, Z3
+       VPERMB Z6, Z5, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_56_mat2<>(SB), Z4, Z4
+       VPERMI2B Z3, Z0, Z1
+       VPERMI2B Z4, Z3, Z2
+       RET
+
+GLOBL expandAVX512Asm_60_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_inShuf0<>+0x00(SB)/8, $0x0100000000000000
+DATA  expandAVX512Asm_60_inShuf0<>+0x08(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512Asm_60_inShuf0<>+0x10(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_60_inShuf0<>+0x18(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_60_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512Asm_60_inShuf0<>+0x28(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_60_inShuf0<>+0x30(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_60_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00
+
+GLOBL expandAVX512Asm_60_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_60_mat0<>+0x08(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_60_mat0<>+0x10(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_60_mat0<>+0x18(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_60_mat0<>+0x20(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_60_mat0<>+0x28(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_60_mat0<>+0x30(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_60_mat0<>+0x38(SB)/8, $0x1010101020202020
+
+GLOBL expandAVX512Asm_60_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_inShuf1<>+0x00(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_60_inShuf1<>+0x08(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_60_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00
+DATA  expandAVX512Asm_60_inShuf1<>+0x18(SB)/8, $0xff00000000000000
+DATA  expandAVX512Asm_60_inShuf1<>+0x20(SB)/8, $0xffffffffff010101
+DATA  expandAVX512Asm_60_inShuf1<>+0x28(SB)/8, $0x0202020202010101
+DATA  expandAVX512Asm_60_inShuf1<>+0x30(SB)/8, $0xffffffffffff0201
+DATA  expandAVX512Asm_60_inShuf1<>+0x38(SB)/8, $0xff01010101010101
+
+GLOBL expandAVX512Asm_60_mat1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_mat1<>+0x00(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_60_mat1<>+0x08(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_60_mat1<>+0x10(SB)/8, $0x4040404080808080
+DATA  expandAVX512Asm_60_mat1<>+0x18(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_60_mat1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_60_mat1<>+0x28(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_60_mat1<>+0x30(SB)/8, $0x0101010102020202
+DATA  expandAVX512Asm_60_mat1<>+0x38(SB)/8, $0x0202020202020202
+
+GLOBL expandAVX512Asm_60_inShuf2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_inShuf2<>+0x00(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_60_inShuf2<>+0x08(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512Asm_60_inShuf2<>+0x10(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_60_inShuf2<>+0x18(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_60_inShuf2<>+0x20(SB)/8, $0xffffffffffffff01
+DATA  expandAVX512Asm_60_inShuf2<>+0x28(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_60_inShuf2<>+0x30(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_60_inShuf2<>+0x38(SB)/8, $0xffffffffffffff01
+
+GLOBL expandAVX512Asm_60_mat2<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_mat2<>+0x00(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_60_mat2<>+0x08(SB)/8, $0x0404040408080808
+DATA  expandAVX512Asm_60_mat2<>+0x10(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_60_mat2<>+0x18(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_60_mat2<>+0x20(SB)/8, $0x1010101020202020
+DATA  expandAVX512Asm_60_mat2<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_60_mat2<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_60_mat2<>+0x38(SB)/8, $0x4040404080808080
+
+GLOBL expandAVX512Asm_60_inShuf3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_inShuf3<>+0x00(SB)/8, $0xff01010101010101
+DATA  expandAVX512Asm_60_inShuf3<>+0x08(SB)/8, $0xffffffffffff0202
+DATA  expandAVX512Asm_60_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff
+
+GLOBL expandAVX512Asm_60_mat3<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_mat3<>+0x00(SB)/8, $0x8080808080808080
+DATA  expandAVX512Asm_60_mat3<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_60_mat3<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_60_mat3<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_60_mat3<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_60_mat3<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_60_mat3<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_60_mat3<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_60_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_outShufLo+0x00(SB)/8, $0x0806050403020100
+DATA  expandAVX512Asm_60_outShufLo+0x08(SB)/8, $0x1816151413121110
+DATA  expandAVX512Asm_60_outShufLo+0x10(SB)/8, $0x28201e1d1c1b1a19
+DATA  expandAVX512Asm_60_outShufLo+0x18(SB)/8, $0x31302e2d2c2b2a29
+DATA  expandAVX512Asm_60_outShufLo+0x20(SB)/8, $0x4140383635343332
+DATA  expandAVX512Asm_60_outShufLo+0x28(SB)/8, $0x4a49484645444342
+DATA  expandAVX512Asm_60_outShufLo+0x30(SB)/8, $0x5a5958504e4d4c4b
+DATA  expandAVX512Asm_60_outShufLo+0x38(SB)/8, $0x626160075e5d5c5b
+
+GLOBL expandAVX512Asm_60_outShufHi0(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_outShufHi0+0x00(SB)/8, $0x3b3a3938302a2928
+DATA  expandAVX512Asm_60_outShufHi0+0x08(SB)/8, $0x44434241403e3d3c
+DATA  expandAVX512Asm_60_outShufHi0+0x10(SB)/8, $0x5453525150484645
+DATA  expandAVX512Asm_60_outShufHi0+0x18(SB)/8, $0x5d5c5b5a59585655
+DATA  expandAVX512Asm_60_outShufHi0+0x20(SB)/8, $0x6d6c6b6a6968605e
+DATA  expandAVX512Asm_60_outShufHi0+0x28(SB)/8, $0x767574737271706e
+DATA  expandAVX512Asm_60_outShufHi0+0x30(SB)/8, $0xffffffffffffff78
+DATA  expandAVX512Asm_60_outShufHi0+0x38(SB)/8, $0x31ffff2f2e2d2c2b
+
+GLOBL expandAVX512Asm_60_outShufHi1(SB), RODATA, $0x40
+DATA  expandAVX512Asm_60_outShufHi1+0x00(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_outShufHi1+0x08(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_outShufHi1+0x10(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_outShufHi1+0x18(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_outShufHi1+0x20(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_outShufHi1+0x28(SB)/8, $0xffffffffffffffff
+DATA  expandAVX512Asm_60_outShufHi1+0x30(SB)/8, $0x06050403020100ff
+DATA  expandAVX512Asm_60_outShufHi1+0x38(SB)/8, $0xff0908ffffffffff
+
+TEXT expandAVX512Asm_60<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_60_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_60_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_60_inShuf2<>(SB), Z3
+       VMOVDQU64 expandAVX512Asm_60_inShuf3<>(SB), Z4
+       VMOVDQU64 expandAVX512Asm_60_outShufLo(SB), Z1
+       VMOVDQU64 expandAVX512Asm_60_outShufHi0(SB), Z5
+       VMOVDQU64 expandAVX512Asm_60_outShufHi1(SB), Z6
+       VMOVDQU64 (AX), Z7
+       VPERMB Z7, Z0, Z0
+       VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat0<>(SB), Z0, Z0
+       VPERMB Z7, Z2, Z2
+       VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat1<>(SB), Z2, Z2
+       VPERMB Z7, Z3, Z3
+       VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat2<>(SB), Z3, Z3
+       VPERMB Z7, Z4, Z4
+       VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat3<>(SB), Z4, Z4
+       VPERMI2B Z2, Z0, Z1
+       MOVQ $0x9f01ffffffffffff, AX
+       KMOVQ AX, K1
+       VPERMI2B.Z Z3, Z2, K1, Z5
+       MOVQ $0x60fe000000000000, AX
+       KMOVQ AX, K1
+       VPERMB.Z Z4, Z6, K1, Z0
+       VPORQ Z0, Z5, Z2
+       RET
+
+GLOBL expandAVX512Asm_64_inShuf0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_64_inShuf0<>+0x00(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_64_inShuf0<>+0x08(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_64_inShuf0<>+0x10(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_64_inShuf0<>+0x18(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_64_inShuf0<>+0x20(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_64_inShuf0<>+0x28(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_64_inShuf0<>+0x30(SB)/8, $0x0000000000000000
+DATA  expandAVX512Asm_64_inShuf0<>+0x38(SB)/8, $0x0000000000000000
+
+GLOBL expandAVX512Asm_64_mat0<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_64_mat0<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_mat0<>+0x08(SB)/8, $0x0202020202020202
+DATA  expandAVX512Asm_64_mat0<>+0x10(SB)/8, $0x0404040404040404
+DATA  expandAVX512Asm_64_mat0<>+0x18(SB)/8, $0x0808080808080808
+DATA  expandAVX512Asm_64_mat0<>+0x20(SB)/8, $0x1010101010101010
+DATA  expandAVX512Asm_64_mat0<>+0x28(SB)/8, $0x2020202020202020
+DATA  expandAVX512Asm_64_mat0<>+0x30(SB)/8, $0x4040404040404040
+DATA  expandAVX512Asm_64_mat0<>+0x38(SB)/8, $0x8080808080808080
+
+GLOBL expandAVX512Asm_64_inShuf1<>(SB), RODATA, $0x40
+DATA  expandAVX512Asm_64_inShuf1<>+0x00(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_inShuf1<>+0x08(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_inShuf1<>+0x10(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_inShuf1<>+0x18(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_inShuf1<>+0x20(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_inShuf1<>+0x28(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_inShuf1<>+0x30(SB)/8, $0x0101010101010101
+DATA  expandAVX512Asm_64_inShuf1<>+0x38(SB)/8, $0x0101010101010101
+
+GLOBL expandAVX512Asm_64_outShufLo(SB), RODATA, $0x40
+DATA  expandAVX512Asm_64_outShufLo+0x00(SB)/8, $0x0706050403020100
+DATA  expandAVX512Asm_64_outShufLo+0x08(SB)/8, $0x0f0e0d0c0b0a0908
+DATA  expandAVX512Asm_64_outShufLo+0x10(SB)/8, $0x1716151413121110
+DATA  expandAVX512Asm_64_outShufLo+0x18(SB)/8, $0x1f1e1d1c1b1a1918
+DATA  expandAVX512Asm_64_outShufLo+0x20(SB)/8, $0x2726252423222120
+DATA  expandAVX512Asm_64_outShufLo+0x28(SB)/8, $0x2f2e2d2c2b2a2928
+DATA  expandAVX512Asm_64_outShufLo+0x30(SB)/8, $0x3736353433323130
+DATA  expandAVX512Asm_64_outShufLo+0x38(SB)/8, $0x3f3e3d3c3b3a3938
+
+TEXT expandAVX512Asm_64<>(SB), NOSPLIT, $0-0
+       VMOVDQU64 expandAVX512Asm_64_inShuf0<>(SB), Z0
+       VMOVDQU64 expandAVX512Asm_64_mat0<>(SB), Z1
+       VMOVDQU64 expandAVX512Asm_64_inShuf1<>(SB), Z2
+       VMOVDQU64 expandAVX512Asm_64_outShufLo(SB), Z3
+       VMOVDQU64 (AX), Z4
+       VPERMB Z4, Z0, Z0
+       VGF2P8AFFINEQB $0, Z1, Z0, Z0
+       VPERMB Z4, Z2, Z2
+       VGF2P8AFFINEQB $0, Z1, Z2, Z2
+       VPERMB Z0, Z3, Z1
+       VPERMB Z2, Z3, Z2
+       RET
+
similarity index 76%
rename from src/internal/runtime/gc/scan/expand_amd64.go
rename to src/internal/runtime/gc/scan/export_amd64_test.go
index 9bea471abec6c5196176b7bdc64185d7515bce91..ea3d86dfbf47d768973b456a7f5a23f56930219b 100644 (file)
@@ -2,9 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64
+
 package scan
 
-import "internal/runtime/gc"
+import (
+       "internal/runtime/gc"
+)
 
 // ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
 // where f is the word size of objects in sizeClass.
@@ -12,11 +16,11 @@ import "internal/runtime/gc"
 // This is a testing entrypoint to the expanders used by scanSpanPacked*.
 //
 //go:noescape
-func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
+func ExpandAVX512Asm(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
 
 // gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly
 // as they don't follow the Go ABI, but you can use this to check if a given
 // expander PC is 0.
 //
 // It is defined in assembly.
-var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr
+var gcExpandersAVX512Asm [len(gc.SizeClassToSize)]uintptr
diff --git a/src/internal/runtime/gc/scan/export_simd_amd64_test.go b/src/internal/runtime/gc/scan/export_simd_amd64_test.go
new file mode 100644 (file)
index 0000000..bb6bc8d
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package scan
+
+import (
+       "internal/runtime/gc"
+       "simd"
+       "unsafe"
+)
+
+// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
+// where f is the word size of objects in sizeClass.
+//
+// This is a testing entrypoint to the expanders used by scanSpanPacked*.
+func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
+       v1, v2 := gcExpandersAVX512[sizeClass](unsafe.Pointer(packed))
+       v1.Store((*[8]uint64)(unsafe.Pointer(unpacked)))
+       v2.Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unpacked)) + 64)))
+       simd.ClearAVXUpperBits()
+}
index e36defb2e18056770d16be63a225cf39f974d233..967565297853fbb88b553fed3dee44b0d44afa37 100644 (file)
@@ -22,7 +22,7 @@ import (
 const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"
 
 func main() {
-       generate("expand_amd64.s", genExpanders)
+       generate("expanders_amd64.s", genExpanders)
 }
 
 func generate(fileName string, genFunc func(*gen.File)) {
@@ -63,7 +63,7 @@ func genExpanders(file *gen.File) {
                xf := int(ob) / 8
                log.Printf("size class %d bytes, expansion %dx", ob, xf)
 
-               fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
+               fn := gen.NewFunc(fmt.Sprintf("expandAVX512Asm_%d<>", xf))
                ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)
 
                if xf == 1 {
@@ -79,7 +79,7 @@ func genExpanders(file *gen.File) {
        }
 
        // Generate table mapping size class to expander PC
-       file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
+       file.AddConst("·gcExpandersAVX512Asm", gcExpandersAVX512)
 }
 
 // mat8x8 is an 8x8 bit matrix.
diff --git a/src/internal/runtime/gc/scan/mkexpanders.go b/src/internal/runtime/gc/scan/mkexpanders.go
new file mode 100644 (file)
index 0000000..7f8c14c
--- /dev/null
@@ -0,0 +1,638 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file is a fork of mkasm.go, instead of generating
+// assembly code, this file generates Go code that uses
+// the simd package.
+
+//go:build ignore
+
+package main
+
+import (
+       "bytes"
+       "fmt"
+       "go/format"
+       "log"
+       "os"
+       "slices"
+       "strconv"
+       "strings"
+       "text/template"
+       "unsafe"
+
+       "internal/runtime/gc"
+)
+
+var simdTemplate = template.Must(template.New("template").Parse(`
+{{- define "header"}}
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package scan
+
+import (
+       "simd"
+       "unsafe"
+)
+{{- end}}
+{{- define "expandersList"}}
+var gcExpandersAVX512 = [{{- len .}}]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
+{{- range .}}
+       {{.}},
+{{- end}}
+}
+{{- end}}
+
+{{- define "expanderData"}}
+var {{.Name}} = [8]uint64{
+{{.Vals}}
+}
+{{- end}}
+
+{{- define "expander"}}
+func {{.Name}}(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+       {{- .BodyLoadString }}
+       {{- .BodyString }}
+}
+{{- end}}
+`))
+
+// expanderData is global data used by the expanders.
+// They will be generated as global arrays.
+type expanderData struct {
+       Name string // Name of the global array
+       Vals string // The values of the arrays, should already be formatted.
+}
+
+// expander is the expander function, it only operates on 3 kinds of values:
+//
+//     uint8x64, mask8x64, uint64.
+//
+// And a limited set of operations.
+type expander struct {
+       Name        string // The name of the expander function
+       BodyLoad    strings.Builder
+       Body        strings.Builder // The actual expand computations, after loads.
+       data        []expanderData
+       dataByVals  map[string]string
+       uint8x64Cnt int
+       mask8x64Cnt int
+       uint64Cnt   int
+}
+
+// Used by text/template.
+// This is needed because tex/template cannot call pointer receiver methods.
+func (e expander) BodyLoadString() string {
+       return e.BodyLoad.String()
+}
+
+func (e expander) BodyString() string {
+       return e.Body.String()
+}
+
+// mat8x8 is an 8x8 bit matrix.
+type mat8x8 struct {
+       mat [8]uint8
+}
+
+func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
+       var out [8]uint64
+       for i, mat := range mats {
+               for j, row := range mat.mat {
+                       // For some reason, Intel flips the rows.
+                       out[i] |= uint64(row) << ((7 - j) * 8)
+               }
+       }
+       return out
+}
+
+func (fn *expander) newVec() string {
+       v := fmt.Sprintf("v%d", fn.uint8x64Cnt)
+       fn.uint8x64Cnt++
+       return v
+}
+
+func (fn *expander) newMask() string {
+       v := fmt.Sprintf("m%d", fn.mask8x64Cnt)
+       fn.mask8x64Cnt++
+       return v
+}
+
+func (fn *expander) newU() string {
+       v := fmt.Sprintf("u%d", fn.uint64Cnt)
+       fn.uint64Cnt++
+       return v
+}
+
+// expandIdentity implements 1x expansion (that is, no expansion).
+func (fn *expander) expandIdentity() {
+       fn.Body.WriteString(`
+       x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+       y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src)+64))).AsUint8x64()
+       return x.AsUint64x8(), y.AsUint64x8()`)
+}
+
+func (fn *expander) loadSrcAsUint8x64() string {
+       v := fn.newVec()
+       fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()\n", v))
+       return v
+}
+
+func (fn *expander) loadGlobalArrAsUint8x64(arrName string) string {
+       v := fn.newVec()
+       fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8(&%s).AsUint8x64()\n", v, arrName))
+       return v
+}
+
+func (fn *expander) permuteUint8x64(data, indices string) string {
+       v := fn.newVec()
+       fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s)\n", v, data, indices))
+       return v
+}
+
+func (fn *expander) permute2Uint8x64(x, y, indices string) string {
+       v := fn.newVec()
+       fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s)\n", v, x, y, indices))
+       return v
+}
+
+func (fn *expander) permuteMaskedUint8x64(data, indices, mask string) string {
+       v := fn.newVec()
+       fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s).Masked(%s)\n", v, data, indices, mask))
+       return v
+}
+
+func (fn *expander) permute2MaskedUint8x64(x, y, indices, mask string) string {
+       v := fn.newVec()
+       fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s).Masked(%s)\n", v, x, y, indices, mask))
+       return v
+}
+
+func (fn *expander) galoisFieldAffineTransformUint8x64(data, matrix string) string {
+       v := fn.newVec()
+       fn.Body.WriteString(fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix))
+       return v
+}
+
+func (fn *expander) returns(x, y string) {
+       fn.Body.WriteString(fmt.Sprintf("return %s.AsUint64x8(), %s.AsUint64x8()", x, y))
+}
+
+func uint8x64Data(data [64]uint8) string {
+       res := ""
+       for i := range 8 {
+               ptr64 := (*uint64)(unsafe.Pointer(&data[i*8]))
+               res += fmt.Sprintf("%#016x,", *ptr64)
+               if i == 3 {
+                       res += "\n"
+               }
+       }
+       return res
+}
+
+func uint64x8Data(data [8]uint64) string {
+       res := ""
+       for i := range 8 {
+               res += fmt.Sprintf("%#016x,", data[i])
+               if i == 3 {
+                       res += "\n"
+               }
+       }
+       return res
+}
+
+func (fn *expander) loadGlobalUint8x64(name string, data [64]uint8) string {
+       val := uint8x64Data(data)
+       if n, ok := fn.dataByVals[val]; !ok {
+               fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+               fn.data = append(fn.data, expanderData{fullName, val})
+               v := fn.loadGlobalArrAsUint8x64(fullName)
+               fn.dataByVals[val] = v
+               return v
+       } else {
+               return n
+       }
+}
+
+func (fn *expander) loadGlobalUint64x8(name string, data [8]uint64) string {
+       val := uint64x8Data(data)
+       if n, ok := fn.dataByVals[val]; !ok {
+               fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+               fn.data = append(fn.data, expanderData{fullName, val})
+               v := fn.loadGlobalArrAsUint8x64(fullName)
+               fn.dataByVals[val] = v
+               return v
+       } else {
+               return n
+       }
+}
+
+func (fn *expander) mask8x64FromBits(data uint64) string {
+       v1 := fn.newU()
+       v2 := fn.newMask()
+       fn.Body.WriteString(fmt.Sprintf("%s := uint64(%#x)\n%s := simd.Mask8x64FromBits(%s)\n",
+               v1, data, v2, v1))
+       return v2
+}
+
+func (fn *expander) orUint8x64(x, y string) string {
+       v := fn.newVec()
+       fn.Body.WriteString(fmt.Sprintf("%s := %s.Or(%s)\n", v, x, y))
+       return v
+}
+
+func main() {
+       generate("expanders_amd64.go", genExpanders)
+}
+
+func generate(fileName string, genFunc func(*bytes.Buffer)) {
+       var buf bytes.Buffer
+       genFunc(&buf)
+       f, err := os.Create(fileName)
+       if err != nil {
+               log.Fatal(err)
+       }
+       defer f.Close()
+       b, err := format.Source(buf.Bytes())
+       if err != nil {
+               log.Printf(string(buf.Bytes()))
+               log.Fatal(err)
+       }
+       _, err = f.Write(b)
+       if err != nil {
+               log.Fatal(err)
+       }
+}
+
+func genExpanders(buffer *bytes.Buffer) {
+       if err := simdTemplate.ExecuteTemplate(buffer, "header", nil); err != nil {
+               panic(fmt.Errorf("failed to execute header template: %w", err))
+       }
+       gcExpandersAVX512 := make([]expander, len(gc.SizeClassToSize))
+       for sc, ob := range gc.SizeClassToSize {
+               if gc.SizeClassToNPages[sc] != 1 {
+                       // These functions all produce a bitmap that covers exactly one
+                       // page.
+                       continue
+               }
+               if ob > gc.MinSizeForMallocHeader {
+                       // This size class is too big to have a packed pointer/scalar bitmap.
+                       break
+               }
+
+               xf := int(ob) / 8
+               log.Printf("size class %d bytes, expansion %dx", ob, xf)
+
+               fn := expander{Name: fmt.Sprintf("expandAVX512_%d", xf), dataByVals: make(map[string]string)}
+
+               if xf == 1 {
+                       fn.expandIdentity()
+               } else {
+                       ok := gfExpander(xf, &fn)
+                       if !ok {
+                               log.Printf("failed to generate expander for size class %d", sc)
+                       }
+               }
+               gcExpandersAVX512[sc] = fn
+       }
+       // Fill in the expanders data first
+       eld := make([]string, len(gcExpandersAVX512))
+       for i, gce := range gcExpandersAVX512 {
+               if gce.Name == "" {
+                       eld[i] = "nil"
+               } else {
+                       eld[i] = gce.Name
+               }
+       }
+       if err := simdTemplate.ExecuteTemplate(buffer, "expandersList", eld); err != nil {
+               panic(fmt.Errorf("failed to execute expandersList template: %w", err))
+       }
+       // List out the expander functions and their data
+       for _, gce := range gcExpandersAVX512 {
+               if gce.Name == "" {
+                       continue
+               }
+               for _, data := range gce.data {
+                       if err := simdTemplate.ExecuteTemplate(buffer, "expanderData", data); err != nil {
+                               panic(fmt.Errorf("failed to execute expanderData template: %w", err))
+                       }
+               }
+               if err := simdTemplate.ExecuteTemplate(buffer, "expander", gce); err != nil {
+                       panic(fmt.Errorf("failed to execute expander template: %w", err))
+               }
+       }
+}
+
+// gfExpander produces a function that expands each bit in an input bitmap into
+// f consecutive bits in an output bitmap.
+//
+// The input is
+//
+//     *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
+//
+// The output is
+//
+//     [64]uint8  = The bottom 512 bits of the expanded bitmap
+//     [64]uint8  = The top 512 bits of the expanded bitmap
+func gfExpander(f int, fn *expander) bool {
+       // TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
+
+       // TODO(austin): For f >= 8, I suspect there are better ways to do this.
+       //
+       // For example, we could use a mask expansion to get a full byte for each
+       // input bit, and separately create the bytes that blend adjacent bits, then
+       // shuffle those bytes together. Certainly for f >= 16 this makes sense
+       // because each of those bytes will be used, possibly more than once.
+
+       objBits := fn.loadSrcAsUint8x64()
+
+       type term struct {
+               iByte, oByte int
+               mat          mat8x8
+       }
+       var terms []term
+
+       // Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
+       // the output byte from the appropriate input byte. Gather all of these into
+       // "terms".
+       for oByte := 0; oByte < 1024/8; oByte++ {
+               var byteMat mat8x8
+               iByte := -1
+               for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
+                       iBit := oBit / f
+                       if iByte == -1 {
+                               iByte = iBit / 8
+                       } else if iByte != iBit/8 {
+                               log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
+                               return false
+                       }
+                       // One way to view this is that the i'th row of the matrix will be
+                       // ANDed with the input byte, and the parity of the result will set
+                       // the i'th bit in the output. We use a simple 1 bit mask, so the
+                       // parity is irrelevant beyond selecting out that one bit.
+                       byteMat.mat[oBit%8] = 1 << (iBit % 8)
+               }
+               terms = append(terms, term{iByte, oByte, byteMat})
+       }
+
+       if false {
+               // Print input byte -> output byte as a matrix
+               maxIByte, maxOByte := 0, 0
+               for _, term := range terms {
+                       maxIByte = max(maxIByte, term.iByte)
+                       maxOByte = max(maxOByte, term.oByte)
+               }
+               iToO := make([][]rune, maxIByte+1)
+               for i := range iToO {
+                       iToO[i] = make([]rune, maxOByte+1)
+               }
+               matMap := make(map[mat8x8]int)
+               for _, term := range terms {
+                       i, ok := matMap[term.mat]
+                       if !ok {
+                               i = len(matMap)
+                               matMap[term.mat] = i
+                       }
+                       iToO[term.iByte][term.oByte] = 'A' + rune(i)
+               }
+               for o := range maxOByte + 1 {
+                       fmt.Printf("%d", o)
+                       for i := range maxIByte + 1 {
+                               fmt.Printf(",")
+                               if mat := iToO[i][o]; mat != 0 {
+                                       fmt.Printf("%c", mat)
+                               }
+                       }
+                       fmt.Println()
+               }
+       }
+
+       // In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
+       // and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
+       //
+       //  abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
+       //    mat0     mat1     mat2     mat3     mat4     mat5     mat6     mat7
+
+       // Group the terms by matrix, but limit each group to 8 terms.
+       const termsPerGroup = 8       // Number of terms we can multiply by the same matrix.
+       const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
+
+       matMap := make(map[mat8x8]int)
+       allMats := make(map[mat8x8]bool)
+       var termGroups [][]term
+       for _, term := range terms {
+               allMats[term.mat] = true
+
+               i, ok := matMap[term.mat]
+               if ok && f > groupsPerSuperGroup {
+                       // The output is ultimately produced in two [64]uint8 registers.
+                       // Getting every byte in the right place of each of these requires a
+                       // final permutation that often requires more than one source.
+                       //
+                       // Up to 8x expansion, we can get a really nice grouping so we can use
+                       // the same 8 matrix vector several times, without producing
+                       // permutations that require more than two sources.
+                       //
+                       // Above 8x, however, we can't get nice matrixes anyway, so we
+                       // instead prefer reducing the complexity of the permutations we
+                       // need to produce the final outputs. To do this, avoid grouping
+                       // together terms that are split across the two registers.
+                       outRegister := termGroups[i][0].oByte / 64
+                       if term.oByte/64 != outRegister {
+                               ok = false
+                       }
+               }
+               if !ok {
+                       // Start a new term group.
+                       i = len(termGroups)
+                       matMap[term.mat] = i
+                       termGroups = append(termGroups, nil)
+               }
+
+               termGroups[i] = append(termGroups[i], term)
+
+               if len(termGroups[i]) == termsPerGroup {
+                       // This term group is full.
+                       delete(matMap, term.mat)
+               }
+       }
+
+       for i, termGroup := range termGroups {
+               log.Printf("term group %d:", i)
+               for _, term := range termGroup {
+                       log.Printf("  %+v", term)
+               }
+       }
+
+       // We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
+       // as many term groups as we can into each super-group to minimize the
+       // number of matrix multiplies.
+       //
+       // Ideally, we use the same matrix in each super-group, which might mean
+       // doing fewer than 8 multiplies at a time. That's fine because it never
+       // increases the total number of matrix multiplies.
+       //
+       // TODO: Packing the matrixes less densely may let us use more broadcast
+       // loads instead of general permutations, though. That replaces a load of
+       // the permutation with a load of the matrix, but is probably still slightly
+       // better.
+       var sgSize, nSuperGroups int
+       oneMatVec := f <= groupsPerSuperGroup
+       if oneMatVec {
+               // We can use the same matrix in each multiply by doing sgSize
+               // multiplies at a time.
+               sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
+               nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
+       } else {
+               // We can't use the same matrix for each multiply. Just do as many at a
+               // time as we can.
+               //
+               // TODO: This is going to produce several distinct matrixes, when we
+               // probably only need two. Be smarter about how we create super-groups
+               // in this case. Maybe we build up an array of super-groups and then the
+               // loop below just turns them into ops?
+               sgSize = 8
+               nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
+       }
+
+       // Construct each super-group.
+       var matGroup [8]mat8x8
+       var matMuls []string
+       var perm [128]int
+       for sgi := range nSuperGroups {
+               var iperm [64]uint8
+               for i := range iperm {
+                       iperm[i] = 0xff // "Don't care"
+               }
+               // Pick off sgSize term groups.
+               superGroup := termGroups[:min(len(termGroups), sgSize)]
+               termGroups = termGroups[len(superGroup):]
+               // Build the matrix and permutations for this super-group.
+               var thisMatGroup [8]mat8x8
+               for i, termGroup := range superGroup {
+                       // All terms in this group have the same matrix. Pick one.
+                       thisMatGroup[i] = termGroup[0].mat
+                       for j, term := range termGroup {
+                               // Build the input permutation.
+                               iperm[i*termsPerGroup+j] = uint8(term.iByte)
+                               // Build the output permutation.
+                               perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
+                       }
+               }
+               log.Printf("input permutation %d: %v", sgi, iperm)
+
+               // Check that we're not making more distinct matrixes than expected.
+               if oneMatVec {
+                       if sgi == 0 {
+                               matGroup = thisMatGroup
+                       } else if matGroup != thisMatGroup {
+                               log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
+                               return false
+                       }
+               }
+
+               // Emit matrix op.
+               matConst :=
+                       fn.loadGlobalUint64x8(fmt.Sprintf("mat%d", sgi),
+                               matGroupToVec(&thisMatGroup))
+               inShufConst :=
+                       fn.loadGlobalUint8x64(fmt.Sprintf("inShuf%d", sgi),
+                               iperm)
+               inOp := fn.permuteUint8x64(objBits, inShufConst)
+               matMul := fn.galoisFieldAffineTransformUint8x64(inOp, matConst)
+               matMuls = append(matMuls, matMul)
+       }
+
+       log.Printf("output permutation: %v", perm)
+
+       outLo, ok := genShuffle(fn, "outShufLo", (*[64]int)(perm[:64]), matMuls...)
+       if !ok {
+               log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+               return false
+       }
+       outHi, ok := genShuffle(fn, "outShufHi", (*[64]int)(perm[64:]), matMuls...)
+       if !ok {
+               log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+               return false
+       }
+       fn.returns(outLo, outHi)
+
+       return true
+}
+
+func genShuffle(fn *expander, name string, perm *[64]int, args ...string) (string, bool) {
+       // Construct flattened permutation.
+       var vperm [64]byte
+
+       // Get the inputs used by this permutation.
+       var inputs []int
+       for i, src := range perm {
+               inputIdx := slices.Index(inputs, src/64)
+               if inputIdx == -1 {
+                       inputIdx = len(inputs)
+                       inputs = append(inputs, src/64)
+               }
+               vperm[i] = byte(src%64 | (inputIdx << 6))
+       }
+
+       // Emit instructions for easy cases.
+       switch len(inputs) {
+       case 1:
+               constOp := fn.loadGlobalUint8x64(name, vperm)
+               return fn.permuteUint8x64(args[inputs[0]], constOp), true
+       case 2:
+               constOp := fn.loadGlobalUint8x64(name, vperm)
+               return fn.permute2Uint8x64(args[inputs[0]], args[inputs[1]], constOp), true
+       }
+
+       // Harder case, we need to shuffle in from up to 2 more tables.
+       //
+       // Perform two shuffles. One shuffle will get its data from the first
+       // two inputs, the other shuffle will get its data from the other one
+       // or two inputs. All values they don't care each don't care about will
+       // be zeroed.
+       var vperms [2][64]byte
+       var masks [2]uint64
+       for j, idx := range vperm {
+               for i := range vperms {
+                       vperms[i][j] = 0xff // "Don't care"
+               }
+               if idx == 0xff {
+                       continue
+               }
+               vperms[idx/128][j] = idx % 128
+               masks[idx/128] |= uint64(1) << j
+       }
+
+       // Validate that the masks are fully disjoint.
+       if masks[0]^masks[1] != ^uint64(0) {
+               panic("bad shuffle!")
+       }
+
+       // Generate constants.
+       constOps := make([]string, len(vperms))
+       for i, v := range vperms {
+               constOps[i] = fn.loadGlobalUint8x64(name+strconv.Itoa(i), v)
+       }
+
+       // Generate shuffles.
+       switch len(inputs) {
+       case 3:
+               r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+               r1 := fn.permuteMaskedUint8x64(args[inputs[2]], constOps[1], fn.mask8x64FromBits(masks[1]))
+               return fn.orUint8x64(r0, r1), true
+       case 4:
+               r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+               r1 := fn.permute2MaskedUint8x64(args[inputs[2]], args[inputs[3]], constOps[1], fn.mask8x64FromBits(masks[1]))
+               return fn.orUint8x64(r0, r1), true
+       }
+
+       // Too many inputs. To support more, we'd need to separate tables much earlier.
+       // Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
+       return args[0], false
+}
index 2ac181f97e5b661b0faf02e89c98a7c57f27fa9b..4af5a81f3190bb309db904300ed2afaebb1b64e2 100644 (file)
@@ -6,13 +6,25 @@ package scan
 
 import (
        "internal/cpu"
+       "internal/goexperiment"
        "internal/runtime/gc"
        "unsafe"
 )
 
 func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
        if CanAVX512() {
-               return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
+               if goexperiment.SIMD {
+                       return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
+               } else {
+                       return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask)
+               }
+       }
+       panic("not implemented")
+}
+
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       if CanAVX512() {
+               return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask)
        }
        panic("not implemented")
 }
@@ -27,12 +39,12 @@ func CanAVX512() bool {
        return avx512ScanPackedReqsMet
 }
 
-func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
-       return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
+func ScanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       return FilterNil(bufp, scanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask))
 }
 
 //go:noescape
-func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
+func scanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
 
 var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
        cpu.X86.HasAVX512BW &&
index 9b4950a7676985d0650cab2df46317071142d9a6..7430a86294356908f29362e12f4d44ba48f1671c 100644 (file)
@@ -6,12 +6,12 @@
 #include "textflag.h"
 
 // Test-only.
-TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
+TEXT ·ExpandAVX512Asm(SB), NOSPLIT, $0-24
        MOVQ sizeClass+0(FP), CX
        MOVQ packed+8(FP), AX
 
        // Call the expander for this size class
-       LEAQ ·gcExpandersAVX512(SB), BX
+       LEAQ ·gcExpandersAVX512Asm(SB), BX
        CALL (BX)(CX*8)
 
        MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
@@ -20,11 +20,11 @@ TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
        VZEROUPPER
        RET
 
-TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
+TEXT ·scanSpanPackedAVX512Asm(SB), NOSPLIT, $256-44
        // Z1+Z2 = Expand the grey object mask into a grey word mask
        MOVQ objMarks+16(FP), AX
        MOVQ sizeClass+24(FP), CX
-       LEAQ ·gcExpandersAVX512(SB), BX
+       LEAQ ·gcExpandersAVX512Asm(SB), BX
        CALL (BX)(CX*8)
 
        // Z3+Z4 = Load the pointer mask
index a914b4f4d7a64f41e2b9e11916550378fb548332..b628db9cdcb1a82042c2ea681b4fa58ead8dfb36 100644 (file)
@@ -11,6 +11,13 @@ import (
        "testing"
 )
 
+func TestScanSpanPackedAVX512Asm(t *testing.T) {
+       if !scan.CanAVX512() {
+               t.Skip("no AVX512")
+       }
+       testScanSpanPacked(t, scan.ScanSpanPackedAVX512Asm)
+}
+
 func TestScanSpanPackedAVX512(t *testing.T) {
        if !scan.CanAVX512() {
                t.Skip("no AVX512")
index a4d51827cc6a20fe85e2fdd2d56a26c2a6b3a275..68c72182ec7c66e545c9d3fd6ba88303e4308cce 100644 (file)
@@ -21,3 +21,6 @@ func HasFastScanSpanPacked() bool {
 func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
        return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
 }
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       panic("not implemented")
+}
diff --git a/src/internal/runtime/gc/scan/scan_nosimd_amd64.go b/src/internal/runtime/gc/scan/scan_nosimd_amd64.go
new file mode 100644 (file)
index 0000000..4d523d5
--- /dev/null
@@ -0,0 +1,16 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.simd
+
+package scan
+
+import (
+       "internal/runtime/gc"
+       "unsafe"
+)
+
+func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       panic("not implemented")
+}
diff --git a/src/internal/runtime/gc/scan/scan_simd_amd64.go b/src/internal/runtime/gc/scan/scan_simd_amd64.go
new file mode 100644 (file)
index 0000000..101358c
--- /dev/null
@@ -0,0 +1,92 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package scan
+
+import (
+       "internal/abi"
+       "internal/runtime/gc"
+       "math/bits"
+       "simd"
+       "unsafe"
+)
+
+func FilterNilAVX512(bufp *uintptr, n int32) (cnt int32) {
+       scanned := 0
+       buf := unsafe.Slice((*uint64)(unsafe.Pointer(bufp)), int(n))
+       // Use the widest vector
+       var zeros simd.Uint64x8
+       for ; scanned+8 <= int(n); scanned += 8 {
+               v := simd.LoadUint64x8Slice(buf[scanned:])
+               m := v.NotEqual(zeros)
+               v.Compress(m).StoreSlice(buf[cnt:])
+               // Count the mask bits
+               mbits := uint64(m.ToBits())
+               mbits &= 0xFF // Only the lower 8 bits are meaningful.
+               nonNilCnt := bits.OnesCount64(mbits)
+               cnt += int32(nonNilCnt)
+       }
+       // Scalar code to clean up tails.
+       for i := scanned; i < int(n); i++ {
+               if buf[i] != 0 {
+                       buf[cnt] = buf[i]
+                       cnt++
+               }
+       }
+       return
+}
+
+func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
+}
+
+func scanSpanPackedAVX512(mem unsafe.Pointer, buf *uintptr, objDarts *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+       // Expand the grey object mask into a grey word mask
+       m1, m2 := gcExpandersAVX512[sizeClass](abi.NoEscape(unsafe.Pointer(objDarts)))
+       // Load the pointer mask
+       ptrm := unsafe.Pointer(ptrMask)
+       m3 := simd.LoadUint64x8((*[8]uint64)(ptrm))
+       m4 := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(ptrm) + 64)))
+
+       masks := [128]uint8{}
+       counts := [128]uint8{}
+       // Combine the grey word mask with the pointer mask to get the scan mask
+       m1m3 := m1.And(m3).AsUint8x64()
+       m2m4 := m2.And(m4).AsUint8x64()
+       m1m3.Store((*[64]uint8)(unsafe.Pointer(&masks[0])))
+       m2m4.Store((*[64]uint8)(unsafe.Pointer(&masks[64])))
+       // Now each bit of m1m3 and m2m4 represents one word of the span.
+       // Thus, each byte covers 64 bytes of memory, which is also how
+       // much we can fix in a ZMM register.
+       //
+       // We do a load/compress for each 64 byte frame.
+       //
+       // counts = Number of memory words to scan in each 64 byte frame
+       // TODO: Right now the type casting is done via memory, is it possible to
+       // workaround these stores and loads and keep them in register?
+       m1m3.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[0])))
+       m2m4.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[64])))
+
+       // Loop over the 64 byte frames in this span.
+       // TODO: is there a way to PCALIGN this loop?
+       for i := range 128 {
+               mv := masks[i]
+               // Skip empty frames.
+               if mv == 0 {
+                       continue
+               }
+               // Load the 64 byte frame.
+               m := simd.Mask64x8FromBits(mv)
+               ptrs := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(mem) + uintptr(i*64))))
+               // Collect just the pointers from the greyed objects into the scan buffer,
+               // i.e., copy the word indices in the mask from Z1 into contiguous memory.
+               ptrs.Compress(m).Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(buf)) + uintptr(count*8))))
+               // Advance the scan buffer position by the number of pointers.
+               count += int32(counts[i])
+       }
+       simd.ClearAVXUpperBits()
+       return
+}
index 1208783b6f7aa14296b2dbd667e5a81350268696..7cadb609bfe4932f95b199c47e2445cc6f8d7252 100644 (file)
@@ -204,6 +204,13 @@ func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
                                                scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
                                        }
                                })
+                               b.Run("impl=PlatformAsm", func(b *testing.B) {
+                                       b.SetBytes(avgBytes)
+                                       for i := range b.N {
+                                               page := pageOrder[i%len(pageOrder)]
+                                               scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
+                                       }
+                               })
                        }
                })
        }