From da92168ec8cedf08603fd77929a4b9d7e3183275 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Sun, 9 Mar 2025 17:19:48 +0000 Subject: [PATCH] [dev.simd] internal/runtime/gc: add simd package based greentea kernels MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This CL adds a new generator to internal/runtime/gc/scan that generates expander kernels in Go SIMD. This CL also includes a Go SIMD scan kernel and a Go SIMD filter kernel. This CL also includes the plumbing, it will use the Go SIMD kernels if goexperiment.simd is on. Benchmark results: ... ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=80-88 354.8n ± 1% 272.4n ± 0% -23.22% (p=0.002 n=6) ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=90-88 375.7n ± 0% 287.1n ± 0% -23.58% (p=0.002 n=6) ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=100-88 450.0n ± 1% 327.4n ± 0% -27.24% (p=0.002 n=6) geomean 246.5n 199.4n -19.10% Throughput +25%. Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f Reviewed-on: https://go-review.googlesource.com/c/go/+/719520 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- .../compile/internal/ssa/stmtlines_test.go | 2 +- src/go/build/deps_test.go | 4 +- src/internal/runtime/gc/scan/expand_amd64.s | 2631 ----------------- .../runtime/gc/scan/expand_amd64_test.go | 4 +- .../runtime/gc/scan/expand_simd_amd64_test.go | 19 + src/internal/runtime/gc/scan/expand_test.go | 2 +- .../runtime/gc/scan/expanders_amd64.go | 1530 ++++++++++ .../runtime/gc/scan/expanders_amd64.s | 2631 +++++++++++++++++ .../{expand_amd64.go => export_amd64_test.go} | 10 +- .../runtime/gc/scan/export_simd_amd64_test.go | 24 + src/internal/runtime/gc/scan/mkasm.go | 6 +- src/internal/runtime/gc/scan/mkexpanders.go | 638 ++++ src/internal/runtime/gc/scan/scan_amd64.go | 20 +- src/internal/runtime/gc/scan/scan_amd64.s | 8 +- .../runtime/gc/scan/scan_amd64_test.go | 7 + src/internal/runtime/gc/scan/scan_generic.go | 3 + .../runtime/gc/scan/scan_nosimd_amd64.go | 16 + .../runtime/gc/scan/scan_simd_amd64.go | 92 + src/internal/runtime/gc/scan/scan_test.go | 7 + 19 files changed, 5004 insertions(+), 2650 deletions(-) delete mode 100644 src/internal/runtime/gc/scan/expand_amd64.s create mode 100644 src/internal/runtime/gc/scan/expand_simd_amd64_test.go create mode 100644 src/internal/runtime/gc/scan/expanders_amd64.go create mode 100644 src/internal/runtime/gc/scan/expanders_amd64.s rename src/internal/runtime/gc/scan/{expand_amd64.go => export_amd64_test.go} (76%) create mode 100644 src/internal/runtime/gc/scan/export_simd_amd64_test.go create mode 100644 src/internal/runtime/gc/scan/mkexpanders.go create mode 100644 src/internal/runtime/gc/scan/scan_nosimd_amd64.go create mode 100644 src/internal/runtime/gc/scan/scan_simd_amd64.go diff --git a/src/cmd/compile/internal/ssa/stmtlines_test.go b/src/cmd/compile/internal/ssa/stmtlines_test.go index 2bdd6c80b2..34c3cf2255 100644 --- a/src/cmd/compile/internal/ssa/stmtlines_test.go +++ b/src/cmd/compile/internal/ssa/stmtlines_test.go @@ -140,7 +140,7 @@ func TestStmtLines(t *testing.T) { var m float64 switch runtime.GOARCH { case "amd64": - m = 0.0111 // > 98.89% obtained on amd64, no backsliding + m = 0.0112 // > 98.88% obtained on amd64, no backsliding case "riscv64": m = 0.03 // XXX temporary update threshold to 97% for regabi default: diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go index 1b6e32d07c..0725aca43a 100644 --- a/src/go/build/deps_test.go +++ b/src/go/build/deps_test.go @@ -88,6 +88,7 @@ var depsRules = ` internal/strconv, internal/trace/tracev2, math/bits, + simd, structs < internal/bytealg < internal/stringslite @@ -835,7 +836,8 @@ var depsRules = ` os, reflect, strings, - sync + sync, + regexp < internal/runtime/gc/internal/gen; regexp, internal/txtar, internal/trace, internal/trace/raw diff --git a/src/internal/runtime/gc/scan/expand_amd64.s b/src/internal/runtime/gc/scan/expand_amd64.s deleted file mode 100644 index 6b0be44cc1..0000000000 --- a/src/internal/runtime/gc/scan/expand_amd64.s +++ /dev/null @@ -1,2631 +0,0 @@ -// Code generated by mkasm.go. DO NOT EDIT. - -#include "go_asm.h" -#include "textflag.h" - -GLOBL ·gcExpandersAVX512(SB), RODATA, $0x220 -DATA ·gcExpandersAVX512+0x00(SB)/8, $0 -DATA ·gcExpandersAVX512+0x08(SB)/8, $expandAVX512_1<>(SB) -DATA ·gcExpandersAVX512+0x10(SB)/8, $expandAVX512_2<>(SB) -DATA ·gcExpandersAVX512+0x18(SB)/8, $expandAVX512_3<>(SB) -DATA ·gcExpandersAVX512+0x20(SB)/8, $expandAVX512_4<>(SB) -DATA ·gcExpandersAVX512+0x28(SB)/8, $expandAVX512_6<>(SB) -DATA ·gcExpandersAVX512+0x30(SB)/8, $expandAVX512_8<>(SB) -DATA ·gcExpandersAVX512+0x38(SB)/8, $expandAVX512_10<>(SB) -DATA ·gcExpandersAVX512+0x40(SB)/8, $expandAVX512_12<>(SB) -DATA ·gcExpandersAVX512+0x48(SB)/8, $expandAVX512_14<>(SB) -DATA ·gcExpandersAVX512+0x50(SB)/8, $expandAVX512_16<>(SB) -DATA ·gcExpandersAVX512+0x58(SB)/8, $expandAVX512_18<>(SB) -DATA ·gcExpandersAVX512+0x60(SB)/8, $expandAVX512_20<>(SB) -DATA ·gcExpandersAVX512+0x68(SB)/8, $expandAVX512_22<>(SB) -DATA ·gcExpandersAVX512+0x70(SB)/8, $expandAVX512_24<>(SB) -DATA ·gcExpandersAVX512+0x78(SB)/8, $expandAVX512_26<>(SB) -DATA ·gcExpandersAVX512+0x80(SB)/8, $expandAVX512_28<>(SB) -DATA ·gcExpandersAVX512+0x88(SB)/8, $expandAVX512_30<>(SB) -DATA ·gcExpandersAVX512+0x90(SB)/8, $expandAVX512_32<>(SB) -DATA ·gcExpandersAVX512+0x98(SB)/8, $expandAVX512_36<>(SB) -DATA ·gcExpandersAVX512+0xa0(SB)/8, $expandAVX512_40<>(SB) -DATA ·gcExpandersAVX512+0xa8(SB)/8, $expandAVX512_44<>(SB) -DATA ·gcExpandersAVX512+0xb0(SB)/8, $expandAVX512_48<>(SB) -DATA ·gcExpandersAVX512+0xb8(SB)/8, $expandAVX512_52<>(SB) -DATA ·gcExpandersAVX512+0xc0(SB)/8, $expandAVX512_56<>(SB) -DATA ·gcExpandersAVX512+0xc8(SB)/8, $expandAVX512_60<>(SB) -DATA ·gcExpandersAVX512+0xd0(SB)/8, $expandAVX512_64<>(SB) -DATA ·gcExpandersAVX512+0xd8(SB)/8, $0 -DATA ·gcExpandersAVX512+0xe0(SB)/8, $0 -DATA ·gcExpandersAVX512+0xe8(SB)/8, $0 -DATA ·gcExpandersAVX512+0xf0(SB)/8, $0 -DATA ·gcExpandersAVX512+0xf8(SB)/8, $0 -DATA ·gcExpandersAVX512+0x100(SB)/8, $0 -DATA ·gcExpandersAVX512+0x108(SB)/8, $0 -DATA ·gcExpandersAVX512+0x110(SB)/8, $0 -DATA ·gcExpandersAVX512+0x118(SB)/8, $0 -DATA ·gcExpandersAVX512+0x120(SB)/8, $0 -DATA ·gcExpandersAVX512+0x128(SB)/8, $0 -DATA ·gcExpandersAVX512+0x130(SB)/8, $0 -DATA ·gcExpandersAVX512+0x138(SB)/8, $0 -DATA ·gcExpandersAVX512+0x140(SB)/8, $0 -DATA ·gcExpandersAVX512+0x148(SB)/8, $0 -DATA ·gcExpandersAVX512+0x150(SB)/8, $0 -DATA ·gcExpandersAVX512+0x158(SB)/8, $0 -DATA ·gcExpandersAVX512+0x160(SB)/8, $0 -DATA ·gcExpandersAVX512+0x168(SB)/8, $0 -DATA ·gcExpandersAVX512+0x170(SB)/8, $0 -DATA ·gcExpandersAVX512+0x178(SB)/8, $0 -DATA ·gcExpandersAVX512+0x180(SB)/8, $0 -DATA ·gcExpandersAVX512+0x188(SB)/8, $0 -DATA ·gcExpandersAVX512+0x190(SB)/8, $0 -DATA ·gcExpandersAVX512+0x198(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1a0(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1a8(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1b0(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1b8(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1c0(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1c8(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1d0(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1d8(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1e0(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1e8(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1f0(SB)/8, $0 -DATA ·gcExpandersAVX512+0x1f8(SB)/8, $0 -DATA ·gcExpandersAVX512+0x200(SB)/8, $0 -DATA ·gcExpandersAVX512+0x208(SB)/8, $0 -DATA ·gcExpandersAVX512+0x210(SB)/8, $0 -DATA ·gcExpandersAVX512+0x218(SB)/8, $0 - -TEXT expandAVX512_1<>(SB), NOSPLIT, $0-0 - VMOVDQU64 (AX), Z1 - VMOVDQU64 64(AX), Z2 - RET - -GLOBL expandAVX512_2_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_2_inShuf0<>+0x00(SB)/8, $0x0706050403020100 -DATA expandAVX512_2_inShuf0<>+0x08(SB)/8, $0x0706050403020100 -DATA expandAVX512_2_inShuf0<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_2_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_2_inShuf0<>+0x20(SB)/8, $0x1716151413121110 -DATA expandAVX512_2_inShuf0<>+0x28(SB)/8, $0x1716151413121110 -DATA expandAVX512_2_inShuf0<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_2_inShuf0<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918 - -GLOBL expandAVX512_2_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_2_mat0<>+0x00(SB)/8, $0x0101020204040808 -DATA expandAVX512_2_mat0<>+0x08(SB)/8, $0x1010202040408080 -DATA expandAVX512_2_mat0<>+0x10(SB)/8, $0x0101020204040808 -DATA expandAVX512_2_mat0<>+0x18(SB)/8, $0x1010202040408080 -DATA expandAVX512_2_mat0<>+0x20(SB)/8, $0x0101020204040808 -DATA expandAVX512_2_mat0<>+0x28(SB)/8, $0x1010202040408080 -DATA expandAVX512_2_mat0<>+0x30(SB)/8, $0x0101020204040808 -DATA expandAVX512_2_mat0<>+0x38(SB)/8, $0x1010202040408080 - -GLOBL expandAVX512_2_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_2_inShuf1<>+0x00(SB)/8, $0x2726252423222120 -DATA expandAVX512_2_inShuf1<>+0x08(SB)/8, $0x2726252423222120 -DATA expandAVX512_2_inShuf1<>+0x10(SB)/8, $0x2f2e2d2c2b2a2928 -DATA expandAVX512_2_inShuf1<>+0x18(SB)/8, $0x2f2e2d2c2b2a2928 -DATA expandAVX512_2_inShuf1<>+0x20(SB)/8, $0x3736353433323130 -DATA expandAVX512_2_inShuf1<>+0x28(SB)/8, $0x3736353433323130 -DATA expandAVX512_2_inShuf1<>+0x30(SB)/8, $0x3f3e3d3c3b3a3938 -DATA expandAVX512_2_inShuf1<>+0x38(SB)/8, $0x3f3e3d3c3b3a3938 - -GLOBL expandAVX512_2_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_2_outShufLo+0x00(SB)/8, $0x0b030a0209010800 -DATA expandAVX512_2_outShufLo+0x08(SB)/8, $0x0f070e060d050c04 -DATA expandAVX512_2_outShufLo+0x10(SB)/8, $0x1b131a1219111810 -DATA expandAVX512_2_outShufLo+0x18(SB)/8, $0x1f171e161d151c14 -DATA expandAVX512_2_outShufLo+0x20(SB)/8, $0x2b232a2229212820 -DATA expandAVX512_2_outShufLo+0x28(SB)/8, $0x2f272e262d252c24 -DATA expandAVX512_2_outShufLo+0x30(SB)/8, $0x3b333a3239313830 -DATA expandAVX512_2_outShufLo+0x38(SB)/8, $0x3f373e363d353c34 - -TEXT expandAVX512_2<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_2_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_2_mat0<>(SB), Z1 - VMOVDQU64 expandAVX512_2_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_2_outShufLo(SB), Z3 - VMOVDQU64 (AX), Z4 - VPERMB Z4, Z0, Z0 - VGF2P8AFFINEQB $0, Z1, Z0, Z0 - VPERMB Z4, Z2, Z2 - VGF2P8AFFINEQB $0, Z1, Z2, Z2 - VPERMB Z0, Z3, Z1 - VPERMB Z2, Z3, Z2 - RET - -GLOBL expandAVX512_3_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_3_inShuf0<>+0x00(SB)/8, $0x0706050403020100 -DATA expandAVX512_3_inShuf0<>+0x08(SB)/8, $0x0706050403020100 -DATA expandAVX512_3_inShuf0<>+0x10(SB)/8, $0x0706050403020100 -DATA expandAVX512_3_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_3_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_3_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_3_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_3_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_3_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_3_mat0<>+0x00(SB)/8, $0x0101010202020404 -DATA expandAVX512_3_mat0<>+0x08(SB)/8, $0x0408080810101020 -DATA expandAVX512_3_mat0<>+0x10(SB)/8, $0x2020404040808080 -DATA expandAVX512_3_mat0<>+0x18(SB)/8, $0x0101010202020404 -DATA expandAVX512_3_mat0<>+0x20(SB)/8, $0x0408080810101020 -DATA expandAVX512_3_mat0<>+0x28(SB)/8, $0x2020404040808080 -DATA expandAVX512_3_mat0<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_3_mat0<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_3_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_3_inShuf1<>+0x00(SB)/8, $0x1716151413121110 -DATA expandAVX512_3_inShuf1<>+0x08(SB)/8, $0x1716151413121110 -DATA expandAVX512_3_inShuf1<>+0x10(SB)/8, $0x1716151413121110 -DATA expandAVX512_3_inShuf1<>+0x18(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_3_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_3_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_3_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_3_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_3_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_3_inShuf2<>+0x00(SB)/8, $0x2726252423222120 -DATA expandAVX512_3_inShuf2<>+0x08(SB)/8, $0x2726252423222120 -DATA expandAVX512_3_inShuf2<>+0x10(SB)/8, $0x2726252423222120 -DATA expandAVX512_3_inShuf2<>+0x18(SB)/8, $0xffffffffff2a2928 -DATA expandAVX512_3_inShuf2<>+0x20(SB)/8, $0xffffffffff2a2928 -DATA expandAVX512_3_inShuf2<>+0x28(SB)/8, $0xffffffffffff2928 -DATA expandAVX512_3_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_3_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_3_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_3_outShufLo+0x00(SB)/8, $0x0a02110901100800 -DATA expandAVX512_3_outShufLo+0x08(SB)/8, $0x05140c04130b0312 -DATA expandAVX512_3_outShufLo+0x10(SB)/8, $0x170f07160e06150d -DATA expandAVX512_3_outShufLo+0x18(SB)/8, $0x221a292119282018 -DATA expandAVX512_3_outShufLo+0x20(SB)/8, $0x1d2c241c2b231b2a -DATA expandAVX512_3_outShufLo+0x28(SB)/8, $0x2f271f2e261e2d25 -DATA expandAVX512_3_outShufLo+0x30(SB)/8, $0x4a42514941504840 -DATA expandAVX512_3_outShufLo+0x38(SB)/8, $0x45544c44534b4352 - -GLOBL expandAVX512_3_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_3_outShufHi+0x00(SB)/8, $0x170f07160e06150d -DATA expandAVX512_3_outShufHi+0x08(SB)/8, $0x221a292119282018 -DATA expandAVX512_3_outShufHi+0x10(SB)/8, $0x1d2c241c2b231b2a -DATA expandAVX512_3_outShufHi+0x18(SB)/8, $0x2f271f2e261e2d25 -DATA expandAVX512_3_outShufHi+0x20(SB)/8, $0x4a42514941504840 -DATA expandAVX512_3_outShufHi+0x28(SB)/8, $0x45544c44534b4352 -DATA expandAVX512_3_outShufHi+0x30(SB)/8, $0x574f47564e46554d -DATA expandAVX512_3_outShufHi+0x38(SB)/8, $0x625a696159686058 - -TEXT expandAVX512_3<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_3_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_3_mat0<>(SB), Z3 - VMOVDQU64 expandAVX512_3_inShuf1<>(SB), Z4 - VMOVDQU64 expandAVX512_3_inShuf2<>(SB), Z5 - VMOVDQU64 expandAVX512_3_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_3_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z6 - VPERMB Z6, Z0, Z0 - VGF2P8AFFINEQB $0, Z3, Z0, Z0 - VPERMB Z6, Z4, Z4 - VGF2P8AFFINEQB $0, Z3, Z4, Z4 - VPERMB Z6, Z5, Z5 - VGF2P8AFFINEQB $0, Z3, Z5, Z3 - VPERMI2B Z4, Z0, Z1 - VPERMI2B Z3, Z4, Z2 - RET - -GLOBL expandAVX512_4_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_4_inShuf0<>+0x00(SB)/8, $0x0706050403020100 -DATA expandAVX512_4_inShuf0<>+0x08(SB)/8, $0x0706050403020100 -DATA expandAVX512_4_inShuf0<>+0x10(SB)/8, $0x0706050403020100 -DATA expandAVX512_4_inShuf0<>+0x18(SB)/8, $0x0706050403020100 -DATA expandAVX512_4_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_4_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_4_inShuf0<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_4_inShuf0<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 - -GLOBL expandAVX512_4_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_4_mat0<>+0x00(SB)/8, $0x0101010102020202 -DATA expandAVX512_4_mat0<>+0x08(SB)/8, $0x0404040408080808 -DATA expandAVX512_4_mat0<>+0x10(SB)/8, $0x1010101020202020 -DATA expandAVX512_4_mat0<>+0x18(SB)/8, $0x4040404080808080 -DATA expandAVX512_4_mat0<>+0x20(SB)/8, $0x0101010102020202 -DATA expandAVX512_4_mat0<>+0x28(SB)/8, $0x0404040408080808 -DATA expandAVX512_4_mat0<>+0x30(SB)/8, $0x1010101020202020 -DATA expandAVX512_4_mat0<>+0x38(SB)/8, $0x4040404080808080 - -GLOBL expandAVX512_4_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_4_inShuf1<>+0x00(SB)/8, $0x1716151413121110 -DATA expandAVX512_4_inShuf1<>+0x08(SB)/8, $0x1716151413121110 -DATA expandAVX512_4_inShuf1<>+0x10(SB)/8, $0x1716151413121110 -DATA expandAVX512_4_inShuf1<>+0x18(SB)/8, $0x1716151413121110 -DATA expandAVX512_4_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_4_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_4_inShuf1<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_4_inShuf1<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918 - -GLOBL expandAVX512_4_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_4_outShufLo+0x00(SB)/8, $0x1911090118100800 -DATA expandAVX512_4_outShufLo+0x08(SB)/8, $0x1b130b031a120a02 -DATA expandAVX512_4_outShufLo+0x10(SB)/8, $0x1d150d051c140c04 -DATA expandAVX512_4_outShufLo+0x18(SB)/8, $0x1f170f071e160e06 -DATA expandAVX512_4_outShufLo+0x20(SB)/8, $0x3931292138302820 -DATA expandAVX512_4_outShufLo+0x28(SB)/8, $0x3b332b233a322a22 -DATA expandAVX512_4_outShufLo+0x30(SB)/8, $0x3d352d253c342c24 -DATA expandAVX512_4_outShufLo+0x38(SB)/8, $0x3f372f273e362e26 - -TEXT expandAVX512_4<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_4_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_4_mat0<>(SB), Z1 - VMOVDQU64 expandAVX512_4_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_4_outShufLo(SB), Z3 - VMOVDQU64 (AX), Z4 - VPERMB Z4, Z0, Z0 - VGF2P8AFFINEQB $0, Z1, Z0, Z0 - VPERMB Z4, Z2, Z2 - VGF2P8AFFINEQB $0, Z1, Z2, Z2 - VPERMB Z0, Z3, Z1 - VPERMB Z2, Z3, Z2 - RET - -GLOBL expandAVX512_6_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_6_inShuf0<>+0x00(SB)/8, $0x0706050403020100 -DATA expandAVX512_6_inShuf0<>+0x08(SB)/8, $0x0706050403020100 -DATA expandAVX512_6_inShuf0<>+0x10(SB)/8, $0x0706050403020100 -DATA expandAVX512_6_inShuf0<>+0x18(SB)/8, $0x0706050403020100 -DATA expandAVX512_6_inShuf0<>+0x20(SB)/8, $0x0706050403020100 -DATA expandAVX512_6_inShuf0<>+0x28(SB)/8, $0x0706050403020100 -DATA expandAVX512_6_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_6_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_6_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_6_mat0<>+0x00(SB)/8, $0x0101010101010202 -DATA expandAVX512_6_mat0<>+0x08(SB)/8, $0x0202020204040404 -DATA expandAVX512_6_mat0<>+0x10(SB)/8, $0x0404080808080808 -DATA expandAVX512_6_mat0<>+0x18(SB)/8, $0x1010101010102020 -DATA expandAVX512_6_mat0<>+0x20(SB)/8, $0x2020202040404040 -DATA expandAVX512_6_mat0<>+0x28(SB)/8, $0x4040808080808080 -DATA expandAVX512_6_mat0<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_6_mat0<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_6_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_6_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_6_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_6_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_6_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_6_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_6_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_6_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_6_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_6_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_6_inShuf2<>+0x00(SB)/8, $0xffff151413121110 -DATA expandAVX512_6_inShuf2<>+0x08(SB)/8, $0xffff151413121110 -DATA expandAVX512_6_inShuf2<>+0x10(SB)/8, $0xffffff1413121110 -DATA expandAVX512_6_inShuf2<>+0x18(SB)/8, $0xffffff1413121110 -DATA expandAVX512_6_inShuf2<>+0x20(SB)/8, $0xffffff1413121110 -DATA expandAVX512_6_inShuf2<>+0x28(SB)/8, $0xffffff1413121110 -DATA expandAVX512_6_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_6_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_6_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_6_outShufLo+0x00(SB)/8, $0x0901282018100800 -DATA expandAVX512_6_outShufLo+0x08(SB)/8, $0x1a120a0229211911 -DATA expandAVX512_6_outShufLo+0x10(SB)/8, $0x2b231b130b032a22 -DATA expandAVX512_6_outShufLo+0x18(SB)/8, $0x0d052c241c140c04 -DATA expandAVX512_6_outShufLo+0x20(SB)/8, $0x1e160e062d251d15 -DATA expandAVX512_6_outShufLo+0x28(SB)/8, $0x2f271f170f072e26 -DATA expandAVX512_6_outShufLo+0x30(SB)/8, $0x4941686058504840 -DATA expandAVX512_6_outShufLo+0x38(SB)/8, $0x5a524a4269615951 - -GLOBL expandAVX512_6_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_6_outShufHi+0x00(SB)/8, $0x2b231b130b032a22 -DATA expandAVX512_6_outShufHi+0x08(SB)/8, $0x0d052c241c140c04 -DATA expandAVX512_6_outShufHi+0x10(SB)/8, $0x1e160e062d251d15 -DATA expandAVX512_6_outShufHi+0x18(SB)/8, $0x2f271f170f072e26 -DATA expandAVX512_6_outShufHi+0x20(SB)/8, $0x4941686058504840 -DATA expandAVX512_6_outShufHi+0x28(SB)/8, $0x5a524a4269615951 -DATA expandAVX512_6_outShufHi+0x30(SB)/8, $0x6b635b534b436a62 -DATA expandAVX512_6_outShufHi+0x38(SB)/8, $0x4d456c645c544c44 - -TEXT expandAVX512_6<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_6_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_6_mat0<>(SB), Z3 - VMOVDQU64 expandAVX512_6_inShuf1<>(SB), Z4 - VMOVDQU64 expandAVX512_6_inShuf2<>(SB), Z5 - VMOVDQU64 expandAVX512_6_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_6_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z6 - VPERMB Z6, Z0, Z0 - VGF2P8AFFINEQB $0, Z3, Z0, Z0 - VPERMB Z6, Z4, Z4 - VGF2P8AFFINEQB $0, Z3, Z4, Z4 - VPERMB Z6, Z5, Z5 - VGF2P8AFFINEQB $0, Z3, Z5, Z3 - VPERMI2B Z4, Z0, Z1 - VPERMI2B Z3, Z4, Z2 - RET - -GLOBL expandAVX512_8_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_8_inShuf0<>+0x00(SB)/8, $0x0706050403020100 -DATA expandAVX512_8_inShuf0<>+0x08(SB)/8, $0x0706050403020100 -DATA expandAVX512_8_inShuf0<>+0x10(SB)/8, $0x0706050403020100 -DATA expandAVX512_8_inShuf0<>+0x18(SB)/8, $0x0706050403020100 -DATA expandAVX512_8_inShuf0<>+0x20(SB)/8, $0x0706050403020100 -DATA expandAVX512_8_inShuf0<>+0x28(SB)/8, $0x0706050403020100 -DATA expandAVX512_8_inShuf0<>+0x30(SB)/8, $0x0706050403020100 -DATA expandAVX512_8_inShuf0<>+0x38(SB)/8, $0x0706050403020100 - -GLOBL expandAVX512_8_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_8_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_8_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_8_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_8_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_8_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_8_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_8_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_8_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_8_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_8_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_8_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_8_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_8_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_8_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_8_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_8_inShuf1<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_8_inShuf1<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 - -GLOBL expandAVX512_8_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_8_outShufLo+0x00(SB)/8, $0x3830282018100800 -DATA expandAVX512_8_outShufLo+0x08(SB)/8, $0x3931292119110901 -DATA expandAVX512_8_outShufLo+0x10(SB)/8, $0x3a322a221a120a02 -DATA expandAVX512_8_outShufLo+0x18(SB)/8, $0x3b332b231b130b03 -DATA expandAVX512_8_outShufLo+0x20(SB)/8, $0x3c342c241c140c04 -DATA expandAVX512_8_outShufLo+0x28(SB)/8, $0x3d352d251d150d05 -DATA expandAVX512_8_outShufLo+0x30(SB)/8, $0x3e362e261e160e06 -DATA expandAVX512_8_outShufLo+0x38(SB)/8, $0x3f372f271f170f07 - -TEXT expandAVX512_8<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_8_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_8_mat0<>(SB), Z1 - VMOVDQU64 expandAVX512_8_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_8_outShufLo(SB), Z3 - VMOVDQU64 (AX), Z4 - VPERMB Z4, Z0, Z0 - VGF2P8AFFINEQB $0, Z1, Z0, Z0 - VPERMB Z4, Z2, Z2 - VGF2P8AFFINEQB $0, Z1, Z2, Z2 - VPERMB Z0, Z3, Z1 - VPERMB Z2, Z3, Z2 - RET - -GLOBL expandAVX512_10_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_10_inShuf0<>+0x00(SB)/8, $0xff06050403020100 -DATA expandAVX512_10_inShuf0<>+0x08(SB)/8, $0xff06050403020100 -DATA expandAVX512_10_inShuf0<>+0x10(SB)/8, $0xff06050403020100 -DATA expandAVX512_10_inShuf0<>+0x18(SB)/8, $0xff06050403020100 -DATA expandAVX512_10_inShuf0<>+0x20(SB)/8, $0xffff050403020100 -DATA expandAVX512_10_inShuf0<>+0x28(SB)/8, $0xffff050403020100 -DATA expandAVX512_10_inShuf0<>+0x30(SB)/8, $0xffff050403020100 -DATA expandAVX512_10_inShuf0<>+0x38(SB)/8, $0xffff050403020100 - -GLOBL expandAVX512_10_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_10_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_10_mat0<>+0x08(SB)/8, $0x0101020202020202 -DATA expandAVX512_10_mat0<>+0x10(SB)/8, $0x0202020204040404 -DATA expandAVX512_10_mat0<>+0x18(SB)/8, $0x0404040404040808 -DATA expandAVX512_10_mat0<>+0x20(SB)/8, $0x0808080808080808 -DATA expandAVX512_10_mat0<>+0x28(SB)/8, $0x1010101010101010 -DATA expandAVX512_10_mat0<>+0x30(SB)/8, $0x1010202020202020 -DATA expandAVX512_10_mat0<>+0x38(SB)/8, $0x2020202040404040 - -GLOBL expandAVX512_10_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_10_inShuf1<>+0x00(SB)/8, $0xffff050403020100 -DATA expandAVX512_10_inShuf1<>+0x08(SB)/8, $0xffff050403020100 -DATA expandAVX512_10_inShuf1<>+0x10(SB)/8, $0xff0c0b0a09080706 -DATA expandAVX512_10_inShuf1<>+0x18(SB)/8, $0xff0c0b0a09080706 -DATA expandAVX512_10_inShuf1<>+0x20(SB)/8, $0xff0c0b0a09080706 -DATA expandAVX512_10_inShuf1<>+0x28(SB)/8, $0xff0c0b0a09080706 -DATA expandAVX512_10_inShuf1<>+0x30(SB)/8, $0xffff0b0a09080706 -DATA expandAVX512_10_inShuf1<>+0x38(SB)/8, $0xffff0b0a09080706 - -GLOBL expandAVX512_10_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_10_mat1<>+0x00(SB)/8, $0x4040404040408080 -DATA expandAVX512_10_mat1<>+0x08(SB)/8, $0x8080808080808080 -DATA expandAVX512_10_mat1<>+0x10(SB)/8, $0x0808080808080808 -DATA expandAVX512_10_mat1<>+0x18(SB)/8, $0x1010101010101010 -DATA expandAVX512_10_mat1<>+0x20(SB)/8, $0x1010202020202020 -DATA expandAVX512_10_mat1<>+0x28(SB)/8, $0x2020202040404040 -DATA expandAVX512_10_mat1<>+0x30(SB)/8, $0x4040404040408080 -DATA expandAVX512_10_mat1<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_10_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_10_inShuf2<>+0x00(SB)/8, $0xffff0c0b0a090807 -DATA expandAVX512_10_inShuf2<>+0x08(SB)/8, $0xffff0c0b0a090807 -DATA expandAVX512_10_inShuf2<>+0x10(SB)/8, $0xffff0c0b0a090807 -DATA expandAVX512_10_inShuf2<>+0x18(SB)/8, $0xffff0c0b0a090807 -DATA expandAVX512_10_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_10_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_10_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_10_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_10_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_10_mat2<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_10_mat2<>+0x08(SB)/8, $0x0101020202020202 -DATA expandAVX512_10_mat2<>+0x10(SB)/8, $0x0202020204040404 -DATA expandAVX512_10_mat2<>+0x18(SB)/8, $0x0404040404040808 -DATA expandAVX512_10_mat2<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_10_mat2<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_10_mat2<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_10_mat2<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_10_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_10_outShufLo+0x00(SB)/8, $0x3830282018100800 -DATA expandAVX512_10_outShufLo+0x08(SB)/8, $0x2921191109014840 -DATA expandAVX512_10_outShufLo+0x10(SB)/8, $0x1a120a0249413931 -DATA expandAVX512_10_outShufLo+0x18(SB)/8, $0x0b034a423a322a22 -DATA expandAVX512_10_outShufLo+0x20(SB)/8, $0x4b433b332b231b13 -DATA expandAVX512_10_outShufLo+0x28(SB)/8, $0x3c342c241c140c04 -DATA expandAVX512_10_outShufLo+0x30(SB)/8, $0x2d251d150d054c44 -DATA expandAVX512_10_outShufLo+0x38(SB)/8, $0x1e160e064d453d35 - -GLOBL expandAVX512_10_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_10_outShufHi+0x00(SB)/8, $0x4840383028201810 -DATA expandAVX512_10_outShufHi+0x08(SB)/8, $0x3931292119115850 -DATA expandAVX512_10_outShufHi+0x10(SB)/8, $0x2a221a1259514941 -DATA expandAVX512_10_outShufHi+0x18(SB)/8, $0x1b135a524a423a32 -DATA expandAVX512_10_outShufHi+0x20(SB)/8, $0x5b534b433b332b23 -DATA expandAVX512_10_outShufHi+0x28(SB)/8, $0x4c443c342c241c14 -DATA expandAVX512_10_outShufHi+0x30(SB)/8, $0x3d352d251d155c54 -DATA expandAVX512_10_outShufHi+0x38(SB)/8, $0x2e261e165d554d45 - -TEXT expandAVX512_10<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_10_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_10_inShuf1<>(SB), Z3 - VMOVDQU64 expandAVX512_10_inShuf2<>(SB), Z4 - VMOVDQU64 expandAVX512_10_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_10_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z5 - VPERMB Z5, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_10_mat0<>(SB), Z0, Z0 - VPERMB Z5, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_10_mat1<>(SB), Z3, Z3 - VPERMB Z5, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_10_mat2<>(SB), Z4, Z4 - VPERMI2B Z3, Z0, Z1 - VPERMI2B Z4, Z3, Z2 - RET - -GLOBL expandAVX512_12_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_12_inShuf0<>+0x00(SB)/8, $0xffff050403020100 -DATA expandAVX512_12_inShuf0<>+0x08(SB)/8, $0xffff050403020100 -DATA expandAVX512_12_inShuf0<>+0x10(SB)/8, $0xffff050403020100 -DATA expandAVX512_12_inShuf0<>+0x18(SB)/8, $0xffff050403020100 -DATA expandAVX512_12_inShuf0<>+0x20(SB)/8, $0xffffff0403020100 -DATA expandAVX512_12_inShuf0<>+0x28(SB)/8, $0xffffff0403020100 -DATA expandAVX512_12_inShuf0<>+0x30(SB)/8, $0xffffff0403020100 -DATA expandAVX512_12_inShuf0<>+0x38(SB)/8, $0xffffff0403020100 - -GLOBL expandAVX512_12_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_12_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_12_mat0<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_12_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_12_mat0<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_12_mat0<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_12_mat0<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_12_mat0<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_12_mat0<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_12_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_12_inShuf1<>+0x00(SB)/8, $0xffffff0403020100 -DATA expandAVX512_12_inShuf1<>+0x08(SB)/8, $0xffffff0403020100 -DATA expandAVX512_12_inShuf1<>+0x10(SB)/8, $0xffffff0403020100 -DATA expandAVX512_12_inShuf1<>+0x18(SB)/8, $0xffffff0403020100 -DATA expandAVX512_12_inShuf1<>+0x20(SB)/8, $0xffff0a0908070605 -DATA expandAVX512_12_inShuf1<>+0x28(SB)/8, $0xffff0a0908070605 -DATA expandAVX512_12_inShuf1<>+0x30(SB)/8, $0xffff0a0908070605 -DATA expandAVX512_12_inShuf1<>+0x38(SB)/8, $0xffff0a0908070605 - -GLOBL expandAVX512_12_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_12_mat1<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_12_mat1<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_12_mat1<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_12_mat1<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_12_mat1<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_12_mat1<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_12_mat1<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_12_mat1<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_12_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_12_inShuf2<>+0x00(SB)/8, $0xffffff0908070605 -DATA expandAVX512_12_inShuf2<>+0x08(SB)/8, $0xffffff0908070605 -DATA expandAVX512_12_inShuf2<>+0x10(SB)/8, $0xffffff0908070605 -DATA expandAVX512_12_inShuf2<>+0x18(SB)/8, $0xffffff0908070605 -DATA expandAVX512_12_inShuf2<>+0x20(SB)/8, $0xffffff0a09080706 -DATA expandAVX512_12_inShuf2<>+0x28(SB)/8, $0xffffff0a09080706 -DATA expandAVX512_12_inShuf2<>+0x30(SB)/8, $0xffffff0a09080706 -DATA expandAVX512_12_inShuf2<>+0x38(SB)/8, $0xffffff0a09080706 - -GLOBL expandAVX512_12_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_12_mat2<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_12_mat2<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_12_mat2<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_12_mat2<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_12_mat2<>+0x20(SB)/8, $0x0101010101010101 -DATA expandAVX512_12_mat2<>+0x28(SB)/8, $0x0101010102020202 -DATA expandAVX512_12_mat2<>+0x30(SB)/8, $0x0202020202020202 -DATA expandAVX512_12_mat2<>+0x38(SB)/8, $0x0404040404040404 - -GLOBL expandAVX512_12_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_12_outShufLo+0x00(SB)/8, $0x3830282018100800 -DATA expandAVX512_12_outShufLo+0x08(SB)/8, $0x1911090158504840 -DATA expandAVX512_12_outShufLo+0x10(SB)/8, $0x5951494139312921 -DATA expandAVX512_12_outShufLo+0x18(SB)/8, $0x3a322a221a120a02 -DATA expandAVX512_12_outShufLo+0x20(SB)/8, $0x1b130b035a524a42 -DATA expandAVX512_12_outShufLo+0x28(SB)/8, $0x5b534b433b332b23 -DATA expandAVX512_12_outShufLo+0x30(SB)/8, $0x3c342c241c140c04 -DATA expandAVX512_12_outShufLo+0x38(SB)/8, $0x1d150d055c544c44 - -GLOBL expandAVX512_12_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_12_outShufHi+0x00(SB)/8, $0x5850484038302820 -DATA expandAVX512_12_outShufHi+0x08(SB)/8, $0x3931292178706860 -DATA expandAVX512_12_outShufHi+0x10(SB)/8, $0x7971696159514941 -DATA expandAVX512_12_outShufHi+0x18(SB)/8, $0x5a524a423a322a22 -DATA expandAVX512_12_outShufHi+0x20(SB)/8, $0x3b332b237a726a62 -DATA expandAVX512_12_outShufHi+0x28(SB)/8, $0x7b736b635b534b43 -DATA expandAVX512_12_outShufHi+0x30(SB)/8, $0x5c544c443c342c24 -DATA expandAVX512_12_outShufHi+0x38(SB)/8, $0x3d352d257c746c64 - -TEXT expandAVX512_12<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_12_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_12_inShuf1<>(SB), Z3 - VMOVDQU64 expandAVX512_12_inShuf2<>(SB), Z4 - VMOVDQU64 expandAVX512_12_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_12_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z5 - VPERMB Z5, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_12_mat0<>(SB), Z0, Z0 - VPERMB Z5, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_12_mat1<>(SB), Z3, Z3 - VPERMB Z5, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_12_mat2<>(SB), Z4, Z4 - VPERMI2B Z3, Z0, Z1 - VPERMI2B Z4, Z3, Z2 - RET - -GLOBL expandAVX512_14_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_14_inShuf0<>+0x00(SB)/8, $0xffffff0403020100 -DATA expandAVX512_14_inShuf0<>+0x08(SB)/8, $0xffffff0403020100 -DATA expandAVX512_14_inShuf0<>+0x10(SB)/8, $0xffffff0403020100 -DATA expandAVX512_14_inShuf0<>+0x18(SB)/8, $0xffffff0403020100 -DATA expandAVX512_14_inShuf0<>+0x20(SB)/8, $0xffffff0403020100 -DATA expandAVX512_14_inShuf0<>+0x28(SB)/8, $0xffffff0403020100 -DATA expandAVX512_14_inShuf0<>+0x30(SB)/8, $0xffffff0403020100 -DATA expandAVX512_14_inShuf0<>+0x38(SB)/8, $0xffffff0403020100 - -GLOBL expandAVX512_14_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_14_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_14_mat0<>+0x08(SB)/8, $0x0101010101010202 -DATA expandAVX512_14_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_14_mat0<>+0x18(SB)/8, $0x0202020204040404 -DATA expandAVX512_14_mat0<>+0x20(SB)/8, $0x0404040404040404 -DATA expandAVX512_14_mat0<>+0x28(SB)/8, $0x0404080808080808 -DATA expandAVX512_14_mat0<>+0x30(SB)/8, $0x0808080808080808 -DATA expandAVX512_14_mat0<>+0x38(SB)/8, $0x1010101010101010 - -GLOBL expandAVX512_14_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_14_inShuf1<>+0x00(SB)/8, $0xffffffff03020100 -DATA expandAVX512_14_inShuf1<>+0x08(SB)/8, $0xffffffff03020100 -DATA expandAVX512_14_inShuf1<>+0x10(SB)/8, $0xffffffff03020100 -DATA expandAVX512_14_inShuf1<>+0x18(SB)/8, $0xffffffff03020100 -DATA expandAVX512_14_inShuf1<>+0x20(SB)/8, $0xffffffff03020100 -DATA expandAVX512_14_inShuf1<>+0x28(SB)/8, $0xffffffff03020100 -DATA expandAVX512_14_inShuf1<>+0x30(SB)/8, $0xffffff0807060504 -DATA expandAVX512_14_inShuf1<>+0x38(SB)/8, $0xffffff0807060504 - -GLOBL expandAVX512_14_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_14_mat1<>+0x00(SB)/8, $0x1010101010102020 -DATA expandAVX512_14_mat1<>+0x08(SB)/8, $0x2020202020202020 -DATA expandAVX512_14_mat1<>+0x10(SB)/8, $0x2020202040404040 -DATA expandAVX512_14_mat1<>+0x18(SB)/8, $0x4040404040404040 -DATA expandAVX512_14_mat1<>+0x20(SB)/8, $0x4040808080808080 -DATA expandAVX512_14_mat1<>+0x28(SB)/8, $0x8080808080808080 -DATA expandAVX512_14_mat1<>+0x30(SB)/8, $0x1010101010102020 -DATA expandAVX512_14_mat1<>+0x38(SB)/8, $0x2020202020202020 - -GLOBL expandAVX512_14_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_14_inShuf2<>+0x00(SB)/8, $0xffffff0807060504 -DATA expandAVX512_14_inShuf2<>+0x08(SB)/8, $0xffffff0807060504 -DATA expandAVX512_14_inShuf2<>+0x10(SB)/8, $0xffffff0807060504 -DATA expandAVX512_14_inShuf2<>+0x18(SB)/8, $0xffffff0807060504 -DATA expandAVX512_14_inShuf2<>+0x20(SB)/8, $0xffffff0908070605 -DATA expandAVX512_14_inShuf2<>+0x28(SB)/8, $0xffffff0908070605 -DATA expandAVX512_14_inShuf2<>+0x30(SB)/8, $0xffffffff08070605 -DATA expandAVX512_14_inShuf2<>+0x38(SB)/8, $0xffffffff08070605 - -GLOBL expandAVX512_14_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_14_mat2<>+0x00(SB)/8, $0x2020202040404040 -DATA expandAVX512_14_mat2<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_14_mat2<>+0x10(SB)/8, $0x4040808080808080 -DATA expandAVX512_14_mat2<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_14_mat2<>+0x20(SB)/8, $0x0101010101010101 -DATA expandAVX512_14_mat2<>+0x28(SB)/8, $0x0101010101010202 -DATA expandAVX512_14_mat2<>+0x30(SB)/8, $0x0202020202020202 -DATA expandAVX512_14_mat2<>+0x38(SB)/8, $0x0202020204040404 - -GLOBL expandAVX512_14_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_14_inShuf3<>+0x00(SB)/8, $0xffffffff08070605 -DATA expandAVX512_14_inShuf3<>+0x08(SB)/8, $0xffffffff08070605 -DATA expandAVX512_14_inShuf3<>+0x10(SB)/8, $0xffffffff08070605 -DATA expandAVX512_14_inShuf3<>+0x18(SB)/8, $0xffffffff08070605 -DATA expandAVX512_14_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_14_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_14_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_14_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_14_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_14_mat3<>+0x00(SB)/8, $0x0404040404040404 -DATA expandAVX512_14_mat3<>+0x08(SB)/8, $0x0404080808080808 -DATA expandAVX512_14_mat3<>+0x10(SB)/8, $0x0808080808080808 -DATA expandAVX512_14_mat3<>+0x18(SB)/8, $0x1010101010101010 -DATA expandAVX512_14_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_14_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_14_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_14_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_14_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_14_outShufLo+0x00(SB)/8, $0x3830282018100800 -DATA expandAVX512_14_outShufLo+0x08(SB)/8, $0x0901686058504840 -DATA expandAVX512_14_outShufLo+0x10(SB)/8, $0x4941393129211911 -DATA expandAVX512_14_outShufLo+0x18(SB)/8, $0x1a120a0269615951 -DATA expandAVX512_14_outShufLo+0x20(SB)/8, $0x5a524a423a322a22 -DATA expandAVX512_14_outShufLo+0x28(SB)/8, $0x2b231b130b036a62 -DATA expandAVX512_14_outShufLo+0x30(SB)/8, $0x6b635b534b433b33 -DATA expandAVX512_14_outShufLo+0x38(SB)/8, $0x3c342c241c140c04 - -GLOBL expandAVX512_14_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_14_outShufHi0+0x00(SB)/8, $0x6860585048403830 -DATA expandAVX512_14_outShufHi0+0x08(SB)/8, $0x3931ffffffff7870 -DATA expandAVX512_14_outShufHi0+0x10(SB)/8, $0x7971696159514941 -DATA expandAVX512_14_outShufHi0+0x18(SB)/8, $0x4a423a32ffffffff -DATA expandAVX512_14_outShufHi0+0x20(SB)/8, $0xffff7a726a625a52 -DATA expandAVX512_14_outShufHi0+0x28(SB)/8, $0x5b534b433b33ffff -DATA expandAVX512_14_outShufHi0+0x30(SB)/8, $0xffffffff7b736b63 -DATA expandAVX512_14_outShufHi0+0x38(SB)/8, $0x6c645c544c443c34 - -GLOBL expandAVX512_14_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_14_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_14_outShufHi1+0x08(SB)/8, $0xffff18100800ffff -DATA expandAVX512_14_outShufHi1+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_14_outShufHi1+0x18(SB)/8, $0xffffffff19110901 -DATA expandAVX512_14_outShufHi1+0x20(SB)/8, $0x0a02ffffffffffff -DATA expandAVX512_14_outShufHi1+0x28(SB)/8, $0xffffffffffff1a12 -DATA expandAVX512_14_outShufHi1+0x30(SB)/8, $0x1b130b03ffffffff -DATA expandAVX512_14_outShufHi1+0x38(SB)/8, $0xffffffffffffffff - -TEXT expandAVX512_14<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_14_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_14_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_14_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_14_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_14_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_14_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_14_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_14_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_14_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_14_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_14_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xff0ffc3ff0ffc3ff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0xf003c00f003c00, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_16_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_16_inShuf0<>+0x00(SB)/8, $0x0303020201010000 -DATA expandAVX512_16_inShuf0<>+0x08(SB)/8, $0x0303020201010000 -DATA expandAVX512_16_inShuf0<>+0x10(SB)/8, $0x0303020201010000 -DATA expandAVX512_16_inShuf0<>+0x18(SB)/8, $0x0303020201010000 -DATA expandAVX512_16_inShuf0<>+0x20(SB)/8, $0x0303020201010000 -DATA expandAVX512_16_inShuf0<>+0x28(SB)/8, $0x0303020201010000 -DATA expandAVX512_16_inShuf0<>+0x30(SB)/8, $0x0303020201010000 -DATA expandAVX512_16_inShuf0<>+0x38(SB)/8, $0x0303020201010000 - -GLOBL expandAVX512_16_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_16_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_16_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_16_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_16_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_16_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_16_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_16_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_16_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_16_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_16_inShuf1<>+0x00(SB)/8, $0x0707060605050404 -DATA expandAVX512_16_inShuf1<>+0x08(SB)/8, $0x0707060605050404 -DATA expandAVX512_16_inShuf1<>+0x10(SB)/8, $0x0707060605050404 -DATA expandAVX512_16_inShuf1<>+0x18(SB)/8, $0x0707060605050404 -DATA expandAVX512_16_inShuf1<>+0x20(SB)/8, $0x0707060605050404 -DATA expandAVX512_16_inShuf1<>+0x28(SB)/8, $0x0707060605050404 -DATA expandAVX512_16_inShuf1<>+0x30(SB)/8, $0x0707060605050404 -DATA expandAVX512_16_inShuf1<>+0x38(SB)/8, $0x0707060605050404 - -GLOBL expandAVX512_16_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_16_outShufLo+0x00(SB)/8, $0x1918111009080100 -DATA expandAVX512_16_outShufLo+0x08(SB)/8, $0x3938313029282120 -DATA expandAVX512_16_outShufLo+0x10(SB)/8, $0x1b1a13120b0a0302 -DATA expandAVX512_16_outShufLo+0x18(SB)/8, $0x3b3a33322b2a2322 -DATA expandAVX512_16_outShufLo+0x20(SB)/8, $0x1d1c15140d0c0504 -DATA expandAVX512_16_outShufLo+0x28(SB)/8, $0x3d3c35342d2c2524 -DATA expandAVX512_16_outShufLo+0x30(SB)/8, $0x1f1e17160f0e0706 -DATA expandAVX512_16_outShufLo+0x38(SB)/8, $0x3f3e37362f2e2726 - -TEXT expandAVX512_16<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_16_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_16_mat0<>(SB), Z1 - VMOVDQU64 expandAVX512_16_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_16_outShufLo(SB), Z3 - VMOVDQU64 (AX), Z4 - VPERMB Z4, Z0, Z0 - VGF2P8AFFINEQB $0, Z1, Z0, Z0 - VPERMB Z4, Z2, Z2 - VGF2P8AFFINEQB $0, Z1, Z2, Z2 - VPERMB Z0, Z3, Z1 - VPERMB Z2, Z3, Z2 - RET - -GLOBL expandAVX512_18_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_18_inShuf0<>+0x00(SB)/8, $0x0303020201010000 -DATA expandAVX512_18_inShuf0<>+0x08(SB)/8, $0xffffffff03020100 -DATA expandAVX512_18_inShuf0<>+0x10(SB)/8, $0xffffffff03020100 -DATA expandAVX512_18_inShuf0<>+0x18(SB)/8, $0xffffffff03020100 -DATA expandAVX512_18_inShuf0<>+0x20(SB)/8, $0xffffffff03020100 -DATA expandAVX512_18_inShuf0<>+0x28(SB)/8, $0xffffffff03020100 -DATA expandAVX512_18_inShuf0<>+0x30(SB)/8, $0x0303020201010000 -DATA expandAVX512_18_inShuf0<>+0x38(SB)/8, $0xff03020201010000 - -GLOBL expandAVX512_18_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_18_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_18_mat0<>+0x08(SB)/8, $0x0101020202020202 -DATA expandAVX512_18_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_18_mat0<>+0x18(SB)/8, $0x0202020204040404 -DATA expandAVX512_18_mat0<>+0x20(SB)/8, $0x0404040404040404 -DATA expandAVX512_18_mat0<>+0x28(SB)/8, $0x0404040404040808 -DATA expandAVX512_18_mat0<>+0x30(SB)/8, $0x0808080808080808 -DATA expandAVX512_18_mat0<>+0x38(SB)/8, $0x1010101010101010 - -GLOBL expandAVX512_18_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_18_inShuf1<>+0x00(SB)/8, $0xffffffffff020100 -DATA expandAVX512_18_inShuf1<>+0x08(SB)/8, $0xffffffffff020100 -DATA expandAVX512_18_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 -DATA expandAVX512_18_inShuf1<>+0x18(SB)/8, $0xffffffffff020100 -DATA expandAVX512_18_inShuf1<>+0x20(SB)/8, $0xffffffffff020100 -DATA expandAVX512_18_inShuf1<>+0x28(SB)/8, $0xffff020201010000 -DATA expandAVX512_18_inShuf1<>+0x30(SB)/8, $0xff06060505040403 -DATA expandAVX512_18_inShuf1<>+0x38(SB)/8, $0xffffffff06050403 - -GLOBL expandAVX512_18_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_18_mat1<>+0x00(SB)/8, $0x1010202020202020 -DATA expandAVX512_18_mat1<>+0x08(SB)/8, $0x2020202020202020 -DATA expandAVX512_18_mat1<>+0x10(SB)/8, $0x2020202040404040 -DATA expandAVX512_18_mat1<>+0x18(SB)/8, $0x4040404040404040 -DATA expandAVX512_18_mat1<>+0x20(SB)/8, $0x4040404040408080 -DATA expandAVX512_18_mat1<>+0x28(SB)/8, $0x8080808080808080 -DATA expandAVX512_18_mat1<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_18_mat1<>+0x38(SB)/8, $0x1010202020202020 - -GLOBL expandAVX512_18_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_18_inShuf2<>+0x00(SB)/8, $0xffffffff06050403 -DATA expandAVX512_18_inShuf2<>+0x08(SB)/8, $0xffffffff06050403 -DATA expandAVX512_18_inShuf2<>+0x10(SB)/8, $0xffffffff06050403 -DATA expandAVX512_18_inShuf2<>+0x18(SB)/8, $0xffffffff06050403 -DATA expandAVX512_18_inShuf2<>+0x20(SB)/8, $0x0606050504040303 -DATA expandAVX512_18_inShuf2<>+0x28(SB)/8, $0x0707060605050404 -DATA expandAVX512_18_inShuf2<>+0x30(SB)/8, $0xffffffffff060504 -DATA expandAVX512_18_inShuf2<>+0x38(SB)/8, $0xffffffffff060504 - -GLOBL expandAVX512_18_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_18_mat2<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_18_mat2<>+0x08(SB)/8, $0x2020202040404040 -DATA expandAVX512_18_mat2<>+0x10(SB)/8, $0x4040404040404040 -DATA expandAVX512_18_mat2<>+0x18(SB)/8, $0x4040404040408080 -DATA expandAVX512_18_mat2<>+0x20(SB)/8, $0x8080808080808080 -DATA expandAVX512_18_mat2<>+0x28(SB)/8, $0x0101010101010101 -DATA expandAVX512_18_mat2<>+0x30(SB)/8, $0x0101020202020202 -DATA expandAVX512_18_mat2<>+0x38(SB)/8, $0x0202020202020202 - -GLOBL expandAVX512_18_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_18_inShuf3<>+0x00(SB)/8, $0xffffffffff060504 -DATA expandAVX512_18_inShuf3<>+0x08(SB)/8, $0xffffffffff060504 -DATA expandAVX512_18_inShuf3<>+0x10(SB)/8, $0xffffffffff060504 -DATA expandAVX512_18_inShuf3<>+0x18(SB)/8, $0xffff060605050404 -DATA expandAVX512_18_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_18_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_18_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_18_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_18_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_18_mat3<>+0x00(SB)/8, $0x0202020204040404 -DATA expandAVX512_18_mat3<>+0x08(SB)/8, $0x0404040404040404 -DATA expandAVX512_18_mat3<>+0x10(SB)/8, $0x0404040404040808 -DATA expandAVX512_18_mat3<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_18_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_18_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_18_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_18_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_18_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_18_outShufLo+0x00(SB)/8, $0x3028201810080100 -DATA expandAVX512_18_outShufLo+0x08(SB)/8, $0x6058504840393831 -DATA expandAVX512_18_outShufLo+0x10(SB)/8, $0x2119110903026968 -DATA expandAVX512_18_outShufLo+0x18(SB)/8, $0x5149413b3a333229 -DATA expandAVX512_18_outShufLo+0x20(SB)/8, $0x120a05046b6a6159 -DATA expandAVX512_18_outShufLo+0x28(SB)/8, $0x423d3c35342a221a -DATA expandAVX512_18_outShufLo+0x30(SB)/8, $0x07066d6c625a524a -DATA expandAVX512_18_outShufLo+0x38(SB)/8, $0x3e37362b231b130b - -GLOBL expandAVX512_18_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_18_outShufHi0+0x00(SB)/8, $0x6160585048403830 -DATA expandAVX512_18_outShufHi0+0x08(SB)/8, $0xffffffff78706968 -DATA expandAVX512_18_outShufHi0+0x10(SB)/8, $0x59514941393231ff -DATA expandAVX512_18_outShufHi0+0x18(SB)/8, $0xffff79716b6a6362 -DATA expandAVX512_18_outShufHi0+0x20(SB)/8, $0x4a423a3433ffffff -DATA expandAVX512_18_outShufHi0+0x28(SB)/8, $0x7a726d6c65645a52 -DATA expandAVX512_18_outShufHi0+0x30(SB)/8, $0x3b3635ffffffffff -DATA expandAVX512_18_outShufHi0+0x38(SB)/8, $0x6f6e67665b534b43 - -GLOBL expandAVX512_18_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_18_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_18_outShufHi1+0x08(SB)/8, $0x18100800ffffffff -DATA expandAVX512_18_outShufHi1+0x10(SB)/8, $0xffffffffffffff19 -DATA expandAVX512_18_outShufHi1+0x18(SB)/8, $0x0901ffffffffffff -DATA expandAVX512_18_outShufHi1+0x20(SB)/8, $0xffffffffff1b1a11 -DATA expandAVX512_18_outShufHi1+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_18_outShufHi1+0x30(SB)/8, $0xffffff1d1c120a02 -DATA expandAVX512_18_outShufHi1+0x38(SB)/8, $0xffffffffffffffff - -TEXT expandAVX512_18<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_18_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_18_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_18_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_18_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_18_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_18_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_18_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_18_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_18_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_18_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_18_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xffe0fff83ffe0fff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0x1f0007c001f000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_20_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_20_inShuf0<>+0x00(SB)/8, $0x0303020201010000 -DATA expandAVX512_20_inShuf0<>+0x08(SB)/8, $0xffffffff03020100 -DATA expandAVX512_20_inShuf0<>+0x10(SB)/8, $0xff03020201010000 -DATA expandAVX512_20_inShuf0<>+0x18(SB)/8, $0xffff020201010000 -DATA expandAVX512_20_inShuf0<>+0x20(SB)/8, $0xffffffffff020100 -DATA expandAVX512_20_inShuf0<>+0x28(SB)/8, $0xffff020201010000 -DATA expandAVX512_20_inShuf0<>+0x30(SB)/8, $0xffff020201010000 -DATA expandAVX512_20_inShuf0<>+0x38(SB)/8, $0xffffffffff020100 - -GLOBL expandAVX512_20_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_20_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_20_mat0<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_20_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_20_mat0<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_20_mat0<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_20_mat0<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_20_mat0<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_20_mat0<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_20_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_20_inShuf1<>+0x00(SB)/8, $0xffff020201010000 -DATA expandAVX512_20_inShuf1<>+0x08(SB)/8, $0xffff020201010000 -DATA expandAVX512_20_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 -DATA expandAVX512_20_inShuf1<>+0x18(SB)/8, $0xffff020201010000 -DATA expandAVX512_20_inShuf1<>+0x20(SB)/8, $0xff06060505040403 -DATA expandAVX512_20_inShuf1<>+0x28(SB)/8, $0x0606050504040303 -DATA expandAVX512_20_inShuf1<>+0x30(SB)/8, $0xffffffff06050403 -DATA expandAVX512_20_inShuf1<>+0x38(SB)/8, $0xffff050504040303 - -GLOBL expandAVX512_20_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_20_mat1<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_20_mat1<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_20_mat1<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_20_mat1<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_20_mat1<>+0x20(SB)/8, $0x0202020202020202 -DATA expandAVX512_20_mat1<>+0x28(SB)/8, $0x0404040404040404 -DATA expandAVX512_20_mat1<>+0x30(SB)/8, $0x0404040408080808 -DATA expandAVX512_20_mat1<>+0x38(SB)/8, $0x0808080808080808 - -GLOBL expandAVX512_20_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_20_inShuf2<>+0x00(SB)/8, $0xffff050504040303 -DATA expandAVX512_20_inShuf2<>+0x08(SB)/8, $0xffffffffff050403 -DATA expandAVX512_20_inShuf2<>+0x10(SB)/8, $0xffff050504040303 -DATA expandAVX512_20_inShuf2<>+0x18(SB)/8, $0xffff050504040303 -DATA expandAVX512_20_inShuf2<>+0x20(SB)/8, $0xffffffffff050403 -DATA expandAVX512_20_inShuf2<>+0x28(SB)/8, $0xffff050504040303 -DATA expandAVX512_20_inShuf2<>+0x30(SB)/8, $0xffff060605050404 -DATA expandAVX512_20_inShuf2<>+0x38(SB)/8, $0xffffffffff060504 - -GLOBL expandAVX512_20_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_20_mat2<>+0x00(SB)/8, $0x1010101010101010 -DATA expandAVX512_20_mat2<>+0x08(SB)/8, $0x1010101020202020 -DATA expandAVX512_20_mat2<>+0x10(SB)/8, $0x2020202020202020 -DATA expandAVX512_20_mat2<>+0x18(SB)/8, $0x4040404040404040 -DATA expandAVX512_20_mat2<>+0x20(SB)/8, $0x4040404080808080 -DATA expandAVX512_20_mat2<>+0x28(SB)/8, $0x8080808080808080 -DATA expandAVX512_20_mat2<>+0x30(SB)/8, $0x0101010101010101 -DATA expandAVX512_20_mat2<>+0x38(SB)/8, $0x0101010102020202 - -GLOBL expandAVX512_20_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_20_outShufLo+0x00(SB)/8, $0x2019181110080100 -DATA expandAVX512_20_outShufLo+0x08(SB)/8, $0x4841403831302928 -DATA expandAVX512_20_outShufLo+0x10(SB)/8, $0x1209030259585049 -DATA expandAVX512_20_outShufLo+0x18(SB)/8, $0x33322b2a211b1a13 -DATA expandAVX512_20_outShufLo+0x20(SB)/8, $0x5b5a514b4a434239 -DATA expandAVX512_20_outShufLo+0x28(SB)/8, $0x221d1c15140a0504 -DATA expandAVX512_20_outShufLo+0x30(SB)/8, $0x4c45443a35342d2c -DATA expandAVX512_20_outShufLo+0x38(SB)/8, $0x160b07065d5c524d - -GLOBL expandAVX512_20_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_20_outShufHi+0x00(SB)/8, $0x4140393830292820 -DATA expandAVX512_20_outShufHi+0x08(SB)/8, $0x6968605958515048 -DATA expandAVX512_20_outShufHi+0x10(SB)/8, $0x312b2a2221787170 -DATA expandAVX512_20_outShufHi+0x18(SB)/8, $0x5a53524943423b3a -DATA expandAVX512_20_outShufHi+0x20(SB)/8, $0x237973726b6a615b -DATA expandAVX512_20_outShufHi+0x28(SB)/8, $0x45443d3c322d2c24 -DATA expandAVX512_20_outShufHi+0x30(SB)/8, $0x6d6c625d5c55544a -DATA expandAVX512_20_outShufHi+0x38(SB)/8, $0x332f2e26257a7574 - -TEXT expandAVX512_20<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_20_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_20_inShuf1<>(SB), Z3 - VMOVDQU64 expandAVX512_20_inShuf2<>(SB), Z4 - VMOVDQU64 expandAVX512_20_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_20_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z5 - VPERMB Z5, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_20_mat0<>(SB), Z0, Z0 - VPERMB Z5, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_20_mat1<>(SB), Z3, Z3 - VPERMB Z5, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_20_mat2<>(SB), Z4, Z4 - VPERMI2B Z3, Z0, Z1 - VPERMI2B Z4, Z3, Z2 - RET - -GLOBL expandAVX512_22_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_22_inShuf0<>+0x00(SB)/8, $0xffff020201010000 -DATA expandAVX512_22_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 -DATA expandAVX512_22_inShuf0<>+0x10(SB)/8, $0xffff020201010000 -DATA expandAVX512_22_inShuf0<>+0x18(SB)/8, $0xffffffffff020100 -DATA expandAVX512_22_inShuf0<>+0x20(SB)/8, $0xffff020201010000 -DATA expandAVX512_22_inShuf0<>+0x28(SB)/8, $0xffffffffff020100 -DATA expandAVX512_22_inShuf0<>+0x30(SB)/8, $0xffff020201010000 -DATA expandAVX512_22_inShuf0<>+0x38(SB)/8, $0xffff020201010000 - -GLOBL expandAVX512_22_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_22_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_22_mat0<>+0x08(SB)/8, $0x0101010101010202 -DATA expandAVX512_22_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_22_mat0<>+0x18(SB)/8, $0x0202020204040404 -DATA expandAVX512_22_mat0<>+0x20(SB)/8, $0x0404040404040404 -DATA expandAVX512_22_mat0<>+0x28(SB)/8, $0x0404080808080808 -DATA expandAVX512_22_mat0<>+0x30(SB)/8, $0x0808080808080808 -DATA expandAVX512_22_mat0<>+0x38(SB)/8, $0x1010101010101010 - -GLOBL expandAVX512_22_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_22_inShuf1<>+0x00(SB)/8, $0xffffffffff020100 -DATA expandAVX512_22_inShuf1<>+0x08(SB)/8, $0xffff020201010000 -DATA expandAVX512_22_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 -DATA expandAVX512_22_inShuf1<>+0x18(SB)/8, $0xffff020201010000 -DATA expandAVX512_22_inShuf1<>+0x20(SB)/8, $0xffffffffff020100 -DATA expandAVX512_22_inShuf1<>+0x28(SB)/8, $0xffffffff01010000 -DATA expandAVX512_22_inShuf1<>+0x30(SB)/8, $0xffff040403030202 -DATA expandAVX512_22_inShuf1<>+0x38(SB)/8, $0xffff050504040303 - -GLOBL expandAVX512_22_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_22_mat1<>+0x00(SB)/8, $0x1010101010102020 -DATA expandAVX512_22_mat1<>+0x08(SB)/8, $0x2020202020202020 -DATA expandAVX512_22_mat1<>+0x10(SB)/8, $0x2020202040404040 -DATA expandAVX512_22_mat1<>+0x18(SB)/8, $0x4040404040404040 -DATA expandAVX512_22_mat1<>+0x20(SB)/8, $0x4040808080808080 -DATA expandAVX512_22_mat1<>+0x28(SB)/8, $0x8080808080808080 -DATA expandAVX512_22_mat1<>+0x30(SB)/8, $0x8080808080808080 -DATA expandAVX512_22_mat1<>+0x38(SB)/8, $0x0101010101010101 - -GLOBL expandAVX512_22_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_22_inShuf2<>+0x00(SB)/8, $0xffffffffff050403 -DATA expandAVX512_22_inShuf2<>+0x08(SB)/8, $0xffff050504040303 -DATA expandAVX512_22_inShuf2<>+0x10(SB)/8, $0xffffffffff050403 -DATA expandAVX512_22_inShuf2<>+0x18(SB)/8, $0xffff050504040303 -DATA expandAVX512_22_inShuf2<>+0x20(SB)/8, $0xffffffffff050403 -DATA expandAVX512_22_inShuf2<>+0x28(SB)/8, $0xffff050504040303 -DATA expandAVX512_22_inShuf2<>+0x30(SB)/8, $0xffff050504040303 -DATA expandAVX512_22_inShuf2<>+0x38(SB)/8, $0xffffffffff050403 - -GLOBL expandAVX512_22_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_22_mat2<>+0x00(SB)/8, $0x0101010101010202 -DATA expandAVX512_22_mat2<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_22_mat2<>+0x10(SB)/8, $0x0202020204040404 -DATA expandAVX512_22_mat2<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_22_mat2<>+0x20(SB)/8, $0x0404080808080808 -DATA expandAVX512_22_mat2<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_22_mat2<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_22_mat2<>+0x38(SB)/8, $0x1010101010102020 - -GLOBL expandAVX512_22_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_22_inShuf3<>+0x00(SB)/8, $0xffff050504040303 -DATA expandAVX512_22_inShuf3<>+0x08(SB)/8, $0xffffffffff050403 -DATA expandAVX512_22_inShuf3<>+0x10(SB)/8, $0xffffff0504040303 -DATA expandAVX512_22_inShuf3<>+0x18(SB)/8, $0xffffffffffff0403 -DATA expandAVX512_22_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_22_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_22_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_22_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_22_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_22_mat3<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_22_mat3<>+0x08(SB)/8, $0x2020202040404040 -DATA expandAVX512_22_mat3<>+0x10(SB)/8, $0x4040404040404040 -DATA expandAVX512_22_mat3<>+0x18(SB)/8, $0x4040808080808080 -DATA expandAVX512_22_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_22_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_22_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_22_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_22_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_22_outShufLo+0x00(SB)/8, $0x2120181110080100 -DATA expandAVX512_22_outShufLo+0x08(SB)/8, $0x4948403938313028 -DATA expandAVX512_22_outShufLo+0x10(SB)/8, $0x0302696860595850 -DATA expandAVX512_22_outShufLo+0x18(SB)/8, $0x3229232219131209 -DATA expandAVX512_22_outShufLo+0x20(SB)/8, $0x5a514b4a413b3a33 -DATA expandAVX512_22_outShufLo+0x28(SB)/8, $0x140a05046b6a615b -DATA expandAVX512_22_outShufLo+0x30(SB)/8, $0x3c35342a25241a15 -DATA expandAVX512_22_outShufLo+0x38(SB)/8, $0x625d5c524d4c423d - -GLOBL expandAVX512_22_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_22_outShufHi0+0x00(SB)/8, $0x5049484039383130 -DATA expandAVX512_22_outShufHi0+0x08(SB)/8, $0x7871706968605958 -DATA expandAVX512_22_outShufHi0+0x10(SB)/8, $0x3332ffffffffffff -DATA expandAVX512_22_outShufHi0+0x18(SB)/8, $0x5b5a514b4a413b3a -DATA expandAVX512_22_outShufHi0+0x20(SB)/8, $0xffff7973726b6a61 -DATA expandAVX512_22_outShufHi0+0x28(SB)/8, $0x3d3c3534ffffffff -DATA expandAVX512_22_outShufHi0+0x30(SB)/8, $0x6c625d5c524d4c42 -DATA expandAVX512_22_outShufHi0+0x38(SB)/8, $0xffffffff7a75746d - -GLOBL expandAVX512_22_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_22_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_22_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_22_outShufHi1+0x10(SB)/8, $0xffff181110080100 -DATA expandAVX512_22_outShufHi1+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_22_outShufHi1+0x20(SB)/8, $0x0302ffffffffffff -DATA expandAVX512_22_outShufHi1+0x28(SB)/8, $0xffffffff19131209 -DATA expandAVX512_22_outShufHi1+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_22_outShufHi1+0x38(SB)/8, $0x140a0504ffffffff - -TEXT expandAVX512_22<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_22_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_22_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_22_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_22_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_22_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_22_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_22_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_22_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_22_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_22_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_22_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xffff03fffc0ffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0xf0000fc0003f0000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_24_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_24_inShuf0<>+0x00(SB)/8, $0x0202010101000000 -DATA expandAVX512_24_inShuf0<>+0x08(SB)/8, $0x0202010101000000 -DATA expandAVX512_24_inShuf0<>+0x10(SB)/8, $0x0202010101000000 -DATA expandAVX512_24_inShuf0<>+0x18(SB)/8, $0x0202010101000000 -DATA expandAVX512_24_inShuf0<>+0x20(SB)/8, $0x0202010101000000 -DATA expandAVX512_24_inShuf0<>+0x28(SB)/8, $0xff02010101000000 -DATA expandAVX512_24_inShuf0<>+0x30(SB)/8, $0xffff010101000000 -DATA expandAVX512_24_inShuf0<>+0x38(SB)/8, $0xffff010101000000 - -GLOBL expandAVX512_24_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_24_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_24_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_24_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_24_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_24_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_24_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_24_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_24_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_24_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_24_inShuf1<>+0x00(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_24_inShuf1<>+0x08(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_24_inShuf1<>+0x10(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_24_inShuf1<>+0x18(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_24_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_24_inShuf1<>+0x28(SB)/8, $0x0404040303030202 -DATA expandAVX512_24_inShuf1<>+0x30(SB)/8, $0x0404030303020202 -DATA expandAVX512_24_inShuf1<>+0x38(SB)/8, $0x0404030303020202 - -GLOBL expandAVX512_24_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_24_inShuf2<>+0x00(SB)/8, $0x0505040404030303 -DATA expandAVX512_24_inShuf2<>+0x08(SB)/8, $0x0505040404030303 -DATA expandAVX512_24_inShuf2<>+0x10(SB)/8, $0x0505040404030303 -DATA expandAVX512_24_inShuf2<>+0x18(SB)/8, $0xffff040404030303 -DATA expandAVX512_24_inShuf2<>+0x20(SB)/8, $0xffff040404030303 -DATA expandAVX512_24_inShuf2<>+0x28(SB)/8, $0xffffffffffffff04 -DATA expandAVX512_24_inShuf2<>+0x30(SB)/8, $0xffffffffffffff04 -DATA expandAVX512_24_inShuf2<>+0x38(SB)/8, $0xffffffffffffff05 - -GLOBL expandAVX512_24_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_24_mat2<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_24_mat2<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_24_mat2<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_24_mat2<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_24_mat2<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_24_mat2<>+0x28(SB)/8, $0x4040404040404040 -DATA expandAVX512_24_mat2<>+0x30(SB)/8, $0x8080808080808080 -DATA expandAVX512_24_mat2<>+0x38(SB)/8, $0x0101010101010101 - -GLOBL expandAVX512_24_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_24_inShuf3<>+0x00(SB)/8, $0xffffffffffffff05 -DATA expandAVX512_24_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_24_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_24_mat3<>+0x00(SB)/8, $0x0202020202020202 -DATA expandAVX512_24_mat3<>+0x08(SB)/8, $0x0000000000000000 -DATA expandAVX512_24_mat3<>+0x10(SB)/8, $0x0000000000000000 -DATA expandAVX512_24_mat3<>+0x18(SB)/8, $0x0000000000000000 -DATA expandAVX512_24_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_24_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_24_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_24_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_24_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_24_outShufLo+0x00(SB)/8, $0x11100a0908020100 -DATA expandAVX512_24_outShufLo+0x08(SB)/8, $0x282221201a191812 -DATA expandAVX512_24_outShufLo+0x10(SB)/8, $0x3a39383231302a29 -DATA expandAVX512_24_outShufLo+0x18(SB)/8, $0x14130d0c0b050403 -DATA expandAVX512_24_outShufLo+0x20(SB)/8, $0x2b2524231d1c1b15 -DATA expandAVX512_24_outShufLo+0x28(SB)/8, $0x3d3c3b3534332d2c -DATA expandAVX512_24_outShufLo+0x30(SB)/8, $0x1716480f0e400706 -DATA expandAVX512_24_outShufLo+0x38(SB)/8, $0x2e602726581f1e50 - -GLOBL expandAVX512_24_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_24_outShufHi0+0x00(SB)/8, $0x3a39383231302928 -DATA expandAVX512_24_outShufHi0+0x08(SB)/8, $0x51504a4948424140 -DATA expandAVX512_24_outShufHi0+0x10(SB)/8, $0x2a6261605a595852 -DATA expandAVX512_24_outShufHi0+0x18(SB)/8, $0x3d3c3b3534332c2b -DATA expandAVX512_24_outShufHi0+0x20(SB)/8, $0x54534d4c4b454443 -DATA expandAVX512_24_outShufHi0+0x28(SB)/8, $0x2d6564635d5c5b55 -DATA expandAVX512_24_outShufHi0+0x30(SB)/8, $0x703f3e6837362f2e -DATA expandAVX512_24_outShufHi0+0x38(SB)/8, $0x5756ff4f4e784746 - -GLOBL expandAVX512_24_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_24_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_outShufHi1+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_outShufHi1+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_outShufHi1+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_outShufHi1+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_outShufHi1+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_24_outShufHi1+0x38(SB)/8, $0xffff00ffffffffff - -TEXT expandAVX512_24<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_24_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_24_mat0<>(SB), Z2 - VMOVDQU64 expandAVX512_24_inShuf1<>(SB), Z3 - VMOVDQU64 expandAVX512_24_inShuf2<>(SB), Z4 - VMOVDQU64 expandAVX512_24_inShuf3<>(SB), Z5 - VMOVDQU64 expandAVX512_24_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_24_outShufHi0(SB), Z6 - VMOVDQU64 expandAVX512_24_outShufHi1(SB), Z7 - VMOVDQU64 (AX), Z8 - VPERMB Z8, Z0, Z0 - VGF2P8AFFINEQB $0, Z2, Z0, Z0 - VPERMB Z8, Z3, Z3 - VGF2P8AFFINEQB $0, Z2, Z3, Z2 - VPERMB Z8, Z4, Z3 - VGF2P8AFFINEQB $0, expandAVX512_24_mat2<>(SB), Z3, Z3 - VPERMB Z8, Z5, Z4 - VGF2P8AFFINEQB $0, expandAVX512_24_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xdfffffffffffffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z6 - MOVQ $0x2000000000000000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z7, K1, Z0 - VPORQ Z0, Z6, Z2 - RET - -GLOBL expandAVX512_26_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_26_inShuf0<>+0x00(SB)/8, $0x0202010101000000 -DATA expandAVX512_26_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 -DATA expandAVX512_26_inShuf0<>+0x10(SB)/8, $0xffff020201010000 -DATA expandAVX512_26_inShuf0<>+0x18(SB)/8, $0xffffffffff020100 -DATA expandAVX512_26_inShuf0<>+0x20(SB)/8, $0xffff020201010000 -DATA expandAVX512_26_inShuf0<>+0x28(SB)/8, $0xffffffffff020100 -DATA expandAVX512_26_inShuf0<>+0x30(SB)/8, $0x0202010101000000 -DATA expandAVX512_26_inShuf0<>+0x38(SB)/8, $0xffff010101000000 - -GLOBL expandAVX512_26_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_26_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_26_mat0<>+0x08(SB)/8, $0x0101020202020202 -DATA expandAVX512_26_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_26_mat0<>+0x18(SB)/8, $0x0202020204040404 -DATA expandAVX512_26_mat0<>+0x20(SB)/8, $0x0404040404040404 -DATA expandAVX512_26_mat0<>+0x28(SB)/8, $0x0404040404040808 -DATA expandAVX512_26_mat0<>+0x30(SB)/8, $0x0808080808080808 -DATA expandAVX512_26_mat0<>+0x38(SB)/8, $0x1010101010101010 - -GLOBL expandAVX512_26_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_26_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_26_inShuf1<>+0x08(SB)/8, $0xffffffff01010000 -DATA expandAVX512_26_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_26_inShuf1<>+0x18(SB)/8, $0xffffffff01010000 -DATA expandAVX512_26_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_26_inShuf1<>+0x28(SB)/8, $0xffff010101000000 -DATA expandAVX512_26_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_26_inShuf1<>+0x38(SB)/8, $0xff04040403030302 - -GLOBL expandAVX512_26_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_26_mat1<>+0x00(SB)/8, $0x1010202020202020 -DATA expandAVX512_26_mat1<>+0x08(SB)/8, $0x2020202020202020 -DATA expandAVX512_26_mat1<>+0x10(SB)/8, $0x2020202040404040 -DATA expandAVX512_26_mat1<>+0x18(SB)/8, $0x4040404040404040 -DATA expandAVX512_26_mat1<>+0x20(SB)/8, $0x4040404040408080 -DATA expandAVX512_26_mat1<>+0x28(SB)/8, $0x8080808080808080 -DATA expandAVX512_26_mat1<>+0x30(SB)/8, $0x0101010101010101 -DATA expandAVX512_26_mat1<>+0x38(SB)/8, $0x0808080808080808 - -GLOBL expandAVX512_26_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_26_inShuf2<>+0x00(SB)/8, $0x0404030303020202 -DATA expandAVX512_26_inShuf2<>+0x08(SB)/8, $0xffffffffff040302 -DATA expandAVX512_26_inShuf2<>+0x10(SB)/8, $0xffff040403030202 -DATA expandAVX512_26_inShuf2<>+0x18(SB)/8, $0xffffffffff040302 -DATA expandAVX512_26_inShuf2<>+0x20(SB)/8, $0xffff040403030202 -DATA expandAVX512_26_inShuf2<>+0x28(SB)/8, $0xffffffffff040302 -DATA expandAVX512_26_inShuf2<>+0x30(SB)/8, $0xff04030303020202 -DATA expandAVX512_26_inShuf2<>+0x38(SB)/8, $0xffff040404030303 - -GLOBL expandAVX512_26_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_26_mat2<>+0x00(SB)/8, $0x1010101010101010 -DATA expandAVX512_26_mat2<>+0x08(SB)/8, $0x1010202020202020 -DATA expandAVX512_26_mat2<>+0x10(SB)/8, $0x2020202020202020 -DATA expandAVX512_26_mat2<>+0x18(SB)/8, $0x2020202040404040 -DATA expandAVX512_26_mat2<>+0x20(SB)/8, $0x4040404040404040 -DATA expandAVX512_26_mat2<>+0x28(SB)/8, $0x4040404040408080 -DATA expandAVX512_26_mat2<>+0x30(SB)/8, $0x8080808080808080 -DATA expandAVX512_26_mat2<>+0x38(SB)/8, $0x0101010101010101 - -GLOBL expandAVX512_26_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_26_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403 -DATA expandAVX512_26_inShuf3<>+0x08(SB)/8, $0xffffffff04040303 -DATA expandAVX512_26_inShuf3<>+0x10(SB)/8, $0xffffffffffff0403 -DATA expandAVX512_26_inShuf3<>+0x18(SB)/8, $0xffffffff04040303 -DATA expandAVX512_26_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403 -DATA expandAVX512_26_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04 -DATA expandAVX512_26_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_26_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_26_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_26_mat3<>+0x00(SB)/8, $0x0101020202020202 -DATA expandAVX512_26_mat3<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_26_mat3<>+0x10(SB)/8, $0x0202020204040404 -DATA expandAVX512_26_mat3<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_26_mat3<>+0x20(SB)/8, $0x0404040404040808 -DATA expandAVX512_26_mat3<>+0x28(SB)/8, $0x1010101010101010 -DATA expandAVX512_26_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_26_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_26_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_26_outShufLo+0x00(SB)/8, $0x2018111008020100 -DATA expandAVX512_26_outShufLo+0x08(SB)/8, $0x3a39383231302821 -DATA expandAVX512_26_outShufLo+0x10(SB)/8, $0x6860595850494840 -DATA expandAVX512_26_outShufLo+0x18(SB)/8, $0x1312090504036a69 -DATA expandAVX512_26_outShufLo+0x20(SB)/8, $0x3b35343329232219 -DATA expandAVX512_26_outShufLo+0x28(SB)/8, $0x5b5a514b4a413d3c -DATA expandAVX512_26_outShufLo+0x30(SB)/8, $0x0a7007066d6c6b61 -DATA expandAVX512_26_outShufLo+0x38(SB)/8, $0x37362a25241a1514 - -GLOBL expandAVX512_26_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_26_outShufHi0+0x00(SB)/8, $0x5851504842414038 -DATA expandAVX512_26_outShufHi0+0x08(SB)/8, $0x7978727170686160 -DATA expandAVX512_26_outShufHi0+0x10(SB)/8, $0xffffffffffffff7a -DATA expandAVX512_26_outShufHi0+0x18(SB)/8, $0x52494544433b3a39 -DATA expandAVX512_26_outShufHi0+0x20(SB)/8, $0x7574736963625953 -DATA expandAVX512_26_outShufHi0+0x28(SB)/8, $0xffffffffff7d7c7b -DATA expandAVX512_26_outShufHi0+0x30(SB)/8, $0xff47463e3d3cffff -DATA expandAVX512_26_outShufHi0+0x38(SB)/8, $0x766a65645a55544a - -GLOBL expandAVX512_26_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_26_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_26_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_26_outShufHi1+0x10(SB)/8, $0x20191810090800ff -DATA expandAVX512_26_outShufHi1+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_26_outShufHi1+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_26_outShufHi1+0x28(SB)/8, $0x1a110b0a01ffffff -DATA expandAVX512_26_outShufHi1+0x30(SB)/8, $0x28ffffffffff211b -DATA expandAVX512_26_outShufHi1+0x38(SB)/8, $0xffffffffffffffff - -TEXT expandAVX512_26<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_26_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_26_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_26_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_26_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_26_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_26_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_26_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_26_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_26_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_26_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_26_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xff7c07ffff01ffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0x83f80000fe0000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_28_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_28_inShuf0<>+0x00(SB)/8, $0x0202010101000000 -DATA expandAVX512_28_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 -DATA expandAVX512_28_inShuf0<>+0x10(SB)/8, $0x0202010101000000 -DATA expandAVX512_28_inShuf0<>+0x18(SB)/8, $0xff02010101000000 -DATA expandAVX512_28_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_28_inShuf0<>+0x28(SB)/8, $0xffff010101000000 -DATA expandAVX512_28_inShuf0<>+0x30(SB)/8, $0xffff010101000000 -DATA expandAVX512_28_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100 - -GLOBL expandAVX512_28_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_28_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_28_mat0<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_28_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_28_mat0<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_28_mat0<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_28_mat0<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_28_mat0<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_28_mat0<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_28_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_28_inShuf1<>+0x00(SB)/8, $0xffff010101000000 -DATA expandAVX512_28_inShuf1<>+0x08(SB)/8, $0xffff010101000000 -DATA expandAVX512_28_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_28_inShuf1<>+0x18(SB)/8, $0xffff010101000000 -DATA expandAVX512_28_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_28_inShuf1<>+0x28(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_28_inShuf1<>+0x30(SB)/8, $0x0404040303030202 -DATA expandAVX512_28_inShuf1<>+0x38(SB)/8, $0xffffffffff040302 - -GLOBL expandAVX512_28_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_28_mat1<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_28_mat1<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_28_mat1<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_28_mat1<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_28_mat1<>+0x20(SB)/8, $0x0101010101010101 -DATA expandAVX512_28_mat1<>+0x28(SB)/8, $0x0202020202020202 -DATA expandAVX512_28_mat1<>+0x30(SB)/8, $0x0404040404040404 -DATA expandAVX512_28_mat1<>+0x38(SB)/8, $0x0404040408080808 - -GLOBL expandAVX512_28_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_28_inShuf2<>+0x00(SB)/8, $0x0404030303020202 -DATA expandAVX512_28_inShuf2<>+0x08(SB)/8, $0x0404030303020202 -DATA expandAVX512_28_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302 -DATA expandAVX512_28_inShuf2<>+0x18(SB)/8, $0xffff030303020202 -DATA expandAVX512_28_inShuf2<>+0x20(SB)/8, $0xffff030303020202 -DATA expandAVX512_28_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302 -DATA expandAVX512_28_inShuf2<>+0x30(SB)/8, $0xffff030303020202 -DATA expandAVX512_28_inShuf2<>+0x38(SB)/8, $0xffff040404030303 - -GLOBL expandAVX512_28_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_28_mat2<>+0x00(SB)/8, $0x0808080808080808 -DATA expandAVX512_28_mat2<>+0x08(SB)/8, $0x1010101010101010 -DATA expandAVX512_28_mat2<>+0x10(SB)/8, $0x1010101020202020 -DATA expandAVX512_28_mat2<>+0x18(SB)/8, $0x2020202020202020 -DATA expandAVX512_28_mat2<>+0x20(SB)/8, $0x4040404040404040 -DATA expandAVX512_28_mat2<>+0x28(SB)/8, $0x4040404080808080 -DATA expandAVX512_28_mat2<>+0x30(SB)/8, $0x8080808080808080 -DATA expandAVX512_28_mat2<>+0x38(SB)/8, $0x0101010101010101 - -GLOBL expandAVX512_28_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_28_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403 -DATA expandAVX512_28_inShuf3<>+0x08(SB)/8, $0xffff040404030303 -DATA expandAVX512_28_inShuf3<>+0x10(SB)/8, $0xffffffffffffff04 -DATA expandAVX512_28_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_28_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_28_mat3<>+0x00(SB)/8, $0x0101010102020202 -DATA expandAVX512_28_mat3<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_28_mat3<>+0x10(SB)/8, $0x0808080808080808 -DATA expandAVX512_28_mat3<>+0x18(SB)/8, $0x0000000000000000 -DATA expandAVX512_28_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_28_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_28_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_28_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_28_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_28_outShufLo+0x00(SB)/8, $0x1812111008020100 -DATA expandAVX512_28_outShufLo+0x08(SB)/8, $0x31302a2928201a19 -DATA expandAVX512_28_outShufLo+0x10(SB)/8, $0x4a49484241403832 -DATA expandAVX512_28_outShufLo+0x18(SB)/8, $0x090504035a595850 -DATA expandAVX512_28_outShufLo+0x20(SB)/8, $0x2b211d1c1b151413 -DATA expandAVX512_28_outShufLo+0x28(SB)/8, $0x4443393534332d2c -DATA expandAVX512_28_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b45 -DATA expandAVX512_28_outShufLo+0x38(SB)/8, $0x1e6817160a600706 - -GLOBL expandAVX512_28_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_28_outShufHi0+0x00(SB)/8, $0x4948424140383130 -DATA expandAVX512_28_outShufHi0+0x08(SB)/8, $0x6261605a5958504a -DATA expandAVX512_28_outShufHi0+0x10(SB)/8, $0xff7a797872717068 -DATA expandAVX512_28_outShufHi0+0x18(SB)/8, $0x4339343332ffffff -DATA expandAVX512_28_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b4544 -DATA expandAVX512_28_outShufHi0+0x28(SB)/8, $0x757473696564635d -DATA expandAVX512_28_outShufHi0+0x30(SB)/8, $0x35ffffffff7d7c7b -DATA expandAVX512_28_outShufHi0+0x38(SB)/8, $0x4f4eff47463a3736 - -GLOBL expandAVX512_28_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_28_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_outShufHi1+0x10(SB)/8, $0x00ffffffffffffff -DATA expandAVX512_28_outShufHi1+0x18(SB)/8, $0xffffffffff0a0908 -DATA expandAVX512_28_outShufHi1+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_outShufHi1+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_28_outShufHi1+0x30(SB)/8, $0xff0d0c0b01ffffff -DATA expandAVX512_28_outShufHi1+0x38(SB)/8, $0xffff10ffffffffff - -TEXT expandAVX512_28<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_28_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_28_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_28_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_28_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_28_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_28_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_28_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_28_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_28_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_28_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_28_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xdf87fffff87fffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0x2078000007800000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_30_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_30_inShuf0<>+0x00(SB)/8, $0x0202010101000000 -DATA expandAVX512_30_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 -DATA expandAVX512_30_inShuf0<>+0x10(SB)/8, $0xffff010101000000 -DATA expandAVX512_30_inShuf0<>+0x18(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_30_inShuf0<>+0x20(SB)/8, $0xffff010101000000 -DATA expandAVX512_30_inShuf0<>+0x28(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_30_inShuf0<>+0x30(SB)/8, $0xffff010101000000 -DATA expandAVX512_30_inShuf0<>+0x38(SB)/8, $0xffff010101000000 - -GLOBL expandAVX512_30_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_30_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_30_mat0<>+0x08(SB)/8, $0x0101010101010202 -DATA expandAVX512_30_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_30_mat0<>+0x18(SB)/8, $0x0202020204040404 -DATA expandAVX512_30_mat0<>+0x20(SB)/8, $0x0404040404040404 -DATA expandAVX512_30_mat0<>+0x28(SB)/8, $0x0404080808080808 -DATA expandAVX512_30_mat0<>+0x30(SB)/8, $0x0808080808080808 -DATA expandAVX512_30_mat0<>+0x38(SB)/8, $0x1010101010101010 - -GLOBL expandAVX512_30_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_30_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_30_inShuf1<>+0x08(SB)/8, $0xffff010101000000 -DATA expandAVX512_30_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_30_inShuf1<>+0x18(SB)/8, $0xffff010101000000 -DATA expandAVX512_30_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_30_inShuf1<>+0x28(SB)/8, $0xffff010101000000 -DATA expandAVX512_30_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_30_inShuf1<>+0x38(SB)/8, $0x0404030303020202 - -GLOBL expandAVX512_30_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_30_mat1<>+0x00(SB)/8, $0x1010101010102020 -DATA expandAVX512_30_mat1<>+0x08(SB)/8, $0x2020202020202020 -DATA expandAVX512_30_mat1<>+0x10(SB)/8, $0x2020202040404040 -DATA expandAVX512_30_mat1<>+0x18(SB)/8, $0x4040404040404040 -DATA expandAVX512_30_mat1<>+0x20(SB)/8, $0x4040808080808080 -DATA expandAVX512_30_mat1<>+0x28(SB)/8, $0x8080808080808080 -DATA expandAVX512_30_mat1<>+0x30(SB)/8, $0x0101010101010101 -DATA expandAVX512_30_mat1<>+0x38(SB)/8, $0x0202020202020202 - -GLOBL expandAVX512_30_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_30_inShuf2<>+0x00(SB)/8, $0xffffffffff040302 -DATA expandAVX512_30_inShuf2<>+0x08(SB)/8, $0xffff030303020202 -DATA expandAVX512_30_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302 -DATA expandAVX512_30_inShuf2<>+0x18(SB)/8, $0xffff030303020202 -DATA expandAVX512_30_inShuf2<>+0x20(SB)/8, $0xffff030303020202 -DATA expandAVX512_30_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302 -DATA expandAVX512_30_inShuf2<>+0x30(SB)/8, $0xffff030303020202 -DATA expandAVX512_30_inShuf2<>+0x38(SB)/8, $0xffffffffffff0302 - -GLOBL expandAVX512_30_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_30_mat2<>+0x00(SB)/8, $0x0202020204040404 -DATA expandAVX512_30_mat2<>+0x08(SB)/8, $0x0404040404040404 -DATA expandAVX512_30_mat2<>+0x10(SB)/8, $0x0404080808080808 -DATA expandAVX512_30_mat2<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_30_mat2<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_30_mat2<>+0x28(SB)/8, $0x1010101010102020 -DATA expandAVX512_30_mat2<>+0x30(SB)/8, $0x2020202020202020 -DATA expandAVX512_30_mat2<>+0x38(SB)/8, $0x2020202040404040 - -GLOBL expandAVX512_30_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_30_inShuf3<>+0x00(SB)/8, $0xffff030303020202 -DATA expandAVX512_30_inShuf3<>+0x08(SB)/8, $0xffffffffffff0302 -DATA expandAVX512_30_inShuf3<>+0x10(SB)/8, $0xffff030303020202 -DATA expandAVX512_30_inShuf3<>+0x18(SB)/8, $0xffff040404030303 -DATA expandAVX512_30_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403 -DATA expandAVX512_30_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04 -DATA expandAVX512_30_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_30_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_30_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_30_mat3<>+0x00(SB)/8, $0x4040404040404040 -DATA expandAVX512_30_mat3<>+0x08(SB)/8, $0x4040808080808080 -DATA expandAVX512_30_mat3<>+0x10(SB)/8, $0x8080808080808080 -DATA expandAVX512_30_mat3<>+0x18(SB)/8, $0x0101010101010101 -DATA expandAVX512_30_mat3<>+0x20(SB)/8, $0x0101010101010202 -DATA expandAVX512_30_mat3<>+0x28(SB)/8, $0x0202020202020202 -DATA expandAVX512_30_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_30_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_30_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_30_outShufLo+0x00(SB)/8, $0x1812111008020100 -DATA expandAVX512_30_outShufLo+0x08(SB)/8, $0x3832313028222120 -DATA expandAVX512_30_outShufLo+0x10(SB)/8, $0x58504a4948403a39 -DATA expandAVX512_30_outShufLo+0x18(SB)/8, $0x04036a6968605a59 -DATA expandAVX512_30_outShufLo+0x20(SB)/8, $0x2423191514130905 -DATA expandAVX512_30_outShufLo+0x28(SB)/8, $0x3d3c3b3534332925 -DATA expandAVX512_30_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b41 -DATA expandAVX512_30_outShufLo+0x38(SB)/8, $0x0a7007066d6c6b61 - -GLOBL expandAVX512_30_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_30_outShufHi0+0x00(SB)/8, $0x504a4948403a3938 -DATA expandAVX512_30_outShufHi0+0x08(SB)/8, $0x70686261605a5958 -DATA expandAVX512_30_outShufHi0+0x10(SB)/8, $0xffffffffff787271 -DATA expandAVX512_30_outShufHi0+0x18(SB)/8, $0x3c3bffffffffffff -DATA expandAVX512_30_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b413d -DATA expandAVX512_30_outShufHi0+0x28(SB)/8, $0x757473696564635d -DATA expandAVX512_30_outShufHi0+0x30(SB)/8, $0xffffffffffffff79 -DATA expandAVX512_30_outShufHi0+0x38(SB)/8, $0x42ff3f3effffffff - -GLOBL expandAVX512_30_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_30_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_30_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_30_outShufHi1+0x10(SB)/8, $0x1008020100ffffff -DATA expandAVX512_30_outShufHi1+0x18(SB)/8, $0xffff201a19181211 -DATA expandAVX512_30_outShufHi1+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_30_outShufHi1+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_30_outShufHi1+0x30(SB)/8, $0x15141309050403ff -DATA expandAVX512_30_outShufHi1+0x38(SB)/8, $0xff28ffff211d1c1b - -TEXT expandAVX512_30<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_30_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_30_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_30_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_30_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_30_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_30_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_30_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_30_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_30_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_30_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_30_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xb001ffffc007ffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0x4ffe00003ff80000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_32_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_32_inShuf0<>+0x00(SB)/8, $0x0101010100000000 -DATA expandAVX512_32_inShuf0<>+0x08(SB)/8, $0x0101010100000000 -DATA expandAVX512_32_inShuf0<>+0x10(SB)/8, $0x0101010100000000 -DATA expandAVX512_32_inShuf0<>+0x18(SB)/8, $0x0101010100000000 -DATA expandAVX512_32_inShuf0<>+0x20(SB)/8, $0x0101010100000000 -DATA expandAVX512_32_inShuf0<>+0x28(SB)/8, $0x0101010100000000 -DATA expandAVX512_32_inShuf0<>+0x30(SB)/8, $0x0101010100000000 -DATA expandAVX512_32_inShuf0<>+0x38(SB)/8, $0x0101010100000000 - -GLOBL expandAVX512_32_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_32_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_32_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_32_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_32_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_32_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_32_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_32_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_32_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_32_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_32_inShuf1<>+0x00(SB)/8, $0x0303030302020202 -DATA expandAVX512_32_inShuf1<>+0x08(SB)/8, $0x0303030302020202 -DATA expandAVX512_32_inShuf1<>+0x10(SB)/8, $0x0303030302020202 -DATA expandAVX512_32_inShuf1<>+0x18(SB)/8, $0x0303030302020202 -DATA expandAVX512_32_inShuf1<>+0x20(SB)/8, $0x0303030302020202 -DATA expandAVX512_32_inShuf1<>+0x28(SB)/8, $0x0303030302020202 -DATA expandAVX512_32_inShuf1<>+0x30(SB)/8, $0x0303030302020202 -DATA expandAVX512_32_inShuf1<>+0x38(SB)/8, $0x0303030302020202 - -GLOBL expandAVX512_32_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_32_outShufLo+0x00(SB)/8, $0x0b0a090803020100 -DATA expandAVX512_32_outShufLo+0x08(SB)/8, $0x1b1a191813121110 -DATA expandAVX512_32_outShufLo+0x10(SB)/8, $0x2b2a292823222120 -DATA expandAVX512_32_outShufLo+0x18(SB)/8, $0x3b3a393833323130 -DATA expandAVX512_32_outShufLo+0x20(SB)/8, $0x0f0e0d0c07060504 -DATA expandAVX512_32_outShufLo+0x28(SB)/8, $0x1f1e1d1c17161514 -DATA expandAVX512_32_outShufLo+0x30(SB)/8, $0x2f2e2d2c27262524 -DATA expandAVX512_32_outShufLo+0x38(SB)/8, $0x3f3e3d3c37363534 - -TEXT expandAVX512_32<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_32_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_32_mat0<>(SB), Z1 - VMOVDQU64 expandAVX512_32_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_32_outShufLo(SB), Z3 - VMOVDQU64 (AX), Z4 - VPERMB Z4, Z0, Z0 - VGF2P8AFFINEQB $0, Z1, Z0, Z0 - VPERMB Z4, Z2, Z2 - VGF2P8AFFINEQB $0, Z1, Z2, Z2 - VPERMB Z0, Z3, Z1 - VPERMB Z2, Z3, Z2 - RET - -GLOBL expandAVX512_36_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_36_inShuf0<>+0x00(SB)/8, $0x0101010100000000 -DATA expandAVX512_36_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_36_inShuf0<>+0x10(SB)/8, $0x0101010100000000 -DATA expandAVX512_36_inShuf0<>+0x18(SB)/8, $0x0101010100000000 -DATA expandAVX512_36_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_36_inShuf0<>+0x28(SB)/8, $0x0101010100000000 -DATA expandAVX512_36_inShuf0<>+0x30(SB)/8, $0x0101010100000000 -DATA expandAVX512_36_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100 - -GLOBL expandAVX512_36_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_36_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_36_mat0<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_36_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_36_mat0<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_36_mat0<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_36_mat0<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_36_mat0<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_36_mat0<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_36_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_36_inShuf1<>+0x00(SB)/8, $0x0101010100000000 -DATA expandAVX512_36_inShuf1<>+0x08(SB)/8, $0xffffff0100000000 -DATA expandAVX512_36_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 -DATA expandAVX512_36_inShuf1<>+0x18(SB)/8, $0xffffffff00000000 -DATA expandAVX512_36_inShuf1<>+0x20(SB)/8, $0xff02020202010101 -DATA expandAVX512_36_inShuf1<>+0x28(SB)/8, $0xffffffffffff0201 -DATA expandAVX512_36_inShuf1<>+0x30(SB)/8, $0x0202020201010101 -DATA expandAVX512_36_inShuf1<>+0x38(SB)/8, $0x0303030302020202 - -GLOBL expandAVX512_36_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_36_mat1<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_36_mat1<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_36_mat1<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_36_mat1<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_36_mat1<>+0x20(SB)/8, $0x4040404040404040 -DATA expandAVX512_36_mat1<>+0x28(SB)/8, $0x4040404080808080 -DATA expandAVX512_36_mat1<>+0x30(SB)/8, $0x8080808080808080 -DATA expandAVX512_36_mat1<>+0x38(SB)/8, $0x0101010101010101 - -GLOBL expandAVX512_36_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_36_inShuf2<>+0x00(SB)/8, $0xffffffffffff0302 -DATA expandAVX512_36_inShuf2<>+0x08(SB)/8, $0x0303030302020202 -DATA expandAVX512_36_inShuf2<>+0x10(SB)/8, $0x0303030302020202 -DATA expandAVX512_36_inShuf2<>+0x18(SB)/8, $0xffffffffffff0302 -DATA expandAVX512_36_inShuf2<>+0x20(SB)/8, $0x0303030302020202 -DATA expandAVX512_36_inShuf2<>+0x28(SB)/8, $0xffff030302020202 -DATA expandAVX512_36_inShuf2<>+0x30(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_36_inShuf2<>+0x38(SB)/8, $0xffffffff02020202 - -GLOBL expandAVX512_36_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_36_mat2<>+0x00(SB)/8, $0x0101010102020202 -DATA expandAVX512_36_mat2<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_36_mat2<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_36_mat2<>+0x18(SB)/8, $0x0404040408080808 -DATA expandAVX512_36_mat2<>+0x20(SB)/8, $0x0808080808080808 -DATA expandAVX512_36_mat2<>+0x28(SB)/8, $0x1010101010101010 -DATA expandAVX512_36_mat2<>+0x30(SB)/8, $0x1010101020202020 -DATA expandAVX512_36_mat2<>+0x38(SB)/8, $0x2020202020202020 - -GLOBL expandAVX512_36_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_36_outShufLo+0x00(SB)/8, $0x1211100803020100 -DATA expandAVX512_36_outShufLo+0x08(SB)/8, $0x2928201b1a191813 -DATA expandAVX512_36_outShufLo+0x10(SB)/8, $0x4038333231302b2a -DATA expandAVX512_36_outShufLo+0x18(SB)/8, $0x504b4a4948434241 -DATA expandAVX512_36_outShufLo+0x20(SB)/8, $0x070605045b5a5958 -DATA expandAVX512_36_outShufLo+0x28(SB)/8, $0x1e1d1c1716151409 -DATA expandAVX512_36_outShufLo+0x30(SB)/8, $0x35342f2e2d2c211f -DATA expandAVX512_36_outShufLo+0x38(SB)/8, $0x4c47464544393736 - -GLOBL expandAVX512_36_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_36_outShufHi+0x00(SB)/8, $0x3332313028222120 -DATA expandAVX512_36_outShufHi+0x08(SB)/8, $0x4a4948403b3a3938 -DATA expandAVX512_36_outShufHi+0x10(SB)/8, $0x616058535251504b -DATA expandAVX512_36_outShufHi+0x18(SB)/8, $0x78706b6a69686362 -DATA expandAVX512_36_outShufHi+0x20(SB)/8, $0x29262524237b7a79 -DATA expandAVX512_36_outShufHi+0x28(SB)/8, $0x3f3e3d3c37363534 -DATA expandAVX512_36_outShufHi+0x30(SB)/8, $0x5655544f4e4d4c41 -DATA expandAVX512_36_outShufHi+0x38(SB)/8, $0x6d6c676665645957 - -TEXT expandAVX512_36<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_36_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_36_inShuf1<>(SB), Z3 - VMOVDQU64 expandAVX512_36_inShuf2<>(SB), Z4 - VMOVDQU64 expandAVX512_36_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_36_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z5 - VPERMB Z5, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_36_mat0<>(SB), Z0, Z0 - VPERMB Z5, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_36_mat1<>(SB), Z3, Z3 - VPERMB Z5, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_36_mat2<>(SB), Z4, Z4 - VPERMI2B Z3, Z0, Z1 - VPERMI2B Z4, Z3, Z2 - RET - -GLOBL expandAVX512_40_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_40_inShuf0<>+0x00(SB)/8, $0x0101010000000000 -DATA expandAVX512_40_inShuf0<>+0x08(SB)/8, $0x0101010000000000 -DATA expandAVX512_40_inShuf0<>+0x10(SB)/8, $0x0101010000000000 -DATA expandAVX512_40_inShuf0<>+0x18(SB)/8, $0x0101010000000000 -DATA expandAVX512_40_inShuf0<>+0x20(SB)/8, $0x0101010000000000 -DATA expandAVX512_40_inShuf0<>+0x28(SB)/8, $0xffffff0000000000 -DATA expandAVX512_40_inShuf0<>+0x30(SB)/8, $0xffffff0000000000 -DATA expandAVX512_40_inShuf0<>+0x38(SB)/8, $0xffffff0000000000 - -GLOBL expandAVX512_40_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_40_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_40_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_40_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_40_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_40_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_40_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_40_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_40_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_40_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_40_inShuf1<>+0x00(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_40_inShuf1<>+0x08(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_40_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_40_inShuf1<>+0x18(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_40_inShuf1<>+0x20(SB)/8, $0xffffffffffffff01 -DATA expandAVX512_40_inShuf1<>+0x28(SB)/8, $0xffff020202020201 -DATA expandAVX512_40_inShuf1<>+0x30(SB)/8, $0x0202020101010101 -DATA expandAVX512_40_inShuf1<>+0x38(SB)/8, $0x0202020101010101 - -GLOBL expandAVX512_40_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_40_mat1<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_40_mat1<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_40_mat1<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_40_mat1<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_40_mat1<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_40_mat1<>+0x28(SB)/8, $0x1010101010101010 -DATA expandAVX512_40_mat1<>+0x30(SB)/8, $0x2020202020202020 -DATA expandAVX512_40_mat1<>+0x38(SB)/8, $0x4040404040404040 - -GLOBL expandAVX512_40_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_40_inShuf2<>+0x00(SB)/8, $0x0202020101010101 -DATA expandAVX512_40_inShuf2<>+0x08(SB)/8, $0x0303030202020202 -DATA expandAVX512_40_inShuf2<>+0x10(SB)/8, $0x0303030202020202 -DATA expandAVX512_40_inShuf2<>+0x18(SB)/8, $0xffffff0202020202 -DATA expandAVX512_40_inShuf2<>+0x20(SB)/8, $0xffffff0202020202 -DATA expandAVX512_40_inShuf2<>+0x28(SB)/8, $0xffffffffffff0202 -DATA expandAVX512_40_inShuf2<>+0x30(SB)/8, $0xffffffffffff0202 -DATA expandAVX512_40_inShuf2<>+0x38(SB)/8, $0xffffffffffff0202 - -GLOBL expandAVX512_40_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_40_mat2<>+0x00(SB)/8, $0x8080808080808080 -DATA expandAVX512_40_mat2<>+0x08(SB)/8, $0x0101010101010101 -DATA expandAVX512_40_mat2<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_40_mat2<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_40_mat2<>+0x20(SB)/8, $0x0808080808080808 -DATA expandAVX512_40_mat2<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_40_mat2<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_40_mat2<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_40_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_40_inShuf3<>+0x00(SB)/8, $0xffffffffffff0303 -DATA expandAVX512_40_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_40_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_40_mat3<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_40_mat3<>+0x08(SB)/8, $0x0000000000000000 -DATA expandAVX512_40_mat3<>+0x10(SB)/8, $0x0000000000000000 -DATA expandAVX512_40_mat3<>+0x18(SB)/8, $0x0000000000000000 -DATA expandAVX512_40_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_40_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_40_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_40_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_40_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_40_outShufLo+0x00(SB)/8, $0x0a09080403020100 -DATA expandAVX512_40_outShufLo+0x08(SB)/8, $0x1814131211100c0b -DATA expandAVX512_40_outShufLo+0x10(SB)/8, $0x232221201c1b1a19 -DATA expandAVX512_40_outShufLo+0x18(SB)/8, $0x31302c2b2a292824 -DATA expandAVX512_40_outShufLo+0x20(SB)/8, $0x3c3b3a3938343332 -DATA expandAVX512_40_outShufLo+0x28(SB)/8, $0x0f0e0d4140070605 -DATA expandAVX512_40_outShufLo+0x30(SB)/8, $0x1d51501716154948 -DATA expandAVX512_40_outShufLo+0x38(SB)/8, $0x6027262559581f1e - -GLOBL expandAVX512_40_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_40_outShufHi0+0x00(SB)/8, $0x3938343332313028 -DATA expandAVX512_40_outShufHi0+0x08(SB)/8, $0x44434241403c3b3a -DATA expandAVX512_40_outShufHi0+0x10(SB)/8, $0x5251504c4b4a4948 -DATA expandAVX512_40_outShufHi0+0x18(SB)/8, $0x605c5b5a59585453 -DATA expandAVX512_40_outShufHi0+0x20(SB)/8, $0x2c2b2a2964636261 -DATA expandAVX512_40_outShufHi0+0x28(SB)/8, $0x3e3d69683736352d -DATA expandAVX512_40_outShufHi0+0x30(SB)/8, $0x797847464571703f -DATA expandAVX512_40_outShufHi0+0x38(SB)/8, $0x575655ffff4f4e4d - -GLOBL expandAVX512_40_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_40_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_outShufHi1+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_outShufHi1+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_outShufHi1+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_outShufHi1+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_outShufHi1+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_40_outShufHi1+0x38(SB)/8, $0xffffff0100ffffff - -TEXT expandAVX512_40<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_40_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_40_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_40_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_40_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_40_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_40_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_40_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_40_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_40_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_40_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_40_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xe7ffffffffffffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0x1800000000000000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_44_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_44_inShuf0<>+0x00(SB)/8, $0x0101010000000000 -DATA expandAVX512_44_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_44_inShuf0<>+0x10(SB)/8, $0x0101010000000000 -DATA expandAVX512_44_inShuf0<>+0x18(SB)/8, $0x0101010000000000 -DATA expandAVX512_44_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_44_inShuf0<>+0x28(SB)/8, $0x0101010000000000 -DATA expandAVX512_44_inShuf0<>+0x30(SB)/8, $0xffffff0000000000 -DATA expandAVX512_44_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 - -GLOBL expandAVX512_44_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_44_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_44_mat0<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_44_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_44_mat0<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_44_mat0<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_44_mat0<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_44_mat0<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_44_mat0<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_44_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_44_inShuf1<>+0x00(SB)/8, $0xffffff0000000000 -DATA expandAVX512_44_inShuf1<>+0x08(SB)/8, $0xffffff0000000000 -DATA expandAVX512_44_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 -DATA expandAVX512_44_inShuf1<>+0x18(SB)/8, $0xffffff0000000000 -DATA expandAVX512_44_inShuf1<>+0x20(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_44_inShuf1<>+0x28(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_44_inShuf1<>+0x30(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_44_inShuf1<>+0x38(SB)/8, $0xff02020202020101 - -GLOBL expandAVX512_44_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_44_mat1<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_44_mat1<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_44_mat1<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_44_mat1<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_44_mat1<>+0x20(SB)/8, $0x0101010101010101 -DATA expandAVX512_44_mat1<>+0x28(SB)/8, $0x0202020202020202 -DATA expandAVX512_44_mat1<>+0x30(SB)/8, $0x0404040404040404 -DATA expandAVX512_44_mat1<>+0x38(SB)/8, $0x0808080808080808 - -GLOBL expandAVX512_44_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_44_inShuf2<>+0x00(SB)/8, $0x0202020101010101 -DATA expandAVX512_44_inShuf2<>+0x08(SB)/8, $0xffffffffffff0201 -DATA expandAVX512_44_inShuf2<>+0x10(SB)/8, $0x0202020101010101 -DATA expandAVX512_44_inShuf2<>+0x18(SB)/8, $0x0202020101010101 -DATA expandAVX512_44_inShuf2<>+0x20(SB)/8, $0xffffffffffff0201 -DATA expandAVX512_44_inShuf2<>+0x28(SB)/8, $0xffff020101010101 -DATA expandAVX512_44_inShuf2<>+0x30(SB)/8, $0xffffff0202020202 -DATA expandAVX512_44_inShuf2<>+0x38(SB)/8, $0xffffffffffffff02 - -GLOBL expandAVX512_44_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_44_mat2<>+0x00(SB)/8, $0x1010101010101010 -DATA expandAVX512_44_mat2<>+0x08(SB)/8, $0x1010101020202020 -DATA expandAVX512_44_mat2<>+0x10(SB)/8, $0x2020202020202020 -DATA expandAVX512_44_mat2<>+0x18(SB)/8, $0x4040404040404040 -DATA expandAVX512_44_mat2<>+0x20(SB)/8, $0x4040404080808080 -DATA expandAVX512_44_mat2<>+0x28(SB)/8, $0x8080808080808080 -DATA expandAVX512_44_mat2<>+0x30(SB)/8, $0x0101010101010101 -DATA expandAVX512_44_mat2<>+0x38(SB)/8, $0x0101010102020202 - -GLOBL expandAVX512_44_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_44_inShuf3<>+0x00(SB)/8, $0xffffff0202020202 -DATA expandAVX512_44_inShuf3<>+0x08(SB)/8, $0xffffff0202020202 -DATA expandAVX512_44_inShuf3<>+0x10(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_44_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202 -DATA expandAVX512_44_inShuf3<>+0x20(SB)/8, $0xffffffffffff0202 -DATA expandAVX512_44_inShuf3<>+0x28(SB)/8, $0xffffffffffff0202 -DATA expandAVX512_44_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_44_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_44_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_44_mat3<>+0x00(SB)/8, $0x0202020202020202 -DATA expandAVX512_44_mat3<>+0x08(SB)/8, $0x0404040404040404 -DATA expandAVX512_44_mat3<>+0x10(SB)/8, $0x0404040408080808 -DATA expandAVX512_44_mat3<>+0x18(SB)/8, $0x1010101010101010 -DATA expandAVX512_44_mat3<>+0x20(SB)/8, $0x2020202020202020 -DATA expandAVX512_44_mat3<>+0x28(SB)/8, $0x4040404040404040 -DATA expandAVX512_44_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_44_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_44_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_44_outShufLo+0x00(SB)/8, $0x1110080403020100 -DATA expandAVX512_44_outShufLo+0x08(SB)/8, $0x1c1b1a1918141312 -DATA expandAVX512_44_outShufLo+0x10(SB)/8, $0x31302c2b2a292820 -DATA expandAVX512_44_outShufLo+0x18(SB)/8, $0x4342414038343332 -DATA expandAVX512_44_outShufLo+0x20(SB)/8, $0x58504c4b4a494844 -DATA expandAVX512_44_outShufLo+0x28(SB)/8, $0x600706055c5b5a59 -DATA expandAVX512_44_outShufLo+0x30(SB)/8, $0x1d69681716150961 -DATA expandAVX512_44_outShufLo+0x38(SB)/8, $0x2f2e2d2171701f1e - -GLOBL expandAVX512_44_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_44_outShufHi0+0x00(SB)/8, $0x4844434241403938 -DATA expandAVX512_44_outShufHi0+0x08(SB)/8, $0x5a59585453525150 -DATA expandAVX512_44_outShufHi0+0x10(SB)/8, $0x6c6b6a6968605c5b -DATA expandAVX512_44_outShufHi0+0x18(SB)/8, $0xffff787473727170 -DATA expandAVX512_44_outShufHi0+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_44_outShufHi0+0x28(SB)/8, $0x46453e3d3c3b3aff -DATA expandAVX512_44_outShufHi0+0x30(SB)/8, $0xff57565549ffff47 -DATA expandAVX512_44_outShufHi0+0x38(SB)/8, $0x6d61ffff5f5e5dff - -GLOBL expandAVX512_44_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_44_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_44_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_44_outShufHi1+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_44_outShufHi1+0x18(SB)/8, $0x0100ffffffffffff -DATA expandAVX512_44_outShufHi1+0x20(SB)/8, $0x0c0b0a0908040302 -DATA expandAVX512_44_outShufHi1+0x28(SB)/8, $0xffffffffffffff10 -DATA expandAVX512_44_outShufHi1+0x30(SB)/8, $0x20ffffffff1918ff -DATA expandAVX512_44_outShufHi1+0x38(SB)/8, $0xffff2928ffffff21 - -TEXT expandAVX512_44<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_44_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_44_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_44_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_44_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_44_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_44_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_44_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_44_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_44_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_44_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_44_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0xce79fe003fffffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0x318601ffc0000000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_48_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_48_inShuf0<>+0x00(SB)/8, $0x0101000000000000 -DATA expandAVX512_48_inShuf0<>+0x08(SB)/8, $0x0101000000000000 -DATA expandAVX512_48_inShuf0<>+0x10(SB)/8, $0x0101000000000000 -DATA expandAVX512_48_inShuf0<>+0x18(SB)/8, $0xffff000000000000 -DATA expandAVX512_48_inShuf0<>+0x20(SB)/8, $0xffff000000000000 -DATA expandAVX512_48_inShuf0<>+0x28(SB)/8, $0xffff000000000000 -DATA expandAVX512_48_inShuf0<>+0x30(SB)/8, $0xffff000000000000 -DATA expandAVX512_48_inShuf0<>+0x38(SB)/8, $0xffff000000000000 - -GLOBL expandAVX512_48_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_48_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_48_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_48_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_48_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_48_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_48_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_48_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_48_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_48_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_48_inShuf1<>+0x00(SB)/8, $0xffffffff01010101 -DATA expandAVX512_48_inShuf1<>+0x08(SB)/8, $0xffffffff01010101 -DATA expandAVX512_48_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101 -DATA expandAVX512_48_inShuf1<>+0x18(SB)/8, $0x0202020202020101 -DATA expandAVX512_48_inShuf1<>+0x20(SB)/8, $0x0202010101010101 -DATA expandAVX512_48_inShuf1<>+0x28(SB)/8, $0x0202010101010101 -DATA expandAVX512_48_inShuf1<>+0x30(SB)/8, $0x0202010101010101 -DATA expandAVX512_48_inShuf1<>+0x38(SB)/8, $0xffff010101010101 - -GLOBL expandAVX512_48_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_48_mat1<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_48_mat1<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_48_mat1<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_48_mat1<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_48_mat1<>+0x20(SB)/8, $0x0808080808080808 -DATA expandAVX512_48_mat1<>+0x28(SB)/8, $0x1010101010101010 -DATA expandAVX512_48_mat1<>+0x30(SB)/8, $0x2020202020202020 -DATA expandAVX512_48_mat1<>+0x38(SB)/8, $0x4040404040404040 - -GLOBL expandAVX512_48_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_48_inShuf2<>+0x00(SB)/8, $0xffff010101010101 -DATA expandAVX512_48_inShuf2<>+0x08(SB)/8, $0xffff020202020202 -DATA expandAVX512_48_inShuf2<>+0x10(SB)/8, $0xffff020202020202 -DATA expandAVX512_48_inShuf2<>+0x18(SB)/8, $0xffffffff02020202 -DATA expandAVX512_48_inShuf2<>+0x20(SB)/8, $0xffffffff02020202 -DATA expandAVX512_48_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_48_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_48_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_48_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_48_mat2<>+0x00(SB)/8, $0x8080808080808080 -DATA expandAVX512_48_mat2<>+0x08(SB)/8, $0x0101010101010101 -DATA expandAVX512_48_mat2<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_48_mat2<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_48_mat2<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_48_mat2<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_48_mat2<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_48_mat2<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_48_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_48_outShufLo+0x00(SB)/8, $0x0908050403020100 -DATA expandAVX512_48_outShufLo+0x08(SB)/8, $0x131211100d0c0b0a -DATA expandAVX512_48_outShufLo+0x10(SB)/8, $0x1d1c1b1a19181514 -DATA expandAVX512_48_outShufLo+0x18(SB)/8, $0x2928252423222120 -DATA expandAVX512_48_outShufLo+0x20(SB)/8, $0x333231302d2c2b2a -DATA expandAVX512_48_outShufLo+0x28(SB)/8, $0x3d3c3b3a39383534 -DATA expandAVX512_48_outShufLo+0x30(SB)/8, $0x0f0e434241400706 -DATA expandAVX512_48_outShufLo+0x38(SB)/8, $0x515017164b4a4948 - -GLOBL expandAVX512_48_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_48_outShufHi+0x00(SB)/8, $0x2524232221201918 -DATA expandAVX512_48_outShufHi+0x08(SB)/8, $0x31302d2c2b2a2928 -DATA expandAVX512_48_outShufHi+0x10(SB)/8, $0x3b3a393835343332 -DATA expandAVX512_48_outShufHi+0x18(SB)/8, $0x4544434241403d3c -DATA expandAVX512_48_outShufHi+0x20(SB)/8, $0x51504d4c4b4a4948 -DATA expandAVX512_48_outShufHi+0x28(SB)/8, $0x1d1c1b1a55545352 -DATA expandAVX512_48_outShufHi+0x30(SB)/8, $0x5b5a595827261f1e -DATA expandAVX512_48_outShufHi+0x38(SB)/8, $0x3736636261602f2e - -TEXT expandAVX512_48<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_48_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_48_inShuf1<>(SB), Z3 - VMOVDQU64 expandAVX512_48_inShuf2<>(SB), Z4 - VMOVDQU64 expandAVX512_48_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_48_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z5 - VPERMB Z5, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_48_mat0<>(SB), Z0, Z0 - VPERMB Z5, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_48_mat1<>(SB), Z3, Z3 - VPERMB Z5, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_48_mat2<>(SB), Z4, Z4 - VPERMI2B Z3, Z0, Z1 - VPERMI2B Z4, Z3, Z2 - RET - -GLOBL expandAVX512_52_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_52_inShuf0<>+0x00(SB)/8, $0x0101000000000000 -DATA expandAVX512_52_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 -DATA expandAVX512_52_inShuf0<>+0x10(SB)/8, $0x0101000000000000 -DATA expandAVX512_52_inShuf0<>+0x18(SB)/8, $0xffff000000000000 -DATA expandAVX512_52_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00 -DATA expandAVX512_52_inShuf0<>+0x28(SB)/8, $0xffff000000000000 -DATA expandAVX512_52_inShuf0<>+0x30(SB)/8, $0xffff000000000000 -DATA expandAVX512_52_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 - -GLOBL expandAVX512_52_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_52_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_52_mat0<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_52_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_52_mat0<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_52_mat0<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_52_mat0<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_52_mat0<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_52_mat0<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_52_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_52_inShuf1<>+0x00(SB)/8, $0xffff000000000000 -DATA expandAVX512_52_inShuf1<>+0x08(SB)/8, $0xffff000000000000 -DATA expandAVX512_52_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 -DATA expandAVX512_52_inShuf1<>+0x18(SB)/8, $0xffff000000000000 -DATA expandAVX512_52_inShuf1<>+0x20(SB)/8, $0xffffffff01010101 -DATA expandAVX512_52_inShuf1<>+0x28(SB)/8, $0xffffffffff010101 -DATA expandAVX512_52_inShuf1<>+0x30(SB)/8, $0xff02020202020201 -DATA expandAVX512_52_inShuf1<>+0x38(SB)/8, $0x0202010101010101 - -GLOBL expandAVX512_52_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_52_mat1<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_52_mat1<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_52_mat1<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_52_mat1<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_52_mat1<>+0x20(SB)/8, $0x0101010101010101 -DATA expandAVX512_52_mat1<>+0x28(SB)/8, $0x0202020202020202 -DATA expandAVX512_52_mat1<>+0x30(SB)/8, $0x0202020202020202 -DATA expandAVX512_52_mat1<>+0x38(SB)/8, $0x0404040404040404 - -GLOBL expandAVX512_52_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_52_inShuf2<>+0x00(SB)/8, $0xffffffffffff0201 -DATA expandAVX512_52_inShuf2<>+0x08(SB)/8, $0x0202010101010101 -DATA expandAVX512_52_inShuf2<>+0x10(SB)/8, $0xffff010101010101 -DATA expandAVX512_52_inShuf2<>+0x18(SB)/8, $0xffffffffffffff01 -DATA expandAVX512_52_inShuf2<>+0x20(SB)/8, $0xffff010101010101 -DATA expandAVX512_52_inShuf2<>+0x28(SB)/8, $0xffff010101010101 -DATA expandAVX512_52_inShuf2<>+0x30(SB)/8, $0xffffffffffffff01 -DATA expandAVX512_52_inShuf2<>+0x38(SB)/8, $0xffff010101010101 - -GLOBL expandAVX512_52_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_52_mat2<>+0x00(SB)/8, $0x0404040408080808 -DATA expandAVX512_52_mat2<>+0x08(SB)/8, $0x0808080808080808 -DATA expandAVX512_52_mat2<>+0x10(SB)/8, $0x1010101010101010 -DATA expandAVX512_52_mat2<>+0x18(SB)/8, $0x1010101020202020 -DATA expandAVX512_52_mat2<>+0x20(SB)/8, $0x2020202020202020 -DATA expandAVX512_52_mat2<>+0x28(SB)/8, $0x4040404040404040 -DATA expandAVX512_52_mat2<>+0x30(SB)/8, $0x4040404080808080 -DATA expandAVX512_52_mat2<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_52_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_52_inShuf3<>+0x00(SB)/8, $0xffff020202020202 -DATA expandAVX512_52_inShuf3<>+0x08(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_52_inShuf3<>+0x10(SB)/8, $0xffffffff02020202 -DATA expandAVX512_52_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202 -DATA expandAVX512_52_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_52_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_52_mat3<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_52_mat3<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_52_mat3<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_52_mat3<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_52_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_52_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_52_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_52_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_52_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_52_outShufLo+0x00(SB)/8, $0x1008050403020100 -DATA expandAVX512_52_outShufLo+0x08(SB)/8, $0x1a19181514131211 -DATA expandAVX512_52_outShufLo+0x10(SB)/8, $0x2b2a2928201d1c1b -DATA expandAVX512_52_outShufLo+0x18(SB)/8, $0x3534333231302d2c -DATA expandAVX512_52_outShufLo+0x20(SB)/8, $0x4845444342414038 -DATA expandAVX512_52_outShufLo+0x28(SB)/8, $0x5958504d4c4b4a49 -DATA expandAVX512_52_outShufLo+0x30(SB)/8, $0x616007065d5c5b5a -DATA expandAVX512_52_outShufLo+0x38(SB)/8, $0x6a69681716096362 - -GLOBL expandAVX512_52_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_52_outShufHi0+0x00(SB)/8, $0x403d3c3b3a393830 -DATA expandAVX512_52_outShufHi0+0x08(SB)/8, $0x51504d4c4b4a4948 -DATA expandAVX512_52_outShufHi0+0x10(SB)/8, $0x6261605855545352 -DATA expandAVX512_52_outShufHi0+0x18(SB)/8, $0x6c6b6a6968656463 -DATA expandAVX512_52_outShufHi0+0x20(SB)/8, $0x7d7c7b7a7978706d -DATA expandAVX512_52_outShufHi0+0x28(SB)/8, $0x31ffffffffffffff -DATA expandAVX512_52_outShufHi0+0x30(SB)/8, $0xff3f3e3635343332 -DATA expandAVX512_52_outShufHi0+0x38(SB)/8, $0xffff4f4e41ffffff - -GLOBL expandAVX512_52_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_52_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_outShufHi1+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_outShufHi1+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_outShufHi1+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_52_outShufHi1+0x28(SB)/8, $0xff08050403020100 -DATA expandAVX512_52_outShufHi1+0x30(SB)/8, $0x10ffffffffffffff -DATA expandAVX512_52_outShufHi1+0x38(SB)/8, $0x1918ffffff131211 - -TEXT expandAVX512_52<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_52_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_52_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_52_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_52_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_52_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_52_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_52_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_52_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_52_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_52_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_52_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0x387f80ffffffffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0xc7807f0000000000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_56_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_56_inShuf0<>+0x00(SB)/8, $0x0100000000000000 -DATA expandAVX512_56_inShuf0<>+0x08(SB)/8, $0x0100000000000000 -DATA expandAVX512_56_inShuf0<>+0x10(SB)/8, $0xff00000000000000 -DATA expandAVX512_56_inShuf0<>+0x18(SB)/8, $0xff00000000000000 -DATA expandAVX512_56_inShuf0<>+0x20(SB)/8, $0xff00000000000000 -DATA expandAVX512_56_inShuf0<>+0x28(SB)/8, $0xff00000000000000 -DATA expandAVX512_56_inShuf0<>+0x30(SB)/8, $0xff00000000000000 -DATA expandAVX512_56_inShuf0<>+0x38(SB)/8, $0xff00000000000000 - -GLOBL expandAVX512_56_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_56_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_56_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_56_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_56_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_56_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_56_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_56_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_56_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_56_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_56_inShuf1<>+0x00(SB)/8, $0xffff010101010101 -DATA expandAVX512_56_inShuf1<>+0x08(SB)/8, $0x0202010101010101 -DATA expandAVX512_56_inShuf1<>+0x10(SB)/8, $0x0201010101010101 -DATA expandAVX512_56_inShuf1<>+0x18(SB)/8, $0xff01010101010101 -DATA expandAVX512_56_inShuf1<>+0x20(SB)/8, $0xff01010101010101 -DATA expandAVX512_56_inShuf1<>+0x28(SB)/8, $0xff01010101010101 -DATA expandAVX512_56_inShuf1<>+0x30(SB)/8, $0xff01010101010101 -DATA expandAVX512_56_inShuf1<>+0x38(SB)/8, $0xff01010101010101 - -GLOBL expandAVX512_56_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_56_inShuf2<>+0x00(SB)/8, $0xff02020202020202 -DATA expandAVX512_56_inShuf2<>+0x08(SB)/8, $0xffffff0202020202 -DATA expandAVX512_56_inShuf2<>+0x10(SB)/8, $0xffffffffffffff02 -DATA expandAVX512_56_inShuf2<>+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_56_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_56_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_56_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_56_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_56_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_56_mat2<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_56_mat2<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_56_mat2<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_56_mat2<>+0x18(SB)/8, $0x0000000000000000 -DATA expandAVX512_56_mat2<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_56_mat2<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_56_mat2<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_56_mat2<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_56_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_56_outShufLo+0x00(SB)/8, $0x0806050403020100 -DATA expandAVX512_56_outShufLo+0x08(SB)/8, $0x11100e0d0c0b0a09 -DATA expandAVX512_56_outShufLo+0x10(SB)/8, $0x1a19181615141312 -DATA expandAVX512_56_outShufLo+0x18(SB)/8, $0x232221201e1d1c1b -DATA expandAVX512_56_outShufLo+0x20(SB)/8, $0x2c2b2a2928262524 -DATA expandAVX512_56_outShufLo+0x28(SB)/8, $0x3534333231302e2d -DATA expandAVX512_56_outShufLo+0x30(SB)/8, $0x3e3d3c3b3a393836 -DATA expandAVX512_56_outShufLo+0x38(SB)/8, $0x0f45444342414007 - -GLOBL expandAVX512_56_outShufHi(SB), RODATA, $0x40 -DATA expandAVX512_56_outShufHi+0x00(SB)/8, $0x11100d0c0b0a0908 -DATA expandAVX512_56_outShufHi+0x08(SB)/8, $0x1a19181615141312 -DATA expandAVX512_56_outShufHi+0x10(SB)/8, $0x232221201e1d1c1b -DATA expandAVX512_56_outShufHi+0x18(SB)/8, $0x2c2b2a2928262524 -DATA expandAVX512_56_outShufHi+0x20(SB)/8, $0x3534333231302e2d -DATA expandAVX512_56_outShufHi+0x28(SB)/8, $0x3e3d3c3b3a393836 -DATA expandAVX512_56_outShufHi+0x30(SB)/8, $0x0e46454443424140 -DATA expandAVX512_56_outShufHi+0x38(SB)/8, $0x50174c4b4a49480f - -TEXT expandAVX512_56<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_56_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_56_mat0<>(SB), Z3 - VMOVDQU64 expandAVX512_56_inShuf1<>(SB), Z4 - VMOVDQU64 expandAVX512_56_inShuf2<>(SB), Z5 - VMOVDQU64 expandAVX512_56_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_56_outShufHi(SB), Z2 - VMOVDQU64 (AX), Z6 - VPERMB Z6, Z0, Z0 - VGF2P8AFFINEQB $0, Z3, Z0, Z0 - VPERMB Z6, Z4, Z4 - VGF2P8AFFINEQB $0, Z3, Z4, Z3 - VPERMB Z6, Z5, Z4 - VGF2P8AFFINEQB $0, expandAVX512_56_mat2<>(SB), Z4, Z4 - VPERMI2B Z3, Z0, Z1 - VPERMI2B Z4, Z3, Z2 - RET - -GLOBL expandAVX512_60_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_60_inShuf0<>+0x00(SB)/8, $0x0100000000000000 -DATA expandAVX512_60_inShuf0<>+0x08(SB)/8, $0xffffffffffffff00 -DATA expandAVX512_60_inShuf0<>+0x10(SB)/8, $0xff00000000000000 -DATA expandAVX512_60_inShuf0<>+0x18(SB)/8, $0xff00000000000000 -DATA expandAVX512_60_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00 -DATA expandAVX512_60_inShuf0<>+0x28(SB)/8, $0xff00000000000000 -DATA expandAVX512_60_inShuf0<>+0x30(SB)/8, $0xff00000000000000 -DATA expandAVX512_60_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 - -GLOBL expandAVX512_60_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_60_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_60_mat0<>+0x08(SB)/8, $0x0101010102020202 -DATA expandAVX512_60_mat0<>+0x10(SB)/8, $0x0202020202020202 -DATA expandAVX512_60_mat0<>+0x18(SB)/8, $0x0404040404040404 -DATA expandAVX512_60_mat0<>+0x20(SB)/8, $0x0404040408080808 -DATA expandAVX512_60_mat0<>+0x28(SB)/8, $0x0808080808080808 -DATA expandAVX512_60_mat0<>+0x30(SB)/8, $0x1010101010101010 -DATA expandAVX512_60_mat0<>+0x38(SB)/8, $0x1010101020202020 - -GLOBL expandAVX512_60_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_60_inShuf1<>+0x00(SB)/8, $0xff00000000000000 -DATA expandAVX512_60_inShuf1<>+0x08(SB)/8, $0xff00000000000000 -DATA expandAVX512_60_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 -DATA expandAVX512_60_inShuf1<>+0x18(SB)/8, $0xff00000000000000 -DATA expandAVX512_60_inShuf1<>+0x20(SB)/8, $0xffffffffff010101 -DATA expandAVX512_60_inShuf1<>+0x28(SB)/8, $0x0202020202010101 -DATA expandAVX512_60_inShuf1<>+0x30(SB)/8, $0xffffffffffff0201 -DATA expandAVX512_60_inShuf1<>+0x38(SB)/8, $0xff01010101010101 - -GLOBL expandAVX512_60_mat1<>(SB), RODATA, $0x40 -DATA expandAVX512_60_mat1<>+0x00(SB)/8, $0x2020202020202020 -DATA expandAVX512_60_mat1<>+0x08(SB)/8, $0x4040404040404040 -DATA expandAVX512_60_mat1<>+0x10(SB)/8, $0x4040404080808080 -DATA expandAVX512_60_mat1<>+0x18(SB)/8, $0x8080808080808080 -DATA expandAVX512_60_mat1<>+0x20(SB)/8, $0x0101010101010101 -DATA expandAVX512_60_mat1<>+0x28(SB)/8, $0x0101010101010101 -DATA expandAVX512_60_mat1<>+0x30(SB)/8, $0x0101010102020202 -DATA expandAVX512_60_mat1<>+0x38(SB)/8, $0x0202020202020202 - -GLOBL expandAVX512_60_inShuf2<>(SB), RODATA, $0x40 -DATA expandAVX512_60_inShuf2<>+0x00(SB)/8, $0xff01010101010101 -DATA expandAVX512_60_inShuf2<>+0x08(SB)/8, $0xffffffffffffff01 -DATA expandAVX512_60_inShuf2<>+0x10(SB)/8, $0xff01010101010101 -DATA expandAVX512_60_inShuf2<>+0x18(SB)/8, $0xff01010101010101 -DATA expandAVX512_60_inShuf2<>+0x20(SB)/8, $0xffffffffffffff01 -DATA expandAVX512_60_inShuf2<>+0x28(SB)/8, $0xff01010101010101 -DATA expandAVX512_60_inShuf2<>+0x30(SB)/8, $0xff01010101010101 -DATA expandAVX512_60_inShuf2<>+0x38(SB)/8, $0xffffffffffffff01 - -GLOBL expandAVX512_60_mat2<>(SB), RODATA, $0x40 -DATA expandAVX512_60_mat2<>+0x00(SB)/8, $0x0404040404040404 -DATA expandAVX512_60_mat2<>+0x08(SB)/8, $0x0404040408080808 -DATA expandAVX512_60_mat2<>+0x10(SB)/8, $0x0808080808080808 -DATA expandAVX512_60_mat2<>+0x18(SB)/8, $0x1010101010101010 -DATA expandAVX512_60_mat2<>+0x20(SB)/8, $0x1010101020202020 -DATA expandAVX512_60_mat2<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_60_mat2<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_60_mat2<>+0x38(SB)/8, $0x4040404080808080 - -GLOBL expandAVX512_60_inShuf3<>(SB), RODATA, $0x40 -DATA expandAVX512_60_inShuf3<>+0x00(SB)/8, $0xff01010101010101 -DATA expandAVX512_60_inShuf3<>+0x08(SB)/8, $0xffffffffffff0202 -DATA expandAVX512_60_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff - -GLOBL expandAVX512_60_mat3<>(SB), RODATA, $0x40 -DATA expandAVX512_60_mat3<>+0x00(SB)/8, $0x8080808080808080 -DATA expandAVX512_60_mat3<>+0x08(SB)/8, $0x0101010101010101 -DATA expandAVX512_60_mat3<>+0x10(SB)/8, $0x0000000000000000 -DATA expandAVX512_60_mat3<>+0x18(SB)/8, $0x0000000000000000 -DATA expandAVX512_60_mat3<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_60_mat3<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_60_mat3<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_60_mat3<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_60_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_60_outShufLo+0x00(SB)/8, $0x0806050403020100 -DATA expandAVX512_60_outShufLo+0x08(SB)/8, $0x1816151413121110 -DATA expandAVX512_60_outShufLo+0x10(SB)/8, $0x28201e1d1c1b1a19 -DATA expandAVX512_60_outShufLo+0x18(SB)/8, $0x31302e2d2c2b2a29 -DATA expandAVX512_60_outShufLo+0x20(SB)/8, $0x4140383635343332 -DATA expandAVX512_60_outShufLo+0x28(SB)/8, $0x4a49484645444342 -DATA expandAVX512_60_outShufLo+0x30(SB)/8, $0x5a5958504e4d4c4b -DATA expandAVX512_60_outShufLo+0x38(SB)/8, $0x626160075e5d5c5b - -GLOBL expandAVX512_60_outShufHi0(SB), RODATA, $0x40 -DATA expandAVX512_60_outShufHi0+0x00(SB)/8, $0x3b3a3938302a2928 -DATA expandAVX512_60_outShufHi0+0x08(SB)/8, $0x44434241403e3d3c -DATA expandAVX512_60_outShufHi0+0x10(SB)/8, $0x5453525150484645 -DATA expandAVX512_60_outShufHi0+0x18(SB)/8, $0x5d5c5b5a59585655 -DATA expandAVX512_60_outShufHi0+0x20(SB)/8, $0x6d6c6b6a6968605e -DATA expandAVX512_60_outShufHi0+0x28(SB)/8, $0x767574737271706e -DATA expandAVX512_60_outShufHi0+0x30(SB)/8, $0xffffffffffffff78 -DATA expandAVX512_60_outShufHi0+0x38(SB)/8, $0x31ffff2f2e2d2c2b - -GLOBL expandAVX512_60_outShufHi1(SB), RODATA, $0x40 -DATA expandAVX512_60_outShufHi1+0x00(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_outShufHi1+0x08(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_outShufHi1+0x10(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_outShufHi1+0x18(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_outShufHi1+0x20(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_outShufHi1+0x28(SB)/8, $0xffffffffffffffff -DATA expandAVX512_60_outShufHi1+0x30(SB)/8, $0x06050403020100ff -DATA expandAVX512_60_outShufHi1+0x38(SB)/8, $0xff0908ffffffffff - -TEXT expandAVX512_60<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_60_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_60_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_60_inShuf2<>(SB), Z3 - VMOVDQU64 expandAVX512_60_inShuf3<>(SB), Z4 - VMOVDQU64 expandAVX512_60_outShufLo(SB), Z1 - VMOVDQU64 expandAVX512_60_outShufHi0(SB), Z5 - VMOVDQU64 expandAVX512_60_outShufHi1(SB), Z6 - VMOVDQU64 (AX), Z7 - VPERMB Z7, Z0, Z0 - VGF2P8AFFINEQB $0, expandAVX512_60_mat0<>(SB), Z0, Z0 - VPERMB Z7, Z2, Z2 - VGF2P8AFFINEQB $0, expandAVX512_60_mat1<>(SB), Z2, Z2 - VPERMB Z7, Z3, Z3 - VGF2P8AFFINEQB $0, expandAVX512_60_mat2<>(SB), Z3, Z3 - VPERMB Z7, Z4, Z4 - VGF2P8AFFINEQB $0, expandAVX512_60_mat3<>(SB), Z4, Z4 - VPERMI2B Z2, Z0, Z1 - MOVQ $0x9f01ffffffffffff, AX - KMOVQ AX, K1 - VPERMI2B.Z Z3, Z2, K1, Z5 - MOVQ $0x60fe000000000000, AX - KMOVQ AX, K1 - VPERMB.Z Z4, Z6, K1, Z0 - VPORQ Z0, Z5, Z2 - RET - -GLOBL expandAVX512_64_inShuf0<>(SB), RODATA, $0x40 -DATA expandAVX512_64_inShuf0<>+0x00(SB)/8, $0x0000000000000000 -DATA expandAVX512_64_inShuf0<>+0x08(SB)/8, $0x0000000000000000 -DATA expandAVX512_64_inShuf0<>+0x10(SB)/8, $0x0000000000000000 -DATA expandAVX512_64_inShuf0<>+0x18(SB)/8, $0x0000000000000000 -DATA expandAVX512_64_inShuf0<>+0x20(SB)/8, $0x0000000000000000 -DATA expandAVX512_64_inShuf0<>+0x28(SB)/8, $0x0000000000000000 -DATA expandAVX512_64_inShuf0<>+0x30(SB)/8, $0x0000000000000000 -DATA expandAVX512_64_inShuf0<>+0x38(SB)/8, $0x0000000000000000 - -GLOBL expandAVX512_64_mat0<>(SB), RODATA, $0x40 -DATA expandAVX512_64_mat0<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_mat0<>+0x08(SB)/8, $0x0202020202020202 -DATA expandAVX512_64_mat0<>+0x10(SB)/8, $0x0404040404040404 -DATA expandAVX512_64_mat0<>+0x18(SB)/8, $0x0808080808080808 -DATA expandAVX512_64_mat0<>+0x20(SB)/8, $0x1010101010101010 -DATA expandAVX512_64_mat0<>+0x28(SB)/8, $0x2020202020202020 -DATA expandAVX512_64_mat0<>+0x30(SB)/8, $0x4040404040404040 -DATA expandAVX512_64_mat0<>+0x38(SB)/8, $0x8080808080808080 - -GLOBL expandAVX512_64_inShuf1<>(SB), RODATA, $0x40 -DATA expandAVX512_64_inShuf1<>+0x00(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_inShuf1<>+0x08(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_inShuf1<>+0x10(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_inShuf1<>+0x18(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_inShuf1<>+0x20(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_inShuf1<>+0x28(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_inShuf1<>+0x30(SB)/8, $0x0101010101010101 -DATA expandAVX512_64_inShuf1<>+0x38(SB)/8, $0x0101010101010101 - -GLOBL expandAVX512_64_outShufLo(SB), RODATA, $0x40 -DATA expandAVX512_64_outShufLo+0x00(SB)/8, $0x0706050403020100 -DATA expandAVX512_64_outShufLo+0x08(SB)/8, $0x0f0e0d0c0b0a0908 -DATA expandAVX512_64_outShufLo+0x10(SB)/8, $0x1716151413121110 -DATA expandAVX512_64_outShufLo+0x18(SB)/8, $0x1f1e1d1c1b1a1918 -DATA expandAVX512_64_outShufLo+0x20(SB)/8, $0x2726252423222120 -DATA expandAVX512_64_outShufLo+0x28(SB)/8, $0x2f2e2d2c2b2a2928 -DATA expandAVX512_64_outShufLo+0x30(SB)/8, $0x3736353433323130 -DATA expandAVX512_64_outShufLo+0x38(SB)/8, $0x3f3e3d3c3b3a3938 - -TEXT expandAVX512_64<>(SB), NOSPLIT, $0-0 - VMOVDQU64 expandAVX512_64_inShuf0<>(SB), Z0 - VMOVDQU64 expandAVX512_64_mat0<>(SB), Z1 - VMOVDQU64 expandAVX512_64_inShuf1<>(SB), Z2 - VMOVDQU64 expandAVX512_64_outShufLo(SB), Z3 - VMOVDQU64 (AX), Z4 - VPERMB Z4, Z0, Z0 - VGF2P8AFFINEQB $0, Z1, Z0, Z0 - VPERMB Z4, Z2, Z2 - VGF2P8AFFINEQB $0, Z1, Z2, Z2 - VPERMB Z0, Z3, Z1 - VPERMB Z2, Z3, Z2 - RET - diff --git a/src/internal/runtime/gc/scan/expand_amd64_test.go b/src/internal/runtime/gc/scan/expand_amd64_test.go index a8f5b88c5c..89736f21da 100644 --- a/src/internal/runtime/gc/scan/expand_amd64_test.go +++ b/src/internal/runtime/gc/scan/expand_amd64_test.go @@ -11,9 +11,9 @@ import ( "testing" ) -func TestExpandAVX512(t *testing.T) { +func TestExpandAVX512Asm(t *testing.T) { if !scan.CanAVX512() { t.Skip("no AVX512") } - testExpand(t, scan.ExpandAVX512) + testExpand(t, scan.ExpandAVX512Asm) } diff --git a/src/internal/runtime/gc/scan/expand_simd_amd64_test.go b/src/internal/runtime/gc/scan/expand_simd_amd64_test.go new file mode 100644 index 0000000000..28f3147787 --- /dev/null +++ b/src/internal/runtime/gc/scan/expand_simd_amd64_test.go @@ -0,0 +1,19 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 && goexperiment.simd + +package scan_test + +import ( + "internal/runtime/gc/scan" + "testing" +) + +func TestExpandAVX512(t *testing.T) { + if !scan.CanAVX512() { + t.Skip("no AVX512") + } + testExpand(t, scan.ExpandAVX512) +} diff --git a/src/internal/runtime/gc/scan/expand_test.go b/src/internal/runtime/gc/scan/expand_test.go index 692817d8b2..2e75574bab 100644 --- a/src/internal/runtime/gc/scan/expand_test.go +++ b/src/internal/runtime/gc/scan/expand_test.go @@ -23,7 +23,7 @@ func testExpand(t *testing.T, expF expandFunc) { for i := range want { if got[i] != want[i] { - t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize) + t.Errorf("expansion differs from reference at bit %d, sizeClass=%d", i*goarch.PtrSize, sizeClass) if goarch.PtrSize == 4 { t.Logf("got: %032b", got[i]) t.Logf("want: %032b", want[i]) diff --git a/src/internal/runtime/gc/scan/expanders_amd64.go b/src/internal/runtime/gc/scan/expanders_amd64.go new file mode 100644 index 0000000000..878dc5f9f4 --- /dev/null +++ b/src/internal/runtime/gc/scan/expanders_amd64.go @@ -0,0 +1,1530 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package scan + +import ( + "simd" + "unsafe" +) + +var gcExpandersAVX512 = [68]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){ + nil, + expandAVX512_1, + expandAVX512_2, + expandAVX512_3, + expandAVX512_4, + expandAVX512_6, + expandAVX512_8, + expandAVX512_10, + expandAVX512_12, + expandAVX512_14, + expandAVX512_16, + expandAVX512_18, + expandAVX512_20, + expandAVX512_22, + expandAVX512_24, + expandAVX512_26, + expandAVX512_28, + expandAVX512_30, + expandAVX512_32, + expandAVX512_36, + expandAVX512_40, + expandAVX512_44, + expandAVX512_48, + expandAVX512_52, + expandAVX512_56, + expandAVX512_60, + expandAVX512_64, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, + nil, +} + +func expandAVX512_1(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src) + 64))).AsUint8x64() + return x.AsUint64x8(), y.AsUint64x8() +} + +var expandAVX512_2_mat0 = [8]uint64{ + 0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080, + 0x0101020204040808, 0x1010202040408080, 0x0101020204040808, 0x1010202040408080, +} +var expandAVX512_2_inShuf0 = [8]uint64{ + 0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, + 0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, +} +var expandAVX512_2_inShuf1 = [8]uint64{ + 0x2726252423222120, 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x2f2e2d2c2b2a2928, + 0x3736353433323130, 0x3736353433323130, 0x3f3e3d3c3b3a3938, 0x3f3e3d3c3b3a3938, +} +var expandAVX512_2_outShufLo = [8]uint64{ + 0x0b030a0209010800, 0x0f070e060d050c04, 0x1b131a1219111810, 0x1f171e161d151c14, + 0x2b232a2229212820, 0x2f272e262d252c24, 0x3b333a3239313830, 0x3f373e363d353c34, +} + +func expandAVX512_2(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_2_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_2_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_2_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_2_outShufLo).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v4.Permute(v8) + v10 := v7.Permute(v8) + return v9.AsUint64x8(), v10.AsUint64x8() +} + +var expandAVX512_3_mat0 = [8]uint64{ + 0x0101010202020404, 0x0408080810101020, 0x2020404040808080, 0x0101010202020404, + 0x0408080810101020, 0x2020404040808080, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_3_inShuf0 = [8]uint64{ + 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0f0e0d0c0b0a0908, + 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_3_inShuf1 = [8]uint64{ + 0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1f1e1d1c1b1a1918, + 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_3_inShuf2 = [8]uint64{ + 0x2726252423222120, 0x2726252423222120, 0x2726252423222120, 0xffffffffff2a2928, + 0xffffffffff2a2928, 0xffffffffffff2928, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_3_outShufLo = [8]uint64{ + 0x0a02110901100800, 0x05140c04130b0312, 0x170f07160e06150d, 0x221a292119282018, + 0x1d2c241c2b231b2a, 0x2f271f2e261e2d25, 0x4a42514941504840, 0x45544c44534b4352, +} +var expandAVX512_3_outShufHi = [8]uint64{ + 0x170f07160e06150d, 0x221a292119282018, 0x1d2c241c2b231b2a, 0x2f271f2e261e2d25, + 0x4a42514941504840, 0x45544c44534b4352, 0x574f47564e46554d, 0x625a696159686058, +} + +func expandAVX512_3(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_3_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_3_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_3_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_3_inShuf2).AsUint8x64() + v11 := simd.LoadUint64x8(&expandAVX512_3_outShufLo).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_3_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v0.Permute(v8) + v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v12 := v4.ConcatPermute(v7, v11) + v14 := v7.ConcatPermute(v10, v13) + return v12.AsUint64x8(), v14.AsUint64x8() +} + +var expandAVX512_4_mat0 = [8]uint64{ + 0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080, + 0x0101010102020202, 0x0404040408080808, 0x1010101020202020, 0x4040404080808080, +} +var expandAVX512_4_inShuf0 = [8]uint64{ + 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, + 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, +} +var expandAVX512_4_inShuf1 = [8]uint64{ + 0x1716151413121110, 0x1716151413121110, 0x1716151413121110, 0x1716151413121110, + 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, 0x1f1e1d1c1b1a1918, +} +var expandAVX512_4_outShufLo = [8]uint64{ + 0x1911090118100800, 0x1b130b031a120a02, 0x1d150d051c140c04, 0x1f170f071e160e06, + 0x3931292138302820, 0x3b332b233a322a22, 0x3d352d253c342c24, 0x3f372f273e362e26, +} + +func expandAVX512_4(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_4_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_4_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_4_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_4_outShufLo).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v4.Permute(v8) + v10 := v7.Permute(v8) + return v9.AsUint64x8(), v10.AsUint64x8() +} + +var expandAVX512_6_mat0 = [8]uint64{ + 0x0101010101010202, 0x0202020204040404, 0x0404080808080808, 0x1010101010102020, + 0x2020202040404040, 0x4040808080808080, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_6_inShuf0 = [8]uint64{ + 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, + 0x0706050403020100, 0x0706050403020100, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_6_inShuf1 = [8]uint64{ + 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, + 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_6_inShuf2 = [8]uint64{ + 0xffff151413121110, 0xffff151413121110, 0xffffff1413121110, 0xffffff1413121110, + 0xffffff1413121110, 0xffffff1413121110, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_6_outShufLo = [8]uint64{ + 0x0901282018100800, 0x1a120a0229211911, 0x2b231b130b032a22, 0x0d052c241c140c04, + 0x1e160e062d251d15, 0x2f271f170f072e26, 0x4941686058504840, 0x5a524a4269615951, +} +var expandAVX512_6_outShufHi = [8]uint64{ + 0x2b231b130b032a22, 0x0d052c241c140c04, 0x1e160e062d251d15, 0x2f271f170f072e26, + 0x4941686058504840, 0x5a524a4269615951, 0x6b635b534b436a62, 0x4d456c645c544c44, +} + +func expandAVX512_6(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_6_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_6_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_6_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_6_inShuf2).AsUint8x64() + v11 := simd.LoadUint64x8(&expandAVX512_6_outShufLo).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_6_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v0.Permute(v8) + v10 := v9.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v12 := v4.ConcatPermute(v7, v11) + v14 := v7.ConcatPermute(v10, v13) + return v12.AsUint64x8(), v14.AsUint64x8() +} + +var expandAVX512_8_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_8_inShuf0 = [8]uint64{ + 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, + 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, 0x0706050403020100, +} +var expandAVX512_8_inShuf1 = [8]uint64{ + 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, + 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, 0x0f0e0d0c0b0a0908, +} +var expandAVX512_8_outShufLo = [8]uint64{ + 0x3830282018100800, 0x3931292119110901, 0x3a322a221a120a02, 0x3b332b231b130b03, + 0x3c342c241c140c04, 0x3d352d251d150d05, 0x3e362e261e160e06, 0x3f372f271f170f07, +} + +func expandAVX512_8(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_8_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_8_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_8_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_8_outShufLo).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v4.Permute(v8) + v10 := v7.Permute(v8) + return v9.AsUint64x8(), v10.AsUint64x8() +} + +var expandAVX512_10_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808, + 0x0808080808080808, 0x1010101010101010, 0x1010202020202020, 0x2020202040404040, +} +var expandAVX512_10_inShuf0 = [8]uint64{ + 0xff06050403020100, 0xff06050403020100, 0xff06050403020100, 0xff06050403020100, + 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, +} +var expandAVX512_10_mat1 = [8]uint64{ + 0x4040404040408080, 0x8080808080808080, 0x0808080808080808, 0x1010101010101010, + 0x1010202020202020, 0x2020202040404040, 0x4040404040408080, 0x8080808080808080, +} +var expandAVX512_10_inShuf1 = [8]uint64{ + 0xffff050403020100, 0xffff050403020100, 0xff0c0b0a09080706, 0xff0c0b0a09080706, + 0xff0c0b0a09080706, 0xff0c0b0a09080706, 0xffff0b0a09080706, 0xffff0b0a09080706, +} +var expandAVX512_10_mat2 = [8]uint64{ + 0x0101010101010101, 0x0101020202020202, 0x0202020204040404, 0x0404040404040808, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_10_inShuf2 = [8]uint64{ + 0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807, 0xffff0c0b0a090807, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_10_outShufLo = [8]uint64{ + 0x3830282018100800, 0x2921191109014840, 0x1a120a0249413931, 0x0b034a423a322a22, + 0x4b433b332b231b13, 0x3c342c241c140c04, 0x2d251d150d054c44, 0x1e160e064d453d35, +} +var expandAVX512_10_outShufHi = [8]uint64{ + 0x4840383028201810, 0x3931292119115850, 0x2a221a1259514941, 0x1b135a524a423a32, + 0x5b534b433b332b23, 0x4c443c342c241c14, 0x3d352d251d155c54, 0x2e261e165d554d45, +} + +func expandAVX512_10(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_10_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_10_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_10_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_10_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_10_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_10_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_10_outShufLo).AsUint8x64() + v15 := simd.LoadUint64x8(&expandAVX512_10_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v14 := v4.ConcatPermute(v8, v13) + v16 := v8.ConcatPermute(v12, v15) + return v14.AsUint64x8(), v16.AsUint64x8() +} + +var expandAVX512_12_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_12_inShuf0 = [8]uint64{ + 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, 0xffff050403020100, + 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, +} +var expandAVX512_12_mat1 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_12_inShuf1 = [8]uint64{ + 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, + 0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605, 0xffff0a0908070605, +} +var expandAVX512_12_mat2 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, +} +var expandAVX512_12_inShuf2 = [8]uint64{ + 0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605, 0xffffff0908070605, + 0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706, 0xffffff0a09080706, +} +var expandAVX512_12_outShufLo = [8]uint64{ + 0x3830282018100800, 0x1911090158504840, 0x5951494139312921, 0x3a322a221a120a02, + 0x1b130b035a524a42, 0x5b534b433b332b23, 0x3c342c241c140c04, 0x1d150d055c544c44, +} +var expandAVX512_12_outShufHi = [8]uint64{ + 0x5850484038302820, 0x3931292178706860, 0x7971696159514941, 0x5a524a423a322a22, + 0x3b332b237a726a62, 0x7b736b635b534b43, 0x5c544c443c342c24, 0x3d352d257c746c64, +} + +func expandAVX512_12(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_12_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_12_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_12_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_12_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_12_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_12_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_12_outShufLo).AsUint8x64() + v15 := simd.LoadUint64x8(&expandAVX512_12_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v14 := v4.ConcatPermute(v8, v13) + v16 := v8.ConcatPermute(v12, v15) + return v14.AsUint64x8(), v16.AsUint64x8() +} + +var expandAVX512_14_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404, + 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010, +} +var expandAVX512_14_inShuf0 = [8]uint64{ + 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, + 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, 0xffffff0403020100, +} +var expandAVX512_14_mat1 = [8]uint64{ + 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, + 0x4040808080808080, 0x8080808080808080, 0x1010101010102020, 0x2020202020202020, +} +var expandAVX512_14_inShuf1 = [8]uint64{ + 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100, + 0xffffffff03020100, 0xffffffff03020100, 0xffffff0807060504, 0xffffff0807060504, +} +var expandAVX512_14_mat2 = [8]uint64{ + 0x2020202040404040, 0x4040404040404040, 0x4040808080808080, 0x8080808080808080, + 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404, +} +var expandAVX512_14_inShuf2 = [8]uint64{ + 0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504, 0xffffff0807060504, + 0xffffff0908070605, 0xffffff0908070605, 0xffffffff08070605, 0xffffffff08070605, +} +var expandAVX512_14_mat3 = [8]uint64{ + 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_14_inShuf3 = [8]uint64{ + 0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605, 0xffffffff08070605, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_14_outShufLo = [8]uint64{ + 0x3830282018100800, 0x0901686058504840, 0x4941393129211911, 0x1a120a0269615951, + 0x5a524a423a322a22, 0x2b231b130b036a62, 0x6b635b534b433b33, 0x3c342c241c140c04, +} +var expandAVX512_14_outShufHi0 = [8]uint64{ + 0x6860585048403830, 0x3931ffffffff7870, 0x7971696159514941, 0x4a423a32ffffffff, + 0xffff7a726a625a52, 0x5b534b433b33ffff, 0xffffffff7b736b63, 0x6c645c544c443c34, +} +var expandAVX512_14_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffff18100800ffff, 0xffffffffffffffff, 0xffffffff19110901, + 0x0a02ffffffffffff, 0xffffffffffff1a12, 0x1b130b03ffffffff, 0xffffffffffffffff, +} + +func expandAVX512_14(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_14_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_14_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_14_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_14_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_14_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_14_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_14_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_14_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_14_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_14_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_14_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xff0ffc3ff0ffc3ff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0xf003c00f003c00) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_16_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_16_inShuf0 = [8]uint64{ + 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, + 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, 0x0303020201010000, +} +var expandAVX512_16_inShuf1 = [8]uint64{ + 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, + 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, 0x0707060605050404, +} +var expandAVX512_16_outShufLo = [8]uint64{ + 0x1918111009080100, 0x3938313029282120, 0x1b1a13120b0a0302, 0x3b3a33322b2a2322, + 0x1d1c15140d0c0504, 0x3d3c35342d2c2524, 0x1f1e17160f0e0706, 0x3f3e37362f2e2726, +} + +func expandAVX512_16(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_16_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_16_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_16_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_16_outShufLo).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v4.Permute(v8) + v10 := v7.Permute(v8) + return v9.AsUint64x8(), v10.AsUint64x8() +} + +var expandAVX512_18_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404, + 0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010, +} +var expandAVX512_18_inShuf0 = [8]uint64{ + 0x0303020201010000, 0xffffffff03020100, 0xffffffff03020100, 0xffffffff03020100, + 0xffffffff03020100, 0xffffffff03020100, 0x0303020201010000, 0xff03020201010000, +} +var expandAVX512_18_mat1 = [8]uint64{ + 0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, + 0x4040404040408080, 0x8080808080808080, 0x1010101010101010, 0x1010202020202020, +} +var expandAVX512_18_inShuf1 = [8]uint64{ + 0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100, 0xffffffffff020100, + 0xffffffffff020100, 0xffff020201010000, 0xff06060505040403, 0xffffffff06050403, +} +var expandAVX512_18_mat2 = [8]uint64{ + 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040404040408080, + 0x8080808080808080, 0x0101010101010101, 0x0101020202020202, 0x0202020202020202, +} +var expandAVX512_18_inShuf2 = [8]uint64{ + 0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403, 0xffffffff06050403, + 0x0606050504040303, 0x0707060605050404, 0xffffffffff060504, 0xffffffffff060504, +} +var expandAVX512_18_mat3 = [8]uint64{ + 0x0202020204040404, 0x0404040404040404, 0x0404040404040808, 0x0808080808080808, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_18_inShuf3 = [8]uint64{ + 0xffffffffff060504, 0xffffffffff060504, 0xffffffffff060504, 0xffff060605050404, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_18_outShufLo = [8]uint64{ + 0x3028201810080100, 0x6058504840393831, 0x2119110903026968, 0x5149413b3a333229, + 0x120a05046b6a6159, 0x423d3c35342a221a, 0x07066d6c625a524a, 0x3e37362b231b130b, +} +var expandAVX512_18_outShufHi0 = [8]uint64{ + 0x6160585048403830, 0xffffffff78706968, 0x59514941393231ff, 0xffff79716b6a6362, + 0x4a423a3433ffffff, 0x7a726d6c65645a52, 0x3b3635ffffffffff, 0x6f6e67665b534b43, +} +var expandAVX512_18_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0x18100800ffffffff, 0xffffffffffffff19, 0x0901ffffffffffff, + 0xffffffffff1b1a11, 0xffffffffffffffff, 0xffffff1d1c120a02, 0xffffffffffffffff, +} + +func expandAVX512_18(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_18_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_18_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_18_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_18_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_18_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_18_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_18_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_18_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_18_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_18_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_18_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xffe0fff83ffe0fff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0x1f0007c001f000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_20_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_20_inShuf0 = [8]uint64{ + 0x0303020201010000, 0xffffffff03020100, 0xff03020201010000, 0xffff020201010000, + 0xffffffffff020100, 0xffff020201010000, 0xffff020201010000, 0xffffffffff020100, +} +var expandAVX512_20_mat1 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x0808080808080808, +} +var expandAVX512_20_inShuf1 = [8]uint64{ + 0xffff020201010000, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, + 0xff06060505040403, 0x0606050504040303, 0xffffffff06050403, 0xffff050504040303, +} +var expandAVX512_20_mat2 = [8]uint64{ + 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040, + 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202, +} +var expandAVX512_20_inShuf2 = [8]uint64{ + 0xffff050504040303, 0xffffffffff050403, 0xffff050504040303, 0xffff050504040303, + 0xffffffffff050403, 0xffff050504040303, 0xffff060605050404, 0xffffffffff060504, +} +var expandAVX512_20_outShufLo = [8]uint64{ + 0x2019181110080100, 0x4841403831302928, 0x1209030259585049, 0x33322b2a211b1a13, + 0x5b5a514b4a434239, 0x221d1c15140a0504, 0x4c45443a35342d2c, 0x160b07065d5c524d, +} +var expandAVX512_20_outShufHi = [8]uint64{ + 0x4140393830292820, 0x6968605958515048, 0x312b2a2221787170, 0x5a53524943423b3a, + 0x237973726b6a615b, 0x45443d3c322d2c24, 0x6d6c625d5c55544a, 0x332f2e26257a7574, +} + +func expandAVX512_20(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_20_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_20_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_20_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_20_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_20_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_20_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_20_outShufLo).AsUint8x64() + v15 := simd.LoadUint64x8(&expandAVX512_20_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v14 := v4.ConcatPermute(v8, v13) + v16 := v8.ConcatPermute(v12, v15) + return v14.AsUint64x8(), v16.AsUint64x8() +} + +var expandAVX512_22_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404, + 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010, +} +var expandAVX512_22_inShuf0 = [8]uint64{ + 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100, + 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, 0xffff020201010000, +} +var expandAVX512_22_mat1 = [8]uint64{ + 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, + 0x4040808080808080, 0x8080808080808080, 0x8080808080808080, 0x0101010101010101, +} +var expandAVX512_22_inShuf1 = [8]uint64{ + 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100, 0xffff020201010000, + 0xffffffffff020100, 0xffffffff01010000, 0xffff040403030202, 0xffff050504040303, +} +var expandAVX512_22_mat2 = [8]uint64{ + 0x0101010101010202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404, + 0x0404080808080808, 0x0808080808080808, 0x1010101010101010, 0x1010101010102020, +} +var expandAVX512_22_inShuf2 = [8]uint64{ + 0xffffffffff050403, 0xffff050504040303, 0xffffffffff050403, 0xffff050504040303, + 0xffffffffff050403, 0xffff050504040303, 0xffff050504040303, 0xffffffffff050403, +} +var expandAVX512_22_mat3 = [8]uint64{ + 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, 0x4040808080808080, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_22_inShuf3 = [8]uint64{ + 0xffff050504040303, 0xffffffffff050403, 0xffffff0504040303, 0xffffffffffff0403, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_22_outShufLo = [8]uint64{ + 0x2120181110080100, 0x4948403938313028, 0x0302696860595850, 0x3229232219131209, + 0x5a514b4a413b3a33, 0x140a05046b6a615b, 0x3c35342a25241a15, 0x625d5c524d4c423d, +} +var expandAVX512_22_outShufHi0 = [8]uint64{ + 0x5049484039383130, 0x7871706968605958, 0x3332ffffffffffff, 0x5b5a514b4a413b3a, + 0xffff7973726b6a61, 0x3d3c3534ffffffff, 0x6c625d5c524d4c42, 0xffffffff7a75746d, +} +var expandAVX512_22_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0xffff181110080100, 0xffffffffffffffff, + 0x0302ffffffffffff, 0xffffffff19131209, 0xffffffffffffffff, 0x140a0504ffffffff, +} + +func expandAVX512_22(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_22_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_22_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_22_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_22_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_22_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_22_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_22_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_22_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_22_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_22_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_22_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xffff03fffc0ffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0xf0000fc0003f0000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_24_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_24_inShuf0 = [8]uint64{ + 0x0202010101000000, 0x0202010101000000, 0x0202010101000000, 0x0202010101000000, + 0x0202010101000000, 0xff02010101000000, 0xffff010101000000, 0xffff010101000000, +} +var expandAVX512_24_inShuf1 = [8]uint64{ + 0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02, 0xffffffffffffff02, + 0xffffffffffffff02, 0x0404040303030202, 0x0404030303020202, 0x0404030303020202, +} +var expandAVX512_24_mat2 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x4040404040404040, 0x8080808080808080, 0x0101010101010101, +} +var expandAVX512_24_inShuf2 = [8]uint64{ + 0x0505040404030303, 0x0505040404030303, 0x0505040404030303, 0xffff040404030303, + 0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffff04, 0xffffffffffffff05, +} +var expandAVX512_24_mat3 = [8]uint64{ + 0x0202020202020202, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_24_inShuf3 = [8]uint64{ + 0xffffffffffffff05, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_24_outShufLo = [8]uint64{ + 0x11100a0908020100, 0x282221201a191812, 0x3a39383231302a29, 0x14130d0c0b050403, + 0x2b2524231d1c1b15, 0x3d3c3b3534332d2c, 0x1716480f0e400706, 0x2e602726581f1e50, +} +var expandAVX512_24_outShufHi0 = [8]uint64{ + 0x3a39383231302928, 0x51504a4948424140, 0x2a6261605a595852, 0x3d3c3b3534332c2b, + 0x54534d4c4b454443, 0x2d6564635d5c5b55, 0x703f3e6837362f2e, 0x5756ff4f4e784746, +} +var expandAVX512_24_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffff00ffffffffff, +} + +func expandAVX512_24(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_24_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_24_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_24_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_24_mat2).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_24_inShuf2).AsUint8x64() + v12 := simd.LoadUint64x8(&expandAVX512_24_mat3).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_24_inShuf3).AsUint8x64() + v16 := simd.LoadUint64x8(&expandAVX512_24_outShufLo).AsUint8x64() + v18 := simd.LoadUint64x8(&expandAVX512_24_outShufHi0).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_24_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v10 := v0.Permute(v9) + v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0) + v14 := v0.Permute(v13) + v15 := v14.GaloisFieldAffineTransform(v12.AsUint64x8(), 0) + v17 := v4.ConcatPermute(v7, v16) + u0 := uint64(0xdfffffffffffffff) + m0 := simd.Mask8x64FromBits(u0) + v20 := v7.ConcatPermute(v11, v18).Masked(m0) + u1 := uint64(0x2000000000000000) + m1 := simd.Mask8x64FromBits(u1) + v21 := v15.Permute(v19).Masked(m1) + v22 := v20.Or(v21) + return v17.AsUint64x8(), v22.AsUint64x8() +} + +var expandAVX512_26_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101020202020202, 0x0202020202020202, 0x0202020204040404, + 0x0404040404040404, 0x0404040404040808, 0x0808080808080808, 0x1010101010101010, +} +var expandAVX512_26_inShuf0 = [8]uint64{ + 0x0202010101000000, 0xffffffffff020100, 0xffff020201010000, 0xffffffffff020100, + 0xffff020201010000, 0xffffffffff020100, 0x0202010101000000, 0xffff010101000000, +} +var expandAVX512_26_mat1 = [8]uint64{ + 0x1010202020202020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, + 0x4040404040408080, 0x8080808080808080, 0x0101010101010101, 0x0808080808080808, +} +var expandAVX512_26_inShuf1 = [8]uint64{ + 0xffffffffffff0100, 0xffffffff01010000, 0xffffffffffff0100, 0xffffffff01010000, + 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0xff04040403030302, +} +var expandAVX512_26_mat2 = [8]uint64{ + 0x1010101010101010, 0x1010202020202020, 0x2020202020202020, 0x2020202040404040, + 0x4040404040404040, 0x4040404040408080, 0x8080808080808080, 0x0101010101010101, +} +var expandAVX512_26_inShuf2 = [8]uint64{ + 0x0404030303020202, 0xffffffffff040302, 0xffff040403030202, 0xffffffffff040302, + 0xffff040403030202, 0xffffffffff040302, 0xff04030303020202, 0xffff040404030303, +} +var expandAVX512_26_mat3 = [8]uint64{ + 0x0101020202020202, 0x0202020202020202, 0x0202020204040404, 0x0404040404040404, + 0x0404040404040808, 0x1010101010101010, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_26_inShuf3 = [8]uint64{ + 0xffffffffffff0403, 0xffffffff04040303, 0xffffffffffff0403, 0xffffffff04040303, + 0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_26_outShufLo = [8]uint64{ + 0x2018111008020100, 0x3a39383231302821, 0x6860595850494840, 0x1312090504036a69, + 0x3b35343329232219, 0x5b5a514b4a413d3c, 0x0a7007066d6c6b61, 0x37362a25241a1514, +} +var expandAVX512_26_outShufHi0 = [8]uint64{ + 0x5851504842414038, 0x7978727170686160, 0xffffffffffffff7a, 0x52494544433b3a39, + 0x7574736963625953, 0xffffffffff7d7c7b, 0xff47463e3d3cffff, 0x766a65645a55544a, +} +var expandAVX512_26_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0x20191810090800ff, 0xffffffffffffffff, + 0xffffffffffffffff, 0x1a110b0a01ffffff, 0x28ffffffffff211b, 0xffffffffffffffff, +} + +func expandAVX512_26(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_26_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_26_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_26_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_26_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_26_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_26_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_26_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_26_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_26_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_26_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_26_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xff7c07ffff01ffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0x83f80000fe0000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_28_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_28_inShuf0 = [8]uint64{ + 0x0202010101000000, 0xffffffffff020100, 0x0202010101000000, 0xff02010101000000, + 0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100, +} +var expandAVX512_28_mat1 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, +} +var expandAVX512_28_inShuf1 = [8]uint64{ + 0xffff010101000000, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000, + 0xffffffffffffff02, 0xffffffffffffff02, 0x0404040303030202, 0xffffffffff040302, +} +var expandAVX512_28_mat2 = [8]uint64{ + 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, + 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, +} +var expandAVX512_28_inShuf2 = [8]uint64{ + 0x0404030303020202, 0x0404030303020202, 0xffffffffffff0302, 0xffff030303020202, + 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303, +} +var expandAVX512_28_mat3 = [8]uint64{ + 0x0101010102020202, 0x0202020202020202, 0x0808080808080808, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_28_inShuf3 = [8]uint64{ + 0xffffffffffff0403, 0xffff040404030303, 0xffffffffffffff04, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_28_outShufLo = [8]uint64{ + 0x1812111008020100, 0x31302a2928201a19, 0x4a49484241403832, 0x090504035a595850, + 0x2b211d1c1b151413, 0x4443393534332d2c, 0x5d5c5b514d4c4b45, 0x1e6817160a600706, +} +var expandAVX512_28_outShufHi0 = [8]uint64{ + 0x4948424140383130, 0x6261605a5958504a, 0xff7a797872717068, 0x4339343332ffffff, + 0x5c5b514d4c4b4544, 0x757473696564635d, 0x35ffffffff7d7c7b, 0x4f4eff47463a3736, +} +var expandAVX512_28_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0x00ffffffffffffff, 0xffffffffff0a0908, + 0xffffffffffffffff, 0xffffffffffffffff, 0xff0d0c0b01ffffff, 0xffff10ffffffffff, +} + +func expandAVX512_28(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_28_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_28_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_28_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_28_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_28_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_28_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_28_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_28_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_28_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_28_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_28_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xdf87fffff87fffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0x2078000007800000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_30_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010101010202, 0x0202020202020202, 0x0202020204040404, + 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, 0x1010101010101010, +} +var expandAVX512_30_inShuf0 = [8]uint64{ + 0x0202010101000000, 0xffffffffff020100, 0xffff010101000000, 0xffffffffffff0100, + 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000, 0xffff010101000000, +} +var expandAVX512_30_mat1 = [8]uint64{ + 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, 0x4040404040404040, + 0x4040808080808080, 0x8080808080808080, 0x0101010101010101, 0x0202020202020202, +} +var expandAVX512_30_inShuf1 = [8]uint64{ + 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffff0100, 0xffff010101000000, + 0xffffffffffff0100, 0xffff010101000000, 0xffffffffffffff02, 0x0404030303020202, +} +var expandAVX512_30_mat2 = [8]uint64{ + 0x0202020204040404, 0x0404040404040404, 0x0404080808080808, 0x0808080808080808, + 0x1010101010101010, 0x1010101010102020, 0x2020202020202020, 0x2020202040404040, +} +var expandAVX512_30_inShuf2 = [8]uint64{ + 0xffffffffff040302, 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, + 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffffffffffff0302, +} +var expandAVX512_30_mat3 = [8]uint64{ + 0x4040404040404040, 0x4040808080808080, 0x8080808080808080, 0x0101010101010101, + 0x0101010101010202, 0x0202020202020202, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_30_inShuf3 = [8]uint64{ + 0xffff030303020202, 0xffffffffffff0302, 0xffff030303020202, 0xffff040404030303, + 0xffffffffffff0403, 0xffffffffffffff04, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_30_outShufLo = [8]uint64{ + 0x1812111008020100, 0x3832313028222120, 0x58504a4948403a39, 0x04036a6968605a59, + 0x2423191514130905, 0x3d3c3b3534332925, 0x5d5c5b514d4c4b41, 0x0a7007066d6c6b61, +} +var expandAVX512_30_outShufHi0 = [8]uint64{ + 0x504a4948403a3938, 0x70686261605a5958, 0xffffffffff787271, 0x3c3bffffffffffff, + 0x5c5b514d4c4b413d, 0x757473696564635d, 0xffffffffffffff79, 0x42ff3f3effffffff, +} +var expandAVX512_30_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0x1008020100ffffff, 0xffff201a19181211, + 0xffffffffffffffff, 0xffffffffffffffff, 0x15141309050403ff, 0xff28ffff211d1c1b, +} + +func expandAVX512_30(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_30_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_30_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_30_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_30_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_30_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_30_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_30_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_30_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_30_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_30_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_30_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xb001ffffc007ffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0x4ffe00003ff80000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_32_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_32_inShuf0 = [8]uint64{ + 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, + 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, 0x0101010100000000, +} +var expandAVX512_32_inShuf1 = [8]uint64{ + 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, + 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, 0x0303030302020202, +} +var expandAVX512_32_outShufLo = [8]uint64{ + 0x0b0a090803020100, 0x1b1a191813121110, 0x2b2a292823222120, 0x3b3a393833323130, + 0x0f0e0d0c07060504, 0x1f1e1d1c17161514, 0x2f2e2d2c27262524, 0x3f3e3d3c37363534, +} + +func expandAVX512_32(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_32_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_32_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_32_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_32_outShufLo).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v4.Permute(v8) + v10 := v7.Permute(v8) + return v9.AsUint64x8(), v10.AsUint64x8() +} + +var expandAVX512_36_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_36_inShuf0 = [8]uint64{ + 0x0101010100000000, 0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000, + 0xffffffffffff0100, 0x0101010100000000, 0x0101010100000000, 0xffffffffffff0100, +} +var expandAVX512_36_mat1 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, +} +var expandAVX512_36_inShuf1 = [8]uint64{ + 0x0101010100000000, 0xffffff0100000000, 0xffffffffffffff00, 0xffffffff00000000, + 0xff02020202010101, 0xffffffffffff0201, 0x0202020201010101, 0x0303030302020202, +} +var expandAVX512_36_mat2 = [8]uint64{ + 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, + 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, +} +var expandAVX512_36_inShuf2 = [8]uint64{ + 0xffffffffffff0302, 0x0303030302020202, 0x0303030302020202, 0xffffffffffff0302, + 0x0303030302020202, 0xffff030302020202, 0xffffffffffffff02, 0xffffffff02020202, +} +var expandAVX512_36_outShufLo = [8]uint64{ + 0x1211100803020100, 0x2928201b1a191813, 0x4038333231302b2a, 0x504b4a4948434241, + 0x070605045b5a5958, 0x1e1d1c1716151409, 0x35342f2e2d2c211f, 0x4c47464544393736, +} +var expandAVX512_36_outShufHi = [8]uint64{ + 0x3332313028222120, 0x4a4948403b3a3938, 0x616058535251504b, 0x78706b6a69686362, + 0x29262524237b7a79, 0x3f3e3d3c37363534, 0x5655544f4e4d4c41, 0x6d6c676665645957, +} + +func expandAVX512_36(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_36_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_36_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_36_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_36_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_36_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_36_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_36_outShufLo).AsUint8x64() + v15 := simd.LoadUint64x8(&expandAVX512_36_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v14 := v4.ConcatPermute(v8, v13) + v16 := v8.ConcatPermute(v12, v15) + return v14.AsUint64x8(), v16.AsUint64x8() +} + +var expandAVX512_40_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_40_inShuf0 = [8]uint64{ + 0x0101010000000000, 0x0101010000000000, 0x0101010000000000, 0x0101010000000000, + 0x0101010000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, +} +var expandAVX512_40_mat1 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, +} +var expandAVX512_40_inShuf1 = [8]uint64{ + 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, + 0xffffffffffffff01, 0xffff020202020201, 0x0202020101010101, 0x0202020101010101, +} +var expandAVX512_40_mat2 = [8]uint64{ + 0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, + 0x0808080808080808, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_40_inShuf2 = [8]uint64{ + 0x0202020101010101, 0x0303030202020202, 0x0303030202020202, 0xffffff0202020202, + 0xffffff0202020202, 0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffff0202, +} +var expandAVX512_40_mat3 = [8]uint64{ + 0x0101010101010101, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_40_inShuf3 = [8]uint64{ + 0xffffffffffff0303, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_40_outShufLo = [8]uint64{ + 0x0a09080403020100, 0x1814131211100c0b, 0x232221201c1b1a19, 0x31302c2b2a292824, + 0x3c3b3a3938343332, 0x0f0e0d4140070605, 0x1d51501716154948, 0x6027262559581f1e, +} +var expandAVX512_40_outShufHi0 = [8]uint64{ + 0x3938343332313028, 0x44434241403c3b3a, 0x5251504c4b4a4948, 0x605c5b5a59585453, + 0x2c2b2a2964636261, 0x3e3d69683736352d, 0x797847464571703f, 0x575655ffff4f4e4d, +} +var expandAVX512_40_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffff0100ffffff, +} + +func expandAVX512_40(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_40_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_40_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_40_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_40_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_40_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_40_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_40_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_40_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_40_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_40_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_40_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xe7ffffffffffffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0x1800000000000000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_44_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_44_inShuf0 = [8]uint64{ + 0x0101010000000000, 0xffffffffffff0100, 0x0101010000000000, 0x0101010000000000, + 0xffffffffffff0100, 0x0101010000000000, 0xffffff0000000000, 0xffffffffffffff00, +} +var expandAVX512_44_mat1 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, +} +var expandAVX512_44_inShuf1 = [8]uint64{ + 0xffffff0000000000, 0xffffff0000000000, 0xffffffffffffff00, 0xffffff0000000000, + 0xffffffffffff0101, 0xffffffffffff0101, 0xffffffffffff0101, 0xff02020202020101, +} +var expandAVX512_44_mat2 = [8]uint64{ + 0x1010101010101010, 0x1010101020202020, 0x2020202020202020, 0x4040404040404040, + 0x4040404080808080, 0x8080808080808080, 0x0101010101010101, 0x0101010102020202, +} +var expandAVX512_44_inShuf2 = [8]uint64{ + 0x0202020101010101, 0xffffffffffff0201, 0x0202020101010101, 0x0202020101010101, + 0xffffffffffff0201, 0xffff020101010101, 0xffffff0202020202, 0xffffffffffffff02, +} +var expandAVX512_44_mat3 = [8]uint64{ + 0x0202020202020202, 0x0404040404040404, 0x0404040408080808, 0x1010101010101010, + 0x2020202020202020, 0x4040404040404040, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_44_inShuf3 = [8]uint64{ + 0xffffff0202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffff0202, + 0xffffffffffff0202, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_44_outShufLo = [8]uint64{ + 0x1110080403020100, 0x1c1b1a1918141312, 0x31302c2b2a292820, 0x4342414038343332, + 0x58504c4b4a494844, 0x600706055c5b5a59, 0x1d69681716150961, 0x2f2e2d2171701f1e, +} +var expandAVX512_44_outShufHi0 = [8]uint64{ + 0x4844434241403938, 0x5a59585453525150, 0x6c6b6a6968605c5b, 0xffff787473727170, + 0xffffffffffffffff, 0x46453e3d3c3b3aff, 0xff57565549ffff47, 0x6d61ffff5f5e5dff, +} +var expandAVX512_44_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0x0100ffffffffffff, + 0x0c0b0a0908040302, 0xffffffffffffff10, 0x20ffffffff1918ff, 0xffff2928ffffff21, +} + +func expandAVX512_44(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_44_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_44_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_44_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_44_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_44_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_44_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_44_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_44_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_44_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_44_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_44_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0xce79fe003fffffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0x318601ffc0000000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_48_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_48_inShuf0 = [8]uint64{ + 0x0101000000000000, 0x0101000000000000, 0x0101000000000000, 0xffff000000000000, + 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, +} +var expandAVX512_48_mat1 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0404040404040404, + 0x0808080808080808, 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, +} +var expandAVX512_48_inShuf1 = [8]uint64{ + 0xffffffff01010101, 0xffffffff01010101, 0xffffffffffff0101, 0x0202020202020101, + 0x0202010101010101, 0x0202010101010101, 0x0202010101010101, 0xffff010101010101, +} +var expandAVX512_48_mat2 = [8]uint64{ + 0x8080808080808080, 0x0101010101010101, 0x0202020202020202, 0x0808080808080808, + 0x1010101010101010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_48_inShuf2 = [8]uint64{ + 0xffff010101010101, 0xffff020202020202, 0xffff020202020202, 0xffffffff02020202, + 0xffffffff02020202, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_48_outShufLo = [8]uint64{ + 0x0908050403020100, 0x131211100d0c0b0a, 0x1d1c1b1a19181514, 0x2928252423222120, + 0x333231302d2c2b2a, 0x3d3c3b3a39383534, 0x0f0e434241400706, 0x515017164b4a4948, +} +var expandAVX512_48_outShufHi = [8]uint64{ + 0x2524232221201918, 0x31302d2c2b2a2928, 0x3b3a393835343332, 0x4544434241403d3c, + 0x51504d4c4b4a4948, 0x1d1c1b1a55545352, 0x5b5a595827261f1e, 0x3736636261602f2e, +} + +func expandAVX512_48(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_48_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_48_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_48_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_48_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_48_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_48_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_48_outShufLo).AsUint8x64() + v15 := simd.LoadUint64x8(&expandAVX512_48_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v14 := v4.ConcatPermute(v8, v13) + v16 := v8.ConcatPermute(v12, v15) + return v14.AsUint64x8(), v16.AsUint64x8() +} + +var expandAVX512_52_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_52_inShuf0 = [8]uint64{ + 0x0101000000000000, 0xffffffffffff0100, 0x0101000000000000, 0xffff000000000000, + 0xffffffffffffff00, 0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00, +} +var expandAVX512_52_mat1 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x0101010101010101, 0x0202020202020202, 0x0202020202020202, 0x0404040404040404, +} +var expandAVX512_52_inShuf1 = [8]uint64{ + 0xffff000000000000, 0xffff000000000000, 0xffffffffffffff00, 0xffff000000000000, + 0xffffffff01010101, 0xffffffffff010101, 0xff02020202020201, 0x0202010101010101, +} +var expandAVX512_52_mat2 = [8]uint64{ + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, +} +var expandAVX512_52_inShuf2 = [8]uint64{ + 0xffffffffffff0201, 0x0202010101010101, 0xffff010101010101, 0xffffffffffffff01, + 0xffff010101010101, 0xffff010101010101, 0xffffffffffffff01, 0xffff010101010101, +} +var expandAVX512_52_mat3 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0404040404040404, 0x0808080808080808, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_52_inShuf3 = [8]uint64{ + 0xffff020202020202, 0xffffffffffffff02, 0xffffffff02020202, 0xffffffffffff0202, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_52_outShufLo = [8]uint64{ + 0x1008050403020100, 0x1a19181514131211, 0x2b2a2928201d1c1b, 0x3534333231302d2c, + 0x4845444342414038, 0x5958504d4c4b4a49, 0x616007065d5c5b5a, 0x6a69681716096362, +} +var expandAVX512_52_outShufHi0 = [8]uint64{ + 0x403d3c3b3a393830, 0x51504d4c4b4a4948, 0x6261605855545352, 0x6c6b6a6968656463, + 0x7d7c7b7a7978706d, 0x31ffffffffffffff, 0xff3f3e3635343332, 0xffff4f4e41ffffff, +} +var expandAVX512_52_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xff08050403020100, 0x10ffffffffffffff, 0x1918ffffff131211, +} + +func expandAVX512_52(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_52_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_52_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_52_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_52_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_52_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_52_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_52_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_52_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_52_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_52_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_52_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0x387f80ffffffffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0xc7807f0000000000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_56_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_56_inShuf0 = [8]uint64{ + 0x0100000000000000, 0x0100000000000000, 0xff00000000000000, 0xff00000000000000, + 0xff00000000000000, 0xff00000000000000, 0xff00000000000000, 0xff00000000000000, +} +var expandAVX512_56_inShuf1 = [8]uint64{ + 0xffff010101010101, 0x0202010101010101, 0x0201010101010101, 0xff01010101010101, + 0xff01010101010101, 0xff01010101010101, 0xff01010101010101, 0xff01010101010101, +} +var expandAVX512_56_mat2 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_56_inShuf2 = [8]uint64{ + 0xff02020202020202, 0xffffff0202020202, 0xffffffffffffff02, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_56_outShufLo = [8]uint64{ + 0x0806050403020100, 0x11100e0d0c0b0a09, 0x1a19181615141312, 0x232221201e1d1c1b, + 0x2c2b2a2928262524, 0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0f45444342414007, +} +var expandAVX512_56_outShufHi = [8]uint64{ + 0x11100d0c0b0a0908, 0x1a19181615141312, 0x232221201e1d1c1b, 0x2c2b2a2928262524, + 0x3534333231302e2d, 0x3e3d3c3b3a393836, 0x0e46454443424140, 0x50174c4b4a49480f, +} + +func expandAVX512_56(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_56_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_56_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_56_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_56_mat2).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_56_inShuf2).AsUint8x64() + v12 := simd.LoadUint64x8(&expandAVX512_56_outShufLo).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_56_outShufHi).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v10 := v0.Permute(v9) + v11 := v10.GaloisFieldAffineTransform(v8.AsUint64x8(), 0) + v13 := v4.ConcatPermute(v7, v12) + v15 := v7.ConcatPermute(v11, v14) + return v13.AsUint64x8(), v15.AsUint64x8() +} + +var expandAVX512_60_mat0 = [8]uint64{ + 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, 0x0404040404040404, + 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, 0x1010101020202020, +} +var expandAVX512_60_inShuf0 = [8]uint64{ + 0x0100000000000000, 0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000, + 0xffffffffffffff00, 0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00, +} +var expandAVX512_60_mat1 = [8]uint64{ + 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, 0x8080808080808080, + 0x0101010101010101, 0x0101010101010101, 0x0101010102020202, 0x0202020202020202, +} +var expandAVX512_60_inShuf1 = [8]uint64{ + 0xff00000000000000, 0xff00000000000000, 0xffffffffffffff00, 0xff00000000000000, + 0xffffffffff010101, 0x0202020202010101, 0xffffffffffff0201, 0xff01010101010101, +} +var expandAVX512_60_mat2 = [8]uint64{ + 0x0404040404040404, 0x0404040408080808, 0x0808080808080808, 0x1010101010101010, + 0x1010101020202020, 0x2020202020202020, 0x4040404040404040, 0x4040404080808080, +} +var expandAVX512_60_inShuf2 = [8]uint64{ + 0xff01010101010101, 0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101, + 0xffffffffffffff01, 0xff01010101010101, 0xff01010101010101, 0xffffffffffffff01, +} +var expandAVX512_60_mat3 = [8]uint64{ + 0x8080808080808080, 0x0101010101010101, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_60_inShuf3 = [8]uint64{ + 0xff01010101010101, 0xffffffffffff0202, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, +} +var expandAVX512_60_outShufLo = [8]uint64{ + 0x0806050403020100, 0x1816151413121110, 0x28201e1d1c1b1a19, 0x31302e2d2c2b2a29, + 0x4140383635343332, 0x4a49484645444342, 0x5a5958504e4d4c4b, 0x626160075e5d5c5b, +} +var expandAVX512_60_outShufHi0 = [8]uint64{ + 0x3b3a3938302a2928, 0x44434241403e3d3c, 0x5453525150484645, 0x5d5c5b5a59585655, + 0x6d6c6b6a6968605e, 0x767574737271706e, 0xffffffffffffff78, 0x31ffff2f2e2d2c2b, +} +var expandAVX512_60_outShufHi1 = [8]uint64{ + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0x06050403020100ff, 0xff0908ffffffffff, +} + +func expandAVX512_60(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_60_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_60_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_60_mat1).AsUint8x64() + v6 := simd.LoadUint64x8(&expandAVX512_60_inShuf1).AsUint8x64() + v9 := simd.LoadUint64x8(&expandAVX512_60_mat2).AsUint8x64() + v10 := simd.LoadUint64x8(&expandAVX512_60_inShuf2).AsUint8x64() + v13 := simd.LoadUint64x8(&expandAVX512_60_mat3).AsUint8x64() + v14 := simd.LoadUint64x8(&expandAVX512_60_inShuf3).AsUint8x64() + v17 := simd.LoadUint64x8(&expandAVX512_60_outShufLo).AsUint8x64() + v19 := simd.LoadUint64x8(&expandAVX512_60_outShufHi0).AsUint8x64() + v20 := simd.LoadUint64x8(&expandAVX512_60_outShufHi1).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v7 := v0.Permute(v6) + v8 := v7.GaloisFieldAffineTransform(v5.AsUint64x8(), 0) + v11 := v0.Permute(v10) + v12 := v11.GaloisFieldAffineTransform(v9.AsUint64x8(), 0) + v15 := v0.Permute(v14) + v16 := v15.GaloisFieldAffineTransform(v13.AsUint64x8(), 0) + v18 := v4.ConcatPermute(v8, v17) + u0 := uint64(0x9f01ffffffffffff) + m0 := simd.Mask8x64FromBits(u0) + v21 := v8.ConcatPermute(v12, v19).Masked(m0) + u1 := uint64(0x60fe000000000000) + m1 := simd.Mask8x64FromBits(u1) + v22 := v16.Permute(v20).Masked(m1) + v23 := v21.Or(v22) + return v18.AsUint64x8(), v23.AsUint64x8() +} + +var expandAVX512_64_mat0 = [8]uint64{ + 0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808, + 0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080, +} +var expandAVX512_64_inShuf0 = [8]uint64{ + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, +} +var expandAVX512_64_inShuf1 = [8]uint64{ + 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, + 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, 0x0101010101010101, +} +var expandAVX512_64_outShufLo = [8]uint64{ + 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x1716151413121110, 0x1f1e1d1c1b1a1918, + 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x3736353433323130, 0x3f3e3d3c3b3a3938, +} + +func expandAVX512_64(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + v0 := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + v1 := simd.LoadUint64x8(&expandAVX512_64_mat0).AsUint8x64() + v2 := simd.LoadUint64x8(&expandAVX512_64_inShuf0).AsUint8x64() + v5 := simd.LoadUint64x8(&expandAVX512_64_inShuf1).AsUint8x64() + v8 := simd.LoadUint64x8(&expandAVX512_64_outShufLo).AsUint8x64() + v3 := v0.Permute(v2) + v4 := v3.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v6 := v0.Permute(v5) + v7 := v6.GaloisFieldAffineTransform(v1.AsUint64x8(), 0) + v9 := v4.Permute(v8) + v10 := v7.Permute(v8) + return v9.AsUint64x8(), v10.AsUint64x8() +} diff --git a/src/internal/runtime/gc/scan/expanders_amd64.s b/src/internal/runtime/gc/scan/expanders_amd64.s new file mode 100644 index 0000000000..c90d715673 --- /dev/null +++ b/src/internal/runtime/gc/scan/expanders_amd64.s @@ -0,0 +1,2631 @@ +// Code generated by mkasm.go. DO NOT EDIT. + +#include "go_asm.h" +#include "textflag.h" + +GLOBL ·gcExpandersAVX512Asm(SB), RODATA, $0x220 +DATA ·gcExpandersAVX512Asm+0x00(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x08(SB)/8, $expandAVX512Asm_1<>(SB) +DATA ·gcExpandersAVX512Asm+0x10(SB)/8, $expandAVX512Asm_2<>(SB) +DATA ·gcExpandersAVX512Asm+0x18(SB)/8, $expandAVX512Asm_3<>(SB) +DATA ·gcExpandersAVX512Asm+0x20(SB)/8, $expandAVX512Asm_4<>(SB) +DATA ·gcExpandersAVX512Asm+0x28(SB)/8, $expandAVX512Asm_6<>(SB) +DATA ·gcExpandersAVX512Asm+0x30(SB)/8, $expandAVX512Asm_8<>(SB) +DATA ·gcExpandersAVX512Asm+0x38(SB)/8, $expandAVX512Asm_10<>(SB) +DATA ·gcExpandersAVX512Asm+0x40(SB)/8, $expandAVX512Asm_12<>(SB) +DATA ·gcExpandersAVX512Asm+0x48(SB)/8, $expandAVX512Asm_14<>(SB) +DATA ·gcExpandersAVX512Asm+0x50(SB)/8, $expandAVX512Asm_16<>(SB) +DATA ·gcExpandersAVX512Asm+0x58(SB)/8, $expandAVX512Asm_18<>(SB) +DATA ·gcExpandersAVX512Asm+0x60(SB)/8, $expandAVX512Asm_20<>(SB) +DATA ·gcExpandersAVX512Asm+0x68(SB)/8, $expandAVX512Asm_22<>(SB) +DATA ·gcExpandersAVX512Asm+0x70(SB)/8, $expandAVX512Asm_24<>(SB) +DATA ·gcExpandersAVX512Asm+0x78(SB)/8, $expandAVX512Asm_26<>(SB) +DATA ·gcExpandersAVX512Asm+0x80(SB)/8, $expandAVX512Asm_28<>(SB) +DATA ·gcExpandersAVX512Asm+0x88(SB)/8, $expandAVX512Asm_30<>(SB) +DATA ·gcExpandersAVX512Asm+0x90(SB)/8, $expandAVX512Asm_32<>(SB) +DATA ·gcExpandersAVX512Asm+0x98(SB)/8, $expandAVX512Asm_36<>(SB) +DATA ·gcExpandersAVX512Asm+0xa0(SB)/8, $expandAVX512Asm_40<>(SB) +DATA ·gcExpandersAVX512Asm+0xa8(SB)/8, $expandAVX512Asm_44<>(SB) +DATA ·gcExpandersAVX512Asm+0xb0(SB)/8, $expandAVX512Asm_48<>(SB) +DATA ·gcExpandersAVX512Asm+0xb8(SB)/8, $expandAVX512Asm_52<>(SB) +DATA ·gcExpandersAVX512Asm+0xc0(SB)/8, $expandAVX512Asm_56<>(SB) +DATA ·gcExpandersAVX512Asm+0xc8(SB)/8, $expandAVX512Asm_60<>(SB) +DATA ·gcExpandersAVX512Asm+0xd0(SB)/8, $expandAVX512Asm_64<>(SB) +DATA ·gcExpandersAVX512Asm+0xd8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0xe0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0xe8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0xf0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0xf8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x100(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x108(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x110(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x118(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x120(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x128(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x130(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x138(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x140(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x148(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x150(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x158(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x160(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x168(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x170(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x178(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x180(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x188(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x190(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x198(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1a0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1a8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1b0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1b8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1c0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1c8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1d0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1d8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1e0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1e8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1f0(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x1f8(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x200(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x208(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x210(SB)/8, $0 +DATA ·gcExpandersAVX512Asm+0x218(SB)/8, $0 + +TEXT expandAVX512Asm_1<>(SB), NOSPLIT, $0-0 + VMOVDQU64 (AX), Z1 + VMOVDQU64 64(AX), Z2 + RET + +GLOBL expandAVX512Asm_2_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_2_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_2_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_2_inShuf0<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_2_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_2_inShuf0<>+0x20(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_2_inShuf0<>+0x28(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_2_inShuf0<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_2_inShuf0<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918 + +GLOBL expandAVX512Asm_2_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_2_mat0<>+0x00(SB)/8, $0x0101020204040808 +DATA expandAVX512Asm_2_mat0<>+0x08(SB)/8, $0x1010202040408080 +DATA expandAVX512Asm_2_mat0<>+0x10(SB)/8, $0x0101020204040808 +DATA expandAVX512Asm_2_mat0<>+0x18(SB)/8, $0x1010202040408080 +DATA expandAVX512Asm_2_mat0<>+0x20(SB)/8, $0x0101020204040808 +DATA expandAVX512Asm_2_mat0<>+0x28(SB)/8, $0x1010202040408080 +DATA expandAVX512Asm_2_mat0<>+0x30(SB)/8, $0x0101020204040808 +DATA expandAVX512Asm_2_mat0<>+0x38(SB)/8, $0x1010202040408080 + +GLOBL expandAVX512Asm_2_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_2_inShuf1<>+0x00(SB)/8, $0x2726252423222120 +DATA expandAVX512Asm_2_inShuf1<>+0x08(SB)/8, $0x2726252423222120 +DATA expandAVX512Asm_2_inShuf1<>+0x10(SB)/8, $0x2f2e2d2c2b2a2928 +DATA expandAVX512Asm_2_inShuf1<>+0x18(SB)/8, $0x2f2e2d2c2b2a2928 +DATA expandAVX512Asm_2_inShuf1<>+0x20(SB)/8, $0x3736353433323130 +DATA expandAVX512Asm_2_inShuf1<>+0x28(SB)/8, $0x3736353433323130 +DATA expandAVX512Asm_2_inShuf1<>+0x30(SB)/8, $0x3f3e3d3c3b3a3938 +DATA expandAVX512Asm_2_inShuf1<>+0x38(SB)/8, $0x3f3e3d3c3b3a3938 + +GLOBL expandAVX512Asm_2_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_2_outShufLo+0x00(SB)/8, $0x0b030a0209010800 +DATA expandAVX512Asm_2_outShufLo+0x08(SB)/8, $0x0f070e060d050c04 +DATA expandAVX512Asm_2_outShufLo+0x10(SB)/8, $0x1b131a1219111810 +DATA expandAVX512Asm_2_outShufLo+0x18(SB)/8, $0x1f171e161d151c14 +DATA expandAVX512Asm_2_outShufLo+0x20(SB)/8, $0x2b232a2229212820 +DATA expandAVX512Asm_2_outShufLo+0x28(SB)/8, $0x2f272e262d252c24 +DATA expandAVX512Asm_2_outShufLo+0x30(SB)/8, $0x3b333a3239313830 +DATA expandAVX512Asm_2_outShufLo+0x38(SB)/8, $0x3f373e363d353c34 + +TEXT expandAVX512Asm_2<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_2_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_2_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512Asm_2_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_2_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512Asm_3_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_3_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_3_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_3_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_3_inShuf0<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_3_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_3_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_3_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_3_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_3_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_3_mat0<>+0x00(SB)/8, $0x0101010202020404 +DATA expandAVX512Asm_3_mat0<>+0x08(SB)/8, $0x0408080810101020 +DATA expandAVX512Asm_3_mat0<>+0x10(SB)/8, $0x2020404040808080 +DATA expandAVX512Asm_3_mat0<>+0x18(SB)/8, $0x0101010202020404 +DATA expandAVX512Asm_3_mat0<>+0x20(SB)/8, $0x0408080810101020 +DATA expandAVX512Asm_3_mat0<>+0x28(SB)/8, $0x2020404040808080 +DATA expandAVX512Asm_3_mat0<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_3_mat0<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_3_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_3_inShuf1<>+0x00(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_3_inShuf1<>+0x08(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_3_inShuf1<>+0x10(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_3_inShuf1<>+0x18(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_3_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_3_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_3_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_3_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_3_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_3_inShuf2<>+0x00(SB)/8, $0x2726252423222120 +DATA expandAVX512Asm_3_inShuf2<>+0x08(SB)/8, $0x2726252423222120 +DATA expandAVX512Asm_3_inShuf2<>+0x10(SB)/8, $0x2726252423222120 +DATA expandAVX512Asm_3_inShuf2<>+0x18(SB)/8, $0xffffffffff2a2928 +DATA expandAVX512Asm_3_inShuf2<>+0x20(SB)/8, $0xffffffffff2a2928 +DATA expandAVX512Asm_3_inShuf2<>+0x28(SB)/8, $0xffffffffffff2928 +DATA expandAVX512Asm_3_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_3_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_3_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_3_outShufLo+0x00(SB)/8, $0x0a02110901100800 +DATA expandAVX512Asm_3_outShufLo+0x08(SB)/8, $0x05140c04130b0312 +DATA expandAVX512Asm_3_outShufLo+0x10(SB)/8, $0x170f07160e06150d +DATA expandAVX512Asm_3_outShufLo+0x18(SB)/8, $0x221a292119282018 +DATA expandAVX512Asm_3_outShufLo+0x20(SB)/8, $0x1d2c241c2b231b2a +DATA expandAVX512Asm_3_outShufLo+0x28(SB)/8, $0x2f271f2e261e2d25 +DATA expandAVX512Asm_3_outShufLo+0x30(SB)/8, $0x4a42514941504840 +DATA expandAVX512Asm_3_outShufLo+0x38(SB)/8, $0x45544c44534b4352 + +GLOBL expandAVX512Asm_3_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_3_outShufHi+0x00(SB)/8, $0x170f07160e06150d +DATA expandAVX512Asm_3_outShufHi+0x08(SB)/8, $0x221a292119282018 +DATA expandAVX512Asm_3_outShufHi+0x10(SB)/8, $0x1d2c241c2b231b2a +DATA expandAVX512Asm_3_outShufHi+0x18(SB)/8, $0x2f271f2e261e2d25 +DATA expandAVX512Asm_3_outShufHi+0x20(SB)/8, $0x4a42514941504840 +DATA expandAVX512Asm_3_outShufHi+0x28(SB)/8, $0x45544c44534b4352 +DATA expandAVX512Asm_3_outShufHi+0x30(SB)/8, $0x574f47564e46554d +DATA expandAVX512Asm_3_outShufHi+0x38(SB)/8, $0x625a696159686058 + +TEXT expandAVX512Asm_3<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_3_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_3_mat0<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_3_inShuf1<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_3_inShuf2<>(SB), Z5 + VMOVDQU64 expandAVX512Asm_3_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_3_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z6 + VPERMB Z6, Z0, Z0 + VGF2P8AFFINEQB $0, Z3, Z0, Z0 + VPERMB Z6, Z4, Z4 + VGF2P8AFFINEQB $0, Z3, Z4, Z4 + VPERMB Z6, Z5, Z5 + VGF2P8AFFINEQB $0, Z3, Z5, Z3 + VPERMI2B Z4, Z0, Z1 + VPERMI2B Z3, Z4, Z2 + RET + +GLOBL expandAVX512Asm_4_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_4_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_4_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_4_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_4_inShuf0<>+0x18(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_4_inShuf0<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_4_inShuf0<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_4_inShuf0<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_4_inShuf0<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 + +GLOBL expandAVX512Asm_4_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_4_mat0<>+0x00(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_4_mat0<>+0x08(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_4_mat0<>+0x10(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_4_mat0<>+0x18(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_4_mat0<>+0x20(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_4_mat0<>+0x28(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_4_mat0<>+0x30(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_4_mat0<>+0x38(SB)/8, $0x4040404080808080 + +GLOBL expandAVX512Asm_4_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_4_inShuf1<>+0x00(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_4_inShuf1<>+0x08(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_4_inShuf1<>+0x10(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_4_inShuf1<>+0x18(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_4_inShuf1<>+0x20(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_4_inShuf1<>+0x28(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_4_inShuf1<>+0x30(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_4_inShuf1<>+0x38(SB)/8, $0x1f1e1d1c1b1a1918 + +GLOBL expandAVX512Asm_4_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_4_outShufLo+0x00(SB)/8, $0x1911090118100800 +DATA expandAVX512Asm_4_outShufLo+0x08(SB)/8, $0x1b130b031a120a02 +DATA expandAVX512Asm_4_outShufLo+0x10(SB)/8, $0x1d150d051c140c04 +DATA expandAVX512Asm_4_outShufLo+0x18(SB)/8, $0x1f170f071e160e06 +DATA expandAVX512Asm_4_outShufLo+0x20(SB)/8, $0x3931292138302820 +DATA expandAVX512Asm_4_outShufLo+0x28(SB)/8, $0x3b332b233a322a22 +DATA expandAVX512Asm_4_outShufLo+0x30(SB)/8, $0x3d352d253c342c24 +DATA expandAVX512Asm_4_outShufLo+0x38(SB)/8, $0x3f372f273e362e26 + +TEXT expandAVX512Asm_4<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_4_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_4_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512Asm_4_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_4_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512Asm_6_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_6_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_6_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_6_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_6_inShuf0<>+0x18(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_6_inShuf0<>+0x20(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_6_inShuf0<>+0x28(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_6_inShuf0<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_6_inShuf0<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_6_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_6_mat0<>+0x00(SB)/8, $0x0101010101010202 +DATA expandAVX512Asm_6_mat0<>+0x08(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_6_mat0<>+0x10(SB)/8, $0x0404080808080808 +DATA expandAVX512Asm_6_mat0<>+0x18(SB)/8, $0x1010101010102020 +DATA expandAVX512Asm_6_mat0<>+0x20(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_6_mat0<>+0x28(SB)/8, $0x4040808080808080 +DATA expandAVX512Asm_6_mat0<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_6_mat0<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_6_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_6_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_6_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_6_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_6_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_6_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_6_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_6_inShuf1<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_6_inShuf1<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_6_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_6_inShuf2<>+0x00(SB)/8, $0xffff151413121110 +DATA expandAVX512Asm_6_inShuf2<>+0x08(SB)/8, $0xffff151413121110 +DATA expandAVX512Asm_6_inShuf2<>+0x10(SB)/8, $0xffffff1413121110 +DATA expandAVX512Asm_6_inShuf2<>+0x18(SB)/8, $0xffffff1413121110 +DATA expandAVX512Asm_6_inShuf2<>+0x20(SB)/8, $0xffffff1413121110 +DATA expandAVX512Asm_6_inShuf2<>+0x28(SB)/8, $0xffffff1413121110 +DATA expandAVX512Asm_6_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_6_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_6_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_6_outShufLo+0x00(SB)/8, $0x0901282018100800 +DATA expandAVX512Asm_6_outShufLo+0x08(SB)/8, $0x1a120a0229211911 +DATA expandAVX512Asm_6_outShufLo+0x10(SB)/8, $0x2b231b130b032a22 +DATA expandAVX512Asm_6_outShufLo+0x18(SB)/8, $0x0d052c241c140c04 +DATA expandAVX512Asm_6_outShufLo+0x20(SB)/8, $0x1e160e062d251d15 +DATA expandAVX512Asm_6_outShufLo+0x28(SB)/8, $0x2f271f170f072e26 +DATA expandAVX512Asm_6_outShufLo+0x30(SB)/8, $0x4941686058504840 +DATA expandAVX512Asm_6_outShufLo+0x38(SB)/8, $0x5a524a4269615951 + +GLOBL expandAVX512Asm_6_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_6_outShufHi+0x00(SB)/8, $0x2b231b130b032a22 +DATA expandAVX512Asm_6_outShufHi+0x08(SB)/8, $0x0d052c241c140c04 +DATA expandAVX512Asm_6_outShufHi+0x10(SB)/8, $0x1e160e062d251d15 +DATA expandAVX512Asm_6_outShufHi+0x18(SB)/8, $0x2f271f170f072e26 +DATA expandAVX512Asm_6_outShufHi+0x20(SB)/8, $0x4941686058504840 +DATA expandAVX512Asm_6_outShufHi+0x28(SB)/8, $0x5a524a4269615951 +DATA expandAVX512Asm_6_outShufHi+0x30(SB)/8, $0x6b635b534b436a62 +DATA expandAVX512Asm_6_outShufHi+0x38(SB)/8, $0x4d456c645c544c44 + +TEXT expandAVX512Asm_6<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_6_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_6_mat0<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_6_inShuf1<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_6_inShuf2<>(SB), Z5 + VMOVDQU64 expandAVX512Asm_6_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_6_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z6 + VPERMB Z6, Z0, Z0 + VGF2P8AFFINEQB $0, Z3, Z0, Z0 + VPERMB Z6, Z4, Z4 + VGF2P8AFFINEQB $0, Z3, Z4, Z4 + VPERMB Z6, Z5, Z5 + VGF2P8AFFINEQB $0, Z3, Z5, Z3 + VPERMI2B Z4, Z0, Z1 + VPERMI2B Z3, Z4, Z2 + RET + +GLOBL expandAVX512Asm_8_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_8_inShuf0<>+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_8_inShuf0<>+0x08(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_8_inShuf0<>+0x10(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_8_inShuf0<>+0x18(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_8_inShuf0<>+0x20(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_8_inShuf0<>+0x28(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_8_inShuf0<>+0x30(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_8_inShuf0<>+0x38(SB)/8, $0x0706050403020100 + +GLOBL expandAVX512Asm_8_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_8_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_8_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_8_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_8_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_8_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_8_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_8_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_8_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_8_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_8_inShuf1<>+0x00(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_8_inShuf1<>+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_8_inShuf1<>+0x10(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_8_inShuf1<>+0x18(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_8_inShuf1<>+0x20(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_8_inShuf1<>+0x28(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_8_inShuf1<>+0x30(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_8_inShuf1<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 + +GLOBL expandAVX512Asm_8_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_8_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512Asm_8_outShufLo+0x08(SB)/8, $0x3931292119110901 +DATA expandAVX512Asm_8_outShufLo+0x10(SB)/8, $0x3a322a221a120a02 +DATA expandAVX512Asm_8_outShufLo+0x18(SB)/8, $0x3b332b231b130b03 +DATA expandAVX512Asm_8_outShufLo+0x20(SB)/8, $0x3c342c241c140c04 +DATA expandAVX512Asm_8_outShufLo+0x28(SB)/8, $0x3d352d251d150d05 +DATA expandAVX512Asm_8_outShufLo+0x30(SB)/8, $0x3e362e261e160e06 +DATA expandAVX512Asm_8_outShufLo+0x38(SB)/8, $0x3f372f271f170f07 + +TEXT expandAVX512Asm_8<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_8_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_8_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512Asm_8_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_8_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512Asm_10_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_inShuf0<>+0x00(SB)/8, $0xff06050403020100 +DATA expandAVX512Asm_10_inShuf0<>+0x08(SB)/8, $0xff06050403020100 +DATA expandAVX512Asm_10_inShuf0<>+0x10(SB)/8, $0xff06050403020100 +DATA expandAVX512Asm_10_inShuf0<>+0x18(SB)/8, $0xff06050403020100 +DATA expandAVX512Asm_10_inShuf0<>+0x20(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_10_inShuf0<>+0x28(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_10_inShuf0<>+0x30(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_10_inShuf0<>+0x38(SB)/8, $0xffff050403020100 + +GLOBL expandAVX512Asm_10_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_10_mat0<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512Asm_10_mat0<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_10_mat0<>+0x18(SB)/8, $0x0404040404040808 +DATA expandAVX512Asm_10_mat0<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_10_mat0<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_10_mat0<>+0x30(SB)/8, $0x1010202020202020 +DATA expandAVX512Asm_10_mat0<>+0x38(SB)/8, $0x2020202040404040 + +GLOBL expandAVX512Asm_10_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_inShuf1<>+0x00(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_10_inShuf1<>+0x08(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_10_inShuf1<>+0x10(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512Asm_10_inShuf1<>+0x18(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512Asm_10_inShuf1<>+0x20(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512Asm_10_inShuf1<>+0x28(SB)/8, $0xff0c0b0a09080706 +DATA expandAVX512Asm_10_inShuf1<>+0x30(SB)/8, $0xffff0b0a09080706 +DATA expandAVX512Asm_10_inShuf1<>+0x38(SB)/8, $0xffff0b0a09080706 + +GLOBL expandAVX512Asm_10_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_mat1<>+0x00(SB)/8, $0x4040404040408080 +DATA expandAVX512Asm_10_mat1<>+0x08(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_10_mat1<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_10_mat1<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_10_mat1<>+0x20(SB)/8, $0x1010202020202020 +DATA expandAVX512Asm_10_mat1<>+0x28(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_10_mat1<>+0x30(SB)/8, $0x4040404040408080 +DATA expandAVX512Asm_10_mat1<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_10_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_inShuf2<>+0x00(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512Asm_10_inShuf2<>+0x08(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512Asm_10_inShuf2<>+0x10(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512Asm_10_inShuf2<>+0x18(SB)/8, $0xffff0c0b0a090807 +DATA expandAVX512Asm_10_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_10_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_10_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_10_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_10_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_mat2<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_10_mat2<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512Asm_10_mat2<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_10_mat2<>+0x18(SB)/8, $0x0404040404040808 +DATA expandAVX512Asm_10_mat2<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_10_mat2<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_10_mat2<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_10_mat2<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_10_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512Asm_10_outShufLo+0x08(SB)/8, $0x2921191109014840 +DATA expandAVX512Asm_10_outShufLo+0x10(SB)/8, $0x1a120a0249413931 +DATA expandAVX512Asm_10_outShufLo+0x18(SB)/8, $0x0b034a423a322a22 +DATA expandAVX512Asm_10_outShufLo+0x20(SB)/8, $0x4b433b332b231b13 +DATA expandAVX512Asm_10_outShufLo+0x28(SB)/8, $0x3c342c241c140c04 +DATA expandAVX512Asm_10_outShufLo+0x30(SB)/8, $0x2d251d150d054c44 +DATA expandAVX512Asm_10_outShufLo+0x38(SB)/8, $0x1e160e064d453d35 + +GLOBL expandAVX512Asm_10_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_10_outShufHi+0x00(SB)/8, $0x4840383028201810 +DATA expandAVX512Asm_10_outShufHi+0x08(SB)/8, $0x3931292119115850 +DATA expandAVX512Asm_10_outShufHi+0x10(SB)/8, $0x2a221a1259514941 +DATA expandAVX512Asm_10_outShufHi+0x18(SB)/8, $0x1b135a524a423a32 +DATA expandAVX512Asm_10_outShufHi+0x20(SB)/8, $0x5b534b433b332b23 +DATA expandAVX512Asm_10_outShufHi+0x28(SB)/8, $0x4c443c342c241c14 +DATA expandAVX512Asm_10_outShufHi+0x30(SB)/8, $0x3d352d251d155c54 +DATA expandAVX512Asm_10_outShufHi+0x38(SB)/8, $0x2e261e165d554d45 + +TEXT expandAVX512Asm_10<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_10_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_10_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_10_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_10_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_10_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_10_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_10_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_10_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512Asm_12_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_inShuf0<>+0x00(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_12_inShuf0<>+0x08(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_12_inShuf0<>+0x10(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_12_inShuf0<>+0x18(SB)/8, $0xffff050403020100 +DATA expandAVX512Asm_12_inShuf0<>+0x20(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_12_inShuf0<>+0x28(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_12_inShuf0<>+0x30(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_12_inShuf0<>+0x38(SB)/8, $0xffffff0403020100 + +GLOBL expandAVX512Asm_12_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_12_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_12_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_12_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_12_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_12_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_12_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_12_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_12_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_inShuf1<>+0x00(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_12_inShuf1<>+0x08(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_12_inShuf1<>+0x10(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_12_inShuf1<>+0x18(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_12_inShuf1<>+0x20(SB)/8, $0xffff0a0908070605 +DATA expandAVX512Asm_12_inShuf1<>+0x28(SB)/8, $0xffff0a0908070605 +DATA expandAVX512Asm_12_inShuf1<>+0x30(SB)/8, $0xffff0a0908070605 +DATA expandAVX512Asm_12_inShuf1<>+0x38(SB)/8, $0xffff0a0908070605 + +GLOBL expandAVX512Asm_12_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_12_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_12_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_12_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_12_mat1<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_12_mat1<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_12_mat1<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_12_mat1<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_12_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_inShuf2<>+0x00(SB)/8, $0xffffff0908070605 +DATA expandAVX512Asm_12_inShuf2<>+0x08(SB)/8, $0xffffff0908070605 +DATA expandAVX512Asm_12_inShuf2<>+0x10(SB)/8, $0xffffff0908070605 +DATA expandAVX512Asm_12_inShuf2<>+0x18(SB)/8, $0xffffff0908070605 +DATA expandAVX512Asm_12_inShuf2<>+0x20(SB)/8, $0xffffff0a09080706 +DATA expandAVX512Asm_12_inShuf2<>+0x28(SB)/8, $0xffffff0a09080706 +DATA expandAVX512Asm_12_inShuf2<>+0x30(SB)/8, $0xffffff0a09080706 +DATA expandAVX512Asm_12_inShuf2<>+0x38(SB)/8, $0xffffff0a09080706 + +GLOBL expandAVX512Asm_12_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_mat2<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_12_mat2<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_12_mat2<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_12_mat2<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_12_mat2<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_12_mat2<>+0x28(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_12_mat2<>+0x30(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_12_mat2<>+0x38(SB)/8, $0x0404040404040404 + +GLOBL expandAVX512Asm_12_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512Asm_12_outShufLo+0x08(SB)/8, $0x1911090158504840 +DATA expandAVX512Asm_12_outShufLo+0x10(SB)/8, $0x5951494139312921 +DATA expandAVX512Asm_12_outShufLo+0x18(SB)/8, $0x3a322a221a120a02 +DATA expandAVX512Asm_12_outShufLo+0x20(SB)/8, $0x1b130b035a524a42 +DATA expandAVX512Asm_12_outShufLo+0x28(SB)/8, $0x5b534b433b332b23 +DATA expandAVX512Asm_12_outShufLo+0x30(SB)/8, $0x3c342c241c140c04 +DATA expandAVX512Asm_12_outShufLo+0x38(SB)/8, $0x1d150d055c544c44 + +GLOBL expandAVX512Asm_12_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_12_outShufHi+0x00(SB)/8, $0x5850484038302820 +DATA expandAVX512Asm_12_outShufHi+0x08(SB)/8, $0x3931292178706860 +DATA expandAVX512Asm_12_outShufHi+0x10(SB)/8, $0x7971696159514941 +DATA expandAVX512Asm_12_outShufHi+0x18(SB)/8, $0x5a524a423a322a22 +DATA expandAVX512Asm_12_outShufHi+0x20(SB)/8, $0x3b332b237a726a62 +DATA expandAVX512Asm_12_outShufHi+0x28(SB)/8, $0x7b736b635b534b43 +DATA expandAVX512Asm_12_outShufHi+0x30(SB)/8, $0x5c544c443c342c24 +DATA expandAVX512Asm_12_outShufHi+0x38(SB)/8, $0x3d352d257c746c64 + +TEXT expandAVX512Asm_12<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_12_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_12_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_12_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_12_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_12_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_12_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_12_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_12_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512Asm_14_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_inShuf0<>+0x00(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_14_inShuf0<>+0x08(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_14_inShuf0<>+0x10(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_14_inShuf0<>+0x18(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_14_inShuf0<>+0x20(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_14_inShuf0<>+0x28(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_14_inShuf0<>+0x30(SB)/8, $0xffffff0403020100 +DATA expandAVX512Asm_14_inShuf0<>+0x38(SB)/8, $0xffffff0403020100 + +GLOBL expandAVX512Asm_14_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_14_mat0<>+0x08(SB)/8, $0x0101010101010202 +DATA expandAVX512Asm_14_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_14_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_14_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_14_mat0<>+0x28(SB)/8, $0x0404080808080808 +DATA expandAVX512Asm_14_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_14_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512Asm_14_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_inShuf1<>+0x00(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_14_inShuf1<>+0x08(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_14_inShuf1<>+0x10(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_14_inShuf1<>+0x18(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_14_inShuf1<>+0x20(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_14_inShuf1<>+0x28(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_14_inShuf1<>+0x30(SB)/8, $0xffffff0807060504 +DATA expandAVX512Asm_14_inShuf1<>+0x38(SB)/8, $0xffffff0807060504 + +GLOBL expandAVX512Asm_14_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_mat1<>+0x00(SB)/8, $0x1010101010102020 +DATA expandAVX512Asm_14_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_14_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_14_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_14_mat1<>+0x20(SB)/8, $0x4040808080808080 +DATA expandAVX512Asm_14_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_14_mat1<>+0x30(SB)/8, $0x1010101010102020 +DATA expandAVX512Asm_14_mat1<>+0x38(SB)/8, $0x2020202020202020 + +GLOBL expandAVX512Asm_14_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_inShuf2<>+0x00(SB)/8, $0xffffff0807060504 +DATA expandAVX512Asm_14_inShuf2<>+0x08(SB)/8, $0xffffff0807060504 +DATA expandAVX512Asm_14_inShuf2<>+0x10(SB)/8, $0xffffff0807060504 +DATA expandAVX512Asm_14_inShuf2<>+0x18(SB)/8, $0xffffff0807060504 +DATA expandAVX512Asm_14_inShuf2<>+0x20(SB)/8, $0xffffff0908070605 +DATA expandAVX512Asm_14_inShuf2<>+0x28(SB)/8, $0xffffff0908070605 +DATA expandAVX512Asm_14_inShuf2<>+0x30(SB)/8, $0xffffffff08070605 +DATA expandAVX512Asm_14_inShuf2<>+0x38(SB)/8, $0xffffffff08070605 + +GLOBL expandAVX512Asm_14_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_mat2<>+0x00(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_14_mat2<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_14_mat2<>+0x10(SB)/8, $0x4040808080808080 +DATA expandAVX512Asm_14_mat2<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_14_mat2<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_14_mat2<>+0x28(SB)/8, $0x0101010101010202 +DATA expandAVX512Asm_14_mat2<>+0x30(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_14_mat2<>+0x38(SB)/8, $0x0202020204040404 + +GLOBL expandAVX512Asm_14_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_inShuf3<>+0x00(SB)/8, $0xffffffff08070605 +DATA expandAVX512Asm_14_inShuf3<>+0x08(SB)/8, $0xffffffff08070605 +DATA expandAVX512Asm_14_inShuf3<>+0x10(SB)/8, $0xffffffff08070605 +DATA expandAVX512Asm_14_inShuf3<>+0x18(SB)/8, $0xffffffff08070605 +DATA expandAVX512Asm_14_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_14_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_14_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_14_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_14_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_mat3<>+0x00(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_14_mat3<>+0x08(SB)/8, $0x0404080808080808 +DATA expandAVX512Asm_14_mat3<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_14_mat3<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_14_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_14_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_14_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_14_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_14_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_outShufLo+0x00(SB)/8, $0x3830282018100800 +DATA expandAVX512Asm_14_outShufLo+0x08(SB)/8, $0x0901686058504840 +DATA expandAVX512Asm_14_outShufLo+0x10(SB)/8, $0x4941393129211911 +DATA expandAVX512Asm_14_outShufLo+0x18(SB)/8, $0x1a120a0269615951 +DATA expandAVX512Asm_14_outShufLo+0x20(SB)/8, $0x5a524a423a322a22 +DATA expandAVX512Asm_14_outShufLo+0x28(SB)/8, $0x2b231b130b036a62 +DATA expandAVX512Asm_14_outShufLo+0x30(SB)/8, $0x6b635b534b433b33 +DATA expandAVX512Asm_14_outShufLo+0x38(SB)/8, $0x3c342c241c140c04 + +GLOBL expandAVX512Asm_14_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_outShufHi0+0x00(SB)/8, $0x6860585048403830 +DATA expandAVX512Asm_14_outShufHi0+0x08(SB)/8, $0x3931ffffffff7870 +DATA expandAVX512Asm_14_outShufHi0+0x10(SB)/8, $0x7971696159514941 +DATA expandAVX512Asm_14_outShufHi0+0x18(SB)/8, $0x4a423a32ffffffff +DATA expandAVX512Asm_14_outShufHi0+0x20(SB)/8, $0xffff7a726a625a52 +DATA expandAVX512Asm_14_outShufHi0+0x28(SB)/8, $0x5b534b433b33ffff +DATA expandAVX512Asm_14_outShufHi0+0x30(SB)/8, $0xffffffff7b736b63 +DATA expandAVX512Asm_14_outShufHi0+0x38(SB)/8, $0x6c645c544c443c34 + +GLOBL expandAVX512Asm_14_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_14_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_14_outShufHi1+0x08(SB)/8, $0xffff18100800ffff +DATA expandAVX512Asm_14_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_14_outShufHi1+0x18(SB)/8, $0xffffffff19110901 +DATA expandAVX512Asm_14_outShufHi1+0x20(SB)/8, $0x0a02ffffffffffff +DATA expandAVX512Asm_14_outShufHi1+0x28(SB)/8, $0xffffffffffff1a12 +DATA expandAVX512Asm_14_outShufHi1+0x30(SB)/8, $0x1b130b03ffffffff +DATA expandAVX512Asm_14_outShufHi1+0x38(SB)/8, $0xffffffffffffffff + +TEXT expandAVX512Asm_14<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_14_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_14_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_14_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_14_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_14_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_14_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_14_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_14_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xff0ffc3ff0ffc3ff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0xf003c00f003c00, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_16_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_16_inShuf0<>+0x00(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_16_inShuf0<>+0x08(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_16_inShuf0<>+0x10(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_16_inShuf0<>+0x18(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_16_inShuf0<>+0x20(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_16_inShuf0<>+0x28(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_16_inShuf0<>+0x30(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_16_inShuf0<>+0x38(SB)/8, $0x0303020201010000 + +GLOBL expandAVX512Asm_16_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_16_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_16_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_16_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_16_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_16_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_16_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_16_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_16_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_16_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_16_inShuf1<>+0x00(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_16_inShuf1<>+0x08(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_16_inShuf1<>+0x10(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_16_inShuf1<>+0x18(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_16_inShuf1<>+0x20(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_16_inShuf1<>+0x28(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_16_inShuf1<>+0x30(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_16_inShuf1<>+0x38(SB)/8, $0x0707060605050404 + +GLOBL expandAVX512Asm_16_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_16_outShufLo+0x00(SB)/8, $0x1918111009080100 +DATA expandAVX512Asm_16_outShufLo+0x08(SB)/8, $0x3938313029282120 +DATA expandAVX512Asm_16_outShufLo+0x10(SB)/8, $0x1b1a13120b0a0302 +DATA expandAVX512Asm_16_outShufLo+0x18(SB)/8, $0x3b3a33322b2a2322 +DATA expandAVX512Asm_16_outShufLo+0x20(SB)/8, $0x1d1c15140d0c0504 +DATA expandAVX512Asm_16_outShufLo+0x28(SB)/8, $0x3d3c35342d2c2524 +DATA expandAVX512Asm_16_outShufLo+0x30(SB)/8, $0x1f1e17160f0e0706 +DATA expandAVX512Asm_16_outShufLo+0x38(SB)/8, $0x3f3e37362f2e2726 + +TEXT expandAVX512Asm_16<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_16_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_16_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512Asm_16_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_16_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512Asm_18_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_inShuf0<>+0x00(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_18_inShuf0<>+0x08(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_18_inShuf0<>+0x10(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_18_inShuf0<>+0x18(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_18_inShuf0<>+0x20(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_18_inShuf0<>+0x28(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_18_inShuf0<>+0x30(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_18_inShuf0<>+0x38(SB)/8, $0xff03020201010000 + +GLOBL expandAVX512Asm_18_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_18_mat0<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512Asm_18_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_18_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_18_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_18_mat0<>+0x28(SB)/8, $0x0404040404040808 +DATA expandAVX512Asm_18_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_18_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512Asm_18_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_inShuf1<>+0x00(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_18_inShuf1<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_18_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_18_inShuf1<>+0x18(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_18_inShuf1<>+0x20(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_18_inShuf1<>+0x28(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_18_inShuf1<>+0x30(SB)/8, $0xff06060505040403 +DATA expandAVX512Asm_18_inShuf1<>+0x38(SB)/8, $0xffffffff06050403 + +GLOBL expandAVX512Asm_18_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_mat1<>+0x00(SB)/8, $0x1010202020202020 +DATA expandAVX512Asm_18_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_18_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_18_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_18_mat1<>+0x20(SB)/8, $0x4040404040408080 +DATA expandAVX512Asm_18_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_18_mat1<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_18_mat1<>+0x38(SB)/8, $0x1010202020202020 + +GLOBL expandAVX512Asm_18_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_inShuf2<>+0x00(SB)/8, $0xffffffff06050403 +DATA expandAVX512Asm_18_inShuf2<>+0x08(SB)/8, $0xffffffff06050403 +DATA expandAVX512Asm_18_inShuf2<>+0x10(SB)/8, $0xffffffff06050403 +DATA expandAVX512Asm_18_inShuf2<>+0x18(SB)/8, $0xffffffff06050403 +DATA expandAVX512Asm_18_inShuf2<>+0x20(SB)/8, $0x0606050504040303 +DATA expandAVX512Asm_18_inShuf2<>+0x28(SB)/8, $0x0707060605050404 +DATA expandAVX512Asm_18_inShuf2<>+0x30(SB)/8, $0xffffffffff060504 +DATA expandAVX512Asm_18_inShuf2<>+0x38(SB)/8, $0xffffffffff060504 + +GLOBL expandAVX512Asm_18_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_mat2<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_18_mat2<>+0x08(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_18_mat2<>+0x10(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_18_mat2<>+0x18(SB)/8, $0x4040404040408080 +DATA expandAVX512Asm_18_mat2<>+0x20(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_18_mat2<>+0x28(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_18_mat2<>+0x30(SB)/8, $0x0101020202020202 +DATA expandAVX512Asm_18_mat2<>+0x38(SB)/8, $0x0202020202020202 + +GLOBL expandAVX512Asm_18_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_inShuf3<>+0x00(SB)/8, $0xffffffffff060504 +DATA expandAVX512Asm_18_inShuf3<>+0x08(SB)/8, $0xffffffffff060504 +DATA expandAVX512Asm_18_inShuf3<>+0x10(SB)/8, $0xffffffffff060504 +DATA expandAVX512Asm_18_inShuf3<>+0x18(SB)/8, $0xffff060605050404 +DATA expandAVX512Asm_18_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_18_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_18_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_18_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_18_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_mat3<>+0x00(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_18_mat3<>+0x08(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_18_mat3<>+0x10(SB)/8, $0x0404040404040808 +DATA expandAVX512Asm_18_mat3<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_18_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_18_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_18_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_18_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_18_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_outShufLo+0x00(SB)/8, $0x3028201810080100 +DATA expandAVX512Asm_18_outShufLo+0x08(SB)/8, $0x6058504840393831 +DATA expandAVX512Asm_18_outShufLo+0x10(SB)/8, $0x2119110903026968 +DATA expandAVX512Asm_18_outShufLo+0x18(SB)/8, $0x5149413b3a333229 +DATA expandAVX512Asm_18_outShufLo+0x20(SB)/8, $0x120a05046b6a6159 +DATA expandAVX512Asm_18_outShufLo+0x28(SB)/8, $0x423d3c35342a221a +DATA expandAVX512Asm_18_outShufLo+0x30(SB)/8, $0x07066d6c625a524a +DATA expandAVX512Asm_18_outShufLo+0x38(SB)/8, $0x3e37362b231b130b + +GLOBL expandAVX512Asm_18_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_outShufHi0+0x00(SB)/8, $0x6160585048403830 +DATA expandAVX512Asm_18_outShufHi0+0x08(SB)/8, $0xffffffff78706968 +DATA expandAVX512Asm_18_outShufHi0+0x10(SB)/8, $0x59514941393231ff +DATA expandAVX512Asm_18_outShufHi0+0x18(SB)/8, $0xffff79716b6a6362 +DATA expandAVX512Asm_18_outShufHi0+0x20(SB)/8, $0x4a423a3433ffffff +DATA expandAVX512Asm_18_outShufHi0+0x28(SB)/8, $0x7a726d6c65645a52 +DATA expandAVX512Asm_18_outShufHi0+0x30(SB)/8, $0x3b3635ffffffffff +DATA expandAVX512Asm_18_outShufHi0+0x38(SB)/8, $0x6f6e67665b534b43 + +GLOBL expandAVX512Asm_18_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_18_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_18_outShufHi1+0x08(SB)/8, $0x18100800ffffffff +DATA expandAVX512Asm_18_outShufHi1+0x10(SB)/8, $0xffffffffffffff19 +DATA expandAVX512Asm_18_outShufHi1+0x18(SB)/8, $0x0901ffffffffffff +DATA expandAVX512Asm_18_outShufHi1+0x20(SB)/8, $0xffffffffff1b1a11 +DATA expandAVX512Asm_18_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_18_outShufHi1+0x30(SB)/8, $0xffffff1d1c120a02 +DATA expandAVX512Asm_18_outShufHi1+0x38(SB)/8, $0xffffffffffffffff + +TEXT expandAVX512Asm_18<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_18_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_18_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_18_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_18_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_18_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_18_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_18_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_18_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xffe0fff83ffe0fff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x1f0007c001f000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_20_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_inShuf0<>+0x00(SB)/8, $0x0303020201010000 +DATA expandAVX512Asm_20_inShuf0<>+0x08(SB)/8, $0xffffffff03020100 +DATA expandAVX512Asm_20_inShuf0<>+0x10(SB)/8, $0xff03020201010000 +DATA expandAVX512Asm_20_inShuf0<>+0x18(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_20_inShuf0<>+0x20(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_20_inShuf0<>+0x28(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_20_inShuf0<>+0x30(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_20_inShuf0<>+0x38(SB)/8, $0xffffffffff020100 + +GLOBL expandAVX512Asm_20_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_20_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_20_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_20_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_20_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_20_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_20_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_20_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_20_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_inShuf1<>+0x00(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_20_inShuf1<>+0x08(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_20_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_20_inShuf1<>+0x18(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_20_inShuf1<>+0x20(SB)/8, $0xff06060505040403 +DATA expandAVX512Asm_20_inShuf1<>+0x28(SB)/8, $0x0606050504040303 +DATA expandAVX512Asm_20_inShuf1<>+0x30(SB)/8, $0xffffffff06050403 +DATA expandAVX512Asm_20_inShuf1<>+0x38(SB)/8, $0xffff050504040303 + +GLOBL expandAVX512Asm_20_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_20_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_20_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_20_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_20_mat1<>+0x20(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_20_mat1<>+0x28(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_20_mat1<>+0x30(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_20_mat1<>+0x38(SB)/8, $0x0808080808080808 + +GLOBL expandAVX512Asm_20_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_inShuf2<>+0x00(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_20_inShuf2<>+0x08(SB)/8, $0xffffffffff050403 +DATA expandAVX512Asm_20_inShuf2<>+0x10(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_20_inShuf2<>+0x18(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_20_inShuf2<>+0x20(SB)/8, $0xffffffffff050403 +DATA expandAVX512Asm_20_inShuf2<>+0x28(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_20_inShuf2<>+0x30(SB)/8, $0xffff060605050404 +DATA expandAVX512Asm_20_inShuf2<>+0x38(SB)/8, $0xffffffffff060504 + +GLOBL expandAVX512Asm_20_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_mat2<>+0x00(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_20_mat2<>+0x08(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_20_mat2<>+0x10(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_20_mat2<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_20_mat2<>+0x20(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_20_mat2<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_20_mat2<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_20_mat2<>+0x38(SB)/8, $0x0101010102020202 + +GLOBL expandAVX512Asm_20_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_outShufLo+0x00(SB)/8, $0x2019181110080100 +DATA expandAVX512Asm_20_outShufLo+0x08(SB)/8, $0x4841403831302928 +DATA expandAVX512Asm_20_outShufLo+0x10(SB)/8, $0x1209030259585049 +DATA expandAVX512Asm_20_outShufLo+0x18(SB)/8, $0x33322b2a211b1a13 +DATA expandAVX512Asm_20_outShufLo+0x20(SB)/8, $0x5b5a514b4a434239 +DATA expandAVX512Asm_20_outShufLo+0x28(SB)/8, $0x221d1c15140a0504 +DATA expandAVX512Asm_20_outShufLo+0x30(SB)/8, $0x4c45443a35342d2c +DATA expandAVX512Asm_20_outShufLo+0x38(SB)/8, $0x160b07065d5c524d + +GLOBL expandAVX512Asm_20_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_20_outShufHi+0x00(SB)/8, $0x4140393830292820 +DATA expandAVX512Asm_20_outShufHi+0x08(SB)/8, $0x6968605958515048 +DATA expandAVX512Asm_20_outShufHi+0x10(SB)/8, $0x312b2a2221787170 +DATA expandAVX512Asm_20_outShufHi+0x18(SB)/8, $0x5a53524943423b3a +DATA expandAVX512Asm_20_outShufHi+0x20(SB)/8, $0x237973726b6a615b +DATA expandAVX512Asm_20_outShufHi+0x28(SB)/8, $0x45443d3c322d2c24 +DATA expandAVX512Asm_20_outShufHi+0x30(SB)/8, $0x6d6c625d5c55544a +DATA expandAVX512Asm_20_outShufHi+0x38(SB)/8, $0x332f2e26257a7574 + +TEXT expandAVX512Asm_20<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_20_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_20_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_20_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_20_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_20_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_20_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_20_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_20_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512Asm_22_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_inShuf0<>+0x00(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_22_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_22_inShuf0<>+0x10(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_22_inShuf0<>+0x18(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_22_inShuf0<>+0x20(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_22_inShuf0<>+0x28(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_22_inShuf0<>+0x30(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_22_inShuf0<>+0x38(SB)/8, $0xffff020201010000 + +GLOBL expandAVX512Asm_22_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_22_mat0<>+0x08(SB)/8, $0x0101010101010202 +DATA expandAVX512Asm_22_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_22_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_22_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_22_mat0<>+0x28(SB)/8, $0x0404080808080808 +DATA expandAVX512Asm_22_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_22_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512Asm_22_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_inShuf1<>+0x00(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_22_inShuf1<>+0x08(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_22_inShuf1<>+0x10(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_22_inShuf1<>+0x18(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_22_inShuf1<>+0x20(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_22_inShuf1<>+0x28(SB)/8, $0xffffffff01010000 +DATA expandAVX512Asm_22_inShuf1<>+0x30(SB)/8, $0xffff040403030202 +DATA expandAVX512Asm_22_inShuf1<>+0x38(SB)/8, $0xffff050504040303 + +GLOBL expandAVX512Asm_22_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_mat1<>+0x00(SB)/8, $0x1010101010102020 +DATA expandAVX512Asm_22_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_22_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_22_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_22_mat1<>+0x20(SB)/8, $0x4040808080808080 +DATA expandAVX512Asm_22_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_22_mat1<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_22_mat1<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512Asm_22_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_inShuf2<>+0x00(SB)/8, $0xffffffffff050403 +DATA expandAVX512Asm_22_inShuf2<>+0x08(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_22_inShuf2<>+0x10(SB)/8, $0xffffffffff050403 +DATA expandAVX512Asm_22_inShuf2<>+0x18(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_22_inShuf2<>+0x20(SB)/8, $0xffffffffff050403 +DATA expandAVX512Asm_22_inShuf2<>+0x28(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_22_inShuf2<>+0x30(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_22_inShuf2<>+0x38(SB)/8, $0xffffffffff050403 + +GLOBL expandAVX512Asm_22_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_mat2<>+0x00(SB)/8, $0x0101010101010202 +DATA expandAVX512Asm_22_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_22_mat2<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_22_mat2<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_22_mat2<>+0x20(SB)/8, $0x0404080808080808 +DATA expandAVX512Asm_22_mat2<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_22_mat2<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_22_mat2<>+0x38(SB)/8, $0x1010101010102020 + +GLOBL expandAVX512Asm_22_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_inShuf3<>+0x00(SB)/8, $0xffff050504040303 +DATA expandAVX512Asm_22_inShuf3<>+0x08(SB)/8, $0xffffffffff050403 +DATA expandAVX512Asm_22_inShuf3<>+0x10(SB)/8, $0xffffff0504040303 +DATA expandAVX512Asm_22_inShuf3<>+0x18(SB)/8, $0xffffffffffff0403 +DATA expandAVX512Asm_22_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_22_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_22_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_22_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_22_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_mat3<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_22_mat3<>+0x08(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_22_mat3<>+0x10(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_22_mat3<>+0x18(SB)/8, $0x4040808080808080 +DATA expandAVX512Asm_22_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_22_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_22_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_22_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_22_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_outShufLo+0x00(SB)/8, $0x2120181110080100 +DATA expandAVX512Asm_22_outShufLo+0x08(SB)/8, $0x4948403938313028 +DATA expandAVX512Asm_22_outShufLo+0x10(SB)/8, $0x0302696860595850 +DATA expandAVX512Asm_22_outShufLo+0x18(SB)/8, $0x3229232219131209 +DATA expandAVX512Asm_22_outShufLo+0x20(SB)/8, $0x5a514b4a413b3a33 +DATA expandAVX512Asm_22_outShufLo+0x28(SB)/8, $0x140a05046b6a615b +DATA expandAVX512Asm_22_outShufLo+0x30(SB)/8, $0x3c35342a25241a15 +DATA expandAVX512Asm_22_outShufLo+0x38(SB)/8, $0x625d5c524d4c423d + +GLOBL expandAVX512Asm_22_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_outShufHi0+0x00(SB)/8, $0x5049484039383130 +DATA expandAVX512Asm_22_outShufHi0+0x08(SB)/8, $0x7871706968605958 +DATA expandAVX512Asm_22_outShufHi0+0x10(SB)/8, $0x3332ffffffffffff +DATA expandAVX512Asm_22_outShufHi0+0x18(SB)/8, $0x5b5a514b4a413b3a +DATA expandAVX512Asm_22_outShufHi0+0x20(SB)/8, $0xffff7973726b6a61 +DATA expandAVX512Asm_22_outShufHi0+0x28(SB)/8, $0x3d3c3534ffffffff +DATA expandAVX512Asm_22_outShufHi0+0x30(SB)/8, $0x6c625d5c524d4c42 +DATA expandAVX512Asm_22_outShufHi0+0x38(SB)/8, $0xffffffff7a75746d + +GLOBL expandAVX512Asm_22_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_22_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_22_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_22_outShufHi1+0x10(SB)/8, $0xffff181110080100 +DATA expandAVX512Asm_22_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_22_outShufHi1+0x20(SB)/8, $0x0302ffffffffffff +DATA expandAVX512Asm_22_outShufHi1+0x28(SB)/8, $0xffffffff19131209 +DATA expandAVX512Asm_22_outShufHi1+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_22_outShufHi1+0x38(SB)/8, $0x140a0504ffffffff + +TEXT expandAVX512Asm_22<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_22_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_22_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_22_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_22_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_22_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_22_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_22_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_22_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xffff03fffc0ffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0xf0000fc0003f0000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_24_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_24_inShuf0<>+0x08(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_24_inShuf0<>+0x10(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_24_inShuf0<>+0x18(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_24_inShuf0<>+0x20(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_24_inShuf0<>+0x28(SB)/8, $0xff02010101000000 +DATA expandAVX512Asm_24_inShuf0<>+0x30(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_24_inShuf0<>+0x38(SB)/8, $0xffff010101000000 + +GLOBL expandAVX512Asm_24_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_24_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_24_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_24_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_24_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_24_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_24_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_24_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_24_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_inShuf1<>+0x00(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_24_inShuf1<>+0x08(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_24_inShuf1<>+0x10(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_24_inShuf1<>+0x18(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_24_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_24_inShuf1<>+0x28(SB)/8, $0x0404040303030202 +DATA expandAVX512Asm_24_inShuf1<>+0x30(SB)/8, $0x0404030303020202 +DATA expandAVX512Asm_24_inShuf1<>+0x38(SB)/8, $0x0404030303020202 + +GLOBL expandAVX512Asm_24_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_inShuf2<>+0x00(SB)/8, $0x0505040404030303 +DATA expandAVX512Asm_24_inShuf2<>+0x08(SB)/8, $0x0505040404030303 +DATA expandAVX512Asm_24_inShuf2<>+0x10(SB)/8, $0x0505040404030303 +DATA expandAVX512Asm_24_inShuf2<>+0x18(SB)/8, $0xffff040404030303 +DATA expandAVX512Asm_24_inShuf2<>+0x20(SB)/8, $0xffff040404030303 +DATA expandAVX512Asm_24_inShuf2<>+0x28(SB)/8, $0xffffffffffffff04 +DATA expandAVX512Asm_24_inShuf2<>+0x30(SB)/8, $0xffffffffffffff04 +DATA expandAVX512Asm_24_inShuf2<>+0x38(SB)/8, $0xffffffffffffff05 + +GLOBL expandAVX512Asm_24_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_mat2<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_24_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_24_mat2<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_24_mat2<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_24_mat2<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_24_mat2<>+0x28(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_24_mat2<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_24_mat2<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512Asm_24_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_inShuf3<>+0x00(SB)/8, $0xffffffffffffff05 +DATA expandAVX512Asm_24_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_24_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_mat3<>+0x00(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_24_mat3<>+0x08(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_24_mat3<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_24_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_24_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_24_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_24_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_24_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_24_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_outShufLo+0x00(SB)/8, $0x11100a0908020100 +DATA expandAVX512Asm_24_outShufLo+0x08(SB)/8, $0x282221201a191812 +DATA expandAVX512Asm_24_outShufLo+0x10(SB)/8, $0x3a39383231302a29 +DATA expandAVX512Asm_24_outShufLo+0x18(SB)/8, $0x14130d0c0b050403 +DATA expandAVX512Asm_24_outShufLo+0x20(SB)/8, $0x2b2524231d1c1b15 +DATA expandAVX512Asm_24_outShufLo+0x28(SB)/8, $0x3d3c3b3534332d2c +DATA expandAVX512Asm_24_outShufLo+0x30(SB)/8, $0x1716480f0e400706 +DATA expandAVX512Asm_24_outShufLo+0x38(SB)/8, $0x2e602726581f1e50 + +GLOBL expandAVX512Asm_24_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_outShufHi0+0x00(SB)/8, $0x3a39383231302928 +DATA expandAVX512Asm_24_outShufHi0+0x08(SB)/8, $0x51504a4948424140 +DATA expandAVX512Asm_24_outShufHi0+0x10(SB)/8, $0x2a6261605a595852 +DATA expandAVX512Asm_24_outShufHi0+0x18(SB)/8, $0x3d3c3b3534332c2b +DATA expandAVX512Asm_24_outShufHi0+0x20(SB)/8, $0x54534d4c4b454443 +DATA expandAVX512Asm_24_outShufHi0+0x28(SB)/8, $0x2d6564635d5c5b55 +DATA expandAVX512Asm_24_outShufHi0+0x30(SB)/8, $0x703f3e6837362f2e +DATA expandAVX512Asm_24_outShufHi0+0x38(SB)/8, $0x5756ff4f4e784746 + +GLOBL expandAVX512Asm_24_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_24_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_outShufHi1+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_24_outShufHi1+0x38(SB)/8, $0xffff00ffffffffff + +TEXT expandAVX512Asm_24<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_24_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_24_mat0<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_24_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_24_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_24_inShuf3<>(SB), Z5 + VMOVDQU64 expandAVX512Asm_24_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_24_outShufHi0(SB), Z6 + VMOVDQU64 expandAVX512Asm_24_outShufHi1(SB), Z7 + VMOVDQU64 (AX), Z8 + VPERMB Z8, Z0, Z0 + VGF2P8AFFINEQB $0, Z2, Z0, Z0 + VPERMB Z8, Z3, Z3 + VGF2P8AFFINEQB $0, Z2, Z3, Z2 + VPERMB Z8, Z4, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_24_mat2<>(SB), Z3, Z3 + VPERMB Z8, Z5, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_24_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xdfffffffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z6 + MOVQ $0x2000000000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z7, K1, Z0 + VPORQ Z0, Z6, Z2 + RET + +GLOBL expandAVX512Asm_26_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_26_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_26_inShuf0<>+0x10(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_26_inShuf0<>+0x18(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_26_inShuf0<>+0x20(SB)/8, $0xffff020201010000 +DATA expandAVX512Asm_26_inShuf0<>+0x28(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_26_inShuf0<>+0x30(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_26_inShuf0<>+0x38(SB)/8, $0xffff010101000000 + +GLOBL expandAVX512Asm_26_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_26_mat0<>+0x08(SB)/8, $0x0101020202020202 +DATA expandAVX512Asm_26_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_26_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_26_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_26_mat0<>+0x28(SB)/8, $0x0404040404040808 +DATA expandAVX512Asm_26_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_26_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512Asm_26_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_26_inShuf1<>+0x08(SB)/8, $0xffffffff01010000 +DATA expandAVX512Asm_26_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_26_inShuf1<>+0x18(SB)/8, $0xffffffff01010000 +DATA expandAVX512Asm_26_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_26_inShuf1<>+0x28(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_26_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_26_inShuf1<>+0x38(SB)/8, $0xff04040403030302 + +GLOBL expandAVX512Asm_26_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_mat1<>+0x00(SB)/8, $0x1010202020202020 +DATA expandAVX512Asm_26_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_26_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_26_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_26_mat1<>+0x20(SB)/8, $0x4040404040408080 +DATA expandAVX512Asm_26_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_26_mat1<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_26_mat1<>+0x38(SB)/8, $0x0808080808080808 + +GLOBL expandAVX512Asm_26_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_inShuf2<>+0x00(SB)/8, $0x0404030303020202 +DATA expandAVX512Asm_26_inShuf2<>+0x08(SB)/8, $0xffffffffff040302 +DATA expandAVX512Asm_26_inShuf2<>+0x10(SB)/8, $0xffff040403030202 +DATA expandAVX512Asm_26_inShuf2<>+0x18(SB)/8, $0xffffffffff040302 +DATA expandAVX512Asm_26_inShuf2<>+0x20(SB)/8, $0xffff040403030202 +DATA expandAVX512Asm_26_inShuf2<>+0x28(SB)/8, $0xffffffffff040302 +DATA expandAVX512Asm_26_inShuf2<>+0x30(SB)/8, $0xff04030303020202 +DATA expandAVX512Asm_26_inShuf2<>+0x38(SB)/8, $0xffff040404030303 + +GLOBL expandAVX512Asm_26_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_mat2<>+0x00(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_26_mat2<>+0x08(SB)/8, $0x1010202020202020 +DATA expandAVX512Asm_26_mat2<>+0x10(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_26_mat2<>+0x18(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_26_mat2<>+0x20(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_26_mat2<>+0x28(SB)/8, $0x4040404040408080 +DATA expandAVX512Asm_26_mat2<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_26_mat2<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512Asm_26_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403 +DATA expandAVX512Asm_26_inShuf3<>+0x08(SB)/8, $0xffffffff04040303 +DATA expandAVX512Asm_26_inShuf3<>+0x10(SB)/8, $0xffffffffffff0403 +DATA expandAVX512Asm_26_inShuf3<>+0x18(SB)/8, $0xffffffff04040303 +DATA expandAVX512Asm_26_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403 +DATA expandAVX512Asm_26_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04 +DATA expandAVX512Asm_26_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_26_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_26_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_mat3<>+0x00(SB)/8, $0x0101020202020202 +DATA expandAVX512Asm_26_mat3<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_26_mat3<>+0x10(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_26_mat3<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_26_mat3<>+0x20(SB)/8, $0x0404040404040808 +DATA expandAVX512Asm_26_mat3<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_26_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_26_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_26_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_outShufLo+0x00(SB)/8, $0x2018111008020100 +DATA expandAVX512Asm_26_outShufLo+0x08(SB)/8, $0x3a39383231302821 +DATA expandAVX512Asm_26_outShufLo+0x10(SB)/8, $0x6860595850494840 +DATA expandAVX512Asm_26_outShufLo+0x18(SB)/8, $0x1312090504036a69 +DATA expandAVX512Asm_26_outShufLo+0x20(SB)/8, $0x3b35343329232219 +DATA expandAVX512Asm_26_outShufLo+0x28(SB)/8, $0x5b5a514b4a413d3c +DATA expandAVX512Asm_26_outShufLo+0x30(SB)/8, $0x0a7007066d6c6b61 +DATA expandAVX512Asm_26_outShufLo+0x38(SB)/8, $0x37362a25241a1514 + +GLOBL expandAVX512Asm_26_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_outShufHi0+0x00(SB)/8, $0x5851504842414038 +DATA expandAVX512Asm_26_outShufHi0+0x08(SB)/8, $0x7978727170686160 +DATA expandAVX512Asm_26_outShufHi0+0x10(SB)/8, $0xffffffffffffff7a +DATA expandAVX512Asm_26_outShufHi0+0x18(SB)/8, $0x52494544433b3a39 +DATA expandAVX512Asm_26_outShufHi0+0x20(SB)/8, $0x7574736963625953 +DATA expandAVX512Asm_26_outShufHi0+0x28(SB)/8, $0xffffffffff7d7c7b +DATA expandAVX512Asm_26_outShufHi0+0x30(SB)/8, $0xff47463e3d3cffff +DATA expandAVX512Asm_26_outShufHi0+0x38(SB)/8, $0x766a65645a55544a + +GLOBL expandAVX512Asm_26_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_26_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_26_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_26_outShufHi1+0x10(SB)/8, $0x20191810090800ff +DATA expandAVX512Asm_26_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_26_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_26_outShufHi1+0x28(SB)/8, $0x1a110b0a01ffffff +DATA expandAVX512Asm_26_outShufHi1+0x30(SB)/8, $0x28ffffffffff211b +DATA expandAVX512Asm_26_outShufHi1+0x38(SB)/8, $0xffffffffffffffff + +TEXT expandAVX512Asm_26<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_26_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_26_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_26_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_26_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_26_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_26_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_26_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_26_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xff7c07ffff01ffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x83f80000fe0000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_28_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_28_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_28_inShuf0<>+0x10(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_28_inShuf0<>+0x18(SB)/8, $0xff02010101000000 +DATA expandAVX512Asm_28_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_28_inShuf0<>+0x28(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_28_inShuf0<>+0x30(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_28_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100 + +GLOBL expandAVX512Asm_28_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_28_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_28_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_28_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_28_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_28_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_28_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_28_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_28_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_inShuf1<>+0x00(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_28_inShuf1<>+0x08(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_28_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_28_inShuf1<>+0x18(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_28_inShuf1<>+0x20(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_28_inShuf1<>+0x28(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_28_inShuf1<>+0x30(SB)/8, $0x0404040303030202 +DATA expandAVX512Asm_28_inShuf1<>+0x38(SB)/8, $0xffffffffff040302 + +GLOBL expandAVX512Asm_28_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_28_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_28_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_28_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_28_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_28_mat1<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_28_mat1<>+0x30(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_28_mat1<>+0x38(SB)/8, $0x0404040408080808 + +GLOBL expandAVX512Asm_28_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_inShuf2<>+0x00(SB)/8, $0x0404030303020202 +DATA expandAVX512Asm_28_inShuf2<>+0x08(SB)/8, $0x0404030303020202 +DATA expandAVX512Asm_28_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302 +DATA expandAVX512Asm_28_inShuf2<>+0x18(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_28_inShuf2<>+0x20(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_28_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302 +DATA expandAVX512Asm_28_inShuf2<>+0x30(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_28_inShuf2<>+0x38(SB)/8, $0xffff040404030303 + +GLOBL expandAVX512Asm_28_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_mat2<>+0x00(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_28_mat2<>+0x08(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_28_mat2<>+0x10(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_28_mat2<>+0x18(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_28_mat2<>+0x20(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_28_mat2<>+0x28(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_28_mat2<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_28_mat2<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512Asm_28_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_inShuf3<>+0x00(SB)/8, $0xffffffffffff0403 +DATA expandAVX512Asm_28_inShuf3<>+0x08(SB)/8, $0xffff040404030303 +DATA expandAVX512Asm_28_inShuf3<>+0x10(SB)/8, $0xffffffffffffff04 +DATA expandAVX512Asm_28_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_28_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_mat3<>+0x00(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_28_mat3<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_28_mat3<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_28_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_28_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_28_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_28_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_28_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_28_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_outShufLo+0x00(SB)/8, $0x1812111008020100 +DATA expandAVX512Asm_28_outShufLo+0x08(SB)/8, $0x31302a2928201a19 +DATA expandAVX512Asm_28_outShufLo+0x10(SB)/8, $0x4a49484241403832 +DATA expandAVX512Asm_28_outShufLo+0x18(SB)/8, $0x090504035a595850 +DATA expandAVX512Asm_28_outShufLo+0x20(SB)/8, $0x2b211d1c1b151413 +DATA expandAVX512Asm_28_outShufLo+0x28(SB)/8, $0x4443393534332d2c +DATA expandAVX512Asm_28_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b45 +DATA expandAVX512Asm_28_outShufLo+0x38(SB)/8, $0x1e6817160a600706 + +GLOBL expandAVX512Asm_28_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_outShufHi0+0x00(SB)/8, $0x4948424140383130 +DATA expandAVX512Asm_28_outShufHi0+0x08(SB)/8, $0x6261605a5958504a +DATA expandAVX512Asm_28_outShufHi0+0x10(SB)/8, $0xff7a797872717068 +DATA expandAVX512Asm_28_outShufHi0+0x18(SB)/8, $0x4339343332ffffff +DATA expandAVX512Asm_28_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b4544 +DATA expandAVX512Asm_28_outShufHi0+0x28(SB)/8, $0x757473696564635d +DATA expandAVX512Asm_28_outShufHi0+0x30(SB)/8, $0x35ffffffff7d7c7b +DATA expandAVX512Asm_28_outShufHi0+0x38(SB)/8, $0x4f4eff47463a3736 + +GLOBL expandAVX512Asm_28_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_28_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_outShufHi1+0x10(SB)/8, $0x00ffffffffffffff +DATA expandAVX512Asm_28_outShufHi1+0x18(SB)/8, $0xffffffffff0a0908 +DATA expandAVX512Asm_28_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_28_outShufHi1+0x30(SB)/8, $0xff0d0c0b01ffffff +DATA expandAVX512Asm_28_outShufHi1+0x38(SB)/8, $0xffff10ffffffffff + +TEXT expandAVX512Asm_28<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_28_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_28_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_28_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_28_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_28_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_28_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_28_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_28_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xdf87fffff87fffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x2078000007800000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_30_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_inShuf0<>+0x00(SB)/8, $0x0202010101000000 +DATA expandAVX512Asm_30_inShuf0<>+0x08(SB)/8, $0xffffffffff020100 +DATA expandAVX512Asm_30_inShuf0<>+0x10(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_30_inShuf0<>+0x18(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_30_inShuf0<>+0x20(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_30_inShuf0<>+0x28(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_30_inShuf0<>+0x30(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_30_inShuf0<>+0x38(SB)/8, $0xffff010101000000 + +GLOBL expandAVX512Asm_30_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_30_mat0<>+0x08(SB)/8, $0x0101010101010202 +DATA expandAVX512Asm_30_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_30_mat0<>+0x18(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_30_mat0<>+0x20(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_30_mat0<>+0x28(SB)/8, $0x0404080808080808 +DATA expandAVX512Asm_30_mat0<>+0x30(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_30_mat0<>+0x38(SB)/8, $0x1010101010101010 + +GLOBL expandAVX512Asm_30_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_inShuf1<>+0x00(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_30_inShuf1<>+0x08(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_30_inShuf1<>+0x10(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_30_inShuf1<>+0x18(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_30_inShuf1<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_30_inShuf1<>+0x28(SB)/8, $0xffff010101000000 +DATA expandAVX512Asm_30_inShuf1<>+0x30(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_30_inShuf1<>+0x38(SB)/8, $0x0404030303020202 + +GLOBL expandAVX512Asm_30_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_mat1<>+0x00(SB)/8, $0x1010101010102020 +DATA expandAVX512Asm_30_mat1<>+0x08(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_30_mat1<>+0x10(SB)/8, $0x2020202040404040 +DATA expandAVX512Asm_30_mat1<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_30_mat1<>+0x20(SB)/8, $0x4040808080808080 +DATA expandAVX512Asm_30_mat1<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_30_mat1<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_30_mat1<>+0x38(SB)/8, $0x0202020202020202 + +GLOBL expandAVX512Asm_30_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_inShuf2<>+0x00(SB)/8, $0xffffffffff040302 +DATA expandAVX512Asm_30_inShuf2<>+0x08(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_30_inShuf2<>+0x10(SB)/8, $0xffffffffffff0302 +DATA expandAVX512Asm_30_inShuf2<>+0x18(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_30_inShuf2<>+0x20(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_30_inShuf2<>+0x28(SB)/8, $0xffffffffffff0302 +DATA expandAVX512Asm_30_inShuf2<>+0x30(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_30_inShuf2<>+0x38(SB)/8, $0xffffffffffff0302 + +GLOBL expandAVX512Asm_30_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_mat2<>+0x00(SB)/8, $0x0202020204040404 +DATA expandAVX512Asm_30_mat2<>+0x08(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_30_mat2<>+0x10(SB)/8, $0x0404080808080808 +DATA expandAVX512Asm_30_mat2<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_30_mat2<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_30_mat2<>+0x28(SB)/8, $0x1010101010102020 +DATA expandAVX512Asm_30_mat2<>+0x30(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_30_mat2<>+0x38(SB)/8, $0x2020202040404040 + +GLOBL expandAVX512Asm_30_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_inShuf3<>+0x00(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_30_inShuf3<>+0x08(SB)/8, $0xffffffffffff0302 +DATA expandAVX512Asm_30_inShuf3<>+0x10(SB)/8, $0xffff030303020202 +DATA expandAVX512Asm_30_inShuf3<>+0x18(SB)/8, $0xffff040404030303 +DATA expandAVX512Asm_30_inShuf3<>+0x20(SB)/8, $0xffffffffffff0403 +DATA expandAVX512Asm_30_inShuf3<>+0x28(SB)/8, $0xffffffffffffff04 +DATA expandAVX512Asm_30_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_30_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_30_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_mat3<>+0x00(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_30_mat3<>+0x08(SB)/8, $0x4040808080808080 +DATA expandAVX512Asm_30_mat3<>+0x10(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_30_mat3<>+0x18(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_30_mat3<>+0x20(SB)/8, $0x0101010101010202 +DATA expandAVX512Asm_30_mat3<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_30_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_30_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_30_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_outShufLo+0x00(SB)/8, $0x1812111008020100 +DATA expandAVX512Asm_30_outShufLo+0x08(SB)/8, $0x3832313028222120 +DATA expandAVX512Asm_30_outShufLo+0x10(SB)/8, $0x58504a4948403a39 +DATA expandAVX512Asm_30_outShufLo+0x18(SB)/8, $0x04036a6968605a59 +DATA expandAVX512Asm_30_outShufLo+0x20(SB)/8, $0x2423191514130905 +DATA expandAVX512Asm_30_outShufLo+0x28(SB)/8, $0x3d3c3b3534332925 +DATA expandAVX512Asm_30_outShufLo+0x30(SB)/8, $0x5d5c5b514d4c4b41 +DATA expandAVX512Asm_30_outShufLo+0x38(SB)/8, $0x0a7007066d6c6b61 + +GLOBL expandAVX512Asm_30_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_outShufHi0+0x00(SB)/8, $0x504a4948403a3938 +DATA expandAVX512Asm_30_outShufHi0+0x08(SB)/8, $0x70686261605a5958 +DATA expandAVX512Asm_30_outShufHi0+0x10(SB)/8, $0xffffffffff787271 +DATA expandAVX512Asm_30_outShufHi0+0x18(SB)/8, $0x3c3bffffffffffff +DATA expandAVX512Asm_30_outShufHi0+0x20(SB)/8, $0x5c5b514d4c4b413d +DATA expandAVX512Asm_30_outShufHi0+0x28(SB)/8, $0x757473696564635d +DATA expandAVX512Asm_30_outShufHi0+0x30(SB)/8, $0xffffffffffffff79 +DATA expandAVX512Asm_30_outShufHi0+0x38(SB)/8, $0x42ff3f3effffffff + +GLOBL expandAVX512Asm_30_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_30_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_30_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_30_outShufHi1+0x10(SB)/8, $0x1008020100ffffff +DATA expandAVX512Asm_30_outShufHi1+0x18(SB)/8, $0xffff201a19181211 +DATA expandAVX512Asm_30_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_30_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_30_outShufHi1+0x30(SB)/8, $0x15141309050403ff +DATA expandAVX512Asm_30_outShufHi1+0x38(SB)/8, $0xff28ffff211d1c1b + +TEXT expandAVX512Asm_30<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_30_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_30_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_30_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_30_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_30_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_30_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_30_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_30_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xb001ffffc007ffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x4ffe00003ff80000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_32_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_32_inShuf0<>+0x00(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_32_inShuf0<>+0x08(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_32_inShuf0<>+0x10(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_32_inShuf0<>+0x18(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_32_inShuf0<>+0x20(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_32_inShuf0<>+0x28(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_32_inShuf0<>+0x30(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_32_inShuf0<>+0x38(SB)/8, $0x0101010100000000 + +GLOBL expandAVX512Asm_32_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_32_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_32_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_32_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_32_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_32_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_32_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_32_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_32_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_32_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_32_inShuf1<>+0x00(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_32_inShuf1<>+0x08(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_32_inShuf1<>+0x10(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_32_inShuf1<>+0x18(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_32_inShuf1<>+0x20(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_32_inShuf1<>+0x28(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_32_inShuf1<>+0x30(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_32_inShuf1<>+0x38(SB)/8, $0x0303030302020202 + +GLOBL expandAVX512Asm_32_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_32_outShufLo+0x00(SB)/8, $0x0b0a090803020100 +DATA expandAVX512Asm_32_outShufLo+0x08(SB)/8, $0x1b1a191813121110 +DATA expandAVX512Asm_32_outShufLo+0x10(SB)/8, $0x2b2a292823222120 +DATA expandAVX512Asm_32_outShufLo+0x18(SB)/8, $0x3b3a393833323130 +DATA expandAVX512Asm_32_outShufLo+0x20(SB)/8, $0x0f0e0d0c07060504 +DATA expandAVX512Asm_32_outShufLo+0x28(SB)/8, $0x1f1e1d1c17161514 +DATA expandAVX512Asm_32_outShufLo+0x30(SB)/8, $0x2f2e2d2c27262524 +DATA expandAVX512Asm_32_outShufLo+0x38(SB)/8, $0x3f3e3d3c37363534 + +TEXT expandAVX512Asm_32<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_32_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_32_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512Asm_32_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_32_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + +GLOBL expandAVX512Asm_36_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_inShuf0<>+0x00(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_36_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_36_inShuf0<>+0x10(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_36_inShuf0<>+0x18(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_36_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_36_inShuf0<>+0x28(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_36_inShuf0<>+0x30(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_36_inShuf0<>+0x38(SB)/8, $0xffffffffffff0100 + +GLOBL expandAVX512Asm_36_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_36_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_36_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_36_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_36_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_36_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_36_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_36_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_36_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_inShuf1<>+0x00(SB)/8, $0x0101010100000000 +DATA expandAVX512Asm_36_inShuf1<>+0x08(SB)/8, $0xffffff0100000000 +DATA expandAVX512Asm_36_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512Asm_36_inShuf1<>+0x18(SB)/8, $0xffffffff00000000 +DATA expandAVX512Asm_36_inShuf1<>+0x20(SB)/8, $0xff02020202010101 +DATA expandAVX512Asm_36_inShuf1<>+0x28(SB)/8, $0xffffffffffff0201 +DATA expandAVX512Asm_36_inShuf1<>+0x30(SB)/8, $0x0202020201010101 +DATA expandAVX512Asm_36_inShuf1<>+0x38(SB)/8, $0x0303030302020202 + +GLOBL expandAVX512Asm_36_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_36_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_36_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_36_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_36_mat1<>+0x20(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_36_mat1<>+0x28(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_36_mat1<>+0x30(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_36_mat1<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512Asm_36_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_inShuf2<>+0x00(SB)/8, $0xffffffffffff0302 +DATA expandAVX512Asm_36_inShuf2<>+0x08(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_36_inShuf2<>+0x10(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_36_inShuf2<>+0x18(SB)/8, $0xffffffffffff0302 +DATA expandAVX512Asm_36_inShuf2<>+0x20(SB)/8, $0x0303030302020202 +DATA expandAVX512Asm_36_inShuf2<>+0x28(SB)/8, $0xffff030302020202 +DATA expandAVX512Asm_36_inShuf2<>+0x30(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_36_inShuf2<>+0x38(SB)/8, $0xffffffff02020202 + +GLOBL expandAVX512Asm_36_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_mat2<>+0x00(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_36_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_36_mat2<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_36_mat2<>+0x18(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_36_mat2<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_36_mat2<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_36_mat2<>+0x30(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_36_mat2<>+0x38(SB)/8, $0x2020202020202020 + +GLOBL expandAVX512Asm_36_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_outShufLo+0x00(SB)/8, $0x1211100803020100 +DATA expandAVX512Asm_36_outShufLo+0x08(SB)/8, $0x2928201b1a191813 +DATA expandAVX512Asm_36_outShufLo+0x10(SB)/8, $0x4038333231302b2a +DATA expandAVX512Asm_36_outShufLo+0x18(SB)/8, $0x504b4a4948434241 +DATA expandAVX512Asm_36_outShufLo+0x20(SB)/8, $0x070605045b5a5958 +DATA expandAVX512Asm_36_outShufLo+0x28(SB)/8, $0x1e1d1c1716151409 +DATA expandAVX512Asm_36_outShufLo+0x30(SB)/8, $0x35342f2e2d2c211f +DATA expandAVX512Asm_36_outShufLo+0x38(SB)/8, $0x4c47464544393736 + +GLOBL expandAVX512Asm_36_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_36_outShufHi+0x00(SB)/8, $0x3332313028222120 +DATA expandAVX512Asm_36_outShufHi+0x08(SB)/8, $0x4a4948403b3a3938 +DATA expandAVX512Asm_36_outShufHi+0x10(SB)/8, $0x616058535251504b +DATA expandAVX512Asm_36_outShufHi+0x18(SB)/8, $0x78706b6a69686362 +DATA expandAVX512Asm_36_outShufHi+0x20(SB)/8, $0x29262524237b7a79 +DATA expandAVX512Asm_36_outShufHi+0x28(SB)/8, $0x3f3e3d3c37363534 +DATA expandAVX512Asm_36_outShufHi+0x30(SB)/8, $0x5655544f4e4d4c41 +DATA expandAVX512Asm_36_outShufHi+0x38(SB)/8, $0x6d6c676665645957 + +TEXT expandAVX512Asm_36<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_36_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_36_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_36_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_36_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_36_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_36_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_36_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_36_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512Asm_40_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_inShuf0<>+0x00(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_40_inShuf0<>+0x08(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_40_inShuf0<>+0x10(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_40_inShuf0<>+0x18(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_40_inShuf0<>+0x20(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_40_inShuf0<>+0x28(SB)/8, $0xffffff0000000000 +DATA expandAVX512Asm_40_inShuf0<>+0x30(SB)/8, $0xffffff0000000000 +DATA expandAVX512Asm_40_inShuf0<>+0x38(SB)/8, $0xffffff0000000000 + +GLOBL expandAVX512Asm_40_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_40_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_40_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_40_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_40_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_40_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_40_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_40_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_40_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_inShuf1<>+0x00(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_40_inShuf1<>+0x08(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_40_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_40_inShuf1<>+0x18(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_40_inShuf1<>+0x20(SB)/8, $0xffffffffffffff01 +DATA expandAVX512Asm_40_inShuf1<>+0x28(SB)/8, $0xffff020202020201 +DATA expandAVX512Asm_40_inShuf1<>+0x30(SB)/8, $0x0202020101010101 +DATA expandAVX512Asm_40_inShuf1<>+0x38(SB)/8, $0x0202020101010101 + +GLOBL expandAVX512Asm_40_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_mat1<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_40_mat1<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_40_mat1<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_40_mat1<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_40_mat1<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_40_mat1<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_40_mat1<>+0x30(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_40_mat1<>+0x38(SB)/8, $0x4040404040404040 + +GLOBL expandAVX512Asm_40_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_inShuf2<>+0x00(SB)/8, $0x0202020101010101 +DATA expandAVX512Asm_40_inShuf2<>+0x08(SB)/8, $0x0303030202020202 +DATA expandAVX512Asm_40_inShuf2<>+0x10(SB)/8, $0x0303030202020202 +DATA expandAVX512Asm_40_inShuf2<>+0x18(SB)/8, $0xffffff0202020202 +DATA expandAVX512Asm_40_inShuf2<>+0x20(SB)/8, $0xffffff0202020202 +DATA expandAVX512Asm_40_inShuf2<>+0x28(SB)/8, $0xffffffffffff0202 +DATA expandAVX512Asm_40_inShuf2<>+0x30(SB)/8, $0xffffffffffff0202 +DATA expandAVX512Asm_40_inShuf2<>+0x38(SB)/8, $0xffffffffffff0202 + +GLOBL expandAVX512Asm_40_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_mat2<>+0x00(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_40_mat2<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_40_mat2<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_40_mat2<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_40_mat2<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_40_mat2<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_40_mat2<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_40_mat2<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_40_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_inShuf3<>+0x00(SB)/8, $0xffffffffffff0303 +DATA expandAVX512Asm_40_inShuf3<>+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_40_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_mat3<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_40_mat3<>+0x08(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_40_mat3<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_40_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_40_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_40_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_40_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_40_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_40_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_outShufLo+0x00(SB)/8, $0x0a09080403020100 +DATA expandAVX512Asm_40_outShufLo+0x08(SB)/8, $0x1814131211100c0b +DATA expandAVX512Asm_40_outShufLo+0x10(SB)/8, $0x232221201c1b1a19 +DATA expandAVX512Asm_40_outShufLo+0x18(SB)/8, $0x31302c2b2a292824 +DATA expandAVX512Asm_40_outShufLo+0x20(SB)/8, $0x3c3b3a3938343332 +DATA expandAVX512Asm_40_outShufLo+0x28(SB)/8, $0x0f0e0d4140070605 +DATA expandAVX512Asm_40_outShufLo+0x30(SB)/8, $0x1d51501716154948 +DATA expandAVX512Asm_40_outShufLo+0x38(SB)/8, $0x6027262559581f1e + +GLOBL expandAVX512Asm_40_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_outShufHi0+0x00(SB)/8, $0x3938343332313028 +DATA expandAVX512Asm_40_outShufHi0+0x08(SB)/8, $0x44434241403c3b3a +DATA expandAVX512Asm_40_outShufHi0+0x10(SB)/8, $0x5251504c4b4a4948 +DATA expandAVX512Asm_40_outShufHi0+0x18(SB)/8, $0x605c5b5a59585453 +DATA expandAVX512Asm_40_outShufHi0+0x20(SB)/8, $0x2c2b2a2964636261 +DATA expandAVX512Asm_40_outShufHi0+0x28(SB)/8, $0x3e3d69683736352d +DATA expandAVX512Asm_40_outShufHi0+0x30(SB)/8, $0x797847464571703f +DATA expandAVX512Asm_40_outShufHi0+0x38(SB)/8, $0x575655ffff4f4e4d + +GLOBL expandAVX512Asm_40_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_40_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_outShufHi1+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_40_outShufHi1+0x38(SB)/8, $0xffffff0100ffffff + +TEXT expandAVX512Asm_40<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_40_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_40_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_40_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_40_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_40_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_40_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_40_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_40_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xe7ffffffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x1800000000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_44_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_inShuf0<>+0x00(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_44_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_44_inShuf0<>+0x10(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_44_inShuf0<>+0x18(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_44_inShuf0<>+0x20(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_44_inShuf0<>+0x28(SB)/8, $0x0101010000000000 +DATA expandAVX512Asm_44_inShuf0<>+0x30(SB)/8, $0xffffff0000000000 +DATA expandAVX512Asm_44_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 + +GLOBL expandAVX512Asm_44_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_44_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_44_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_44_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_44_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_44_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_44_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_44_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_44_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_inShuf1<>+0x00(SB)/8, $0xffffff0000000000 +DATA expandAVX512Asm_44_inShuf1<>+0x08(SB)/8, $0xffffff0000000000 +DATA expandAVX512Asm_44_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512Asm_44_inShuf1<>+0x18(SB)/8, $0xffffff0000000000 +DATA expandAVX512Asm_44_inShuf1<>+0x20(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_44_inShuf1<>+0x28(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_44_inShuf1<>+0x30(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_44_inShuf1<>+0x38(SB)/8, $0xff02020202020101 + +GLOBL expandAVX512Asm_44_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_44_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_44_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_44_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_44_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_44_mat1<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_44_mat1<>+0x30(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_44_mat1<>+0x38(SB)/8, $0x0808080808080808 + +GLOBL expandAVX512Asm_44_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_inShuf2<>+0x00(SB)/8, $0x0202020101010101 +DATA expandAVX512Asm_44_inShuf2<>+0x08(SB)/8, $0xffffffffffff0201 +DATA expandAVX512Asm_44_inShuf2<>+0x10(SB)/8, $0x0202020101010101 +DATA expandAVX512Asm_44_inShuf2<>+0x18(SB)/8, $0x0202020101010101 +DATA expandAVX512Asm_44_inShuf2<>+0x20(SB)/8, $0xffffffffffff0201 +DATA expandAVX512Asm_44_inShuf2<>+0x28(SB)/8, $0xffff020101010101 +DATA expandAVX512Asm_44_inShuf2<>+0x30(SB)/8, $0xffffff0202020202 +DATA expandAVX512Asm_44_inShuf2<>+0x38(SB)/8, $0xffffffffffffff02 + +GLOBL expandAVX512Asm_44_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_mat2<>+0x00(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_44_mat2<>+0x08(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_44_mat2<>+0x10(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_44_mat2<>+0x18(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_44_mat2<>+0x20(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_44_mat2<>+0x28(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_44_mat2<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_44_mat2<>+0x38(SB)/8, $0x0101010102020202 + +GLOBL expandAVX512Asm_44_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_inShuf3<>+0x00(SB)/8, $0xffffff0202020202 +DATA expandAVX512Asm_44_inShuf3<>+0x08(SB)/8, $0xffffff0202020202 +DATA expandAVX512Asm_44_inShuf3<>+0x10(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_44_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202 +DATA expandAVX512Asm_44_inShuf3<>+0x20(SB)/8, $0xffffffffffff0202 +DATA expandAVX512Asm_44_inShuf3<>+0x28(SB)/8, $0xffffffffffff0202 +DATA expandAVX512Asm_44_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_44_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_44_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_mat3<>+0x00(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_44_mat3<>+0x08(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_44_mat3<>+0x10(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_44_mat3<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_44_mat3<>+0x20(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_44_mat3<>+0x28(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_44_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_44_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_44_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_outShufLo+0x00(SB)/8, $0x1110080403020100 +DATA expandAVX512Asm_44_outShufLo+0x08(SB)/8, $0x1c1b1a1918141312 +DATA expandAVX512Asm_44_outShufLo+0x10(SB)/8, $0x31302c2b2a292820 +DATA expandAVX512Asm_44_outShufLo+0x18(SB)/8, $0x4342414038343332 +DATA expandAVX512Asm_44_outShufLo+0x20(SB)/8, $0x58504c4b4a494844 +DATA expandAVX512Asm_44_outShufLo+0x28(SB)/8, $0x600706055c5b5a59 +DATA expandAVX512Asm_44_outShufLo+0x30(SB)/8, $0x1d69681716150961 +DATA expandAVX512Asm_44_outShufLo+0x38(SB)/8, $0x2f2e2d2171701f1e + +GLOBL expandAVX512Asm_44_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_outShufHi0+0x00(SB)/8, $0x4844434241403938 +DATA expandAVX512Asm_44_outShufHi0+0x08(SB)/8, $0x5a59585453525150 +DATA expandAVX512Asm_44_outShufHi0+0x10(SB)/8, $0x6c6b6a6968605c5b +DATA expandAVX512Asm_44_outShufHi0+0x18(SB)/8, $0xffff787473727170 +DATA expandAVX512Asm_44_outShufHi0+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_44_outShufHi0+0x28(SB)/8, $0x46453e3d3c3b3aff +DATA expandAVX512Asm_44_outShufHi0+0x30(SB)/8, $0xff57565549ffff47 +DATA expandAVX512Asm_44_outShufHi0+0x38(SB)/8, $0x6d61ffff5f5e5dff + +GLOBL expandAVX512Asm_44_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_44_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_44_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_44_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_44_outShufHi1+0x18(SB)/8, $0x0100ffffffffffff +DATA expandAVX512Asm_44_outShufHi1+0x20(SB)/8, $0x0c0b0a0908040302 +DATA expandAVX512Asm_44_outShufHi1+0x28(SB)/8, $0xffffffffffffff10 +DATA expandAVX512Asm_44_outShufHi1+0x30(SB)/8, $0x20ffffffff1918ff +DATA expandAVX512Asm_44_outShufHi1+0x38(SB)/8, $0xffff2928ffffff21 + +TEXT expandAVX512Asm_44<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_44_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_44_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_44_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_44_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_44_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_44_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_44_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_44_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0xce79fe003fffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x318601ffc0000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_48_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_inShuf0<>+0x00(SB)/8, $0x0101000000000000 +DATA expandAVX512Asm_48_inShuf0<>+0x08(SB)/8, $0x0101000000000000 +DATA expandAVX512Asm_48_inShuf0<>+0x10(SB)/8, $0x0101000000000000 +DATA expandAVX512Asm_48_inShuf0<>+0x18(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_48_inShuf0<>+0x20(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_48_inShuf0<>+0x28(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_48_inShuf0<>+0x30(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_48_inShuf0<>+0x38(SB)/8, $0xffff000000000000 + +GLOBL expandAVX512Asm_48_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_48_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_48_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_48_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_48_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_48_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_48_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_48_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_48_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_inShuf1<>+0x00(SB)/8, $0xffffffff01010101 +DATA expandAVX512Asm_48_inShuf1<>+0x08(SB)/8, $0xffffffff01010101 +DATA expandAVX512Asm_48_inShuf1<>+0x10(SB)/8, $0xffffffffffff0101 +DATA expandAVX512Asm_48_inShuf1<>+0x18(SB)/8, $0x0202020202020101 +DATA expandAVX512Asm_48_inShuf1<>+0x20(SB)/8, $0x0202010101010101 +DATA expandAVX512Asm_48_inShuf1<>+0x28(SB)/8, $0x0202010101010101 +DATA expandAVX512Asm_48_inShuf1<>+0x30(SB)/8, $0x0202010101010101 +DATA expandAVX512Asm_48_inShuf1<>+0x38(SB)/8, $0xffff010101010101 + +GLOBL expandAVX512Asm_48_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_mat1<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_48_mat1<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_48_mat1<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_48_mat1<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_48_mat1<>+0x20(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_48_mat1<>+0x28(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_48_mat1<>+0x30(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_48_mat1<>+0x38(SB)/8, $0x4040404040404040 + +GLOBL expandAVX512Asm_48_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_inShuf2<>+0x00(SB)/8, $0xffff010101010101 +DATA expandAVX512Asm_48_inShuf2<>+0x08(SB)/8, $0xffff020202020202 +DATA expandAVX512Asm_48_inShuf2<>+0x10(SB)/8, $0xffff020202020202 +DATA expandAVX512Asm_48_inShuf2<>+0x18(SB)/8, $0xffffffff02020202 +DATA expandAVX512Asm_48_inShuf2<>+0x20(SB)/8, $0xffffffff02020202 +DATA expandAVX512Asm_48_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_48_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_48_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_48_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_mat2<>+0x00(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_48_mat2<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_48_mat2<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_48_mat2<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_48_mat2<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_48_mat2<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_48_mat2<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_48_mat2<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_48_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_outShufLo+0x00(SB)/8, $0x0908050403020100 +DATA expandAVX512Asm_48_outShufLo+0x08(SB)/8, $0x131211100d0c0b0a +DATA expandAVX512Asm_48_outShufLo+0x10(SB)/8, $0x1d1c1b1a19181514 +DATA expandAVX512Asm_48_outShufLo+0x18(SB)/8, $0x2928252423222120 +DATA expandAVX512Asm_48_outShufLo+0x20(SB)/8, $0x333231302d2c2b2a +DATA expandAVX512Asm_48_outShufLo+0x28(SB)/8, $0x3d3c3b3a39383534 +DATA expandAVX512Asm_48_outShufLo+0x30(SB)/8, $0x0f0e434241400706 +DATA expandAVX512Asm_48_outShufLo+0x38(SB)/8, $0x515017164b4a4948 + +GLOBL expandAVX512Asm_48_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_48_outShufHi+0x00(SB)/8, $0x2524232221201918 +DATA expandAVX512Asm_48_outShufHi+0x08(SB)/8, $0x31302d2c2b2a2928 +DATA expandAVX512Asm_48_outShufHi+0x10(SB)/8, $0x3b3a393835343332 +DATA expandAVX512Asm_48_outShufHi+0x18(SB)/8, $0x4544434241403d3c +DATA expandAVX512Asm_48_outShufHi+0x20(SB)/8, $0x51504d4c4b4a4948 +DATA expandAVX512Asm_48_outShufHi+0x28(SB)/8, $0x1d1c1b1a55545352 +DATA expandAVX512Asm_48_outShufHi+0x30(SB)/8, $0x5b5a595827261f1e +DATA expandAVX512Asm_48_outShufHi+0x38(SB)/8, $0x3736636261602f2e + +TEXT expandAVX512Asm_48<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_48_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_48_inShuf1<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_48_inShuf2<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_48_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_48_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z5 + VPERMB Z5, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_48_mat0<>(SB), Z0, Z0 + VPERMB Z5, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_48_mat1<>(SB), Z3, Z3 + VPERMB Z5, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_48_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512Asm_52_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_inShuf0<>+0x00(SB)/8, $0x0101000000000000 +DATA expandAVX512Asm_52_inShuf0<>+0x08(SB)/8, $0xffffffffffff0100 +DATA expandAVX512Asm_52_inShuf0<>+0x10(SB)/8, $0x0101000000000000 +DATA expandAVX512Asm_52_inShuf0<>+0x18(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_52_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00 +DATA expandAVX512Asm_52_inShuf0<>+0x28(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_52_inShuf0<>+0x30(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_52_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 + +GLOBL expandAVX512Asm_52_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_52_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_52_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_52_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_52_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_52_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_52_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_52_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_52_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_inShuf1<>+0x00(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_52_inShuf1<>+0x08(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_52_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512Asm_52_inShuf1<>+0x18(SB)/8, $0xffff000000000000 +DATA expandAVX512Asm_52_inShuf1<>+0x20(SB)/8, $0xffffffff01010101 +DATA expandAVX512Asm_52_inShuf1<>+0x28(SB)/8, $0xffffffffff010101 +DATA expandAVX512Asm_52_inShuf1<>+0x30(SB)/8, $0xff02020202020201 +DATA expandAVX512Asm_52_inShuf1<>+0x38(SB)/8, $0x0202010101010101 + +GLOBL expandAVX512Asm_52_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_52_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_52_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_52_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_52_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_52_mat1<>+0x28(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_52_mat1<>+0x30(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_52_mat1<>+0x38(SB)/8, $0x0404040404040404 + +GLOBL expandAVX512Asm_52_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_inShuf2<>+0x00(SB)/8, $0xffffffffffff0201 +DATA expandAVX512Asm_52_inShuf2<>+0x08(SB)/8, $0x0202010101010101 +DATA expandAVX512Asm_52_inShuf2<>+0x10(SB)/8, $0xffff010101010101 +DATA expandAVX512Asm_52_inShuf2<>+0x18(SB)/8, $0xffffffffffffff01 +DATA expandAVX512Asm_52_inShuf2<>+0x20(SB)/8, $0xffff010101010101 +DATA expandAVX512Asm_52_inShuf2<>+0x28(SB)/8, $0xffff010101010101 +DATA expandAVX512Asm_52_inShuf2<>+0x30(SB)/8, $0xffffffffffffff01 +DATA expandAVX512Asm_52_inShuf2<>+0x38(SB)/8, $0xffff010101010101 + +GLOBL expandAVX512Asm_52_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_mat2<>+0x00(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_52_mat2<>+0x08(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_52_mat2<>+0x10(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_52_mat2<>+0x18(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_52_mat2<>+0x20(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_52_mat2<>+0x28(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_52_mat2<>+0x30(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_52_mat2<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_52_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_inShuf3<>+0x00(SB)/8, $0xffff020202020202 +DATA expandAVX512Asm_52_inShuf3<>+0x08(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_52_inShuf3<>+0x10(SB)/8, $0xffffffff02020202 +DATA expandAVX512Asm_52_inShuf3<>+0x18(SB)/8, $0xffffffffffff0202 +DATA expandAVX512Asm_52_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_52_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_mat3<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_52_mat3<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_52_mat3<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_52_mat3<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_52_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_52_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_52_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_52_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_52_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_outShufLo+0x00(SB)/8, $0x1008050403020100 +DATA expandAVX512Asm_52_outShufLo+0x08(SB)/8, $0x1a19181514131211 +DATA expandAVX512Asm_52_outShufLo+0x10(SB)/8, $0x2b2a2928201d1c1b +DATA expandAVX512Asm_52_outShufLo+0x18(SB)/8, $0x3534333231302d2c +DATA expandAVX512Asm_52_outShufLo+0x20(SB)/8, $0x4845444342414038 +DATA expandAVX512Asm_52_outShufLo+0x28(SB)/8, $0x5958504d4c4b4a49 +DATA expandAVX512Asm_52_outShufLo+0x30(SB)/8, $0x616007065d5c5b5a +DATA expandAVX512Asm_52_outShufLo+0x38(SB)/8, $0x6a69681716096362 + +GLOBL expandAVX512Asm_52_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_outShufHi0+0x00(SB)/8, $0x403d3c3b3a393830 +DATA expandAVX512Asm_52_outShufHi0+0x08(SB)/8, $0x51504d4c4b4a4948 +DATA expandAVX512Asm_52_outShufHi0+0x10(SB)/8, $0x6261605855545352 +DATA expandAVX512Asm_52_outShufHi0+0x18(SB)/8, $0x6c6b6a6968656463 +DATA expandAVX512Asm_52_outShufHi0+0x20(SB)/8, $0x7d7c7b7a7978706d +DATA expandAVX512Asm_52_outShufHi0+0x28(SB)/8, $0x31ffffffffffffff +DATA expandAVX512Asm_52_outShufHi0+0x30(SB)/8, $0xff3f3e3635343332 +DATA expandAVX512Asm_52_outShufHi0+0x38(SB)/8, $0xffff4f4e41ffffff + +GLOBL expandAVX512Asm_52_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_52_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_52_outShufHi1+0x28(SB)/8, $0xff08050403020100 +DATA expandAVX512Asm_52_outShufHi1+0x30(SB)/8, $0x10ffffffffffffff +DATA expandAVX512Asm_52_outShufHi1+0x38(SB)/8, $0x1918ffffff131211 + +TEXT expandAVX512Asm_52<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_52_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_52_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_52_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_52_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_52_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_52_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_52_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_52_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0x387f80ffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0xc7807f0000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_56_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_56_inShuf0<>+0x00(SB)/8, $0x0100000000000000 +DATA expandAVX512Asm_56_inShuf0<>+0x08(SB)/8, $0x0100000000000000 +DATA expandAVX512Asm_56_inShuf0<>+0x10(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_56_inShuf0<>+0x18(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_56_inShuf0<>+0x20(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_56_inShuf0<>+0x28(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_56_inShuf0<>+0x30(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_56_inShuf0<>+0x38(SB)/8, $0xff00000000000000 + +GLOBL expandAVX512Asm_56_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_56_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_56_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_56_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_56_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_56_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_56_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_56_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_56_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_56_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_56_inShuf1<>+0x00(SB)/8, $0xffff010101010101 +DATA expandAVX512Asm_56_inShuf1<>+0x08(SB)/8, $0x0202010101010101 +DATA expandAVX512Asm_56_inShuf1<>+0x10(SB)/8, $0x0201010101010101 +DATA expandAVX512Asm_56_inShuf1<>+0x18(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_56_inShuf1<>+0x20(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_56_inShuf1<>+0x28(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_56_inShuf1<>+0x30(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_56_inShuf1<>+0x38(SB)/8, $0xff01010101010101 + +GLOBL expandAVX512Asm_56_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_56_inShuf2<>+0x00(SB)/8, $0xff02020202020202 +DATA expandAVX512Asm_56_inShuf2<>+0x08(SB)/8, $0xffffff0202020202 +DATA expandAVX512Asm_56_inShuf2<>+0x10(SB)/8, $0xffffffffffffff02 +DATA expandAVX512Asm_56_inShuf2<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_56_inShuf2<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_56_inShuf2<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_56_inShuf2<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_56_inShuf2<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_56_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_56_mat2<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_56_mat2<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_56_mat2<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_56_mat2<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_56_mat2<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_56_mat2<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_56_mat2<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_56_mat2<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_56_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_56_outShufLo+0x00(SB)/8, $0x0806050403020100 +DATA expandAVX512Asm_56_outShufLo+0x08(SB)/8, $0x11100e0d0c0b0a09 +DATA expandAVX512Asm_56_outShufLo+0x10(SB)/8, $0x1a19181615141312 +DATA expandAVX512Asm_56_outShufLo+0x18(SB)/8, $0x232221201e1d1c1b +DATA expandAVX512Asm_56_outShufLo+0x20(SB)/8, $0x2c2b2a2928262524 +DATA expandAVX512Asm_56_outShufLo+0x28(SB)/8, $0x3534333231302e2d +DATA expandAVX512Asm_56_outShufLo+0x30(SB)/8, $0x3e3d3c3b3a393836 +DATA expandAVX512Asm_56_outShufLo+0x38(SB)/8, $0x0f45444342414007 + +GLOBL expandAVX512Asm_56_outShufHi(SB), RODATA, $0x40 +DATA expandAVX512Asm_56_outShufHi+0x00(SB)/8, $0x11100d0c0b0a0908 +DATA expandAVX512Asm_56_outShufHi+0x08(SB)/8, $0x1a19181615141312 +DATA expandAVX512Asm_56_outShufHi+0x10(SB)/8, $0x232221201e1d1c1b +DATA expandAVX512Asm_56_outShufHi+0x18(SB)/8, $0x2c2b2a2928262524 +DATA expandAVX512Asm_56_outShufHi+0x20(SB)/8, $0x3534333231302e2d +DATA expandAVX512Asm_56_outShufHi+0x28(SB)/8, $0x3e3d3c3b3a393836 +DATA expandAVX512Asm_56_outShufHi+0x30(SB)/8, $0x0e46454443424140 +DATA expandAVX512Asm_56_outShufHi+0x38(SB)/8, $0x50174c4b4a49480f + +TEXT expandAVX512Asm_56<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_56_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_56_mat0<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_56_inShuf1<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_56_inShuf2<>(SB), Z5 + VMOVDQU64 expandAVX512Asm_56_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_56_outShufHi(SB), Z2 + VMOVDQU64 (AX), Z6 + VPERMB Z6, Z0, Z0 + VGF2P8AFFINEQB $0, Z3, Z0, Z0 + VPERMB Z6, Z4, Z4 + VGF2P8AFFINEQB $0, Z3, Z4, Z3 + VPERMB Z6, Z5, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_56_mat2<>(SB), Z4, Z4 + VPERMI2B Z3, Z0, Z1 + VPERMI2B Z4, Z3, Z2 + RET + +GLOBL expandAVX512Asm_60_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_inShuf0<>+0x00(SB)/8, $0x0100000000000000 +DATA expandAVX512Asm_60_inShuf0<>+0x08(SB)/8, $0xffffffffffffff00 +DATA expandAVX512Asm_60_inShuf0<>+0x10(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_60_inShuf0<>+0x18(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_60_inShuf0<>+0x20(SB)/8, $0xffffffffffffff00 +DATA expandAVX512Asm_60_inShuf0<>+0x28(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_60_inShuf0<>+0x30(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_60_inShuf0<>+0x38(SB)/8, $0xffffffffffffff00 + +GLOBL expandAVX512Asm_60_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_60_mat0<>+0x08(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_60_mat0<>+0x10(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_60_mat0<>+0x18(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_60_mat0<>+0x20(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_60_mat0<>+0x28(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_60_mat0<>+0x30(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_60_mat0<>+0x38(SB)/8, $0x1010101020202020 + +GLOBL expandAVX512Asm_60_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_inShuf1<>+0x00(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_60_inShuf1<>+0x08(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_60_inShuf1<>+0x10(SB)/8, $0xffffffffffffff00 +DATA expandAVX512Asm_60_inShuf1<>+0x18(SB)/8, $0xff00000000000000 +DATA expandAVX512Asm_60_inShuf1<>+0x20(SB)/8, $0xffffffffff010101 +DATA expandAVX512Asm_60_inShuf1<>+0x28(SB)/8, $0x0202020202010101 +DATA expandAVX512Asm_60_inShuf1<>+0x30(SB)/8, $0xffffffffffff0201 +DATA expandAVX512Asm_60_inShuf1<>+0x38(SB)/8, $0xff01010101010101 + +GLOBL expandAVX512Asm_60_mat1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_mat1<>+0x00(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_60_mat1<>+0x08(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_60_mat1<>+0x10(SB)/8, $0x4040404080808080 +DATA expandAVX512Asm_60_mat1<>+0x18(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_60_mat1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_60_mat1<>+0x28(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_60_mat1<>+0x30(SB)/8, $0x0101010102020202 +DATA expandAVX512Asm_60_mat1<>+0x38(SB)/8, $0x0202020202020202 + +GLOBL expandAVX512Asm_60_inShuf2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_inShuf2<>+0x00(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_60_inShuf2<>+0x08(SB)/8, $0xffffffffffffff01 +DATA expandAVX512Asm_60_inShuf2<>+0x10(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_60_inShuf2<>+0x18(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_60_inShuf2<>+0x20(SB)/8, $0xffffffffffffff01 +DATA expandAVX512Asm_60_inShuf2<>+0x28(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_60_inShuf2<>+0x30(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_60_inShuf2<>+0x38(SB)/8, $0xffffffffffffff01 + +GLOBL expandAVX512Asm_60_mat2<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_mat2<>+0x00(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_60_mat2<>+0x08(SB)/8, $0x0404040408080808 +DATA expandAVX512Asm_60_mat2<>+0x10(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_60_mat2<>+0x18(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_60_mat2<>+0x20(SB)/8, $0x1010101020202020 +DATA expandAVX512Asm_60_mat2<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_60_mat2<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_60_mat2<>+0x38(SB)/8, $0x4040404080808080 + +GLOBL expandAVX512Asm_60_inShuf3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_inShuf3<>+0x00(SB)/8, $0xff01010101010101 +DATA expandAVX512Asm_60_inShuf3<>+0x08(SB)/8, $0xffffffffffff0202 +DATA expandAVX512Asm_60_inShuf3<>+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_inShuf3<>+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_inShuf3<>+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_inShuf3<>+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_inShuf3<>+0x30(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_inShuf3<>+0x38(SB)/8, $0xffffffffffffffff + +GLOBL expandAVX512Asm_60_mat3<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_mat3<>+0x00(SB)/8, $0x8080808080808080 +DATA expandAVX512Asm_60_mat3<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_60_mat3<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_60_mat3<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_60_mat3<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_60_mat3<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_60_mat3<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_60_mat3<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_60_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_outShufLo+0x00(SB)/8, $0x0806050403020100 +DATA expandAVX512Asm_60_outShufLo+0x08(SB)/8, $0x1816151413121110 +DATA expandAVX512Asm_60_outShufLo+0x10(SB)/8, $0x28201e1d1c1b1a19 +DATA expandAVX512Asm_60_outShufLo+0x18(SB)/8, $0x31302e2d2c2b2a29 +DATA expandAVX512Asm_60_outShufLo+0x20(SB)/8, $0x4140383635343332 +DATA expandAVX512Asm_60_outShufLo+0x28(SB)/8, $0x4a49484645444342 +DATA expandAVX512Asm_60_outShufLo+0x30(SB)/8, $0x5a5958504e4d4c4b +DATA expandAVX512Asm_60_outShufLo+0x38(SB)/8, $0x626160075e5d5c5b + +GLOBL expandAVX512Asm_60_outShufHi0(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_outShufHi0+0x00(SB)/8, $0x3b3a3938302a2928 +DATA expandAVX512Asm_60_outShufHi0+0x08(SB)/8, $0x44434241403e3d3c +DATA expandAVX512Asm_60_outShufHi0+0x10(SB)/8, $0x5453525150484645 +DATA expandAVX512Asm_60_outShufHi0+0x18(SB)/8, $0x5d5c5b5a59585655 +DATA expandAVX512Asm_60_outShufHi0+0x20(SB)/8, $0x6d6c6b6a6968605e +DATA expandAVX512Asm_60_outShufHi0+0x28(SB)/8, $0x767574737271706e +DATA expandAVX512Asm_60_outShufHi0+0x30(SB)/8, $0xffffffffffffff78 +DATA expandAVX512Asm_60_outShufHi0+0x38(SB)/8, $0x31ffff2f2e2d2c2b + +GLOBL expandAVX512Asm_60_outShufHi1(SB), RODATA, $0x40 +DATA expandAVX512Asm_60_outShufHi1+0x00(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_outShufHi1+0x08(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_outShufHi1+0x10(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_outShufHi1+0x18(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_outShufHi1+0x20(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_outShufHi1+0x28(SB)/8, $0xffffffffffffffff +DATA expandAVX512Asm_60_outShufHi1+0x30(SB)/8, $0x06050403020100ff +DATA expandAVX512Asm_60_outShufHi1+0x38(SB)/8, $0xff0908ffffffffff + +TEXT expandAVX512Asm_60<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_60_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_60_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_60_inShuf2<>(SB), Z3 + VMOVDQU64 expandAVX512Asm_60_inShuf3<>(SB), Z4 + VMOVDQU64 expandAVX512Asm_60_outShufLo(SB), Z1 + VMOVDQU64 expandAVX512Asm_60_outShufHi0(SB), Z5 + VMOVDQU64 expandAVX512Asm_60_outShufHi1(SB), Z6 + VMOVDQU64 (AX), Z7 + VPERMB Z7, Z0, Z0 + VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat0<>(SB), Z0, Z0 + VPERMB Z7, Z2, Z2 + VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat1<>(SB), Z2, Z2 + VPERMB Z7, Z3, Z3 + VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat2<>(SB), Z3, Z3 + VPERMB Z7, Z4, Z4 + VGF2P8AFFINEQB $0, expandAVX512Asm_60_mat3<>(SB), Z4, Z4 + VPERMI2B Z2, Z0, Z1 + MOVQ $0x9f01ffffffffffff, AX + KMOVQ AX, K1 + VPERMI2B.Z Z3, Z2, K1, Z5 + MOVQ $0x60fe000000000000, AX + KMOVQ AX, K1 + VPERMB.Z Z4, Z6, K1, Z0 + VPORQ Z0, Z5, Z2 + RET + +GLOBL expandAVX512Asm_64_inShuf0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_64_inShuf0<>+0x00(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_64_inShuf0<>+0x08(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_64_inShuf0<>+0x10(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_64_inShuf0<>+0x18(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_64_inShuf0<>+0x20(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_64_inShuf0<>+0x28(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_64_inShuf0<>+0x30(SB)/8, $0x0000000000000000 +DATA expandAVX512Asm_64_inShuf0<>+0x38(SB)/8, $0x0000000000000000 + +GLOBL expandAVX512Asm_64_mat0<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_64_mat0<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_mat0<>+0x08(SB)/8, $0x0202020202020202 +DATA expandAVX512Asm_64_mat0<>+0x10(SB)/8, $0x0404040404040404 +DATA expandAVX512Asm_64_mat0<>+0x18(SB)/8, $0x0808080808080808 +DATA expandAVX512Asm_64_mat0<>+0x20(SB)/8, $0x1010101010101010 +DATA expandAVX512Asm_64_mat0<>+0x28(SB)/8, $0x2020202020202020 +DATA expandAVX512Asm_64_mat0<>+0x30(SB)/8, $0x4040404040404040 +DATA expandAVX512Asm_64_mat0<>+0x38(SB)/8, $0x8080808080808080 + +GLOBL expandAVX512Asm_64_inShuf1<>(SB), RODATA, $0x40 +DATA expandAVX512Asm_64_inShuf1<>+0x00(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_inShuf1<>+0x08(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_inShuf1<>+0x10(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_inShuf1<>+0x18(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_inShuf1<>+0x20(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_inShuf1<>+0x28(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_inShuf1<>+0x30(SB)/8, $0x0101010101010101 +DATA expandAVX512Asm_64_inShuf1<>+0x38(SB)/8, $0x0101010101010101 + +GLOBL expandAVX512Asm_64_outShufLo(SB), RODATA, $0x40 +DATA expandAVX512Asm_64_outShufLo+0x00(SB)/8, $0x0706050403020100 +DATA expandAVX512Asm_64_outShufLo+0x08(SB)/8, $0x0f0e0d0c0b0a0908 +DATA expandAVX512Asm_64_outShufLo+0x10(SB)/8, $0x1716151413121110 +DATA expandAVX512Asm_64_outShufLo+0x18(SB)/8, $0x1f1e1d1c1b1a1918 +DATA expandAVX512Asm_64_outShufLo+0x20(SB)/8, $0x2726252423222120 +DATA expandAVX512Asm_64_outShufLo+0x28(SB)/8, $0x2f2e2d2c2b2a2928 +DATA expandAVX512Asm_64_outShufLo+0x30(SB)/8, $0x3736353433323130 +DATA expandAVX512Asm_64_outShufLo+0x38(SB)/8, $0x3f3e3d3c3b3a3938 + +TEXT expandAVX512Asm_64<>(SB), NOSPLIT, $0-0 + VMOVDQU64 expandAVX512Asm_64_inShuf0<>(SB), Z0 + VMOVDQU64 expandAVX512Asm_64_mat0<>(SB), Z1 + VMOVDQU64 expandAVX512Asm_64_inShuf1<>(SB), Z2 + VMOVDQU64 expandAVX512Asm_64_outShufLo(SB), Z3 + VMOVDQU64 (AX), Z4 + VPERMB Z4, Z0, Z0 + VGF2P8AFFINEQB $0, Z1, Z0, Z0 + VPERMB Z4, Z2, Z2 + VGF2P8AFFINEQB $0, Z1, Z2, Z2 + VPERMB Z0, Z3, Z1 + VPERMB Z2, Z3, Z2 + RET + diff --git a/src/internal/runtime/gc/scan/expand_amd64.go b/src/internal/runtime/gc/scan/export_amd64_test.go similarity index 76% rename from src/internal/runtime/gc/scan/expand_amd64.go rename to src/internal/runtime/gc/scan/export_amd64_test.go index 9bea471abe..ea3d86dfbf 100644 --- a/src/internal/runtime/gc/scan/expand_amd64.go +++ b/src/internal/runtime/gc/scan/export_amd64_test.go @@ -2,9 +2,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build amd64 + package scan -import "internal/runtime/gc" +import ( + "internal/runtime/gc" +) // ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked, // where f is the word size of objects in sizeClass. @@ -12,11 +16,11 @@ import "internal/runtime/gc" // This is a testing entrypoint to the expanders used by scanSpanPacked*. // //go:noescape -func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) +func ExpandAVX512Asm(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) // gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly // as they don't follow the Go ABI, but you can use this to check if a given // expander PC is 0. // // It is defined in assembly. -var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr +var gcExpandersAVX512Asm [len(gc.SizeClassToSize)]uintptr diff --git a/src/internal/runtime/gc/scan/export_simd_amd64_test.go b/src/internal/runtime/gc/scan/export_simd_amd64_test.go new file mode 100644 index 0000000000..bb6bc8d4cc --- /dev/null +++ b/src/internal/runtime/gc/scan/export_simd_amd64_test.go @@ -0,0 +1,24 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package scan + +import ( + "internal/runtime/gc" + "simd" + "unsafe" +) + +// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked, +// where f is the word size of objects in sizeClass. +// +// This is a testing entrypoint to the expanders used by scanSpanPacked*. +func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) { + v1, v2 := gcExpandersAVX512[sizeClass](unsafe.Pointer(packed)) + v1.Store((*[8]uint64)(unsafe.Pointer(unpacked))) + v2.Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unpacked)) + 64))) + simd.ClearAVXUpperBits() +} diff --git a/src/internal/runtime/gc/scan/mkasm.go b/src/internal/runtime/gc/scan/mkasm.go index e36defb2e1..9675652978 100644 --- a/src/internal/runtime/gc/scan/mkasm.go +++ b/src/internal/runtime/gc/scan/mkasm.go @@ -22,7 +22,7 @@ import ( const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n" func main() { - generate("expand_amd64.s", genExpanders) + generate("expanders_amd64.s", genExpanders) } func generate(fileName string, genFunc func(*gen.File)) { @@ -63,7 +63,7 @@ func genExpanders(file *gen.File) { xf := int(ob) / 8 log.Printf("size class %d bytes, expansion %dx", ob, xf) - fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf)) + fn := gen.NewFunc(fmt.Sprintf("expandAVX512Asm_%d<>", xf)) ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn) if xf == 1 { @@ -79,7 +79,7 @@ func genExpanders(file *gen.File) { } // Generate table mapping size class to expander PC - file.AddConst("·gcExpandersAVX512", gcExpandersAVX512) + file.AddConst("·gcExpandersAVX512Asm", gcExpandersAVX512) } // mat8x8 is an 8x8 bit matrix. diff --git a/src/internal/runtime/gc/scan/mkexpanders.go b/src/internal/runtime/gc/scan/mkexpanders.go new file mode 100644 index 0000000000..7f8c14cf6f --- /dev/null +++ b/src/internal/runtime/gc/scan/mkexpanders.go @@ -0,0 +1,638 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file is a fork of mkasm.go, instead of generating +// assembly code, this file generates Go code that uses +// the simd package. + +//go:build ignore + +package main + +import ( + "bytes" + "fmt" + "go/format" + "log" + "os" + "slices" + "strconv" + "strings" + "text/template" + "unsafe" + + "internal/runtime/gc" +) + +var simdTemplate = template.Must(template.New("template").Parse(` +{{- define "header"}} +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package scan + +import ( + "simd" + "unsafe" +) +{{- end}} +{{- define "expandersList"}} +var gcExpandersAVX512 = [{{- len .}}]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){ +{{- range .}} + {{.}}, +{{- end}} +} +{{- end}} + +{{- define "expanderData"}} +var {{.Name}} = [8]uint64{ +{{.Vals}} +} +{{- end}} + +{{- define "expander"}} +func {{.Name}}(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) { + {{- .BodyLoadString }} + {{- .BodyString }} +} +{{- end}} +`)) + +// expanderData is global data used by the expanders. +// They will be generated as global arrays. +type expanderData struct { + Name string // Name of the global array + Vals string // The values of the arrays, should already be formatted. +} + +// expander is the expander function, it only operates on 3 kinds of values: +// +// uint8x64, mask8x64, uint64. +// +// And a limited set of operations. +type expander struct { + Name string // The name of the expander function + BodyLoad strings.Builder + Body strings.Builder // The actual expand computations, after loads. + data []expanderData + dataByVals map[string]string + uint8x64Cnt int + mask8x64Cnt int + uint64Cnt int +} + +// Used by text/template. +// This is needed because tex/template cannot call pointer receiver methods. +func (e expander) BodyLoadString() string { + return e.BodyLoad.String() +} + +func (e expander) BodyString() string { + return e.Body.String() +} + +// mat8x8 is an 8x8 bit matrix. +type mat8x8 struct { + mat [8]uint8 +} + +func matGroupToVec(mats *[8]mat8x8) [8]uint64 { + var out [8]uint64 + for i, mat := range mats { + for j, row := range mat.mat { + // For some reason, Intel flips the rows. + out[i] |= uint64(row) << ((7 - j) * 8) + } + } + return out +} + +func (fn *expander) newVec() string { + v := fmt.Sprintf("v%d", fn.uint8x64Cnt) + fn.uint8x64Cnt++ + return v +} + +func (fn *expander) newMask() string { + v := fmt.Sprintf("m%d", fn.mask8x64Cnt) + fn.mask8x64Cnt++ + return v +} + +func (fn *expander) newU() string { + v := fmt.Sprintf("u%d", fn.uint64Cnt) + fn.uint64Cnt++ + return v +} + +// expandIdentity implements 1x expansion (that is, no expansion). +func (fn *expander) expandIdentity() { + fn.Body.WriteString(` + x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64() + y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src)+64))).AsUint8x64() + return x.AsUint64x8(), y.AsUint64x8()`) +} + +func (fn *expander) loadSrcAsUint8x64() string { + v := fn.newVec() + fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()\n", v)) + return v +} + +func (fn *expander) loadGlobalArrAsUint8x64(arrName string) string { + v := fn.newVec() + fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8(&%s).AsUint8x64()\n", v, arrName)) + return v +} + +func (fn *expander) permuteUint8x64(data, indices string) string { + v := fn.newVec() + fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s)\n", v, data, indices)) + return v +} + +func (fn *expander) permute2Uint8x64(x, y, indices string) string { + v := fn.newVec() + fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s)\n", v, x, y, indices)) + return v +} + +func (fn *expander) permuteMaskedUint8x64(data, indices, mask string) string { + v := fn.newVec() + fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s).Masked(%s)\n", v, data, indices, mask)) + return v +} + +func (fn *expander) permute2MaskedUint8x64(x, y, indices, mask string) string { + v := fn.newVec() + fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s).Masked(%s)\n", v, x, y, indices, mask)) + return v +} + +func (fn *expander) galoisFieldAffineTransformUint8x64(data, matrix string) string { + v := fn.newVec() + fn.Body.WriteString(fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix)) + return v +} + +func (fn *expander) returns(x, y string) { + fn.Body.WriteString(fmt.Sprintf("return %s.AsUint64x8(), %s.AsUint64x8()", x, y)) +} + +func uint8x64Data(data [64]uint8) string { + res := "" + for i := range 8 { + ptr64 := (*uint64)(unsafe.Pointer(&data[i*8])) + res += fmt.Sprintf("%#016x,", *ptr64) + if i == 3 { + res += "\n" + } + } + return res +} + +func uint64x8Data(data [8]uint64) string { + res := "" + for i := range 8 { + res += fmt.Sprintf("%#016x,", data[i]) + if i == 3 { + res += "\n" + } + } + return res +} + +func (fn *expander) loadGlobalUint8x64(name string, data [64]uint8) string { + val := uint8x64Data(data) + if n, ok := fn.dataByVals[val]; !ok { + fullName := fmt.Sprintf("%s_%s", fn.Name, name) + fn.data = append(fn.data, expanderData{fullName, val}) + v := fn.loadGlobalArrAsUint8x64(fullName) + fn.dataByVals[val] = v + return v + } else { + return n + } +} + +func (fn *expander) loadGlobalUint64x8(name string, data [8]uint64) string { + val := uint64x8Data(data) + if n, ok := fn.dataByVals[val]; !ok { + fullName := fmt.Sprintf("%s_%s", fn.Name, name) + fn.data = append(fn.data, expanderData{fullName, val}) + v := fn.loadGlobalArrAsUint8x64(fullName) + fn.dataByVals[val] = v + return v + } else { + return n + } +} + +func (fn *expander) mask8x64FromBits(data uint64) string { + v1 := fn.newU() + v2 := fn.newMask() + fn.Body.WriteString(fmt.Sprintf("%s := uint64(%#x)\n%s := simd.Mask8x64FromBits(%s)\n", + v1, data, v2, v1)) + return v2 +} + +func (fn *expander) orUint8x64(x, y string) string { + v := fn.newVec() + fn.Body.WriteString(fmt.Sprintf("%s := %s.Or(%s)\n", v, x, y)) + return v +} + +func main() { + generate("expanders_amd64.go", genExpanders) +} + +func generate(fileName string, genFunc func(*bytes.Buffer)) { + var buf bytes.Buffer + genFunc(&buf) + f, err := os.Create(fileName) + if err != nil { + log.Fatal(err) + } + defer f.Close() + b, err := format.Source(buf.Bytes()) + if err != nil { + log.Printf(string(buf.Bytes())) + log.Fatal(err) + } + _, err = f.Write(b) + if err != nil { + log.Fatal(err) + } +} + +func genExpanders(buffer *bytes.Buffer) { + if err := simdTemplate.ExecuteTemplate(buffer, "header", nil); err != nil { + panic(fmt.Errorf("failed to execute header template: %w", err)) + } + gcExpandersAVX512 := make([]expander, len(gc.SizeClassToSize)) + for sc, ob := range gc.SizeClassToSize { + if gc.SizeClassToNPages[sc] != 1 { + // These functions all produce a bitmap that covers exactly one + // page. + continue + } + if ob > gc.MinSizeForMallocHeader { + // This size class is too big to have a packed pointer/scalar bitmap. + break + } + + xf := int(ob) / 8 + log.Printf("size class %d bytes, expansion %dx", ob, xf) + + fn := expander{Name: fmt.Sprintf("expandAVX512_%d", xf), dataByVals: make(map[string]string)} + + if xf == 1 { + fn.expandIdentity() + } else { + ok := gfExpander(xf, &fn) + if !ok { + log.Printf("failed to generate expander for size class %d", sc) + } + } + gcExpandersAVX512[sc] = fn + } + // Fill in the expanders data first + eld := make([]string, len(gcExpandersAVX512)) + for i, gce := range gcExpandersAVX512 { + if gce.Name == "" { + eld[i] = "nil" + } else { + eld[i] = gce.Name + } + } + if err := simdTemplate.ExecuteTemplate(buffer, "expandersList", eld); err != nil { + panic(fmt.Errorf("failed to execute expandersList template: %w", err)) + } + // List out the expander functions and their data + for _, gce := range gcExpandersAVX512 { + if gce.Name == "" { + continue + } + for _, data := range gce.data { + if err := simdTemplate.ExecuteTemplate(buffer, "expanderData", data); err != nil { + panic(fmt.Errorf("failed to execute expanderData template: %w", err)) + } + } + if err := simdTemplate.ExecuteTemplate(buffer, "expander", gce); err != nil { + panic(fmt.Errorf("failed to execute expander template: %w", err)) + } + } +} + +// gfExpander produces a function that expands each bit in an input bitmap into +// f consecutive bits in an output bitmap. +// +// The input is +// +// *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits) +// +// The output is +// +// [64]uint8 = The bottom 512 bits of the expanded bitmap +// [64]uint8 = The top 512 bits of the expanded bitmap +func gfExpander(f int, fn *expander) bool { + // TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler. + + // TODO(austin): For f >= 8, I suspect there are better ways to do this. + // + // For example, we could use a mask expansion to get a full byte for each + // input bit, and separately create the bytes that blend adjacent bits, then + // shuffle those bytes together. Certainly for f >= 16 this makes sense + // because each of those bytes will be used, possibly more than once. + + objBits := fn.loadSrcAsUint8x64() + + type term struct { + iByte, oByte int + mat mat8x8 + } + var terms []term + + // Iterate over all output bytes and construct the 8x8 GF2 matrix to compute + // the output byte from the appropriate input byte. Gather all of these into + // "terms". + for oByte := 0; oByte < 1024/8; oByte++ { + var byteMat mat8x8 + iByte := -1 + for oBit := oByte * 8; oBit < oByte*8+8; oBit++ { + iBit := oBit / f + if iByte == -1 { + iByte = iBit / 8 + } else if iByte != iBit/8 { + log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8) + return false + } + // One way to view this is that the i'th row of the matrix will be + // ANDed with the input byte, and the parity of the result will set + // the i'th bit in the output. We use a simple 1 bit mask, so the + // parity is irrelevant beyond selecting out that one bit. + byteMat.mat[oBit%8] = 1 << (iBit % 8) + } + terms = append(terms, term{iByte, oByte, byteMat}) + } + + if false { + // Print input byte -> output byte as a matrix + maxIByte, maxOByte := 0, 0 + for _, term := range terms { + maxIByte = max(maxIByte, term.iByte) + maxOByte = max(maxOByte, term.oByte) + } + iToO := make([][]rune, maxIByte+1) + for i := range iToO { + iToO[i] = make([]rune, maxOByte+1) + } + matMap := make(map[mat8x8]int) + for _, term := range terms { + i, ok := matMap[term.mat] + if !ok { + i = len(matMap) + matMap[term.mat] = i + } + iToO[term.iByte][term.oByte] = 'A' + rune(i) + } + for o := range maxOByte + 1 { + fmt.Printf("%d", o) + for i := range maxIByte + 1 { + fmt.Printf(",") + if mat := iToO[i][o]; mat != 0 { + fmt.Printf("%c", mat) + } + } + fmt.Println() + } + } + + // In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel, + // and we get to operate on up to 8 matrixes in parallel (or 64 values). That is: + // + // abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+ + // mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7 + + // Group the terms by matrix, but limit each group to 8 terms. + const termsPerGroup = 8 // Number of terms we can multiply by the same matrix. + const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector. + + matMap := make(map[mat8x8]int) + allMats := make(map[mat8x8]bool) + var termGroups [][]term + for _, term := range terms { + allMats[term.mat] = true + + i, ok := matMap[term.mat] + if ok && f > groupsPerSuperGroup { + // The output is ultimately produced in two [64]uint8 registers. + // Getting every byte in the right place of each of these requires a + // final permutation that often requires more than one source. + // + // Up to 8x expansion, we can get a really nice grouping so we can use + // the same 8 matrix vector several times, without producing + // permutations that require more than two sources. + // + // Above 8x, however, we can't get nice matrixes anyway, so we + // instead prefer reducing the complexity of the permutations we + // need to produce the final outputs. To do this, avoid grouping + // together terms that are split across the two registers. + outRegister := termGroups[i][0].oByte / 64 + if term.oByte/64 != outRegister { + ok = false + } + } + if !ok { + // Start a new term group. + i = len(termGroups) + matMap[term.mat] = i + termGroups = append(termGroups, nil) + } + + termGroups[i] = append(termGroups[i], term) + + if len(termGroups[i]) == termsPerGroup { + // This term group is full. + delete(matMap, term.mat) + } + } + + for i, termGroup := range termGroups { + log.Printf("term group %d:", i) + for _, term := range termGroup { + log.Printf(" %+v", term) + } + } + + // We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack + // as many term groups as we can into each super-group to minimize the + // number of matrix multiplies. + // + // Ideally, we use the same matrix in each super-group, which might mean + // doing fewer than 8 multiplies at a time. That's fine because it never + // increases the total number of matrix multiplies. + // + // TODO: Packing the matrixes less densely may let us use more broadcast + // loads instead of general permutations, though. That replaces a load of + // the permutation with a load of the matrix, but is probably still slightly + // better. + var sgSize, nSuperGroups int + oneMatVec := f <= groupsPerSuperGroup + if oneMatVec { + // We can use the same matrix in each multiply by doing sgSize + // multiplies at a time. + sgSize = groupsPerSuperGroup / len(allMats) * len(allMats) + nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize + } else { + // We can't use the same matrix for each multiply. Just do as many at a + // time as we can. + // + // TODO: This is going to produce several distinct matrixes, when we + // probably only need two. Be smarter about how we create super-groups + // in this case. Maybe we build up an array of super-groups and then the + // loop below just turns them into ops? + sgSize = 8 + nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup + } + + // Construct each super-group. + var matGroup [8]mat8x8 + var matMuls []string + var perm [128]int + for sgi := range nSuperGroups { + var iperm [64]uint8 + for i := range iperm { + iperm[i] = 0xff // "Don't care" + } + // Pick off sgSize term groups. + superGroup := termGroups[:min(len(termGroups), sgSize)] + termGroups = termGroups[len(superGroup):] + // Build the matrix and permutations for this super-group. + var thisMatGroup [8]mat8x8 + for i, termGroup := range superGroup { + // All terms in this group have the same matrix. Pick one. + thisMatGroup[i] = termGroup[0].mat + for j, term := range termGroup { + // Build the input permutation. + iperm[i*termsPerGroup+j] = uint8(term.iByte) + // Build the output permutation. + perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j + } + } + log.Printf("input permutation %d: %v", sgi, iperm) + + // Check that we're not making more distinct matrixes than expected. + if oneMatVec { + if sgi == 0 { + matGroup = thisMatGroup + } else if matGroup != thisMatGroup { + log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup) + return false + } + } + + // Emit matrix op. + matConst := + fn.loadGlobalUint64x8(fmt.Sprintf("mat%d", sgi), + matGroupToVec(&thisMatGroup)) + inShufConst := + fn.loadGlobalUint8x64(fmt.Sprintf("inShuf%d", sgi), + iperm) + inOp := fn.permuteUint8x64(objBits, inShufConst) + matMul := fn.galoisFieldAffineTransformUint8x64(inOp, matConst) + matMuls = append(matMuls, matMul) + } + + log.Printf("output permutation: %v", perm) + + outLo, ok := genShuffle(fn, "outShufLo", (*[64]int)(perm[:64]), matMuls...) + if !ok { + log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls)) + return false + } + outHi, ok := genShuffle(fn, "outShufHi", (*[64]int)(perm[64:]), matMuls...) + if !ok { + log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls)) + return false + } + fn.returns(outLo, outHi) + + return true +} + +func genShuffle(fn *expander, name string, perm *[64]int, args ...string) (string, bool) { + // Construct flattened permutation. + var vperm [64]byte + + // Get the inputs used by this permutation. + var inputs []int + for i, src := range perm { + inputIdx := slices.Index(inputs, src/64) + if inputIdx == -1 { + inputIdx = len(inputs) + inputs = append(inputs, src/64) + } + vperm[i] = byte(src%64 | (inputIdx << 6)) + } + + // Emit instructions for easy cases. + switch len(inputs) { + case 1: + constOp := fn.loadGlobalUint8x64(name, vperm) + return fn.permuteUint8x64(args[inputs[0]], constOp), true + case 2: + constOp := fn.loadGlobalUint8x64(name, vperm) + return fn.permute2Uint8x64(args[inputs[0]], args[inputs[1]], constOp), true + } + + // Harder case, we need to shuffle in from up to 2 more tables. + // + // Perform two shuffles. One shuffle will get its data from the first + // two inputs, the other shuffle will get its data from the other one + // or two inputs. All values they don't care each don't care about will + // be zeroed. + var vperms [2][64]byte + var masks [2]uint64 + for j, idx := range vperm { + for i := range vperms { + vperms[i][j] = 0xff // "Don't care" + } + if idx == 0xff { + continue + } + vperms[idx/128][j] = idx % 128 + masks[idx/128] |= uint64(1) << j + } + + // Validate that the masks are fully disjoint. + if masks[0]^masks[1] != ^uint64(0) { + panic("bad shuffle!") + } + + // Generate constants. + constOps := make([]string, len(vperms)) + for i, v := range vperms { + constOps[i] = fn.loadGlobalUint8x64(name+strconv.Itoa(i), v) + } + + // Generate shuffles. + switch len(inputs) { + case 3: + r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0])) + r1 := fn.permuteMaskedUint8x64(args[inputs[2]], constOps[1], fn.mask8x64FromBits(masks[1])) + return fn.orUint8x64(r0, r1), true + case 4: + r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0])) + r1 := fn.permute2MaskedUint8x64(args[inputs[2]], args[inputs[3]], constOps[1], fn.mask8x64FromBits(masks[1])) + return fn.orUint8x64(r0, r1), true + } + + // Too many inputs. To support more, we'd need to separate tables much earlier. + // Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes). + return args[0], false +} diff --git a/src/internal/runtime/gc/scan/scan_amd64.go b/src/internal/runtime/gc/scan/scan_amd64.go index 2ac181f97e..4af5a81f31 100644 --- a/src/internal/runtime/gc/scan/scan_amd64.go +++ b/src/internal/runtime/gc/scan/scan_amd64.go @@ -6,13 +6,25 @@ package scan import ( "internal/cpu" + "internal/goexperiment" "internal/runtime/gc" "unsafe" ) func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { if CanAVX512() { - return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask) + if goexperiment.SIMD { + return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask) + } else { + return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask) + } + } + panic("not implemented") +} + +func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + if CanAVX512() { + return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask) } panic("not implemented") } @@ -27,12 +39,12 @@ func CanAVX512() bool { return avx512ScanPackedReqsMet } -func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { - return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)) +func ScanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + return FilterNil(bufp, scanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask)) } //go:noescape -func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) +func scanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL && cpu.X86.HasAVX512BW && diff --git a/src/internal/runtime/gc/scan/scan_amd64.s b/src/internal/runtime/gc/scan/scan_amd64.s index 9b4950a767..7430a86294 100644 --- a/src/internal/runtime/gc/scan/scan_amd64.s +++ b/src/internal/runtime/gc/scan/scan_amd64.s @@ -6,12 +6,12 @@ #include "textflag.h" // Test-only. -TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24 +TEXT ·ExpandAVX512Asm(SB), NOSPLIT, $0-24 MOVQ sizeClass+0(FP), CX MOVQ packed+8(FP), AX // Call the expander for this size class - LEAQ ·gcExpandersAVX512(SB), BX + LEAQ ·gcExpandersAVX512Asm(SB), BX CALL (BX)(CX*8) MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer @@ -20,11 +20,11 @@ TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24 VZEROUPPER RET -TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44 +TEXT ·scanSpanPackedAVX512Asm(SB), NOSPLIT, $256-44 // Z1+Z2 = Expand the grey object mask into a grey word mask MOVQ objMarks+16(FP), AX MOVQ sizeClass+24(FP), CX - LEAQ ·gcExpandersAVX512(SB), BX + LEAQ ·gcExpandersAVX512Asm(SB), BX CALL (BX)(CX*8) // Z3+Z4 = Load the pointer mask diff --git a/src/internal/runtime/gc/scan/scan_amd64_test.go b/src/internal/runtime/gc/scan/scan_amd64_test.go index a914b4f4d7..b628db9cdc 100644 --- a/src/internal/runtime/gc/scan/scan_amd64_test.go +++ b/src/internal/runtime/gc/scan/scan_amd64_test.go @@ -11,6 +11,13 @@ import ( "testing" ) +func TestScanSpanPackedAVX512Asm(t *testing.T) { + if !scan.CanAVX512() { + t.Skip("no AVX512") + } + testScanSpanPacked(t, scan.ScanSpanPackedAVX512Asm) +} + func TestScanSpanPackedAVX512(t *testing.T) { if !scan.CanAVX512() { t.Skip("no AVX512") diff --git a/src/internal/runtime/gc/scan/scan_generic.go b/src/internal/runtime/gc/scan/scan_generic.go index a4d51827cc..68c72182ec 100644 --- a/src/internal/runtime/gc/scan/scan_generic.go +++ b/src/internal/runtime/gc/scan/scan_generic.go @@ -21,3 +21,6 @@ func HasFastScanSpanPacked() bool { func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask) } +func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + panic("not implemented") +} diff --git a/src/internal/runtime/gc/scan/scan_nosimd_amd64.go b/src/internal/runtime/gc/scan/scan_nosimd_amd64.go new file mode 100644 index 0000000000..4d523d5bcd --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_nosimd_amd64.go @@ -0,0 +1,16 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !goexperiment.simd + +package scan + +import ( + "internal/runtime/gc" + "unsafe" +) + +func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + panic("not implemented") +} diff --git a/src/internal/runtime/gc/scan/scan_simd_amd64.go b/src/internal/runtime/gc/scan/scan_simd_amd64.go new file mode 100644 index 0000000000..101358c60b --- /dev/null +++ b/src/internal/runtime/gc/scan/scan_simd_amd64.go @@ -0,0 +1,92 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build goexperiment.simd && amd64 + +package scan + +import ( + "internal/abi" + "internal/runtime/gc" + "math/bits" + "simd" + "unsafe" +) + +func FilterNilAVX512(bufp *uintptr, n int32) (cnt int32) { + scanned := 0 + buf := unsafe.Slice((*uint64)(unsafe.Pointer(bufp)), int(n)) + // Use the widest vector + var zeros simd.Uint64x8 + for ; scanned+8 <= int(n); scanned += 8 { + v := simd.LoadUint64x8Slice(buf[scanned:]) + m := v.NotEqual(zeros) + v.Compress(m).StoreSlice(buf[cnt:]) + // Count the mask bits + mbits := uint64(m.ToBits()) + mbits &= 0xFF // Only the lower 8 bits are meaningful. + nonNilCnt := bits.OnesCount64(mbits) + cnt += int32(nonNilCnt) + } + // Scalar code to clean up tails. + for i := scanned; i < int(n); i++ { + if buf[i] != 0 { + buf[cnt] = buf[i] + cnt++ + } + } + return +} + +func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)) +} + +func scanSpanPackedAVX512(mem unsafe.Pointer, buf *uintptr, objDarts *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { + // Expand the grey object mask into a grey word mask + m1, m2 := gcExpandersAVX512[sizeClass](abi.NoEscape(unsafe.Pointer(objDarts))) + // Load the pointer mask + ptrm := unsafe.Pointer(ptrMask) + m3 := simd.LoadUint64x8((*[8]uint64)(ptrm)) + m4 := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(ptrm) + 64))) + + masks := [128]uint8{} + counts := [128]uint8{} + // Combine the grey word mask with the pointer mask to get the scan mask + m1m3 := m1.And(m3).AsUint8x64() + m2m4 := m2.And(m4).AsUint8x64() + m1m3.Store((*[64]uint8)(unsafe.Pointer(&masks[0]))) + m2m4.Store((*[64]uint8)(unsafe.Pointer(&masks[64]))) + // Now each bit of m1m3 and m2m4 represents one word of the span. + // Thus, each byte covers 64 bytes of memory, which is also how + // much we can fix in a ZMM register. + // + // We do a load/compress for each 64 byte frame. + // + // counts = Number of memory words to scan in each 64 byte frame + // TODO: Right now the type casting is done via memory, is it possible to + // workaround these stores and loads and keep them in register? + m1m3.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[0]))) + m2m4.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[64]))) + + // Loop over the 64 byte frames in this span. + // TODO: is there a way to PCALIGN this loop? + for i := range 128 { + mv := masks[i] + // Skip empty frames. + if mv == 0 { + continue + } + // Load the 64 byte frame. + m := simd.Mask64x8FromBits(mv) + ptrs := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(mem) + uintptr(i*64)))) + // Collect just the pointers from the greyed objects into the scan buffer, + // i.e., copy the word indices in the mask from Z1 into contiguous memory. + ptrs.Compress(m).Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(buf)) + uintptr(count*8)))) + // Advance the scan buffer position by the number of pointers. + count += int32(counts[i]) + } + simd.ClearAVXUpperBits() + return +} diff --git a/src/internal/runtime/gc/scan/scan_test.go b/src/internal/runtime/gc/scan/scan_test.go index 1208783b6f..7cadb609bf 100644 --- a/src/internal/runtime/gc/scan/scan_test.go +++ b/src/internal/runtime/gc/scan/scan_test.go @@ -204,6 +204,13 @@ func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) { scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) } }) + b.Run("impl=PlatformAsm", func(b *testing.B) { + b.SetBytes(avgBytes) + for i := range b.N { + page := pageOrder[i%len(pageOrder)] + scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) + } + }) } }) } -- 2.52.0