From ea3b2ecd2878a694f9f42011eccb1312feb82bca Mon Sep 17 00:00:00 2001 From: David Chase Date: Sat, 20 Sep 2025 16:52:07 -0400 Subject: [PATCH] [dev.simd] cmd/compile, simd: add 64-bit select-from-pair methods these are in the same style as the 32-bit select-from-pair, including the grouped variant. This does not quite capture the full awesome power of VSHUFPD where it can select differently in each group; that will be some other method, that is more complex. Change-Id: I807ddd7c1256103b5b0d7c5d60bd70b185e3aaf0 Reviewed-on: https://go-review.googlesource.com/c/go/+/705695 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- src/cmd/compile/internal/ssagen/intrinsics.go | 140 ++- src/simd/internal/simd_test/simd_test.go | 120 ++- src/simd/pkginternal_test.go | 112 +-- src/simd/shuffles_amd64.go | 798 ++++++++++++------ 4 files changed, 819 insertions(+), 351 deletions(-) diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 4c5cd9ef2c..6561cbe9a2 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1632,12 +1632,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) - sfp := func(method string, hwop ssa.Op, vectype *types.Type) { + sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) { addF("simd", method, func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { x, a, b, c, d, y := args[0], args[1], args[2], args[3], args[4], args[5] if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 && c.Op == ssa.OpConst8 && d.Op == ssa.OpConst8 { - return selectFromPair(x, a, b, c, d, y, s, hwop, vectype) + return select4FromPair(x, a, b, c, d, y, s, hwop, vectype) } else { return s.callResult(n, callNormal) } @@ -1645,25 +1645,64 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { sys.AMD64) } - sfp("Int32x4.SelectFromPair", ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128) - sfp("Uint32x4.SelectFromPair", ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128) - sfp("Float32x4.SelectFromPair", ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128) + sfp4("Int32x4.SelectFromPair", ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128) + sfp4("Uint32x4.SelectFromPair", ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128) + sfp4("Float32x4.SelectFromPair", ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128) - sfp("Int32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256) - sfp("Uint32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256) - sfp("Float32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256) + sfp4("Int32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256) + sfp4("Uint32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256) + sfp4("Float32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256) - sfp("Int32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512) - sfp("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512) - sfp("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512) + sfp4("Int32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512) + sfp4("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512) + sfp4("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512) + + sfp2 := func(method string, hwop ssa.Op, vectype *types.Type, cscimm func(i, j uint8) int64) { + addF("simd", method, + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + x, a, b, y := args[0], args[1], args[2], args[3] + if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 { + return select2FromPair(x, a, b, y, s, hwop, vectype, cscimm) + } else { + return s.callResult(n, callNormal) + } + }, + sys.AMD64) + } + + sfp2("Uint64x2.SelectFromPair", ssa.OpconcatSelectedConstantUint64x2, types.TypeVec128, cscimm2) + sfp2("Int64x2.SelectFromPair", ssa.OpconcatSelectedConstantInt64x2, types.TypeVec128, cscimm2) + sfp2("Float64x2.SelectFromPair", ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, cscimm2) + + sfp2("Uint64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, cscimm2g2) + sfp2("Int64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x4, types.TypeVec256, cscimm2g2) + sfp2("Float64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x4, types.TypeVec256, cscimm2g2) + + sfp2("Uint64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, cscimm2g4) + sfp2("Int64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x8, types.TypeVec512, cscimm2g4) + sfp2("Float64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x8, types.TypeVec512, cscimm2g4) } } -func cscimm(a, b, c, d uint8) int64 { +func cscimm4(a, b, c, d uint8) int64 { return se(a + b<<2 + c<<4 + d<<6) } +func cscimm2(a, b uint8) int64 { + return se(a + b<<1) +} + +func cscimm2g2(a, b uint8) int64 { + g := cscimm2(a, b) + return int64(int8(g + g<<2)) +} + +func cscimm2g4(a, b uint8) int64 { + g := cscimm2g2(a, b) + return int64(int8(g + g<<4)) +} + const ( _LLLL = iota _HLLL @@ -1683,7 +1722,32 @@ const ( _HHHH ) -func selectFromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value { +const ( + _LL = iota + _HL + _LH + _HH +) + +func select2FromPair(x, _a, _b, y *ssa.Value, s *state, op ssa.Op, t *types.Type, csc func(a, b uint8) int64) *ssa.Value { + a, b := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()) + pattern := (a&2)>>1 + (b & 2) + a, b = a&1, b&1 + + switch pattern { + case _LL: + return s.newValue2I(op, t, csc(a, b), x, x) + case _HH: + return s.newValue2I(op, t, csc(a, b), y, y) + case _LH: + return s.newValue2I(op, t, csc(a, b), x, y) + case _HL: + return s.newValue2I(op, t, csc(a, b), y, x) + } + panic("The preceding switch should have been exhaustive") +} + +func select4FromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value { a, b, c, d := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()), uint8(_c.AuxInt8()), uint8(_d.AuxInt8()) pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 @@ -1692,54 +1756,54 @@ func selectFromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *typ switch pattern { case _LLLL: // TODO DETECT 0,1,2,3, 0,0,0,0 - return s.newValue2I(op, t, cscimm(a, b, c, d), x, x) + return s.newValue2I(op, t, cscimm4(a, b, c, d), x, x) case _HHHH: // TODO DETECT 0,1,2,3, 0,0,0,0 - return s.newValue2I(op, t, cscimm(a, b, c, d), y, y) + return s.newValue2I(op, t, cscimm4(a, b, c, d), y, y) case _LLHH: - return s.newValue2I(op, t, cscimm(a, b, c, d), x, y) + return s.newValue2I(op, t, cscimm4(a, b, c, d), x, y) case _HHLL: - return s.newValue2I(op, t, cscimm(a, b, c, d), y, x) + return s.newValue2I(op, t, cscimm4(a, b, c, d), y, x) case _HLLL: - z := s.newValue2I(op, t, cscimm(a, a, b, b), y, x) - return s.newValue2I(op, t, cscimm(0, 2, c, d), z, x) + z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x) + return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x) case _LHLL: - z := s.newValue2I(op, t, cscimm(a, a, b, b), x, y) - return s.newValue2I(op, t, cscimm(0, 2, c, d), z, x) + z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y) + return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x) case _HLHH: - z := s.newValue2I(op, t, cscimm(a, a, b, b), y, x) - return s.newValue2I(op, t, cscimm(0, 2, c, d), z, y) + z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x) + return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y) case _LHHH: - z := s.newValue2I(op, t, cscimm(a, a, b, b), x, y) - return s.newValue2I(op, t, cscimm(0, 2, c, d), z, y) + z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y) + return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y) case _LLLH: - z := s.newValue2I(op, t, cscimm(c, c, d, d), x, y) - return s.newValue2I(op, t, cscimm(a, b, 0, 2), x, z) + z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y) + return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z) case _LLHL: - z := s.newValue2I(op, t, cscimm(c, c, d, d), y, x) - return s.newValue2I(op, t, cscimm(a, b, 0, 2), x, z) + z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x) + return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z) case _HHLH: - z := s.newValue2I(op, t, cscimm(c, c, d, d), x, y) - return s.newValue2I(op, t, cscimm(a, b, 0, 2), y, z) + z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y) + return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z) case _HHHL: - z := s.newValue2I(op, t, cscimm(c, c, d, d), y, x) - return s.newValue2I(op, t, cscimm(a, b, 0, 2), y, z) + z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x) + return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z) case _LHLH: - z := s.newValue2I(op, t, cscimm(a, c, b, d), x, y) + z := s.newValue2I(op, t, cscimm4(a, c, b, d), x, y) return s.newValue2I(op, t, se(0b11_01_10_00), z, z) case _HLHL: - z := s.newValue2I(op, t, cscimm(b, d, a, c), x, y) + z := s.newValue2I(op, t, cscimm4(b, d, a, c), x, y) return s.newValue2I(op, t, se(0b01_11_00_10), z, z) case _HLLH: - z := s.newValue2I(op, t, cscimm(b, c, a, d), x, y) + z := s.newValue2I(op, t, cscimm4(b, c, a, d), x, y) return s.newValue2I(op, t, se(0b11_01_00_10), z, z) case _LHHL: - z := s.newValue2I(op, t, cscimm(a, d, b, c), x, y) + z := s.newValue2I(op, t, cscimm4(a, d, b, c), x, y) return s.newValue2I(op, t, se(0b01_11_10_00), z, z) } panic("The preceding switch should have been exhaustive") @@ -1906,7 +1970,7 @@ func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExp return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 { i1, i2 := args[1].AuxInt, args[2].AuxInt - return s.newValue2I(op, t, i1+i2<<4, args[0], args[3]) + return s.newValue2I(op, t, int64(int8(i1+i2<<4)), args[0], args[3]) } four := s.constInt64(types.Types[types.TUINT8], 4) shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four) diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index e38f7eea01..d00fcf5dd3 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -595,7 +595,7 @@ func TestIsZero(t *testing.T) { } } -func TestSelectFromPairConst(t *testing.T) { +func TestSelect4FromPairConst(t *testing.T) { x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3}) y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7}) @@ -652,7 +652,7 @@ func selectFromPairInt32x4(x simd.Int32x4, a, b, c, d uint8, y simd.Int32x4) sim return x.SelectFromPair(a, b, c, d, y) } -func TestSelectFromPairVar(t *testing.T) { +func TestSelect4FromPairVar(t *testing.T) { x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3}) y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7}) @@ -704,7 +704,7 @@ func TestSelectFromPairVar(t *testing.T) { foo(hllh, 4, 0, 1, 5) } -func TestSelectFromPairConstGroupedFloat32x8(t *testing.T) { +func TestSelect4FromPairConstGrouped(t *testing.T) { x := simd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13}) y := simd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17}) @@ -887,5 +887,119 @@ func TestSelect128FromPairVar(t *testing.T) { foo(cd, 2, 3) foo(da, 3, 0) foo(dc, 3, 2) +} + +func TestSelect2FromPairConst(t *testing.T) { + x := simd.LoadUint64x2Slice([]uint64{0, 1}) + y := simd.LoadUint64x2Slice([]uint64{2, 3}) + + ll := x.SelectFromPair(0, 1, y) + hh := x.SelectFromPair(3, 2, y) + lh := x.SelectFromPair(0, 3, y) + hl := x.SelectFromPair(2, 1, y) + + r := make([]uint64, 2, 2) + + foo := func(v simd.Uint64x2, a, b uint64) { + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, b}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedUint(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 10, 11}) + y := simd.LoadUint64x4Slice([]uint64{2, 3, 12, 13}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]uint64, 4, 4) + + foo := func(v simd.Uint64x4, a, b uint64) { + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedFloat(t *testing.T) { + x := simd.LoadFloat64x4Slice([]float64{0, 1, 10, 11}) + y := simd.LoadFloat64x4Slice([]float64{2, 3, 12, 13}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]float64, 4, 4) + + foo := func(v simd.Float64x4, a, b float64) { + v.StoreSlice(r) + checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedInt(t *testing.T) { + x := simd.LoadInt64x4Slice([]int64{0, 1, 10, 11}) + y := simd.LoadInt64x4Slice([]int64{2, 3, 12, 13}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]int64, 4, 4) + + foo := func(v simd.Int64x4, a, b int64) { + v.StoreSlice(r) + checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10}) + } + + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) +} + +func TestSelect2FromPairConstGroupedInt512(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + + x := simd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31}) + y := simd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33}) + + ll := x.SelectFromPairGrouped(0, 1, y) + hh := x.SelectFromPairGrouped(3, 2, y) + lh := x.SelectFromPairGrouped(0, 3, y) + hl := x.SelectFromPairGrouped(2, 1, y) + + r := make([]int64, 8, 8) + + foo := func(v simd.Int64x8, a, b int64) { + v.StoreSlice(r) + checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30}) + } + foo(ll, 0, 1) + foo(hh, 3, 2) + foo(lh, 0, 3) + foo(hl, 2, 1) } diff --git a/src/simd/pkginternal_test.go b/src/simd/pkginternal_test.go index 557a0537b4..632e24d9d9 100644 --- a/src/simd/pkginternal_test.go +++ b/src/simd/pkginternal_test.go @@ -99,53 +99,53 @@ func select2x4x32(x Int32x4, a, b, c, d uint8, y Int32x4) Int32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -180,53 +180,53 @@ func select2x8x32Grouped(x Int32x8, a, b, c, d uint8, y Int32x8) Int32x8 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } diff --git a/src/simd/shuffles_amd64.go b/src/simd/shuffles_amd64.go index 68c840730b..c46a2d06fe 100644 --- a/src/simd/shuffles_amd64.go +++ b/src/simd/shuffles_amd64.go @@ -44,6 +44,16 @@ const ( _HHHH // a:y, b:y, c:y, d:y ) +// These constants represent the source pattern for the four parameters +// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for +// two-element vectors. +const ( + _LL = iota + _HL + _LH + _HH +) + // SelectFromPair returns the selection of four elements from the two // vectors x and y, where selector values in the range 0-3 specify // elements from x and values in the range 4-7 specify the 0-3 elements @@ -72,53 +82,53 @@ func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -144,53 +154,53 @@ func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -216,53 +226,53 @@ func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 { switch pattern { case _LLLL: - return x.concatSelectedConstant(cscimm(a, b, c, d), x) + return x.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstant(cscimm(a, b, c, d), y) + return y.concatSelectedConstant(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstant(cscimm(a, b, c, d), y) + return x.concatSelectedConstant(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstant(cscimm(a, b, c, d), x) + return y.concatSelectedConstant(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstant(cscimm(a, a, b, b), x) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstant(cscimm4(a, a, b, b), x) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstant(cscimm(a, a, b, b), y) - return z.concatSelectedConstant(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstant(cscimm4(a, a, b, b), y) + return z.concatSelectedConstant(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstant(cscimm(c, c, d, d), y) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstant(cscimm4(c, c, d, d), y) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstant(cscimm(c, c, d, d), x) - return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstant(cscimm4(c, c, d, d), x) + return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstant(cscimm(a, c, b, d), y) - return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(a, c, b, d), y) + return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstant(cscimm(b, d, a, c), y) - return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(b, d, a, c), y) + return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstant(cscimm(b, c, a, d), y) - return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstant(cscimm4(b, c, a, d), y) + return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstant(cscimm(a, d, b, c), y) - return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstant(cscimm4(a, d, b, c), y) + return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -291,53 +301,53 @@ func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -366,53 +376,53 @@ func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -441,53 +451,53 @@ func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -511,53 +521,53 @@ func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 { switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -581,53 +591,53 @@ func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x1 switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } @@ -651,59 +661,339 @@ func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float3 switch pattern { case _LLLL: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HHHH: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _LLHH: - return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y) case _HHLL: - return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x) case _HLLL: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _LHLL: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x) case _HLHH: - z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LHHH: - z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) - return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) + z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y) + return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y) case _LLLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LLHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHLH: - z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _HHHL: - z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) - return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) + z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x) + return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z) case _LHLH: - z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) - return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y) + return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z) case _HLHL: - z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) - return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y) + return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z) case _HLLH: - z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) - return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y) + return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z) case _LHHL: - z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) - return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) + z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y) + return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z) } panic("missing case, switch should be exhaustive") } -// cscimm converts the 4 vector element indices into a single +// cscimm4 converts the 4 vector element indices into a single // uint8 for use as an immediate. -func cscimm(a, b, c, d uint8) uint8 { +func cscimm4(a, b, c, d uint8) uint8 { return uint8(a + b<<2 + c<<4 + d<<6) } + +// cscimm2 converts the 2 vector element indices into a single +// uint8 for use as an immediate. +func cscimm2(a, b uint8) uint8 { + return uint8(a + b<<1) +} + +// cscimm2g2 converts the 2 vector element indices into a single +// uint8 for use as an immediate, but duplicated for VSHUFPD +// to emulate grouped behavior of VSHUFPS +func cscimm2g2(a, b uint8) uint8 { + g := cscimm2(a, b) + return g + g<<2 +} + +// cscimm2g4 converts the 2 vector element indices into a single +// uint8 for use as an immediate, but with four copies for VSHUFPD +// to emulate grouped behavior of VSHUFPS +func cscimm2g4(a, b uint8) uint8 { + g := cscimm2g2(a, b) + return g + g<<4 +} + +// SelectFromPair returns the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstant(cscimm2(a, b), x) + case _HH: + return y.concatSelectedConstant(cscimm2(a, b), y) + case _LH: + return x.concatSelectedConstant(cscimm2(a, b), y) + case _HL: + return y.concatSelectedConstant(cscimm2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the two 128-bit halves of +// the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the four 128-bit subvectors +// of the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPair returns the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstant(cscimm2(a, b), x) + case _HH: + return y.concatSelectedConstant(cscimm2(a, b), y) + case _LH: + return x.concatSelectedConstant(cscimm2(a, b), y) + case _HL: + return y.concatSelectedConstant(cscimm2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the two 128-bit halves of +// the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the four 128-bit subvectors +// of the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPair returns the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstant(cscimm2(a, b), x) + case _HH: + return y.concatSelectedConstant(cscimm2(a, b), y) + case _LH: + return x.concatSelectedConstant(cscimm2(a, b), y) + case _HL: + return y.concatSelectedConstant(cscimm2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the two 128-bit halves of +// the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x) + } + panic("missing case, switch should be exhaustive") +} + +// SelectFromPairGrouped returns, for each of the four 128-bit subvectors +// of the vectors x and y, the selection of two elements from the two +// vectors x and y, where selector values in the range 0-1 specify +// elements from x and values in the range 2-3 specify the 0-1 elements +// of y. When the selectors are constants the selection can be +// implemented in a single instruction. +// +// If the selectors are not constant this will translate to a function +// call. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 { + pattern := (a&2)>>1 + (b & 2) + + a, b = a&1, b&1 + + switch pattern { + case _LL: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + case _HH: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _LH: + return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y) + case _HL: + return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x) + } + panic("missing case, switch should be exhaustive") +} -- 2.52.0