From bf77323efa55a4fbe86a3e19c84d12533f5f10af Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Tue, 4 Nov 2025 20:27:04 +0000 Subject: [PATCH] [dev.simd] simd: put unexported methods to another file This CL is just a cleanup. Change-Id: I429f2d211828e17faca03a02f40e9f544b94844d Reviewed-on: https://go-review.googlesource.com/c/go/+/717820 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/simd/_gen/simdgen/gen_simdTypes.go | 27 +- src/simd/_gen/simdgen/godefs.go | 4 +- src/simd/ops_amd64.go | 502 ------------------------ src/simd/ops_internal_amd64.go | 507 +++++++++++++++++++++++++ 4 files changed, 528 insertions(+), 512 deletions(-) create mode 100644 src/simd/ops_internal_amd64.go diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go index a8998ec252..7765327b32 100644 --- a/src/simd/_gen/simdgen/gen_simdTypes.go +++ b/src/simd/_gen/simdgen/gen_simdTypes.go @@ -12,6 +12,7 @@ import ( "slices" "sort" "strings" + "unicode" ) type simdType struct { @@ -586,10 +587,12 @@ func writeSIMDFeatures(ops []Operation) *bytes.Buffer { // writeSIMDStubs generates the simd vector intrinsic stubs and writes it to ops_amd64.go and ops_internal_amd64.go // within the specified directory. -func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) *bytes.Buffer { +func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer) { t := templateOf(simdStubsTmpl, "simdStubs") - buffer := new(bytes.Buffer) - buffer.WriteString(simdPackageHeader) + f = new(bytes.Buffer) + fI = new(bytes.Buffer) + f.WriteString(simdPackageHeader) + fI.WriteString(simdPackageHeader) slices.SortFunc(ops, compareOperations) @@ -610,10 +613,16 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) *bytes.Buffer { } } if i == 0 || op.Go != ops[i-1].Go { - fmt.Fprintf(buffer, "\n/* %s */\n", op.Go) + fmt.Fprintf(f, "\n/* %s */\n", op.Go) } - if err := t.ExecuteTemplate(buffer, s, op); err != nil { - panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err)) + if unicode.IsUpper([]rune(op.Go)[0]) { + if err := t.ExecuteTemplate(f, s, op); err != nil { + panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err)) + } + } else { + if err := t.ExecuteTemplate(fI, s, op); err != nil { + panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err)) + } } } else { panic(fmt.Errorf("failed to classify op %v: %w", op.Go, err)) @@ -622,17 +631,17 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) *bytes.Buffer { vectorConversions := vConvertFromTypeMap(typeMap) for _, conv := range vectorConversions { - if err := t.ExecuteTemplate(buffer, "vectorConversion", conv); err != nil { + if err := t.ExecuteTemplate(f, "vectorConversion", conv); err != nil { panic(fmt.Errorf("failed to execute vectorConversion template: %w", err)) } } masks := masksFromTypeMap(typeMap) for _, mask := range masks { - if err := t.ExecuteTemplate(buffer, "mask", mask); err != nil { + if err := t.ExecuteTemplate(f, "mask", mask); err != nil { panic(fmt.Errorf("failed to execute mask template for mask %s: %w", mask.Name, err)) } } - return buffer + return } diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go index 244f67fe9d..f42251c5c3 100644 --- a/src/simd/_gen/simdgen/godefs.go +++ b/src/simd/_gen/simdgen/godefs.go @@ -382,7 +382,9 @@ func writeGoDefs(path string, cl unify.Closure) error { formatWriteAndClose(writeSIMDTypes(typeMap), path, "src/"+simdPackage+"/types_amd64.go") formatWriteAndClose(writeSIMDFeatures(deduped), path, "src/"+simdPackage+"/cpu.go") - formatWriteAndClose(writeSIMDStubs(deduped, typeMap), path, "src/"+simdPackage+"/ops_amd64.go") + f, fI := writeSIMDStubs(deduped, typeMap) + formatWriteAndClose(f, path, "src/"+simdPackage+"/ops_amd64.go") + formatWriteAndClose(fI, path, "src/"+simdPackage+"/ops_internal_amd64.go") formatWriteAndClose(writeSIMDIntrinsics(deduped, typeMap), path, "src/cmd/compile/internal/ssagen/simdintrinsics.go") formatWriteAndClose(writeSIMDGenericOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdgenericOps.go") formatWriteAndClose(writeSIMDMachineOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go") diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index e0c76099ba..ace2f7aec8 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -7608,518 +7608,16 @@ func (x Uint64x8) Xor(y Uint64x8) Uint64x8 /* blend */ -// blend blends two vectors based on mask values, choosing either -// the first or the second based on whether the third is false or true -// -// Asm: VPBLENDVB, CPU Feature: AVX -func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16 - -// blend blends two vectors based on mask values, choosing either -// the first or the second based on whether the third is false or true -// -// Asm: VPBLENDVB, CPU Feature: AVX2 -func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32 - /* blendMasked */ -// blendMasked blends two vectors based on mask values, choosing either -// the first or the second based on whether the third is false or true -// -// This operation is applied selectively under a write mask. -// -// Asm: VPBLENDMB, CPU Feature: AVX512 -func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64 - -// blendMasked blends two vectors based on mask values, choosing either -// the first or the second based on whether the third is false or true -// -// This operation is applied selectively under a write mask. -// -// Asm: VPBLENDMW, CPU Feature: AVX512 -func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32 - -// blendMasked blends two vectors based on mask values, choosing either -// the first or the second based on whether the third is false or true -// -// This operation is applied selectively under a write mask. -// -// Asm: VPBLENDMD, CPU Feature: AVX512 -func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 - -// blendMasked blends two vectors based on mask values, choosing either -// the first or the second based on whether the third is false or true -// -// This operation is applied selectively under a write mask. -// -// Asm: VPBLENDMQ, CPU Feature: AVX512 -func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 - /* concatSelectedConstant */ -// concatSelectedConstant concatenates selected elements from x and y into the lower and upper -// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specify which element from y or x to select. -// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns -// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX -func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4 - -// concatSelectedConstant concatenates selected elements from x and y into the lower and upper -// halves of the output. The selection is chosen by the constant parameter hilo -// where hi and lo are each one bit specifying which 64-bit element to select -// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) -// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, -// selecting from y, is 1, and selects 7. -// -// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX -func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2 - -// concatSelectedConstant concatenates selected elements from x and y into the lower and upper -// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specify which element from y or x to select. -// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns -// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX -func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4 - -// concatSelectedConstant concatenates selected elements from x and y into the lower and upper -// halves of the output. The selection is chosen by the constant parameter hilo -// where hi and lo are each one bit specifying which 64-bit element to select -// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) -// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, -// selecting from y, is 1, and selects 7. -// -// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX -func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2 - -// concatSelectedConstant concatenates selected elements from x and y into the lower and upper -// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specify which element from y or x to select. -// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns -// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX -func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4 - -// concatSelectedConstant concatenates selected elements from x and y into the lower and upper -// halves of the output. The selection is chosen by the constant parameter hilo -// where hi and lo are each one bit specifying which 64-bit element to select -// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) -// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, -// selecting from y, is 1, and selects 7. -// -// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX -func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2 - /* concatSelectedConstantGrouped */ -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specifying which element from y or x to select. -// For example, -// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) -// returns {2,0,5,7,10,8,13,15} -// (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX -func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specifying which element from y or x to select. -// For example, -// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( -// -// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) -// -// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} -// (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX512 -func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selections are specified by the constant parameter hilos where each -// hi and lo pair select 64-bit elements from the corresponding 128-bit -// subvectors of x and y. -// -// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) -// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least -// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), -// then 1, selecting element 1 from x's upper 128 bits (9), then 1, -// selecting element 1 from y's upper 128 bits (11). -// This differs from the same method applied to a 32x8 vector, where -// the 8-bit constant performs the same selection on both subvectors. -// -// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX -func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selections are specified by the constant parameter hilos where each -// hi and lo pair select 64-bit elements from the corresponding 128-bit -// subvectors of x and y. -// -// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) -// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's -// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), -// then 1, selecting element 1 from x's next 128 bits (9), then 1, -// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select -// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two -// 1 bits select the upper elements from x and y's last 128 bits (17, 19). -// This differs from the same method applied to a 32x8 or 32x16 vector, where -// the 8-bit constant performs the same selection on all the subvectors. -// -// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX512 -func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specifying which element from y or x to select. -// For example, -// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) -// returns {2,0,5,7,10,8,13,15} -// (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX -func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specifying which element from y or x to select. -// For example, -// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( -// -// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) -// -// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} -// (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX512 -func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selections are specified by the constant parameter hilos where each -// hi and lo pair select 64-bit elements from the corresponding 128-bit -// subvectors of x and y. -// -// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) -// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least -// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), -// then 1, selecting element 1 from x's upper 128 bits (9), then 1, -// selecting element 1 from y's upper 128 bits (11). -// This differs from the same method applied to a 32x8 vector, where -// the 8-bit constant performs the same selection on both subvectors. -// -// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX -func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selections are specified by the constant parameter hilos where each -// hi and lo pair select 64-bit elements from the corresponding 128-bit -// subvectors of x and y. -// -// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) -// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's -// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), -// then 1, selecting element 1 from x's next 128 bits (9), then 1, -// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select -// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two -// 1 bits select the upper elements from x and y's last 128 bits (17, 19). -// This differs from the same method applied to a 32x8 or 32x16 vector, where -// the 8-bit constant performs the same selection on all the subvectors. -// -// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX512 -func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specifying which element from y or x to select. -// For example, -// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) -// returns {2,0,5,7,10,8,13,15} -// (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX -func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selection is chosen by the constant parameter h1h0l1l0 -// where each {h,l}{1,0} is two bits specifying which element from y or x to select. -// For example, -// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( -// -// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) -// -// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} -// (don't forget that the binary constant is written big-endian). -// -// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPS, CPU Feature: AVX512 -func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selections are specified by the constant parameter hilos where each -// hi and lo pair select 64-bit elements from the corresponding 128-bit -// subvectors of x and y. -// -// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) -// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least -// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), -// then 1, selecting element 1 from x's upper 128 bits (9), then 1, -// selecting element 1 from y's upper 128 bits (11). -// This differs from the same method applied to a 32x8 vector, where -// the 8-bit constant performs the same selection on both subvectors. -// -// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX -func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4 - -// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y -// into the lower and upper halves of corresponding subvectors of the output. -// The selections are specified by the constant parameter hilos where each -// hi and lo pair select 64-bit elements from the corresponding 128-bit -// subvectors of x and y. -// -// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) -// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's -// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), -// then 1, selecting element 1 from x's next 128 bits (9), then 1, -// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select -// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two -// 1 bits select the upper elements from x and y's last 128 bits (17, 19). -// This differs from the same method applied to a 32x8 or 32x16 vector, where -// the 8-bit constant performs the same selection on all the subvectors. -// -// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VSHUFPD, CPU Feature: AVX512 -func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 - /* moveMasked */ -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVUPS, CPU Feature: AVX512 -func (x Float32x16) moveMasked(mask Mask32x16) Float32x16 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVUPD, CPU Feature: AVX512 -func (x Float64x8) moveMasked(mask Mask64x8) Float64x8 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU8, CPU Feature: AVX512 -func (x Int8x64) moveMasked(mask Mask8x64) Int8x64 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU16, CPU Feature: AVX512 -func (x Int16x32) moveMasked(mask Mask16x32) Int16x32 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU32, CPU Feature: AVX512 -func (x Int32x16) moveMasked(mask Mask32x16) Int32x16 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU64, CPU Feature: AVX512 -func (x Int64x8) moveMasked(mask Mask64x8) Int64x8 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU8, CPU Feature: AVX512 -func (x Uint8x64) moveMasked(mask Mask8x64) Uint8x64 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU16, CPU Feature: AVX512 -func (x Uint16x32) moveMasked(mask Mask16x32) Uint16x32 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU32, CPU Feature: AVX512 -func (x Uint32x16) moveMasked(mask Mask32x16) Uint32x16 - -// moveMasked blends a vector with zero, with the original value where the mask is true -// and zero where the mask is false. -// -// This operation is applied selectively under a write mask. -// -// Asm: VMOVDQU64, CPU Feature: AVX512 -func (x Uint64x8) moveMasked(mask Mask64x8) Uint64x8 - /* tern */ -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGD, CPU Feature: AVX512 -func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGD, CPU Feature: AVX512 -func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGD, CPU Feature: AVX512 -func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGQ, CPU Feature: AVX512 -func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGQ, CPU Feature: AVX512 -func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGQ, CPU Feature: AVX512 -func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGD, CPU Feature: AVX512 -func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGD, CPU Feature: AVX512 -func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGD, CPU Feature: AVX512 -func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGQ, CPU Feature: AVX512 -func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGQ, CPU Feature: AVX512 -func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4 - -// tern performs a logical operation on three vectors based on the 8-bit truth table. -// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) -// -// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPTERNLOGQ, CPU Feature: AVX512 -func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8 - // Float64x2 converts from Float32x4 to Float64x2 func (from Float32x4) AsFloat64x2() (to Float64x2) diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go new file mode 100644 index 0000000000..cb18c90e29 --- /dev/null +++ b/src/simd/ops_internal_amd64.go @@ -0,0 +1,507 @@ +// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT. + +//go:build goexperiment.simd + +package simd + +// blend blends two vectors based on mask values, choosing either +// the first or the second based on whether the third is false or true +// +// Asm: VPBLENDVB, CPU Feature: AVX +func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16 + +// blend blends two vectors based on mask values, choosing either +// the first or the second based on whether the third is false or true +// +// Asm: VPBLENDVB, CPU Feature: AVX2 +func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32 + +// blendMasked blends two vectors based on mask values, choosing either +// the first or the second based on whether the third is false or true +// +// This operation is applied selectively under a write mask. +// +// Asm: VPBLENDMB, CPU Feature: AVX512 +func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64 + +// blendMasked blends two vectors based on mask values, choosing either +// the first or the second based on whether the third is false or true +// +// This operation is applied selectively under a write mask. +// +// Asm: VPBLENDMW, CPU Feature: AVX512 +func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32 + +// blendMasked blends two vectors based on mask values, choosing either +// the first or the second based on whether the third is false or true +// +// This operation is applied selectively under a write mask. +// +// Asm: VPBLENDMD, CPU Feature: AVX512 +func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 + +// blendMasked blends two vectors based on mask values, choosing either +// the first or the second based on whether the third is false or true +// +// This operation is applied selectively under a write mask. +// +// Asm: VPBLENDMQ, CPU Feature: AVX512 +func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns +// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter hilo +// where hi and lo are each one bit specifying which 64-bit element to select +// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) +// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, +// selecting from y, is 1, and selects 7. +// +// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns +// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter hilo +// where hi and lo are each one bit specifying which 64-bit element to select +// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) +// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, +// selecting from y, is 1, and selects 7. +// +// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specify which element from y or x to select. +// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns +// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4 + +// concatSelectedConstant concatenates selected elements from x and y into the lower and upper +// halves of the output. The selection is chosen by the constant parameter hilo +// where hi and lo are each one bit specifying which 64-bit element to select +// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) +// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, +// selecting from y, is 1, and selects 7. +// +// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specifying which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specifying which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( +// +// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) +// +// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX512 +func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) +// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's +// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's next 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select +// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two +// 1 bits select the upper elements from x and y's last 128 bits (17, 19). +// This differs from the same method applied to a 32x8 or 32x16 vector, where +// the 8-bit constant performs the same selection on all the subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specifying which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specifying which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( +// +// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) +// +// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX512 +func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) +// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's +// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's next 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select +// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two +// 1 bits select the upper elements from x and y's last 128 bits (17, 19). +// This differs from the same method applied to a 32x8 or 32x16 vector, where +// the 8-bit constant performs the same selection on all the subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specifying which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) +// returns {2,0,5,7,10,8,13,15} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX +func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selection is chosen by the constant parameter h1h0l1l0 +// where each {h,l}{1,0} is two bits specifying which element from y or x to select. +// For example, +// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( +// +// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) +// +// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} +// (don't forget that the binary constant is written big-endian). +// +// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPS, CPU Feature: AVX512 +func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) +// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least +// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's upper 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). +// This differs from the same method applied to a 32x8 vector, where +// the 8-bit constant performs the same selection on both subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX +func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4 + +// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y +// into the lower and upper halves of corresponding subvectors of the output. +// The selections are specified by the constant parameter hilos where each +// hi and lo pair select 64-bit elements from the corresponding 128-bit +// subvectors of x and y. +// +// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) +// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's +// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), +// then 1, selecting element 1 from x's next 128 bits (9), then 1, +// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select +// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two +// 1 bits select the upper elements from x and y's last 128 bits (17, 19). +// This differs from the same method applied to a 32x8 or 32x16 vector, where +// the 8-bit constant performs the same selection on all the subvectors. +// +// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VSHUFPD, CPU Feature: AVX512 +func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVUPS, CPU Feature: AVX512 +func (x Float32x16) moveMasked(mask Mask32x16) Float32x16 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVUPD, CPU Feature: AVX512 +func (x Float64x8) moveMasked(mask Mask64x8) Float64x8 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU8, CPU Feature: AVX512 +func (x Int8x64) moveMasked(mask Mask8x64) Int8x64 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU16, CPU Feature: AVX512 +func (x Int16x32) moveMasked(mask Mask16x32) Int16x32 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU32, CPU Feature: AVX512 +func (x Int32x16) moveMasked(mask Mask32x16) Int32x16 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU64, CPU Feature: AVX512 +func (x Int64x8) moveMasked(mask Mask64x8) Int64x8 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU8, CPU Feature: AVX512 +func (x Uint8x64) moveMasked(mask Mask8x64) Uint8x64 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU16, CPU Feature: AVX512 +func (x Uint16x32) moveMasked(mask Mask16x32) Uint16x32 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU32, CPU Feature: AVX512 +func (x Uint32x16) moveMasked(mask Mask32x16) Uint32x16 + +// moveMasked blends a vector with zero, with the original value where the mask is true +// and zero where the mask is false. +// +// This operation is applied selectively under a write mask. +// +// Asm: VMOVDQU64, CPU Feature: AVX512 +func (x Uint64x8) moveMasked(mask Mask64x8) Uint64x8 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8 -- 2.52.0