[dev.simd] simd: put unexported methods to another file

author Junyang Shao <shaojunyang@google.com>

Tue, 4 Nov 2025 20:27:04 +0000 (20:27 +0000)

committer Junyang Shao <shaojunyang@google.com>

Mon, 10 Nov 2025 17:53:16 +0000 (09:53 -0800)
author Junyang Shao <shaojunyang@google.com>
Tue, 4 Nov 2025 20:27:04 +0000 (20:27 +0000)
committer Junyang Shao <shaojunyang@google.com>
Mon, 10 Nov 2025 17:53:16 +0000 (09:53 -0800)
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go

index a8998ec25295dbd229d05b8866a0fead7888fd01..7765327b32e586d2ec2297415e8bcde00d1de751 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -12,6 +12,7 @@ import (
         "slices"
         "sort"
         "strings"
+       "unicode"
  )
  
  type simdType struct {
@@ -586,10 +587,12 @@ func writeSIMDFeatures(ops []Operation) *bytes.Buffer {
  
  // writeSIMDStubs generates the simd vector intrinsic stubs and writes it to ops_amd64.go and ops_internal_amd64.go
  // within the specified directory.
-func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
+func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer) {
         t := templateOf(simdStubsTmpl, "simdStubs")
-       buffer := new(bytes.Buffer)
-       buffer.WriteString(simdPackageHeader)
+       f = new(bytes.Buffer)
+       fI = new(bytes.Buffer)
+       f.WriteString(simdPackageHeader)
+       fI.WriteString(simdPackageHeader)
  
         slices.SortFunc(ops, compareOperations)
  
@@ -610,10 +613,16 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
                                 }
                         }
                         if i == 0 || op.Go != ops[i-1].Go {
-                               fmt.Fprintf(buffer, "\n/* %s */\n", op.Go)
+                               fmt.Fprintf(f, "\n/* %s */\n", op.Go)
                         }
-                       if err := t.ExecuteTemplate(buffer, s, op); err != nil {
-                               panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err))
+                       if unicode.IsUpper([]rune(op.Go)[0]) {
+                               if err := t.ExecuteTemplate(f, s, op); err != nil {
+                                       panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err))
+                               }
+                       } else {
+                               if err := t.ExecuteTemplate(fI, s, op); err != nil {
+                                       panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err))
+                               }
                         }
                 } else {
                         panic(fmt.Errorf("failed to classify op %v: %w", op.Go, err))
@@ -622,17 +631,17 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
  
         vectorConversions := vConvertFromTypeMap(typeMap)
         for _, conv := range vectorConversions {
-               if err := t.ExecuteTemplate(buffer, "vectorConversion", conv); err != nil {
+               if err := t.ExecuteTemplate(f, "vectorConversion", conv); err != nil {
                         panic(fmt.Errorf("failed to execute vectorConversion template: %w", err))
                 }
         }
  
         masks := masksFromTypeMap(typeMap)
         for _, mask := range masks {
-               if err := t.ExecuteTemplate(buffer, "mask", mask); err != nil {
+               if err := t.ExecuteTemplate(f, "mask", mask); err != nil {
                         panic(fmt.Errorf("failed to execute mask template for mask %s: %w", mask.Name, err))
                 }
         }
  
-       return buffer
+       return
  }
diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go

index 244f67fe9d91164a2127e18a9aaff08b8bdfd864..f42251c5c31bdaa5bdf5372d892ce8374ad7fd09 100644 (file)
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@@ -382,7 +382,9 @@ func writeGoDefs(path string, cl unify.Closure) error {
  
         formatWriteAndClose(writeSIMDTypes(typeMap), path, "src/"+simdPackage+"/types_amd64.go")
         formatWriteAndClose(writeSIMDFeatures(deduped), path, "src/"+simdPackage+"/cpu.go")
-       formatWriteAndClose(writeSIMDStubs(deduped, typeMap), path, "src/"+simdPackage+"/ops_amd64.go")
+       f, fI := writeSIMDStubs(deduped, typeMap)
+       formatWriteAndClose(f, path, "src/"+simdPackage+"/ops_amd64.go")
+       formatWriteAndClose(fI, path, "src/"+simdPackage+"/ops_internal_amd64.go")
         formatWriteAndClose(writeSIMDIntrinsics(deduped, typeMap), path, "src/cmd/compile/internal/ssagen/simdintrinsics.go")
         formatWriteAndClose(writeSIMDGenericOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdgenericOps.go")
         formatWriteAndClose(writeSIMDMachineOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go")
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index e0c76099ba4efa25bbdf0d455ef1487ef976c672..ace2f7aec8340fd0688166ceb9fca8dc8096bb40 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -7608,518 +7608,16 @@ func (x Uint64x8) Xor(y Uint64x8) Uint64x8
  
  /* blend */
  
-// blend blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// Asm: VPBLENDVB, CPU Feature: AVX
-func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16
-
-// blend blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// Asm: VPBLENDVB, CPU Feature: AVX2
-func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32
-
  /* blendMasked */
  
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMB, CPU Feature: AVX512
-func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64
-
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMW, CPU Feature: AVX512
-func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32
-
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMD, CPU Feature: AVX512
-func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
-
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMQ, CPU Feature: AVX512
-func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
-
  /* concatSelectedConstant */
  
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specify which element from y or x to select.
-// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
-// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output.  The selection is chosen by the constant parameter hilo
-// where hi and lo are each one bit specifying which 64-bit element to select
-// from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
-// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
-// selecting from y, is 1, and selects 7.
-//
-// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specify which element from y or x to select.
-// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
-// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output.  The selection is chosen by the constant parameter hilo
-// where hi and lo are each one bit specifying which 64-bit element to select
-// from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
-// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
-// selecting from y, is 1, and selects 7.
-//
-// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specify which element from y or x to select.
-// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
-// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output.  The selection is chosen by the constant parameter hilo
-// where hi and lo are each one bit specifying which 64-bit element to select
-// from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
-// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
-// selecting from y, is 1, and selects 7.
-//
-// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2
-
  /* concatSelectedConstantGrouped */
  
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
-// returns {2,0,5,7,10,8,13,15}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
-//
-//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
-//
-// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
-// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
-// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).
-// This differs from the same method applied to a 32x8 vector, where
-// the 8-bit constant performs the same selection on both subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
-// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
-// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's next 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
-// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
-// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
-// This differs from the same method applied to a 32x8 or 32x16 vector, where
-// the 8-bit constant performs the same selection on all the subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
-// returns {2,0,5,7,10,8,13,15}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
-//
-//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
-//
-// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
-// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
-// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).
-// This differs from the same method applied to a 32x8 vector, where
-// the 8-bit constant performs the same selection on both subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
-// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
-// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's next 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
-// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
-// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
-// This differs from the same method applied to a 32x8 or 32x16 vector, where
-// the 8-bit constant performs the same selection on all the subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
-// returns {2,0,5,7,10,8,13,15}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
-//
-//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
-//
-// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
-// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
-// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).
-// This differs from the same method applied to a 32x8 vector, where
-// the 8-bit constant performs the same selection on both subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
-// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
-// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's next 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
-// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
-// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
-// This differs from the same method applied to a 32x8 or 32x16 vector, where
-// the 8-bit constant performs the same selection on all the subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
-
  /* moveMasked */
  
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVUPS, CPU Feature: AVX512
-func (x Float32x16) moveMasked(mask Mask32x16) Float32x16
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVUPD, CPU Feature: AVX512
-func (x Float64x8) moveMasked(mask Mask64x8) Float64x8
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU8, CPU Feature: AVX512
-func (x Int8x64) moveMasked(mask Mask8x64) Int8x64
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU16, CPU Feature: AVX512
-func (x Int16x32) moveMasked(mask Mask16x32) Int16x32
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU32, CPU Feature: AVX512
-func (x Int32x16) moveMasked(mask Mask32x16) Int32x16
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU64, CPU Feature: AVX512
-func (x Int64x8) moveMasked(mask Mask64x8) Int64x8
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU8, CPU Feature: AVX512
-func (x Uint8x64) moveMasked(mask Mask8x64) Uint8x64
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU16, CPU Feature: AVX512
-func (x Uint16x32) moveMasked(mask Mask16x32) Uint16x32
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU32, CPU Feature: AVX512
-func (x Uint32x16) moveMasked(mask Mask32x16) Uint32x16
-
-// moveMasked blends a vector with zero, with the original value where the mask is true
-// and zero where the mask is false.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VMOVDQU64, CPU Feature: AVX512
-func (x Uint64x8) moveMasked(mask Mask64x8) Uint64x8
-
  /* tern */
  
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8
-
  // Float64x2 converts from Float32x4 to Float64x2
  func (from Float32x4) AsFloat64x2() (to Float64x2)
  
diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go

new file mode 100644 (file)

index 0000000..cb18c90
--- /dev/null
+++ b/src/simd/ops_internal_amd64.go
@@ -0,0 +1,507 @@
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package simd
+
+// blend blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// Asm: VPBLENDVB, CPU Feature: AVX
+func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16
+
+// blend blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// Asm: VPBLENDVB, CPU Feature: AVX2
+func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMB, CPU Feature: AVX512
+func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMW, CPU Feature: AVX512
+func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMD, CPU Feature: AVX512
+func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMQ, CPU Feature: AVX512
+func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specify which element from y or x to select.
+// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
+// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output.  The selection is chosen by the constant parameter hilo
+// where hi and lo are each one bit specifying which 64-bit element to select
+// from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
+// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
+// selecting from y, is 1, and selects 7.
+//
+// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specify which element from y or x to select.
+// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
+// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output.  The selection is chosen by the constant parameter hilo
+// where hi and lo are each one bit specifying which 64-bit element to select
+// from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
+// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
+// selecting from y, is 1, and selects 7.
+//
+// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specify which element from y or x to select.
+// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
+// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output.  The selection is chosen by the constant parameter hilo
+// where hi and lo are each one bit specifying which 64-bit element to select
+// from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
+// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
+// selecting from y, is 1, and selects 7.
+//
+// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
+// returns {2,0,5,7,10,8,13,15}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+//
+//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//
+// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
+// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
+// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).
+// This differs from the same method applied to a 32x8 vector, where
+// the 8-bit constant performs the same selection on both subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
+// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
+// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's next 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
+// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
+// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
+// This differs from the same method applied to a 32x8 or 32x16 vector, where
+// the 8-bit constant performs the same selection on all the subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
+// returns {2,0,5,7,10,8,13,15}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+//
+//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//
+// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
+// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
+// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).
+// This differs from the same method applied to a 32x8 vector, where
+// the 8-bit constant performs the same selection on both subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
+// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
+// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's next 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
+// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
+// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
+// This differs from the same method applied to a 32x8 or 32x16 vector, where
+// the 8-bit constant performs the same selection on all the subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
+// returns {2,0,5,7,10,8,13,15}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+//
+//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//
+// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
+// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
+// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).
+// This differs from the same method applied to a 32x8 vector, where
+// the 8-bit constant performs the same selection on both subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
+// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
+// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's next 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
+// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
+// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
+// This differs from the same method applied to a 32x8 or 32x16 vector, where
+// the 8-bit constant performs the same selection on all the subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVUPS, CPU Feature: AVX512
+func (x Float32x16) moveMasked(mask Mask32x16) Float32x16
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVUPD, CPU Feature: AVX512
+func (x Float64x8) moveMasked(mask Mask64x8) Float64x8
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+func (x Int8x64) moveMasked(mask Mask8x64) Int8x64
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+func (x Int16x32) moveMasked(mask Mask16x32) Int16x32
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+func (x Int32x16) moveMasked(mask Mask32x16) Int32x16
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+func (x Int64x8) moveMasked(mask Mask64x8) Int64x8
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+func (x Uint8x64) moveMasked(mask Mask8x64) Uint8x64
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+func (x Uint16x32) moveMasked(mask Mask16x32) Uint16x32
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+func (x Uint32x16) moveMasked(mask Mask32x16) Uint32x16
+
+// moveMasked blends a vector with zero, with the original value where the mask is true
+// and zero where the mask is false.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+func (x Uint64x8) moveMasked(mask Mask64x8) Uint64x8
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8
author	Junyang Shao <shaojunyang@google.com>
	Tue, 4 Nov 2025 20:27:04 +0000 (20:27 +0000)
committer	Junyang Shao <shaojunyang@google.com>
	Mon, 10 Nov 2025 17:53:16 +0000 (09:53 -0800)
src/simd/_gen/simdgen/gen_simdTypes.go		patch \| blob \| history
src/simd/_gen/simdgen/godefs.go		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history
src/simd/ops_internal_amd64.go	[new file with mode: 0644]	patch \| blob