[dev.simd] simd, cmd/compile: add more element types for Select128FromPair

author David Chase <drchase@google.com>

Wed, 19 Nov 2025 22:17:54 +0000 (17:17 -0500)

committer David Chase <drchase@google.com>

Fri, 21 Nov 2025 01:48:29 +0000 (17:48 -0800)
author David Chase <drchase@google.com>
Wed, 19 Nov 2025 22:17:54 +0000 (17:17 -0500)
committer David Chase <drchase@google.com>
Fri, 21 Nov 2025 01:48:29 +0000 (17:48 -0800)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index 283a2e53cd9ff10a2952e67a8072db7288427e07..db426f6615f68936659795f739ddbbfe1e71cfd7 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -941,8 +941,12 @@
  (ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
  (Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...)
  (Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...)
+(Select128FromPairInt8x32 ...) => (VPERM2I128256 ...)
+(Select128FromPairInt16x16 ...) => (VPERM2I128256 ...)
  (Select128FromPairInt32x8 ...) => (VPERM2I128256 ...)
  (Select128FromPairInt64x4 ...) => (VPERM2I128256 ...)
+(Select128FromPairUint8x32 ...) => (VPERM2I128256 ...)
+(Select128FromPairUint16x16 ...) => (VPERM2I128256 ...)
  (Select128FromPairUint32x8 ...) => (VPERM2I128256 ...)
  (Select128FromPairUint64x4 ...) => (VPERM2I128256 ...)
  (SetElemFloat32x4 ...) => (VPINSRD128 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index 3fae158c0ae911bb4bf657a89bb0cea1dc786555..5683fcef0df63518e1699f57eecda4ea519902e4 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1192,8 +1192,12 @@ func simdGenericOps() []opData {
                 {name: "SHA1FourRoundsUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairInt8x32", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairInt16x16", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "Select128FromPairInt64x4", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairUint8x32", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairUint16x16", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "Select128FromPairUint32x8", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "Select128FromPairUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index fa94dfbbd59b8eb9fba7695174551b20bfede610..bb40ff411786e1721bca0a3b99b5d2bb5523d0c5 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -7151,8 +7151,12 @@ const (
         OpSHA1FourRoundsUint32x4
         OpSelect128FromPairFloat32x8
         OpSelect128FromPairFloat64x4
+       OpSelect128FromPairInt8x32
+       OpSelect128FromPairInt16x16
         OpSelect128FromPairInt32x8
         OpSelect128FromPairInt64x4
+       OpSelect128FromPairUint8x32
+       OpSelect128FromPairUint16x16
         OpSelect128FromPairUint32x8
         OpSelect128FromPairUint64x4
         OpSetElemFloat32x4
@@ -92250,6 +92254,18 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "Select128FromPairInt8x32",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Select128FromPairInt16x16",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "Select128FromPairInt32x8",
                 auxType: auxUInt8,
@@ -92262,6 +92278,18 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "Select128FromPairUint8x32",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Select128FromPairUint16x16",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "Select128FromPairUint32x8",
                 auxType: auxUInt8,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index 5ad2ed3f96bac4189cdbba79f937d2eb10007adf..c7995c5c9e5eeabf21551877cbc0bd21f713b97f 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -5017,18 +5017,30 @@ func rewriteValueAMD64(v *Value) bool {
         case OpSelect128FromPairFloat64x4:
                 v.Op = OpAMD64VPERM2F128256
                 return true
+       case OpSelect128FromPairInt16x16:
+               v.Op = OpAMD64VPERM2I128256
+               return true
         case OpSelect128FromPairInt32x8:
                 v.Op = OpAMD64VPERM2I128256
                 return true
         case OpSelect128FromPairInt64x4:
                 v.Op = OpAMD64VPERM2I128256
                 return true
+       case OpSelect128FromPairInt8x32:
+               v.Op = OpAMD64VPERM2I128256
+               return true
+       case OpSelect128FromPairUint16x16:
+               v.Op = OpAMD64VPERM2I128256
+               return true
         case OpSelect128FromPairUint32x8:
                 v.Op = OpAMD64VPERM2I128256
                 return true
         case OpSelect128FromPairUint64x4:
                 v.Op = OpAMD64VPERM2I128256
                 return true
+       case OpSelect128FromPairUint8x32:
+               v.Op = OpAMD64VPERM2I128256
+               return true
         case OpSelectN:
                 return rewriteValueAMD64_OpSelectN(v)
         case OpSetElemFloat32x4:
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 34e491371eae7724f5ca602dd17c234f70c16e71..413cf92c88c7e70534c79fb580aec93d98ad0ceb 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -953,8 +953,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt8x32, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt16x16, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint8x32, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint16x16, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go

index dc5f77adaab0cbf5e5159970be029acf509f1acd..f98795e1b0b7c2066abd4ec92fcd86796eb24fa7 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -351,7 +351,7 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uin
  {{if .Documentation}}{{.Documentation}}
  //{{end}}
  // {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table.
-// {{.ImmName}} should be between 0 and 3, inclusive; other values will result in a runtime panic.
+// {{.ImmName}} should be between 0 and 3, inclusive; other values may result in a runtime panic.
  //
  // Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
  func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go

index 0b8fbd7e3de85a2fa0beccb95a10cb63b747650b..c127eb1b6de65a6ef81c25e9d5ce7be9b83d1c37 100644 (file)
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@@ -98,6 +98,8 @@ func (o *Operation) SkipMaskedMethod() bool {
         return false
  }
  
+var reForName = regexp.MustCompile(`\bNAME\b`)
+
  func (o *Operation) DecodeUnified(v *unify.Value) error {
         if err := v.Decode(&o.rawOperation); err != nil {
                 return err
@@ -117,7 +119,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
         } else {
                 o.Documentation = "// UNDOCUMENTED"
         }
-       o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
+       o.Documentation = reForName.ReplaceAllString(o.Documentation, o.Go)
         if isMasked {
                 o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
                 // Suppress generic op and method declaration for exported methods, if a mask is present.
@@ -128,7 +130,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
                 }
         }
         if o.rawOperation.AddDoc != nil {
-               o.Documentation += "\n" + *o.rawOperation.AddDoc
+               o.Documentation += "\n" + reForName.ReplaceAllString(*o.rawOperation.AddDoc, o.Go)
         }
  
         o.In = append(o.rawOperation.In, o.rawOperation.InVariant...)
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml

index 44bd8efb7fdbe00b671b39c70dd514593a0d0dc1..3c86974e8a2a3a4e126f2ce0adf41d7fd647ab6e 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@@ -135,7 +135,7 @@
      // NAME concatenates selected elements from x and y into the lower and upper
      // halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
      // where each {h,l}{1,0} is two bits specify which element from y or x to select.
-    // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
+    // For example, {0,1,2,3}.NAME(0b_11_01_00_10, {4,5,6,7}) returns
      // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
  
  - go: concatSelectedConstant
@@ -196,9 +196,12 @@
      // The selection is chosen by the constant parameter h1h0l1l0
      // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
      // For example,
-    // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.NAME(
-    //             0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+    //
+    //   {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.NAME(
+    //    0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+    //
      // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+    //
      // (don't forget that the binary constant is written big-endian).
  
  - go: concatSelectedConstantGrouped
@@ -214,7 +217,7 @@
      // subvectors of x and y.
      //
      // For example {4,5,8,9,12,13,16,17}.NAME(0b11_00_11_10, {6,7,10,11,14,15,18,19})
-    // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 
+    // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
      // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
      // then 1, selecting element 1 from x's next 128 bits (9), then 1,
      // selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
@@ -227,9 +230,8 @@
    commutative: false
    documentation: !string |-
      // NAME treats the 256-bit vectors x and y as a single vector of four
-    // 128-bit elements, and returns a 256-bit result formed by 
+    // 128-bit elements, and returns a 256-bit result formed by
      // concatenating the two elements specified by lo and hi.
-    // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.
  
  - go: ConcatShiftBytesRight
    commutative: false
diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml

index 697d6a8bced40b5637b0848d21dd7df6b596e452..bbea29bcb0af3ffb86c2ee130646f1a325d18ae1 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@@ -837,6 +837,12 @@
  - go: Select128FromPair
    asm: VPERM2F128
    operandOrder: II
+  addDoc: !string |-
+    // For example,
+    //
+    //   {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71})
+    //
+    // returns {70, 71, 40, 41}.
    in:
    - &v
      go: $t
@@ -854,6 +860,12 @@
  - go: Select128FromPair
    asm: VPERM2F128
    operandOrder: II
+  addDoc: !string |-
+    // For example,
+    //
+    //   {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+    //
+    // returns {70, 71, 72, 73, 40, 41, 42, 43}.
    in:
    - &v
      go: $t
@@ -872,6 +884,12 @@
  - go: Select128FromPair
    asm: VPERM2I128
    operandOrder: II
+  addDoc: !string |-
+    // For example,
+    //
+    //   {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71})
+    //
+    // returns {70, 71, 40, 41}.
    in:
    - &v
      go: $t
@@ -890,6 +908,12 @@
  - go: Select128FromPair
    asm: VPERM2I128
    operandOrder: II
+  addDoc: !string |-
+    // For example,
+    //
+    //   {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+    //
+    // returns {70, 71, 72, 73, 40, 41, 42, 43}.
    in:
    - &v
      go: $t
@@ -905,6 +929,56 @@
    out:
    - *v
  
+- go: Select128FromPair
+  asm: VPERM2I128
+  operandOrder: II
+  addDoc: !string |-
+    // For example,
+    //
+    //   {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.NAME(3, 0,
+    //    {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+    //
+    // returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: int|uint
+    bits: 256
+    OverwriteElementBits: 16
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
+- go: Select128FromPair
+  asm: VPERM2I128
+  operandOrder: II
+  addDoc: !string |-
+    // For example,
+    //
+    //   {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.NAME(3, 0,
+    //        {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+    //
+    // returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: int|uint
+    bits: 256
+    OverwriteElementBits: 8
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
  - go: ConcatShiftBytesRight
    asm: VPALIGNR
    in:
@@ -930,4 +1004,3 @@
      immOffset: 0
    out:
    - *uint256512
- 
-\ No newline at end of file
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index e9ddb463be7dde9bb32a143d82147e9b280b6152..8acf3e897c2616422df1e5c6cd9f2a16d19f8546 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5604,10 +5604,14 @@ func (x Float64x8) Scale(y Float64x8) Float64x8
  // Select128FromPair treats the 256-bit vectors x and y as a single vector of four
  // 128-bit elements, and returns a 256-bit result formed by
  // concatenating the two elements specified by lo and hi.
-// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
+// For example,
+//
+//     {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
  //
  // Asm: VPERM2F128, CPU Feature: AVX
  func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
@@ -5615,10 +5619,14 @@ func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
  // Select128FromPair treats the 256-bit vectors x and y as a single vector of four
  // 128-bit elements, and returns a 256-bit result formed by
  // concatenating the two elements specified by lo and hi.
-// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
+// For example,
+//
+//     {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
  //
  // Asm: VPERM2F128, CPU Feature: AVX
  func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
@@ -5626,10 +5634,46 @@ func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
  // Select128FromPair treats the 256-bit vectors x and y as a single vector of four
  // 128-bit elements, and returns a 256-bit result formed by
  // concatenating the two elements specified by lo and hi.
-// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
+// For example,
+//
+//     {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
+//          {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int8x32) Select128FromPair(lo, hi uint8, y Int8x32) Int8x32
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//     {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
+//      {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int16x16) Select128FromPair(lo, hi uint8, y Int16x16) Int16x16
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//     {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
  //
  // Asm: VPERM2I128, CPU Feature: AVX2
  func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
@@ -5637,10 +5681,14 @@ func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
  // Select128FromPair treats the 256-bit vectors x and y as a single vector of four
  // 128-bit elements, and returns a 256-bit result formed by
  // concatenating the two elements specified by lo and hi.
-// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
+// For example,
+//
+//     {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
  //
  // Asm: VPERM2I128, CPU Feature: AVX2
  func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
@@ -5648,10 +5696,46 @@ func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
  // Select128FromPair treats the 256-bit vectors x and y as a single vector of four
  // 128-bit elements, and returns a 256-bit result formed by
  // concatenating the two elements specified by lo and hi.
-// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
+// For example,
+//
+//     {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
+//          {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint8x32) Select128FromPair(lo, hi uint8, y Uint8x32) Uint8x32
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//     {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
+//      {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint16x16) Select128FromPair(lo, hi uint8, y Uint16x16) Uint16x16
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+//     {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
  //
  // Asm: VPERM2I128, CPU Feature: AVX2
  func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
@@ -5659,10 +5743,14 @@ func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
  // Select128FromPair treats the 256-bit vectors x and y as a single vector of four
  // 128-bit elements, and returns a 256-bit result formed by
  // concatenating the two elements specified by lo and hi.
-// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
+// For example,
+//
+//     {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
  //
  // Asm: VPERM2I128, CPU Feature: AVX2
  func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4
diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go

index 63ee6416a66cf7070a26ec8fabd8a7bb8a0888b9..e54c3b200637acae8c0c7957986b2502eb09a9f5 100644 (file)
--- a/src/simd/ops_internal_amd64.go
+++ b/src/simd/ops_internal_amd64.go
@@ -144,11 +144,12 @@ func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Fl
  // The selection is chosen by the constant parameter h1h0l1l0
  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
  // For example,
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
  //
-//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//     {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+//      0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
  //
  // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+//
  // (don't forget that the binary constant is written big-endian).
  //
  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
@@ -215,11 +216,12 @@ func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x
  // The selection is chosen by the constant parameter h1h0l1l0
  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
  // For example,
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
  //
-//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//     {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+//      0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
  //
  // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+//
  // (don't forget that the binary constant is written big-endian).
  //
  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
@@ -286,11 +288,12 @@ func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint
  // The selection is chosen by the constant parameter h1h0l1l0
  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
  // For example,
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
  //
-//     0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//     {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+//      0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
  //
  // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+//
  // (don't forget that the binary constant is written big-endian).
  //
  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
author	David Chase <drchase@google.com>
	Wed, 19 Nov 2025 22:17:54 +0000 (17:17 -0500)
committer	David Chase <drchase@google.com>
	Fri, 21 Nov 2025 01:48:29 +0000 (17:48 -0800)
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/_gen/simdgen/gen_simdTypes.go		patch \| blob \| history
src/simd/_gen/simdgen/godefs.go		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/categories.yaml		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/go.yaml		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history
src/simd/ops_internal_amd64.go		patch \| blob \| history