[dev.simd] simd, cmd/compile: add 128 bit select-from-pair

author David Chase <drchase@google.com>

Fri, 19 Sep 2025 17:07:59 +0000 (13:07 -0400)

committer David Chase <drchase@google.com>

Fri, 26 Sep 2025 20:11:10 +0000 (13:11 -0700)
author David Chase <drchase@google.com>
Fri, 19 Sep 2025 17:07:59 +0000 (13:07 -0400)
committer David Chase <drchase@google.com>
Fri, 26 Sep 2025 20:11:10 +0000 (13:11 -0700)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index d69740cd96d3c4cda300c34650942175cc3a204d..a4d24524357f391fe44140e555924e9203b43298 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -1053,6 +1053,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VGF2P8AFFINEINVQB128,
                 ssa.OpAMD64VGF2P8AFFINEINVQB256,
                 ssa.OpAMD64VGF2P8AFFINEINVQB512,
+               ssa.OpAMD64VPERM2F128256,
+               ssa.OpAMD64VPERM2I128256,
                 ssa.OpAMD64VINSERTF128256,
                 ssa.OpAMD64VINSERTF64X4512,
                 ssa.OpAMD64VINSERTI128256,
diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go

index 2222a5444aa12f314e16b15dbeea1c39250b3221..0cfa2a2262f070f379245bf2ee9ee29f92053b5e 100644 (file)
--- a/src/cmd/compile/internal/ir/symtab.go
+++ b/src/cmd/compile/internal/ir/symtab.go
@@ -45,6 +45,7 @@ type symsStruct struct {
         PanicdottypeI     *obj.LSym
         Panicnildottype   *obj.LSym
         Panicoverflow     *obj.LSym
+       PanicSimdImm      *obj.LSym
         Racefuncenter     *obj.LSym
         Racefuncexit      *obj.LSym
         Raceread          *obj.LSym
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index 9db223c04f40ec7c80cc2a36d5fb29d8358b012c..1eab8b5e6d6b146af9dc9909e46321338ab95623 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -938,6 +938,12 @@
  (ScaleFloat64x2 ...) => (VSCALEFPD128 ...)
  (ScaleFloat64x4 ...) => (VSCALEFPD256 ...)
  (ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
+(Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...)
+(Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...)
+(Select128FromPairInt32x8 ...) => (VPERM2I128256 ...)
+(Select128FromPairInt64x4 ...) => (VPERM2I128256 ...)
+(Select128FromPairUint32x8 ...) => (VPERM2I128256 ...)
+(Select128FromPairUint64x4 ...) => (VPERM2I128256 ...)
  (SetElemFloat32x4 ...) => (VPINSRD128 ...)
  (SetElemFloat64x2 ...) => (VPINSRQ128 ...)
  (SetElemInt8x16 ...) => (VPINSRB128 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go

index ba91fb3fc95d2bd3c1dfae09c598f8c558f6ab6a..5e1da3249fed10f15eaa26869f0a94565a524e21 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -1212,6 +1212,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERM2F128256", argLength: 2, reg: v21, asm: "VPERM2F128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERM2I128256", argLength: 2, reg: v21, asm: "VPERM2I128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPINSRQ128", argLength: 2, reg: vgpv, asm: "VPINSRQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index 81a1dff1378bf77563eee31da0e07a42fa6e6450..aa088dbf0bf63b02629c96c816591b77a4a4f675 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1199,6 +1199,12 @@ func simdGenericOps() []opData {
                 {name: "RoundToEvenScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RoundToEvenScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RoundToEvenScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairInt64x4", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairUint32x8", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "Select128FromPairUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "SetElemFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "UInt8"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index 792a1ca08f194342998336bacbf519524ca13788..105d1a803c6c3f4aa1158c63a6eb31566e24993e 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2444,6 +2444,8 @@ const (
         OpAMD64VPRORQMasked128
         OpAMD64VPRORQMasked256
         OpAMD64VPRORQMasked512
+       OpAMD64VPERM2F128256
+       OpAMD64VPERM2I128256
         OpAMD64VPINSRD128
         OpAMD64VPINSRQ128
         OpAMD64VPINSRB128
@@ -6594,6 +6596,12 @@ const (
         OpRoundToEvenScaledResidueFloat64x2
         OpRoundToEvenScaledResidueFloat64x4
         OpRoundToEvenScaledResidueFloat64x8
+       OpSelect128FromPairFloat32x8
+       OpSelect128FromPairFloat64x4
+       OpSelect128FromPairInt32x8
+       OpSelect128FromPairInt64x4
+       OpSelect128FromPairUint32x8
+       OpSelect128FromPairUint64x4
         OpSetElemFloat32x4
         OpSetElemFloat64x2
         OpSetElemInt8x16
@@ -37656,6 +37664,36 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:    "VPERM2F128256",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPERM2F128,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                               {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPERM2I128256",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPERM2I128,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                               {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:    "VPINSRD128",
                 auxType: auxUInt8,
@@ -82360,6 +82398,42 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "Select128FromPairFloat32x8",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Select128FromPairFloat64x4",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Select128FromPairInt32x8",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Select128FromPairInt64x4",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Select128FromPairUint32x8",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Select128FromPairUint64x4",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "SetElemFloat32x4",
                 auxType: auxUInt8,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index ca9f9ae17be9c178e2b9e06b0f5613779bc55a5a..bc611fc44c4293d3421ed6fa80ffee2b655be6dc 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -4991,6 +4991,24 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpSelect0(v)
         case OpSelect1:
                 return rewriteValueAMD64_OpSelect1(v)
+       case OpSelect128FromPairFloat32x8:
+               v.Op = OpAMD64VPERM2F128256
+               return true
+       case OpSelect128FromPairFloat64x4:
+               v.Op = OpAMD64VPERM2F128256
+               return true
+       case OpSelect128FromPairInt32x8:
+               v.Op = OpAMD64VPERM2I128256
+               return true
+       case OpSelect128FromPairInt64x4:
+               v.Op = OpAMD64VPERM2I128256
+               return true
+       case OpSelect128FromPairUint32x8:
+               v.Op = OpAMD64VPERM2I128256
+               return true
+       case OpSelect128FromPairUint64x4:
+               v.Op = OpAMD64VPERM2I128256
+               return true
         case OpSelectN:
                 return rewriteValueAMD64_OpSelectN(v)
         case OpSetElemFloat32x4:
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go

index 985d899a71e99a0d6871efd090d36844149e8022..4c5cd9ef2cf989672c3a8b16b27cfbabf3f47b76 100644 (file)
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1842,7 +1842,9 @@ func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp fu
         for i, t := range targets {
                 s.startBlock(t)
                 genOp(s, i)
-               t.AddEdgeTo(bEnd)
+               if t.Kind != ssa.BlockExit {
+                       t.AddEdgeTo(bEnd)
+               }
                 s.endBlock()
         }
  
@@ -1899,6 +1901,28 @@ func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.Ca
         }
  }
  
+// Two immediates instead of just 1.  Offset is ignored, so it is a _ parameter instead.
+func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+       return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 {
+                       i1, i2 := args[1].AuxInt, args[2].AuxInt
+                       return s.newValue2I(op, t, i1+i2<<4, args[0], args[3])
+               }
+               four := s.constInt64(types.Types[types.TUINT8], 4)
+               shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four)
+               combined := s.newValue2(ssa.OpAdd8, types.Types[types.TUINT8], args[1], shifted)
+               return immJumpTable(s, combined, n, func(sNew *state, idx int) {
+                       // Encode as int8 due to requirement of AuxInt, check its comment for details.
+                       // TODO for "zeroing" values, panic instead.
+                       if idx & ^(3+3<<4) == 0 {
+                               s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx)), args[0], args[3])
+                       } else {
+                               sNew.rtcall(ir.Syms.PanicSimdImm, false, nil)
+                       }
+               })
+       }
+}
+
  func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
         return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                 if args[2].Op == ssa.OpConst8 {
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 41858a77454b8d55f1841c53081b04aa5e81c6ac..a62b3882c38aee20dce92c3995fcada80c3f42c0 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -950,6 +950,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64)
         addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64)
         addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64)
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go

index 57129817f6c878a5cd2c49343d9aca67177061fb..37aad360f2a075463fc49d57ce1e8cb8a48016ce 100644 (file)
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -141,6 +141,7 @@ func InitConfig() {
         ir.Syms.Panicnildottype = typecheck.LookupRuntimeFunc("panicnildottype")
         ir.Syms.Panicoverflow = typecheck.LookupRuntimeFunc("panicoverflow")
         ir.Syms.Panicshift = typecheck.LookupRuntimeFunc("panicshift")
+       ir.Syms.PanicSimdImm = typecheck.LookupRuntimeFunc("panicSimdImm")
         ir.Syms.Racefuncenter = typecheck.LookupRuntimeFunc("racefuncenter")
         ir.Syms.Racefuncexit = typecheck.LookupRuntimeFunc("racefuncexit")
         ir.Syms.Raceread = typecheck.LookupRuntimeFunc("raceread")
diff --git a/src/runtime/panic.go b/src/runtime/panic.go

index 8c91c9435abd18d81a644289480e3ec2c83e48bc..d7bce70fe5a948885be4c1743647e59e9cca636b 100644 (file)
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -341,6 +341,13 @@ func panicmemAddr(addr uintptr) {
         panic(errorAddressString{msg: "invalid memory address or nil pointer dereference", addr: addr})
  }
  
+var simdImmError = error(errorString("out-of-range immediate for simd intrinsic"))
+
+func panicSimdImm() {
+       panicCheck2("simd immediate error")
+       panic(simdImmError)
+}
+
  // Create a new deferred function fn, which has no arguments and results.
  // The compiler turns a defer statement into a call to this.
  func deferproc(fn func()) {
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go

index 353bc46b317503dc6c8df020a65b85ce009c7c05..4b27f7ce5f73b84af1ec8426b233f79a8150e501 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -56,6 +56,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
  {{end}}
  {{define "op2Imm8_2I"}}        addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
  {{end}}
+{{define "op2Imm8_II"}}        addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
  {{define "op3Imm8"}}   addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
  {{end}}
  {{define "op3Imm8_2I"}}        addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go

index 0d5d08b7edd08d2f7251ba2f8e2eb4ef1e5025b2..8944c35cad715ad174f8c55a5eaacd6537400990 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -354,6 +354,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"
  func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8) {{.GoType}}
  {{end}}
  
+{{define "op2Imm8_II"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table.
+// {{.ImmName}} should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
  
  {{define "op3Imm8"}}
  {{if .Documentation}}{{.Documentation}}
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml

index e9a7fef2023221f59f9c383f0a35e3f553146349..0c733e12ee19e69511da0a2919a07a242081e4b2 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@@ -174,4 +174,10 @@
      // then 1, selecting element 1 from x's upper 128 bits (9), then 1,
      // selecting element 1 from y's upper 128 bits (11).
      // This differs from the same method applied to a 32x8 vector, where
-    // the 8-bit constant performs the same selection on both subvectors.
-\ No newline at end of file
+    // the 8-bit constant performs the same selection on both subvectors.
+
+- go: Select128FromPair
+  commutative: false
+  documentation: !string |-
+    // NAME selects the low and high 128-bit halves from the 128-bit halves
+    // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml

index 46599b7bd7ef0259d9dbd9a861f097be20c0196c..495b9ed6fa1371526959bd76bc61551b891de29f 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@@ -721,7 +721,6 @@
    out:
    - *v
  
-
  - go: concatSelectedConstantGrouped
    asm: VSHUFPD
    in:
@@ -771,3 +770,74 @@
    inVariant: []
    out:
    - *v
+
+- go: Select128FromPair
+  asm: VPERM2F128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: float
+    bits: 256
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
+- go: Select128FromPair
+  asm: VPERM2F128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: float
+    bits: 256
+    OverwriteElementBits: 32
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
+- go: Select128FromPair
+  asm: VPERM2I128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: int|uint
+    bits: 256
+    OverwriteElementBits: 64
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
+
+- go: Select128FromPair
+  asm: VPERM2I128
+  operandOrder: II
+  in:
+  - &v
+    go: $t
+    class: vreg
+    base: int|uint
+    bits: 256
+    OverwriteElementBits: 32
+  - *v
+  - class: immediate
+    immOffset: 0
+    name: "lo, hi"
+  inVariant: []
+  out:
+  - *v
diff --git a/src/simd/_gen/unify/domain.go b/src/simd/_gen/unify/domain.go

index 1e0f2be63d739ab3ba95079389fe3c3b348ce0aa..8eb5deab2ba8604238b546b97cb3be7ada2a9452 100644 (file)
--- a/src/simd/_gen/unify/domain.go
+++ b/src/simd/_gen/unify/domain.go
@@ -106,8 +106,8 @@ func (b *DefBuilder) Add(name string, v *Value) {
         if b.fields == nil {
                 b.fields = make(map[string]*Value)
         }
-       if _, ok := b.fields[name]; ok {
-               panic(fmt.Sprintf("duplicate field %q", name))
+       if old, ok := b.fields[name]; ok {
+               panic(fmt.Sprintf("duplicate field %q, added value is %v, old value is %v", name, v, old))
         }
         b.fields[name] = v
  }
diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go

index 6deadde45e65ceb5999ac568bbde9d3d899acb37..e38f7eea01cb9554dd64f3ed95d4846fa9e23fa1 100644 (file)
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@@ -815,3 +815,77 @@ func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
         foo(lhhl, 0, 4, 5, 1)
         foo(hllh, 4, 0, 1, 5)
  }
+
+func TestSelect128FromPair(t *testing.T) {
+       x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+       y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+       aa := x.Select128FromPair(0, 0, y)
+       ab := x.Select128FromPair(0, 1, y)
+       bc := x.Select128FromPair(1, 2, y)
+       cd := x.Select128FromPair(2, 3, y)
+       da := x.Select128FromPair(3, 0, y)
+       dc := x.Select128FromPair(3, 2, y)
+
+       r := make([]uint64, 4, 4)
+
+       foo := func(v simd.Uint64x4, a, b uint64) {
+               a, b = 2*a, 2*b
+               v.StoreSlice(r)
+               checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+       }
+
+       foo(aa, 0, 0)
+       foo(ab, 0, 1)
+       foo(bc, 1, 2)
+       foo(cd, 2, 3)
+       foo(da, 3, 0)
+       foo(dc, 3, 2)
+}
+
+func TestSelect128FromPairError(t *testing.T) {
+       x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+       y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+       defer func() {
+               if r := recover(); r != nil {
+                       t.Logf("Saw expected panic %v", r)
+               }
+       }()
+       _ = x.Select128FromPair(0, 4, y)
+
+       t.Errorf("Should have panicked")
+}
+
+//go:noinline
+func select128FromPair(x simd.Uint64x4, lo, hi uint8, y simd.Uint64x4) simd.Uint64x4 {
+       return x.Select128FromPair(lo, hi, y)
+}
+
+func TestSelect128FromPairVar(t *testing.T) {
+       x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+       y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+       aa := select128FromPair(x, 0, 0, y)
+       ab := select128FromPair(x, 0, 1, y)
+       bc := select128FromPair(x, 1, 2, y)
+       cd := select128FromPair(x, 2, 3, y)
+       da := select128FromPair(x, 3, 0, y)
+       dc := select128FromPair(x, 3, 2, y)
+
+       r := make([]uint64, 4, 4)
+
+       foo := func(v simd.Uint64x4, a, b uint64) {
+               a, b = 2*a, 2*b
+               v.StoreSlice(r)
+               checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+       }
+
+       foo(aa, 0, 0)
+       foo(ab, 0, 1)
+       foo(bc, 1, 2)
+       foo(cd, 2, 3)
+       foo(da, 3, 0)
+       foo(dc, 3, 2)
+
+}
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index a104601ed75a82b69e6e43e8b1dd087e2e3993ce..91e7d91842a8d9ea72001feecc56b445e99d6b29 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5576,6 +5576,62 @@ func (x Float64x4) Scale(y Float64x4) Float64x4
  // Asm: VSCALEFPD, CPU Feature: AVX512
  func (x Float64x8) Scale(y Float64x8) Float64x8
  
+/* Select128FromPair */
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
+
+// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
+// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4
+
  /* SetElem */
  
  // SetElem sets a single constant-indexed element's value.
author	David Chase <drchase@google.com>
	Fri, 19 Sep 2025 17:07:59 +0000 (13:07 -0400)
committer	David Chase <drchase@google.com>
	Fri, 26 Sep 2025 20:11:10 +0000 (13:11 -0700)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ir/symtab.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/intrinsics.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/ssa.go		patch \| blob \| history
src/runtime/panic.go		patch \| blob \| history
src/simd/_gen/simdgen/gen_simdIntrinsics.go		patch \| blob \| history
src/simd/_gen/simdgen/gen_simdTypes.go		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/categories.yaml		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/go.yaml		patch \| blob \| history
src/simd/_gen/unify/domain.go		patch \| blob \| history
src/simd/internal/simd_test/simd_test.go		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history