[dev.simd] cmd/compile: add INSERT[IF]128 instructions

author David Chase <drchase@google.com>

Wed, 25 Jun 2025 20:06:00 +0000 (16:06 -0400)

committer David Chase <drchase@google.com>

Mon, 7 Jul 2025 23:52:36 +0000 (16:52 -0700)
author David Chase <drchase@google.com>
Wed, 25 Jun 2025 20:06:00 +0000 (16:06 -0400)
committer David Chase <drchase@google.com>
Mon, 7 Jul 2025 23:52:36 +0000 (16:52 -0700)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index 999f3c200ce798a4763be6d882d81f7548dbfc9b..ac2848d1bafa694e43e8a4e60a9f9a603d34eae0 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -706,6 +706,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VGF2P8AFFINEINVQB128,
                 ssa.OpAMD64VGF2P8AFFINEINVQB256,
                 ssa.OpAMD64VGF2P8AFFINEINVQB512,
+               ssa.OpAMD64VINSERTF128256,
+               ssa.OpAMD64VINSERTI128256,
                 ssa.OpAMD64VPSHLDW128,
                 ssa.OpAMD64VPSHLDW256,
                 ssa.OpAMD64VPSHLDW512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index 3768c5aaadc338462d2e80a13fc75318508dd939..6b1078e74127985956e3a4b3e19708bda8a4932b 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -1452,6 +1452,16 @@
  (SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSDS128 ...)
  (SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSDS256 ...)
  (SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSDS512 ...)
+(Set128Float32x8 [a] x y) => (VINSERTF128256 [a] x y)
+(Set128Float64x4 [a] x y) => (VINSERTF128256 [a] x y)
+(Set128Int8x32 [a] x y) => (VINSERTI128256 [a] x y)
+(Set128Int16x16 [a] x y) => (VINSERTI128256 [a] x y)
+(Set128Int32x8 [a] x y) => (VINSERTI128256 [a] x y)
+(Set128Int64x4 [a] x y) => (VINSERTI128256 [a] x y)
+(Set128Uint8x32 [a] x y) => (VINSERTI128256 [a] x y)
+(Set128Uint16x16 [a] x y) => (VINSERTI128256 [a] x y)
+(Set128Uint32x8 [a] x y) => (VINSERTI128256 [a] x y)
+(Set128Uint64x4 [a] x y) => (VINSERTI128256 [a] x y)
  (SetElemInt8x16 [a] x y) => (VPINSRB128 [a] x y)
  (SetElemInt16x8 [a] x y) => (VPINSRW128 [a] x y)
  (SetElemInt32x4 [a] x y) => (VPINSRD128 [a] x y)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go

index 5e627e696e96f015fa7ceeaaf170b0e67b00d785..787d3c5fcbf50c9ff0d8acaddb1bcc89da9bae80 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -768,6 +768,7 @@ func simdAMD64Ops(fp11, fp21, fp2k, fpkfp, fp2kfp, fp2kk, fp31, fp3kfp, fpgpfp,
                 {name: "VRNDSCALEPSMasked256", argLength: 2, reg: fpkfp, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VREDUCEPSMasked256", argLength: 2, reg: fpkfp, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VCMPPSMasked256", argLength: 3, reg: fp2kk, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+               {name: "VINSERTF128256", argLength: 2, reg: fp21, asm: "VINSERTF128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VROUNDPD128", argLength: 1, reg: fp11, asm: "VROUNDPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VRNDSCALEPD128", argLength: 1, reg: fp11, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VREDUCEPD128", argLength: 1, reg: fp11, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -879,6 +880,7 @@ func simdAMD64Ops(fp11, fp21, fp2k, fpkfp, fp2kfp, fp2kk, fp31, fp3kfp, fpgpfp,
                 {name: "VPINSRB128", argLength: 2, reg: fpgpfp, asm: "VPINSRB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPCMPB256", argLength: 2, reg: fp2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
                 {name: "VPCMPBMasked256", argLength: 3, reg: fp2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
+               {name: "VINSERTI128256", argLength: 2, reg: fp21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPCMPB512", argLength: 2, reg: fp2k, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
                 {name: "VPCMPBMasked512", argLength: 3, reg: fp2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
                 {name: "VPCMPUW256", argLength: 2, reg: fp2k, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index b68b237c31254889d5b75cfa6d7d8da9dee8d9aa..076a16ebda611024b3fafd993816aa824d16cf8f 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -1511,6 +1511,7 @@ func simdGenericOps() []opData {
                 {name: "MaskedRoundWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedTruncWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "RoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "Set128Float32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "TruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "CeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "DiffWithCeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
@@ -1543,6 +1544,7 @@ func simdGenericOps() []opData {
                 {name: "MaskedRoundWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedTruncWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "RoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "Set128Float64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "TruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "CeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "DiffWithCeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
@@ -1562,6 +1564,7 @@ func simdGenericOps() []opData {
                 {name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "MaskedShiftAllLeftAndFillUpperFromInt16x16", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "MaskedShiftAllRightAndFillUpperFromInt16x16", argLength: 3, commutative: false, aux: "Int8"},
+               {name: "Set128Int16x16", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllLeftAndFillUpperFromInt16x16", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightAndFillUpperFromInt16x16", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedShiftAllLeftAndFillUpperFromInt16x32", argLength: 3, commutative: false, aux: "Int8"},
@@ -1598,6 +1601,7 @@ func simdGenericOps() []opData {
                 {name: "MaskedShiftAllRightAndFillUpperFromInt32x8", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "RotateAllRightInt32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "Set128Int32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllLeftAndFillUpperFromInt32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightAndFillUpperFromInt32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "GetElemInt64x2", argLength: 1, commutative: false, aux: "Int8"},
@@ -1616,6 +1620,7 @@ func simdGenericOps() []opData {
                 {name: "MaskedShiftAllRightAndFillUpperFromInt64x4", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "RotateAllLeftInt64x4", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "RotateAllRightInt64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "Set128Int64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllLeftAndFillUpperFromInt64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightAndFillUpperFromInt64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedRotateAllLeftInt64x8", argLength: 2, commutative: false, aux: "Int8"},
@@ -1628,8 +1633,10 @@ func simdGenericOps() []opData {
                 {name: "ShiftAllRightAndFillUpperFromInt64x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "Int8"},
+               {name: "Set128Int8x32", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedShiftAllLeftAndFillUpperFromUint16x16", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "MaskedShiftAllRightAndFillUpperFromUint16x16", argLength: 3, commutative: false, aux: "Int8"},
+               {name: "Set128Uint16x16", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllLeftAndFillUpperFromUint16x16", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightAndFillUpperFromUint16x16", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedShiftAllLeftAndFillUpperFromUint16x32", argLength: 3, commutative: false, aux: "Int8"},
@@ -1666,6 +1673,7 @@ func simdGenericOps() []opData {
                 {name: "MaskedShiftAllRightAndFillUpperFromUint32x8", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "RotateAllLeftUint32x8", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "RotateAllRightUint32x8", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "Set128Uint32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllLeftAndFillUpperFromUint32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightAndFillUpperFromUint32x8", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "Int8"},
@@ -1684,6 +1692,7 @@ func simdGenericOps() []opData {
                 {name: "MaskedShiftAllRightAndFillUpperFromUint64x4", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "RotateAllLeftUint64x4", argLength: 1, commutative: false, aux: "Int8"},
                 {name: "RotateAllRightUint64x4", argLength: 1, commutative: false, aux: "Int8"},
+               {name: "Set128Uint64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllLeftAndFillUpperFromUint64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "ShiftAllRightAndFillUpperFromUint64x4", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedRotateAllLeftUint64x8", argLength: 2, commutative: false, aux: "Int8"},
@@ -1704,6 +1713,7 @@ func simdGenericOps() []opData {
                 {name: "GaloisFieldAffineTransformInversedUint8x32", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedGaloisFieldAffineTransformUint8x32", argLength: 3, commutative: false, aux: "Int8"},
                 {name: "MaskedGaloisFieldAffineTransformInversedUint8x32", argLength: 3, commutative: false, aux: "Int8"},
+               {name: "Set128Uint8x32", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "GaloisFieldAffineTransformInversedUint8x64", argLength: 2, commutative: false, aux: "Int8"},
                 {name: "MaskedGaloisFieldAffineTransformUint8x64", argLength: 3, commutative: false, aux: "Int8"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index fec727ea12ec78010b8a491f16f30902ce969ed7..ece791ca6cea71e10921ce0a27544684a6a38d99 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1961,6 +1961,7 @@ const (
         OpAMD64VRNDSCALEPSMasked256
         OpAMD64VREDUCEPSMasked256
         OpAMD64VCMPPSMasked256
+       OpAMD64VINSERTF128256
         OpAMD64VROUNDPD128
         OpAMD64VRNDSCALEPD128
         OpAMD64VREDUCEPD128
@@ -2072,6 +2073,7 @@ const (
         OpAMD64VPINSRB128
         OpAMD64VPCMPB256
         OpAMD64VPCMPBMasked256
+       OpAMD64VINSERTI128256
         OpAMD64VPCMPB512
         OpAMD64VPCMPBMasked512
         OpAMD64VPCMPUW256
@@ -5844,6 +5846,7 @@ const (
         OpMaskedRoundWithPrecisionFloat32x8
         OpMaskedTruncWithPrecisionFloat32x8
         OpRoundWithPrecisionFloat32x8
+       OpSet128Float32x8
         OpTruncWithPrecisionFloat32x8
         OpCeilWithPrecisionFloat64x2
         OpDiffWithCeilWithPrecisionFloat64x2
@@ -5876,6 +5879,7 @@ const (
         OpMaskedRoundWithPrecisionFloat64x4
         OpMaskedTruncWithPrecisionFloat64x4
         OpRoundWithPrecisionFloat64x4
+       OpSet128Float64x4
         OpTruncWithPrecisionFloat64x4
         OpCeilWithPrecisionFloat64x8
         OpDiffWithCeilWithPrecisionFloat64x8
@@ -5895,6 +5899,7 @@ const (
         OpTruncWithPrecisionFloat64x8
         OpMaskedShiftAllLeftAndFillUpperFromInt16x16
         OpMaskedShiftAllRightAndFillUpperFromInt16x16
+       OpSet128Int16x16
         OpShiftAllLeftAndFillUpperFromInt16x16
         OpShiftAllRightAndFillUpperFromInt16x16
         OpMaskedShiftAllLeftAndFillUpperFromInt16x32
@@ -5931,6 +5936,7 @@ const (
         OpMaskedShiftAllRightAndFillUpperFromInt32x8
         OpRotateAllLeftInt32x8
         OpRotateAllRightInt32x8
+       OpSet128Int32x8
         OpShiftAllLeftAndFillUpperFromInt32x8
         OpShiftAllRightAndFillUpperFromInt32x8
         OpGetElemInt64x2
@@ -5949,6 +5955,7 @@ const (
         OpMaskedShiftAllRightAndFillUpperFromInt64x4
         OpRotateAllLeftInt64x4
         OpRotateAllRightInt64x4
+       OpSet128Int64x4
         OpShiftAllLeftAndFillUpperFromInt64x4
         OpShiftAllRightAndFillUpperFromInt64x4
         OpMaskedRotateAllLeftInt64x8
@@ -5961,8 +5968,10 @@ const (
         OpShiftAllRightAndFillUpperFromInt64x8
         OpGetElemInt8x16
         OpSetElemInt8x16
+       OpSet128Int8x32
         OpMaskedShiftAllLeftAndFillUpperFromUint16x16
         OpMaskedShiftAllRightAndFillUpperFromUint16x16
+       OpSet128Uint16x16
         OpShiftAllLeftAndFillUpperFromUint16x16
         OpShiftAllRightAndFillUpperFromUint16x16
         OpMaskedShiftAllLeftAndFillUpperFromUint16x32
@@ -5999,6 +6008,7 @@ const (
         OpMaskedShiftAllRightAndFillUpperFromUint32x8
         OpRotateAllLeftUint32x8
         OpRotateAllRightUint32x8
+       OpSet128Uint32x8
         OpShiftAllLeftAndFillUpperFromUint32x8
         OpShiftAllRightAndFillUpperFromUint32x8
         OpGetElemUint64x2
@@ -6017,6 +6027,7 @@ const (
         OpMaskedShiftAllRightAndFillUpperFromUint64x4
         OpRotateAllLeftUint64x4
         OpRotateAllRightUint64x4
+       OpSet128Uint64x4
         OpShiftAllLeftAndFillUpperFromUint64x4
         OpShiftAllRightAndFillUpperFromUint64x4
         OpMaskedRotateAllLeftUint64x8
@@ -6037,6 +6048,7 @@ const (
         OpGaloisFieldAffineTransformInversedUint8x32
         OpMaskedGaloisFieldAffineTransformUint8x32
         OpMaskedGaloisFieldAffineTransformInversedUint8x32
+       OpSet128Uint8x32
         OpGaloisFieldAffineTransformUint8x64
         OpGaloisFieldAffineTransformInversedUint8x64
         OpMaskedGaloisFieldAffineTransformUint8x64
@@ -30131,6 +30143,21 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:    "VINSERTF128256",
+               auxType: auxInt8,
+               argLen:  2,
+               asm:     x86.AVINSERTF128,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:    "VROUNDPD128",
                 auxType: auxInt8,
@@ -31825,6 +31852,21 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:    "VINSERTI128256",
+               auxType: auxInt8,
+               argLen:  2,
+               asm:     x86.AVINSERTI128,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:        "VPCMPB512",
                 auxType:     auxInt8,
@@ -67718,6 +67760,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "Set128Float32x8",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "TruncWithPrecisionFloat32x8",
                 auxType: auxInt8,
@@ -67910,6 +67958,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "Set128Float64x4",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "TruncWithPrecisionFloat64x4",
                 auxType: auxInt8,
@@ -68024,6 +68078,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  3,
                 generic: true,
         },
+       {
+               name:    "Set128Int16x16",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "ShiftAllLeftAndFillUpperFromInt16x16",
                 auxType: auxInt8,
@@ -68240,6 +68300,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "Set128Int32x8",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "ShiftAllLeftAndFillUpperFromInt32x8",
                 auxType: auxInt8,
@@ -68348,6 +68414,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "Set128Int64x4",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "ShiftAllLeftAndFillUpperFromInt64x4",
                 auxType: auxInt8,
@@ -68420,6 +68492,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "Set128Int8x32",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "MaskedShiftAllLeftAndFillUpperFromUint16x16",
                 auxType: auxInt8,
@@ -68432,6 +68510,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  3,
                 generic: true,
         },
+       {
+               name:    "Set128Uint16x16",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "ShiftAllLeftAndFillUpperFromUint16x16",
                 auxType: auxInt8,
@@ -68648,6 +68732,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "Set128Uint32x8",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "ShiftAllLeftAndFillUpperFromUint32x8",
                 auxType: auxInt8,
@@ -68756,6 +68846,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "Set128Uint64x4",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "ShiftAllLeftAndFillUpperFromUint64x4",
                 auxType: auxInt8,
@@ -68876,6 +68972,12 @@ var opcodeTable = [...]opInfo{
                 argLen:  3,
                 generic: true,
         },
+       {
+               name:    "Set128Uint8x32",
+               auxType: auxInt8,
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "GaloisFieldAffineTransformUint8x64",
                 auxType: auxInt8,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index 15ca2fcc5b4dfe34aec1f28f74fee765d2266a4c..5c1872dcdfd313216a3dbf1b175c890f76be4768 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -4411,6 +4411,26 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpSelect1(v)
         case OpSelectN:
                 return rewriteValueAMD64_OpSelectN(v)
+       case OpSet128Float32x8:
+               return rewriteValueAMD64_OpSet128Float32x8(v)
+       case OpSet128Float64x4:
+               return rewriteValueAMD64_OpSet128Float64x4(v)
+       case OpSet128Int16x16:
+               return rewriteValueAMD64_OpSet128Int16x16(v)
+       case OpSet128Int32x8:
+               return rewriteValueAMD64_OpSet128Int32x8(v)
+       case OpSet128Int64x4:
+               return rewriteValueAMD64_OpSet128Int64x4(v)
+       case OpSet128Int8x32:
+               return rewriteValueAMD64_OpSet128Int8x32(v)
+       case OpSet128Uint16x16:
+               return rewriteValueAMD64_OpSet128Uint16x16(v)
+       case OpSet128Uint32x8:
+               return rewriteValueAMD64_OpSet128Uint32x8(v)
+       case OpSet128Uint64x4:
+               return rewriteValueAMD64_OpSet128Uint64x4(v)
+       case OpSet128Uint8x32:
+               return rewriteValueAMD64_OpSet128Uint8x32(v)
         case OpSetElemInt16x8:
                 return rewriteValueAMD64_OpSetElemInt16x8(v)
         case OpSetElemInt32x4:
@@ -53102,6 +53122,156 @@ func rewriteValueAMD64_OpSelectN(v *Value) bool {
         }
         return false
  }
+func rewriteValueAMD64_OpSet128Float32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Float32x8 [a] x y)
+       // result: (VINSERTF128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTF128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Float64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Float64x4 [a] x y)
+       // result: (VINSERTF128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTF128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Int16x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Int16x16 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Int32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Int32x8 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Int64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Int64x4 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Int8x32(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Int8x32 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Uint16x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Uint16x16 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Uint32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Uint32x8 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Uint64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Uint64x4 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
+func rewriteValueAMD64_OpSet128Uint8x32(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (Set128Uint8x32 [a] x y)
+       // result: (VINSERTI128256 [a] x y)
+       for {
+               a := auxIntToInt8(v.AuxInt)
+               x := v_0
+               y := v_1
+               v.reset(OpAMD64VINSERTI128256)
+               v.AuxInt = int8ToAuxInt(a)
+               v.AddArg2(x, y)
+               return true
+       }
+}
  func rewriteValueAMD64_OpSetElemInt16x8(v *Value) bool {
         v_1 := v.Args[1]
         v_0 := v.Args[0]
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 9837f07fc47dd7213626333ff9e3736125fb76f6..3d0e6fbd4aa75f7ebee1801596bda04eaa5abb22 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -1463,6 +1463,16 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x8.Set128", opLen2Imm8(ssa.OpSet128Float32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Float64x4.Set128", opLen2Imm8(ssa.OpSet128Float64x4, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int8x32.Set128", opLen2Imm8(ssa.OpSet128Int8x32, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int16x16.Set128", opLen2Imm8(ssa.OpSet128Int16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int32x8.Set128", opLen2Imm8(ssa.OpSet128Int32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int64x4.Set128", opLen2Imm8(ssa.OpSet128Int64x4, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint8x32.Set128", opLen2Imm8(ssa.OpSet128Uint8x32, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x16.Set128", opLen2Imm8(ssa.OpSet128Uint16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x8.Set128", opLen2Imm8(ssa.OpSet128Uint32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint64x4.Set128", opLen2Imm8(ssa.OpSet128Uint64x4, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64)
         addF(simdPackage, "Int16x8.SetElem", opLen2Imm8(ssa.OpSetElemInt16x8, types.TypeVec128, 0), sys.AMD64)
         addF(simdPackage, "Int32x4.SetElem", opLen2Imm8(ssa.OpSetElemInt32x4, types.TypeVec128, 0), sys.AMD64)
diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go

index 59908d60c520ae566d164d14f93efb46299e018e..f99938bb9d29e96a0e008e2a57c0bc603499dac4 100644 (file)
--- a/src/simd/simd_test.go
+++ b/src/simd/simd_test.go
@@ -193,6 +193,22 @@ func TestSlicesInt8GetElem(t *testing.T) {
         }
  
  }
+
+func TestSlicesInt8Set128(t *testing.T) {
+       a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+               17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+       v := simd.LoadInt8x16Slice(a) // 1-16
+       u := simd.LoadInt8x32Slice(a) // 1-32
+
+       w := u.Set128(1, v) // 1-16:1-16
+
+       b := make([]int8, 32, 32)
+       w.StoreSlice(b)
+
+       checkInt8Slices(t, a, b[:16])
+       checkInt8Slices(t, a, b[16:])
+}
+
  func TestSlicesInt8TooShortLoad(t *testing.T) {
         defer func() {
                 if r := recover(); r != nil {
diff --git a/src/simd/simd_wrapped_test.go b/src/simd/simd_wrapped_test.go

index 321d3bb80a4fe18acfd817b8d00a2d21e23d3cf9..4a8c0957e5b37e9f753dcadf39f3066de996e9b9 100644 (file)
--- a/src/simd/simd_wrapped_test.go
+++ b/src/simd/simd_wrapped_test.go
@@ -7975,6 +7975,7 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6
  // RotateAllLeft
  // RotateAllRight
  // RoundWithPrecision
+// Set128
  // SetElem
  // ShiftAllLeft
  // ShiftAllLeftAndFillUpperFrom
diff --git a/src/simd/stubs_amd64.go b/src/simd/stubs_amd64.go

index f53242cd738daa2f4d569ded089fde113041c1a7..de54a9ada48cb377fe1be855e66b5ecaa05cf31f 100644 (file)
--- a/src/simd/stubs_amd64.go
+++ b/src/simd/stubs_amd64.go
@@ -7682,6 +7682,58 @@ func (x Uint32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int
  // Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
  func (x Uint32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int8x64) Uint32x16
  
+/* Set128 */
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTF128, CPU Feature: AVX
+func (x Float32x8) Set128(imm uint8, y Float32x4) Float32x8
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTF128, CPU Feature: AVX
+func (x Float64x4) Set128(imm uint8, y Float64x2) Float64x4
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int8x32) Set128(imm uint8, y Int8x16) Int8x32
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int16x16) Set128(imm uint8, y Int16x8) Int16x16
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int32x8) Set128(imm uint8, y Int32x4) Int32x8
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int64x4) Set128(imm uint8, y Int64x2) Int64x4
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint8x32) Set128(imm uint8, y Uint8x16) Uint8x32
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint16x16) Set128(imm uint8, y Uint16x8) Uint16x16
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint32x8) Set128(imm uint8, y Uint32x4) Uint32x8
+
+// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint64x4) Set128(imm uint8, y Uint64x2) Uint64x4
+
  /* SetElem */
  
  // SetElem sets a single constant-indexed element's value.
author	David Chase <drchase@google.com>
	Wed, 25 Jun 2025 20:06:00 +0000 (16:06 -0400)
committer	David Chase <drchase@google.com>
	Mon, 7 Jul 2025 23:52:36 +0000 (16:52 -0700)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/simd_test.go		patch \| blob \| history
src/simd/simd_wrapped_test.go		patch \| blob \| history
src/simd/stubs_amd64.go		patch \| blob \| history