]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile, simd: added methods for "float" GetElem
authorDavid Chase <drchase@google.com>
Thu, 14 Aug 2025 21:26:15 +0000 (17:26 -0400)
committerDavid Chase <drchase@google.com>
Mon, 18 Aug 2025 20:12:27 +0000 (13:12 -0700)
This also required a "always use operation with least
OverrideBase" filter in choosing the machine instructions.

The order of generated HW operations is slightly
modified because the Float version of GetElem
appears earlier in the sorted operations list,
though it is not chosen to generate the HW Op.

Change-Id: I95fa67afca9c8b6f4f18941fdcaf69afdad8055b
Reviewed-on: https://go-review.googlesource.com/c/go/+/696375
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
src/cmd/compile/internal/amd64/simdssa.go
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/cmd/compile/internal/ssagen/simdintrinsics.go
src/simd/_gen/simdgen/gen_simdMachineOps.go
src/simd/_gen/simdgen/godefs.go
src/simd/_gen/simdgen/ops/Moves/go.yaml
src/simd/ops_amd64.go

index 3ec8b484fb8a6dd01ea50fb3693676d9ef3dbdcf..466e6c9cc74ff0b35088f4076326496c726e4b7d 100644 (file)
@@ -1128,10 +1128,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPINSRW128:
                p = simdVgpvImm8(s, v)
 
-       case ssa.OpAMD64VPEXTRB128,
-               ssa.OpAMD64VPEXTRW128,
-               ssa.OpAMD64VPEXTRD128,
-               ssa.OpAMD64VPEXTRQ128:
+       case ssa.OpAMD64VPEXTRD128,
+               ssa.OpAMD64VPEXTRQ128,
+               ssa.OpAMD64VPEXTRB128,
+               ssa.OpAMD64VPEXTRW128:
                p = simdVgpImm8(s, v)
 
        case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128,
index 9670f035ba880ba73eb946420cacbc6cda3c8fc7..d64f36cf74e9c2c112da04dfddce65fd0083759e 100644 (file)
 (GaloisFieldMulMaskedUint8x16 x y mask) => (VGF2P8MULBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (GaloisFieldMulMaskedUint8x32 x y mask) => (VGF2P8MULBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (GaloisFieldMulMaskedUint8x64 x y mask) => (VGF2P8MULBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(GetElemFloat32x4 ...) => (VPEXTRD128 ...)
+(GetElemFloat64x2 ...) => (VPEXTRQ128 ...)
 (GetElemInt8x16 ...) => (VPEXTRB128 ...)
 (GetElemInt16x8 ...) => (VPEXTRW128 ...)
 (GetElemInt32x4 ...) => (VPEXTRD128 ...)
index 61abaa5e9781dc68fd2d788e4dea37647bf16864..ba73453ffe1298b847bf0fb7765ee0fb3fb90ecb 100644 (file)
@@ -978,10 +978,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VGF2P8AFFINEQBMasked128", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
-               {name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
-               {name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false},
                {name: "VPEXTRD128", argLength: 1, reg: vgp, asm: "VPEXTRD", aux: "UInt8", commutative: false, typ: "int32", resultInArg0: false},
                {name: "VPEXTRQ128", argLength: 1, reg: vgp, asm: "VPEXTRQ", aux: "UInt8", commutative: false, typ: "int64", resultInArg0: false},
+               {name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
+               {name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false},
                {name: "VEXTRACTF128128", argLength: 1, reg: v11, asm: "VEXTRACTF128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VEXTRACTF64X4256", argLength: 1, reg: w11, asm: "VEXTRACTF64X4", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
index 4f2b1a912158911c3a5b653f5700f734e0388b5b..d98c0d8152acef16dba65527de3344f6e159e35b 100644 (file)
@@ -1720,6 +1720,8 @@ func simdGenericOps() []opData {
                {name: "GaloisFieldAffineTransformUint8x16", argLength: 2, commutative: false, aux: "UInt8"},
                {name: "GaloisFieldAffineTransformUint8x32", argLength: 2, commutative: false, aux: "UInt8"},
                {name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "GetElemFloat32x4", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "GetElemFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
                {name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "UInt8"},
                {name: "GetElemInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
                {name: "GetElemInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
index 60ef38535247335daa78c6f84f4b3ef5d024eb38..b45cccd96bbb22238a6f75cf1da6ee9e23eadf47 100644 (file)
@@ -2201,10 +2201,10 @@ const (
        OpAMD64VGF2P8AFFINEQBMasked128
        OpAMD64VGF2P8AFFINEQBMasked256
        OpAMD64VGF2P8AFFINEQBMasked512
-       OpAMD64VPEXTRB128
-       OpAMD64VPEXTRW128
        OpAMD64VPEXTRD128
        OpAMD64VPEXTRQ128
+       OpAMD64VPEXTRB128
+       OpAMD64VPEXTRW128
        OpAMD64VEXTRACTF128128
        OpAMD64VEXTRACTF64X4256
        OpAMD64VEXTRACTI128128
@@ -6352,6 +6352,8 @@ const (
        OpGaloisFieldAffineTransformUint8x16
        OpGaloisFieldAffineTransformUint8x32
        OpGaloisFieldAffineTransformUint8x64
+       OpGetElemFloat32x4
+       OpGetElemFloat64x2
        OpGetElemInt8x16
        OpGetElemInt16x8
        OpGetElemInt32x4
@@ -34154,13 +34156,13 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPEXTRB128",
+               name:    "VPEXTRD128",
                auxType: auxUInt8,
                argLen:  1,
-               asm:     x86.AVPEXTRB,
+               asm:     x86.AVPEXTRD,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
                        },
                        outputs: []outputInfo{
                                {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -34168,13 +34170,13 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPEXTRW128",
+               name:    "VPEXTRQ128",
                auxType: auxUInt8,
                argLen:  1,
-               asm:     x86.AVPEXTRW,
+               asm:     x86.AVPEXTRQ,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
                        },
                        outputs: []outputInfo{
                                {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -34182,13 +34184,13 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPEXTRD128",
+               name:    "VPEXTRB128",
                auxType: auxUInt8,
                argLen:  1,
-               asm:     x86.AVPEXTRD,
+               asm:     x86.AVPEXTRB,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
                        },
                        outputs: []outputInfo{
                                {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -34196,13 +34198,13 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPEXTRQ128",
+               name:    "VPEXTRW128",
                auxType: auxUInt8,
                argLen:  1,
-               asm:     x86.AVPEXTRQ,
+               asm:     x86.AVPEXTRW,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
                        },
                        outputs: []outputInfo{
                                {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -72920,6 +72922,18 @@ var opcodeTable = [...]opInfo{
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "GetElemFloat32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "GetElemFloat64x2",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
        {
                name:    "GetElemInt8x16",
                auxType: auxUInt8,
index 6e5e212fbeb00b4bee08c713721eb394966c82b7..69393014c78a85f21e282c812beccd5c36906996 100644 (file)
@@ -2186,6 +2186,12 @@ func rewriteValueAMD64(v *Value) bool {
        case OpGetClosurePtr:
                v.Op = OpAMD64LoweredGetClosurePtr
                return true
+       case OpGetElemFloat32x4:
+               v.Op = OpAMD64VPEXTRD128
+               return true
+       case OpGetElemFloat64x2:
+               v.Op = OpAMD64VPEXTRQ128
+               return true
        case OpGetElemInt16x8:
                v.Op = OpAMD64VPEXTRW128
                return true
index 682a37e91ba274173f22db653f7ae4610629145c..be3d917f8ff704e613384346f8bb3737f19d9d60 100644 (file)
@@ -536,6 +536,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint8x16.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x16, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint8x32.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x32, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint8x64.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.GetElem", opLen1Imm8(ssa.OpGetElemFloat32x4, types.Types[types.TFLOAT32], 0), sys.AMD64)
+       addF(simdPackage, "Float64x2.GetElem", opLen1Imm8(ssa.OpGetElemFloat64x2, types.Types[types.TFLOAT64], 0), sys.AMD64)
        addF(simdPackage, "Int8x16.GetElem", opLen1Imm8(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0), sys.AMD64)
        addF(simdPackage, "Int16x8.GetElem", opLen1Imm8(ssa.OpGetElemInt16x8, types.Types[types.TINT16], 0), sys.AMD64)
        addF(simdPackage, "Int32x4.GetElem", opLen1Imm8(ssa.OpGetElemInt32x4, types.Types[types.TINT32], 0), sys.AMD64)
index 64918e5543a3f31917aa33289a453f371ff01c2f..f4d91a0c8ec2e0096c26723f105e28773f47737a 100644 (file)
@@ -46,22 +46,47 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
                OpsData    []opData
                OpsDataImm []opData
        }
-       seen := map[string]struct{}{}
+
        regInfoSet := map[string]bool{
                "v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
                "w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true}
        opsData := make([]opData, 0)
        opsDataImm := make([]opData, 0)
+
+       // Determine the "best" version of an instruction to use
+       best := make(map[string]Operation)
+       var mOpOrder []string
+       countOverrides := func(s []Operand) int {
+               a := 0
+               for _, o := range s {
+                       if o.OverwriteBase != nil {
+                               a++
+                       }
+               }
+               return a
+       }
        for _, op := range ops {
-               shapeIn, shapeOut, maskType, _, gOp := op.shape()
+               _, _, maskType, _, gOp := op.shape()
                asm := machineOpName(maskType, gOp)
+               other, ok := best[asm]
+               if !ok {
+                       best[asm] = op
+                       mOpOrder = append(mOpOrder, asm)
+                       continue
+               }
+               // see if "op" is better than "other"
+               if countOverrides(op.In)+countOverrides(op.Out) < countOverrides(other.In)+countOverrides(other.Out) {
+                       best[asm] = op
+               }
+       }
+
+       for _, asm := range mOpOrder {
+               op := best[asm]
+               shapeIn, shapeOut, _, _, gOp := op.shape()
 
                // TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy
                // one here with a name suffix "Merging". The rewrite rules will need them.
-               if _, ok := seen[asm]; ok {
-                       continue
-               }
-               seen[asm] = struct{}{}
+
                regInfo, err := op.regShape()
                if err != nil {
                        panic(err)
index 0022140aaab177ff9021ce5b58a7f98053874745..22decb9d7e69c9dbaaebdbd5d47b838ae273aafc 100644 (file)
@@ -67,7 +67,7 @@ type rawOperation struct {
        NoTypes *string
        // If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped.
        NoGenericOps *string
-       // If non-nil, this string will be attached to the machine ssa op name.
+       // If non-nil, this string will be attached to the machine ssa op name.  E.g. "const"
        SSAVariant *string
 }
 
index 71981c12af7d125362304408f5e8f703d04428b1..0e5997deebbc354bd1034495384654051edf59f4 100644 (file)
     base: $b
     bits: $e
 
+- go: GetElem
+  asm: "VPEXTR[DQ]"
+  in:
+  - class: vreg
+    base: int
+    elemBits: $e
+    OverwriteBase: float
+  - *imm
+  out:
+  - class: greg
+    base: int
+    bits: $e
+    OverwriteBase: float
+
 - go: "SetHi|SetLo"
   asm: "VINSERTI128|VINSERTI64X4"
   inVariant: []
index d78bb699eaac235bd7e64309d0acac08bd524f52..8da3cd18175abad84109eef83480c2a31b5f2056 100644 (file)
@@ -3470,6 +3470,20 @@ func (x Uint8x64) GaloisFieldMulMasked(y Uint8x64, mask Mask8x64) Uint8x64
 
 /* GetElem */
 
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRD, CPU Feature: AVX
+func (x Float32x4) GetElem(index uint8) float32
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRQ, CPU Feature: AVX
+func (x Float64x2) GetElem(index uint8) float64
+
 // GetElem retrieves a single constant-indexed element's value.
 //
 // index results in better performance when it's a constant, a non-constant value will be translated into a jump table.