]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: add floating point min/max intrinsics on s390x
authorMichael Munday <mndygolang+git@gmail.com>
Fri, 27 Jun 2025 20:05:38 +0000 (21:05 +0100)
committerMichael Munday <mndygolang+git@gmail.com>
Wed, 30 Jul 2025 19:29:15 +0000 (12:29 -0700)
Add the VECTOR FP (MINIMUM|MAXIMUM) instructions to the assembler and
use them in the compiler to implement min and max.

Note: I've allowed floating point registers to be used with the single
element instructions (those with the W instead of V prefix) to allow
easier integration into the compiler.

Change-Id: I5f80a510bd248cf483cce95f1979bf63fbae7de6
Reviewed-on: https://go-review.googlesource.com/c/go/+/684715
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Freeman <mark@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
12 files changed:
src/cmd/asm/internal/asm/testdata/s390x.s
src/cmd/compile/internal/s390x/ssa.go
src/cmd/compile/internal/ssa/_gen/S390X.rules
src/cmd/compile/internal/ssa/_gen/S390XOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteS390X.go
src/cmd/compile/internal/ssagen/ssa.go
src/cmd/internal/obj/s390x/a.out.go
src/cmd/internal/obj/s390x/anames.go
src/cmd/internal/obj/s390x/asmz.go
src/cmd/internal/obj/s390x/vector.go
test/codegen/floats.go

index a19292b263640190ccb7972ddd33dcb249194070..93c3ec9ea7f22ed9e97e23310987a872592ad016 100644 (file)
@@ -540,6 +540,18 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
        VSTRCZBS V18, V20, V22, V24     // e78240306f8a
        VSTRCZHS V18, V20, V22, V24     // e78241306f8a
        VSTRCZFS V18, V20, V22, V24     // e78242306f8a
+       VFMAXSB $1, V2, V3, V4          // e742301020ef
+       WFMAXSB $2, V5, V6, V7          // e775602820ef
+       WFMAXSB $2, F5, F6, F7          // e775602820ef
+       VFMAXDB $3, V8, V9, V10         // e7a8903030ef
+       WFMAXDB $4, V11, V12, V13       // e7dbc04830ef
+       WFMAXDB $4, F11, F12, F13       // e7dbc04830ef
+       VFMINSB $7, V14, V15, V16       // e70ef07028ee
+       WFMINSB $8, V17, V18, V19       // e73120882eee
+       WFMINSB $8, F1, F2, F3          // e731208820ee
+       VFMINDB $9, V20, V21, V22       // e76450903eee
+       WFMINDB $10, V23, V24, V25      // e79780a83eee
+       WFMINDB $10, F7, F8, F9         // e79780a830ee
 
        RET
        RET     foo(SB)
index 4d24881dbaf5d59911b795211f51b2708c3f39be..ad66bfb5d8533423c6692488bfb6a0c3fd021bcf 100644 (file)
@@ -281,6 +281,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
        case ssa.OpS390XCPSDR:
                p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
                p.Reg = v.Args[0].Reg()
+       case ssa.OpS390XWFMAXDB, ssa.OpS390XWFMAXSB,
+               ssa.OpS390XWFMINDB, ssa.OpS390XWFMINSB:
+               p := opregregimm(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg(), 1 /* Java Math.Max() */)
+               p.AddRestSource(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()})
        case ssa.OpS390XDIVD, ssa.OpS390XDIVW,
                ssa.OpS390XDIVDU, ssa.OpS390XDIVWU,
                ssa.OpS390XMODD, ssa.OpS390XMODW,
index 231ad0615dd96e891b141214b9a6a31d69900cce..80e12f8e29d6d2ee21739a1fb4ff23d882dbefcf 100644 (file)
 
 (Sqrt32    ...) => (FSQRTS ...)
 
+(Max(64|32)F ...) => (WFMAX(D|S)B ...)
+(Min(64|32)F ...) => (WFMIN(D|S)B ...)
+
 // Atomic loads and stores.
 // The SYNC instruction (fast-BCR-serialization) prevents store-load
 // reordering. Other sequences of memory operations (load-load,
index 2f57d12630d5d42154625377a15007f481e492ba..38fb3cb0748932c7c1181e7f90bc5a8ee3ce7adc 100644 (file)
@@ -222,6 +222,12 @@ func init() {
                {name: "LNDFR", argLength: 1, reg: fp11, asm: "LNDFR"},                                                                       // fp64/fp32 clear sign bit
                {name: "CPSDR", argLength: 2, reg: fp21, asm: "CPSDR"},                                                                       // fp64/fp32 copy arg1 sign bit to arg0
 
+               // Single element vector floating point min / max instructions
+               {name: "WFMAXDB", argLength: 2, reg: fp21, asm: "WFMAXDB", typ: "Float64"}, // max[float64](arg0, arg1)
+               {name: "WFMAXSB", argLength: 2, reg: fp21, asm: "WFMAXSB", typ: "Float32"}, // max[float32](arg0, arg1)
+               {name: "WFMINDB", argLength: 2, reg: fp21, asm: "WFMINDB", typ: "Float64"}, // min[float64](arg0, arg1)
+               {name: "WFMINSB", argLength: 2, reg: fp21, asm: "WFMINSB", typ: "Float32"}, // min[float32](arg0, arg1)
+
                // Round to integer, float64 only.
                //
                // aux | rounding mode
index e88af66f5fdbd97ea3117e6b71e6376fdc9f1074..36c1815ea2ea7dcf100472b320020044eef96873 100644 (file)
@@ -2655,6 +2655,10 @@ const (
        OpS390XLPDFR
        OpS390XLNDFR
        OpS390XCPSDR
+       OpS390XWFMAXDB
+       OpS390XWFMAXSB
+       OpS390XWFMINDB
+       OpS390XWFMINSB
        OpS390XFIDBR
        OpS390XFMOVSload
        OpS390XFMOVDload
@@ -35775,6 +35779,62 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "WFMAXDB",
+               argLen: 2,
+               asm:    s390x.AWFMAXDB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                               {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+                       outputs: []outputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+               },
+       },
+       {
+               name:   "WFMAXSB",
+               argLen: 2,
+               asm:    s390x.AWFMAXSB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                               {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+                       outputs: []outputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+               },
+       },
+       {
+               name:   "WFMINDB",
+               argLen: 2,
+               asm:    s390x.AWFMINDB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                               {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+                       outputs: []outputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+               },
+       },
+       {
+               name:   "WFMINSB",
+               argLen: 2,
+               asm:    s390x.AWFMINSB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                               {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+                       outputs: []outputInfo{
+                               {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+                       },
+               },
+       },
        {
                name:    "FIDBR",
                auxType: auxInt8,
index 2e7492501a803adef4ca0ffba5be535a5869daf1..a7fde81c4789b2f2749adeb6db3a79ea9dcddae8 100644 (file)
@@ -368,6 +368,18 @@ func rewriteValueS390X(v *Value) bool {
                return rewriteValueS390X_OpLsh8x64(v)
        case OpLsh8x8:
                return rewriteValueS390X_OpLsh8x8(v)
+       case OpMax32F:
+               v.Op = OpS390XWFMAXSB
+               return true
+       case OpMax64F:
+               v.Op = OpS390XWFMAXDB
+               return true
+       case OpMin32F:
+               v.Op = OpS390XWFMINSB
+               return true
+       case OpMin64F:
+               v.Op = OpS390XWFMINDB
+               return true
        case OpMod16:
                return rewriteValueS390X_OpMod16(v)
        case OpMod16u:
index 3deb0ecf2336a1ff8444a141c842115c7262e50c..bce94d35f94841f4a09b32c148e3331030651e5b 100644 (file)
@@ -3986,7 +3986,7 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value {
                if typ.IsFloat() {
                        hasIntrinsic := false
                        switch Arch.LinkArch.Family {
-                       case sys.AMD64, sys.ARM64, sys.Loong64, sys.RISCV64:
+                       case sys.AMD64, sys.ARM64, sys.Loong64, sys.RISCV64, sys.S390X:
                                hasIntrinsic = true
                        case sys.PPC64:
                                hasIntrinsic = buildcfg.GOPPC64 >= 9
index 1a64370efa87673d6d508e4d520d00a89e11e78e..dc715182f5cf81b86e164c6172c7db3970c634f0 100644 (file)
@@ -715,6 +715,14 @@ const (
        AWFLNDB
        AVFLPDB
        AWFLPDB
+       AVFMAXDB
+       AWFMAXDB
+       AVFMAXSB
+       AWFMAXSB
+       AVFMINDB
+       AWFMINDB
+       AVFMINSB
+       AWFMINSB
        AVFSQ
        AVFSQDB
        AWFSQDB
index c0a0c401fa09a061008def835f0d848cd14d472d..a6f2820f85d5244b4bbc971e8fa27c7e7953a92f 100644 (file)
@@ -438,6 +438,14 @@ var Anames = []string{
        "WFLNDB",
        "VFLPDB",
        "WFLPDB",
+       "VFMAXDB",
+       "WFMAXDB",
+       "VFMAXSB",
+       "WFMAXSB",
+       "VFMINDB",
+       "WFMINDB",
+       "VFMINSB",
+       "WFMINSB",
        "VFSQ",
        "VFSQDB",
        "WFSQDB",
index 72d92abbaf2a26807bfe2e34ce8327014984139a..957222a1559388540db306c37a4ca04e51bfe90f 100644 (file)
@@ -441,6 +441,11 @@ var optab = []Optab{
        {i: 119, as: AVERLLVG, a1: C_VREG, a2: C_VREG, a6: C_VREG},
        {i: 119, as: AVERLLVG, a1: C_VREG, a6: C_VREG},
 
+       // VRR-c floating point min/max
+       {i: 128, as: AVFMAXDB, a1: C_SCON, a2: C_VREG, a3: C_VREG, a6: C_VREG},
+       {i: 128, as: AWFMAXDB, a1: C_SCON, a2: C_VREG, a3: C_VREG, a6: C_VREG},
+       {i: 128, as: AWFMAXDB, a1: C_SCON, a2: C_FREG, a3: C_FREG, a6: C_FREG},
+
        // VRR-d
        {i: 120, as: AVACQ, a1: C_VREG, a2: C_VREG, a3: C_VREG, a6: C_VREG},
 
@@ -1480,6 +1485,14 @@ func buildop(ctxt *obj.Link) {
                        opset(AVFMSDB, r)
                        opset(AWFMSDB, r)
                        opset(AVPERM, r)
+               case AVFMAXDB:
+                       opset(AVFMAXSB, r)
+                       opset(AVFMINDB, r)
+                       opset(AVFMINSB, r)
+               case AWFMAXDB:
+                       opset(AWFMAXSB, r)
+                       opset(AWFMINDB, r)
+                       opset(AWFMINSB, r)
                case AKM:
                        opset(AKMC, r)
                        opset(AKLMD, r)
@@ -2636,6 +2649,8 @@ const (
        op_VUPLL  uint32 = 0xE7D4 //    VRR-a   VECTOR UNPACK LOGICAL LOW
        op_VUPL   uint32 = 0xE7D6 //    VRR-a   VECTOR UNPACK LOW
        op_VMSL   uint32 = 0xE7B8 //    VRR-d   VECTOR MULTIPLY SUM LOGICAL
+       op_VFMAX  uint32 = 0xE7EF //    VRR-c   VECTOR FP MAXIMUM
+       op_VFMIN  uint32 = 0xE7EE //    VRR-c   VECTOR FP MINIMUM
 
        // added in z15
        op_KDSA uint32 = 0xB93A // FORMAT_RRE        COMPUTE DIGITAL SIGNATURE AUTHENTICATION (KDSA)
@@ -4475,6 +4490,12 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
                        c.ctxt.Diag("padding byte register cannot be same as input or output register %v", p)
                }
                zRS(op_MVCLE, uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), uint32(d2), asm)
+
+       case 128: // VRR-c floating point max/min
+               op, m4, _ := vop(p.As)
+               m5 := singleElementMask(p.As)
+               m6 := uint32(c.vregoff(&p.From))
+               zVRRc(op, uint32(p.To.Reg), uint32(p.Reg), uint32(p.GetFrom3().Reg), m6, m5, m4, asm)
        }
 }
 
index e7e36eaf15abd61ad64161d9fa86f48d58c616d2..966cd04c277c387f99462a08f16f50f4fde75a9c 100644 (file)
@@ -1027,6 +1027,22 @@ func vop(as obj.As) (opcode, es, cs uint32) {
                return op_VUPL, 1, 0
        case AVUPLF:
                return op_VUPL, 2, 0
+       case AVFMAXDB:
+               return op_VFMAX, 3, 0
+       case AWFMAXDB:
+               return op_VFMAX, 3, 0
+       case AVFMAXSB:
+               return op_VFMAX, 2, 0
+       case AWFMAXSB:
+               return op_VFMAX, 2, 0
+       case AVFMINDB:
+               return op_VFMIN, 3, 0
+       case AWFMINDB:
+               return op_VFMIN, 3, 0
+       case AVFMINSB:
+               return op_VFMIN, 2, 0
+       case AWFMINSB:
+               return op_VFMIN, 2, 0
        }
 }
 
@@ -1062,7 +1078,11 @@ func singleElementMask(as obj.As) uint32 {
                AWFSQDB,
                AWFSDB,
                AWFTCIDB,
-               AWFIDB:
+               AWFIDB,
+               AWFMAXDB,
+               AWFMAXSB,
+               AWFMINDB,
+               AWFMINSB:
                return 8
        }
        return 0
index d04202d39423d14115d1fb071b1081d6bd36bc2e..0cee49727958e5a21847b7115f44a594c8d79d2c 100644 (file)
@@ -172,6 +172,7 @@ func Float64Min(a, b float64) float64 {
        // riscv64:"FMIN"
        // ppc64/power9:"XSMINJDP"
        // ppc64/power10:"XSMINJDP"
+       // s390x: "WFMINDB"
        return min(a, b)
 }
 
@@ -182,6 +183,7 @@ func Float64Max(a, b float64) float64 {
        // riscv64:"FMAX"
        // ppc64/power9:"XSMAXJDP"
        // ppc64/power10:"XSMAXJDP"
+       // s390x: "WFMAXDB"
        return max(a, b)
 }
 
@@ -192,6 +194,7 @@ func Float32Min(a, b float32) float32 {
        // riscv64:"FMINS"
        // ppc64/power9:"XSMINJDP"
        // ppc64/power10:"XSMINJDP"
+       // s390x: "WFMINSB"
        return min(a, b)
 }
 
@@ -202,6 +205,7 @@ func Float32Max(a, b float32) float32 {
        // riscv64:"FMAXS"
        // ppc64/power9:"XSMAXJDP"
        // ppc64/power10:"XSMAXJDP"
+       // s390x: "WFMAXSB"
        return max(a, b)
 }