From: Michael Munday Date: Fri, 27 Jun 2025 20:05:38 +0000 (+0100) Subject: cmd/compile: add floating point min/max intrinsics on s390x X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=cedf63616a00c8a6a69f6bbe4bc93b6865cec842;p=gostls13.git cmd/compile: add floating point min/max intrinsics on s390x Add the VECTOR FP (MINIMUM|MAXIMUM) instructions to the assembler and use them in the compiler to implement min and max. Note: I've allowed floating point registers to be used with the single element instructions (those with the W instead of V prefix) to allow easier integration into the compiler. Change-Id: I5f80a510bd248cf483cce95f1979bf63fbae7de6 Reviewed-on: https://go-review.googlesource.com/c/go/+/684715 Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Mark Freeman Reviewed-by: Keith Randall --- diff --git a/src/cmd/asm/internal/asm/testdata/s390x.s b/src/cmd/asm/internal/asm/testdata/s390x.s index a19292b263..93c3ec9ea7 100644 --- a/src/cmd/asm/internal/asm/testdata/s390x.s +++ b/src/cmd/asm/internal/asm/testdata/s390x.s @@ -540,6 +540,18 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16- VSTRCZBS V18, V20, V22, V24 // e78240306f8a VSTRCZHS V18, V20, V22, V24 // e78241306f8a VSTRCZFS V18, V20, V22, V24 // e78242306f8a + VFMAXSB $1, V2, V3, V4 // e742301020ef + WFMAXSB $2, V5, V6, V7 // e775602820ef + WFMAXSB $2, F5, F6, F7 // e775602820ef + VFMAXDB $3, V8, V9, V10 // e7a8903030ef + WFMAXDB $4, V11, V12, V13 // e7dbc04830ef + WFMAXDB $4, F11, F12, F13 // e7dbc04830ef + VFMINSB $7, V14, V15, V16 // e70ef07028ee + WFMINSB $8, V17, V18, V19 // e73120882eee + WFMINSB $8, F1, F2, F3 // e731208820ee + VFMINDB $9, V20, V21, V22 // e76450903eee + WFMINDB $10, V23, V24, V25 // e79780a83eee + WFMINDB $10, F7, F8, F9 // e79780a830ee RET RET foo(SB) diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go index 4d24881dba..ad66bfb5d8 100644 --- a/src/cmd/compile/internal/s390x/ssa.go +++ b/src/cmd/compile/internal/s390x/ssa.go @@ -281,6 +281,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { case ssa.OpS390XCPSDR: p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg()) p.Reg = v.Args[0].Reg() + case ssa.OpS390XWFMAXDB, ssa.OpS390XWFMAXSB, + ssa.OpS390XWFMINDB, ssa.OpS390XWFMINSB: + p := opregregimm(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg(), 1 /* Java Math.Max() */) + p.AddRestSource(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()}) case ssa.OpS390XDIVD, ssa.OpS390XDIVW, ssa.OpS390XDIVDU, ssa.OpS390XDIVWU, ssa.OpS390XMODD, ssa.OpS390XMODW, diff --git a/src/cmd/compile/internal/ssa/_gen/S390X.rules b/src/cmd/compile/internal/ssa/_gen/S390X.rules index 231ad0615d..80e12f8e29 100644 --- a/src/cmd/compile/internal/ssa/_gen/S390X.rules +++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules @@ -145,6 +145,9 @@ (Sqrt32 ...) => (FSQRTS ...) +(Max(64|32)F ...) => (WFMAX(D|S)B ...) +(Min(64|32)F ...) => (WFMIN(D|S)B ...) + // Atomic loads and stores. // The SYNC instruction (fast-BCR-serialization) prevents store-load // reordering. Other sequences of memory operations (load-load, diff --git a/src/cmd/compile/internal/ssa/_gen/S390XOps.go b/src/cmd/compile/internal/ssa/_gen/S390XOps.go index 2f57d12630..38fb3cb074 100644 --- a/src/cmd/compile/internal/ssa/_gen/S390XOps.go +++ b/src/cmd/compile/internal/ssa/_gen/S390XOps.go @@ -222,6 +222,12 @@ func init() { {name: "LNDFR", argLength: 1, reg: fp11, asm: "LNDFR"}, // fp64/fp32 clear sign bit {name: "CPSDR", argLength: 2, reg: fp21, asm: "CPSDR"}, // fp64/fp32 copy arg1 sign bit to arg0 + // Single element vector floating point min / max instructions + {name: "WFMAXDB", argLength: 2, reg: fp21, asm: "WFMAXDB", typ: "Float64"}, // max[float64](arg0, arg1) + {name: "WFMAXSB", argLength: 2, reg: fp21, asm: "WFMAXSB", typ: "Float32"}, // max[float32](arg0, arg1) + {name: "WFMINDB", argLength: 2, reg: fp21, asm: "WFMINDB", typ: "Float64"}, // min[float64](arg0, arg1) + {name: "WFMINSB", argLength: 2, reg: fp21, asm: "WFMINSB", typ: "Float32"}, // min[float32](arg0, arg1) + // Round to integer, float64 only. // // aux | rounding mode diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index e88af66f5f..36c1815ea2 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2655,6 +2655,10 @@ const ( OpS390XLPDFR OpS390XLNDFR OpS390XCPSDR + OpS390XWFMAXDB + OpS390XWFMAXSB + OpS390XWFMINDB + OpS390XWFMINSB OpS390XFIDBR OpS390XFMOVSload OpS390XFMOVDload @@ -35775,6 +35779,62 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "WFMAXDB", + argLen: 2, + asm: s390x.AWFMAXDB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, + { + name: "WFMAXSB", + argLen: 2, + asm: s390x.AWFMAXSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, + { + name: "WFMINDB", + argLen: 2, + asm: s390x.AWFMINDB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, + { + name: "WFMINSB", + argLen: 2, + asm: s390x.AWFMINSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, { name: "FIDBR", auxType: auxInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go index 2e7492501a..a7fde81c47 100644 --- a/src/cmd/compile/internal/ssa/rewriteS390X.go +++ b/src/cmd/compile/internal/ssa/rewriteS390X.go @@ -368,6 +368,18 @@ func rewriteValueS390X(v *Value) bool { return rewriteValueS390X_OpLsh8x64(v) case OpLsh8x8: return rewriteValueS390X_OpLsh8x8(v) + case OpMax32F: + v.Op = OpS390XWFMAXSB + return true + case OpMax64F: + v.Op = OpS390XWFMAXDB + return true + case OpMin32F: + v.Op = OpS390XWFMINSB + return true + case OpMin64F: + v.Op = OpS390XWFMINDB + return true case OpMod16: return rewriteValueS390X_OpMod16(v) case OpMod16u: diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index 3deb0ecf23..bce94d35f9 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -3986,7 +3986,7 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value { if typ.IsFloat() { hasIntrinsic := false switch Arch.LinkArch.Family { - case sys.AMD64, sys.ARM64, sys.Loong64, sys.RISCV64: + case sys.AMD64, sys.ARM64, sys.Loong64, sys.RISCV64, sys.S390X: hasIntrinsic = true case sys.PPC64: hasIntrinsic = buildcfg.GOPPC64 >= 9 diff --git a/src/cmd/internal/obj/s390x/a.out.go b/src/cmd/internal/obj/s390x/a.out.go index 1a64370efa..dc715182f5 100644 --- a/src/cmd/internal/obj/s390x/a.out.go +++ b/src/cmd/internal/obj/s390x/a.out.go @@ -715,6 +715,14 @@ const ( AWFLNDB AVFLPDB AWFLPDB + AVFMAXDB + AWFMAXDB + AVFMAXSB + AWFMAXSB + AVFMINDB + AWFMINDB + AVFMINSB + AWFMINSB AVFSQ AVFSQDB AWFSQDB diff --git a/src/cmd/internal/obj/s390x/anames.go b/src/cmd/internal/obj/s390x/anames.go index c0a0c401fa..a6f2820f85 100644 --- a/src/cmd/internal/obj/s390x/anames.go +++ b/src/cmd/internal/obj/s390x/anames.go @@ -438,6 +438,14 @@ var Anames = []string{ "WFLNDB", "VFLPDB", "WFLPDB", + "VFMAXDB", + "WFMAXDB", + "VFMAXSB", + "WFMAXSB", + "VFMINDB", + "WFMINDB", + "VFMINSB", + "WFMINSB", "VFSQ", "VFSQDB", "WFSQDB", diff --git a/src/cmd/internal/obj/s390x/asmz.go b/src/cmd/internal/obj/s390x/asmz.go index 72d92abbaf..957222a155 100644 --- a/src/cmd/internal/obj/s390x/asmz.go +++ b/src/cmd/internal/obj/s390x/asmz.go @@ -441,6 +441,11 @@ var optab = []Optab{ {i: 119, as: AVERLLVG, a1: C_VREG, a2: C_VREG, a6: C_VREG}, {i: 119, as: AVERLLVG, a1: C_VREG, a6: C_VREG}, + // VRR-c floating point min/max + {i: 128, as: AVFMAXDB, a1: C_SCON, a2: C_VREG, a3: C_VREG, a6: C_VREG}, + {i: 128, as: AWFMAXDB, a1: C_SCON, a2: C_VREG, a3: C_VREG, a6: C_VREG}, + {i: 128, as: AWFMAXDB, a1: C_SCON, a2: C_FREG, a3: C_FREG, a6: C_FREG}, + // VRR-d {i: 120, as: AVACQ, a1: C_VREG, a2: C_VREG, a3: C_VREG, a6: C_VREG}, @@ -1480,6 +1485,14 @@ func buildop(ctxt *obj.Link) { opset(AVFMSDB, r) opset(AWFMSDB, r) opset(AVPERM, r) + case AVFMAXDB: + opset(AVFMAXSB, r) + opset(AVFMINDB, r) + opset(AVFMINSB, r) + case AWFMAXDB: + opset(AWFMAXSB, r) + opset(AWFMINDB, r) + opset(AWFMINSB, r) case AKM: opset(AKMC, r) opset(AKLMD, r) @@ -2636,6 +2649,8 @@ const ( op_VUPLL uint32 = 0xE7D4 // VRR-a VECTOR UNPACK LOGICAL LOW op_VUPL uint32 = 0xE7D6 // VRR-a VECTOR UNPACK LOW op_VMSL uint32 = 0xE7B8 // VRR-d VECTOR MULTIPLY SUM LOGICAL + op_VFMAX uint32 = 0xE7EF // VRR-c VECTOR FP MAXIMUM + op_VFMIN uint32 = 0xE7EE // VRR-c VECTOR FP MINIMUM // added in z15 op_KDSA uint32 = 0xB93A // FORMAT_RRE COMPUTE DIGITAL SIGNATURE AUTHENTICATION (KDSA) @@ -4475,6 +4490,12 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) { c.ctxt.Diag("padding byte register cannot be same as input or output register %v", p) } zRS(op_MVCLE, uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), uint32(d2), asm) + + case 128: // VRR-c floating point max/min + op, m4, _ := vop(p.As) + m5 := singleElementMask(p.As) + m6 := uint32(c.vregoff(&p.From)) + zVRRc(op, uint32(p.To.Reg), uint32(p.Reg), uint32(p.GetFrom3().Reg), m6, m5, m4, asm) } } diff --git a/src/cmd/internal/obj/s390x/vector.go b/src/cmd/internal/obj/s390x/vector.go index e7e36eaf15..966cd04c27 100644 --- a/src/cmd/internal/obj/s390x/vector.go +++ b/src/cmd/internal/obj/s390x/vector.go @@ -1027,6 +1027,22 @@ func vop(as obj.As) (opcode, es, cs uint32) { return op_VUPL, 1, 0 case AVUPLF: return op_VUPL, 2, 0 + case AVFMAXDB: + return op_VFMAX, 3, 0 + case AWFMAXDB: + return op_VFMAX, 3, 0 + case AVFMAXSB: + return op_VFMAX, 2, 0 + case AWFMAXSB: + return op_VFMAX, 2, 0 + case AVFMINDB: + return op_VFMIN, 3, 0 + case AWFMINDB: + return op_VFMIN, 3, 0 + case AVFMINSB: + return op_VFMIN, 2, 0 + case AWFMINSB: + return op_VFMIN, 2, 0 } } @@ -1062,7 +1078,11 @@ func singleElementMask(as obj.As) uint32 { AWFSQDB, AWFSDB, AWFTCIDB, - AWFIDB: + AWFIDB, + AWFMAXDB, + AWFMAXSB, + AWFMINDB, + AWFMINSB: return 8 } return 0 diff --git a/test/codegen/floats.go b/test/codegen/floats.go index d04202d394..0cee497279 100644 --- a/test/codegen/floats.go +++ b/test/codegen/floats.go @@ -172,6 +172,7 @@ func Float64Min(a, b float64) float64 { // riscv64:"FMIN" // ppc64/power9:"XSMINJDP" // ppc64/power10:"XSMINJDP" + // s390x: "WFMINDB" return min(a, b) } @@ -182,6 +183,7 @@ func Float64Max(a, b float64) float64 { // riscv64:"FMAX" // ppc64/power9:"XSMAXJDP" // ppc64/power10:"XSMAXJDP" + // s390x: "WFMAXDB" return max(a, b) } @@ -192,6 +194,7 @@ func Float32Min(a, b float32) float32 { // riscv64:"FMINS" // ppc64/power9:"XSMINJDP" // ppc64/power10:"XSMINJDP" + // s390x: "WFMINSB" return min(a, b) } @@ -202,6 +205,7 @@ func Float32Max(a, b float32) float32 { // riscv64:"FMAXS" // ppc64/power9:"XSMAXJDP" // ppc64/power10:"XSMAXJDP" + // s390x: "WFMAXSB" return max(a, b) }