From: David Chase Date: Thu, 21 Aug 2025 21:07:13 +0000 (-0400) Subject: [dev.simd] cmd/compile: add instructions and rewrites for scalar-> vector moves X-Git-Tag: go1.26rc1~147^2~91 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6890aa2e20;p=gostls13.git [dev.simd] cmd/compile: add instructions and rewrites for scalar-> vector moves This required changes to the assembler so that VMOVSS and VMOVSD could handle FP constants. Change-Id: Iaa2f8df71867a3283bc058b7ec691b56a3e73621 Reviewed-on: https://go-review.googlesource.com/c/go/+/698240 Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI --- diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 58a0f9cc81..817f6dbc1d 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1723,6 +1723,24 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = simdReg(v) + case ssa.OpAMD64VMOVQload, ssa.OpAMD64VMOVDload, + ssa.OpAMD64VMOVSSload, ssa.OpAMD64VMOVSDload: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_MEM + p.From.Reg = v.Args[0].Reg() + ssagen.AddAux(&p.From, v) + p.To.Type = obj.TYPE_REG + p.To.Reg = simdReg(v) + + case ssa.OpAMD64VMOVSSconst, ssa.OpAMD64VMOVSDconst: + // for loading constants directly into SIMD registers + x := simdReg(v) + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_FCONST + p.From.Val = math.Float64frombits(uint64(v.AuxInt)) + p.To.Type = obj.TYPE_REG + p.To.Reg = x + case ssa.OpAMD64VMOVD, ssa.OpAMD64VMOVQ: // These are for initializing the least 32/64 bits of a SIMD register from an "int". p := s.Prog(v.Op.Asm()) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 0c7c7ced43..2300cc3757 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1782,3 +1782,12 @@ (VPBROADCASTW(128|256|512) x:(VPINSRW128 [0] (Zero128 ) y)) && x.Uses == 1 => (VPBROADCASTW(128|256|512) (VMOVQ y)) +(VMOVQ x:(MOVQload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (VMOVQload [off] {sym} ptr mem) +(VMOVD x:(MOVLload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (VMOVDload [off] {sym} ptr mem) + +(VMOVSDf2v x:(MOVSDload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (VMOVSDload [off] {sym} ptr mem) +(VMOVSSf2v x:(MOVSSload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (VMOVSSload [off] {sym} ptr mem) + +(VMOVSDf2v x:(MOVSDconst [c] )) => (VMOVSDconst [c] ) +(VMOVSSf2v x:(MOVSSconst [c] )) => (VMOVSSconst [c] ) + diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 03f38db640..96001e203f 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -1389,6 +1389,14 @@ func init() { {name: "VMOVQ", argLength: 1, reg: gpv, asm: "VMOVQ"}, {name: "VMOVD", argLength: 1, reg: gpv, asm: "VMOVD"}, + {name: "VMOVQload", argLength: 2, reg: fpload, asm: "VMOVQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, + {name: "VMOVDload", argLength: 2, reg: fpload, asm: "VMOVD", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, + {name: "VMOVSSload", argLength: 2, reg: fpload, asm: "VMOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + {name: "VMOVSDload", argLength: 2, reg: fpload, asm: "VMOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, + + {name: "VMOVSSconst", reg: fp01, asm: "VMOVSS", aux: "Float32", rematerializeable: true}, + {name: "VMOVSDconst", reg: fp01, asm: "VMOVSD", aux: "Float64", rematerializeable: true}, + {name: "VZEROUPPER", argLength: 0, asm: "VZEROUPPER"}, {name: "VZEROALL", argLength: 0, asm: "VZEROALL"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 7f6e9a0282..f0c18d0816 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1218,6 +1218,12 @@ const ( OpAMD64VMOVSSf2v OpAMD64VMOVQ OpAMD64VMOVD + OpAMD64VMOVQload + OpAMD64VMOVDload + OpAMD64VMOVSSload + OpAMD64VMOVSDload + OpAMD64VMOVSSconst + OpAMD64VMOVSDconst OpAMD64VZEROUPPER OpAMD64VZEROALL OpAMD64KMOVQload @@ -18925,6 +18931,94 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VMOVQload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVDload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVSSload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVSS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVSDload", + auxType: auxSymOff, + argLen: 2, + faultOnNilArg0: true, + symEffect: SymRead, + asm: x86.AVMOVSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVSSconst", + auxType: auxFloat32, + argLen: 0, + rematerializeable: true, + asm: x86.AVMOVSS, + reg: regInfo{ + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VMOVSDconst", + auxType: auxFloat64, + argLen: 0, + rematerializeable: true, + asm: x86.AVMOVSD, + reg: regInfo{ + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VZEROUPPER", argLen: 0, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 469417536f..8fec5d5b9a 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -507,6 +507,8 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64TESTW(v) case OpAMD64TESTWconst: return rewriteValueAMD64_OpAMD64TESTWconst(v) + case OpAMD64VMOVD: + return rewriteValueAMD64_OpAMD64VMOVD(v) case OpAMD64VMOVDQU16Masked512: return rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v) case OpAMD64VMOVDQU32Masked512: @@ -515,6 +517,12 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v) case OpAMD64VMOVDQU8Masked512: return rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v) + case OpAMD64VMOVQ: + return rewriteValueAMD64_OpAMD64VMOVQ(v) + case OpAMD64VMOVSDf2v: + return rewriteValueAMD64_OpAMD64VMOVSDf2v(v) + case OpAMD64VMOVSSf2v: + return rewriteValueAMD64_OpAMD64VMOVSSf2v(v) case OpAMD64VPANDQ512: return rewriteValueAMD64_OpAMD64VPANDQ512(v) case OpAMD64VPBROADCASTB128: @@ -26442,6 +26450,34 @@ func rewriteValueAMD64_OpAMD64TESTWconst(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VMOVD(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VMOVD x:(MOVLload [off] {sym} ptr mem)) + // cond: x.Uses == 1 && clobber(x) + // result: @x.Block (VMOVDload [off] {sym} ptr mem) + for { + x := v_0 + if x.Op != OpAMD64MOVLload { + break + } + off := auxIntToInt32(x.AuxInt) + sym := auxToSym(x.Aux) + mem := x.Args[1] + ptr := x.Args[0] + if !(x.Uses == 1 && clobber(x)) { + break + } + b = x.Block + v0 := b.NewValue0(x.Pos, OpAMD64VMOVDload, v.Type) + v.copyOf(v0) + v0.AuxInt = int32ToAuxInt(off) + v0.Aux = symToAux(sym) + v0.AddArg2(ptr, mem) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -28799,6 +28835,114 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VMOVQ(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VMOVQ x:(MOVQload [off] {sym} ptr mem)) + // cond: x.Uses == 1 && clobber(x) + // result: @x.Block (VMOVQload [off] {sym} ptr mem) + for { + x := v_0 + if x.Op != OpAMD64MOVQload { + break + } + off := auxIntToInt32(x.AuxInt) + sym := auxToSym(x.Aux) + mem := x.Args[1] + ptr := x.Args[0] + if !(x.Uses == 1 && clobber(x)) { + break + } + b = x.Block + v0 := b.NewValue0(x.Pos, OpAMD64VMOVQload, v.Type) + v.copyOf(v0) + v0.AuxInt = int32ToAuxInt(off) + v0.Aux = symToAux(sym) + v0.AddArg2(ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVSDf2v(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VMOVSDf2v x:(MOVSDload [off] {sym} ptr mem)) + // cond: x.Uses == 1 && clobber(x) + // result: @x.Block (VMOVSDload [off] {sym} ptr mem) + for { + x := v_0 + if x.Op != OpAMD64MOVSDload { + break + } + off := auxIntToInt32(x.AuxInt) + sym := auxToSym(x.Aux) + mem := x.Args[1] + ptr := x.Args[0] + if !(x.Uses == 1 && clobber(x)) { + break + } + b = x.Block + v0 := b.NewValue0(x.Pos, OpAMD64VMOVSDload, v.Type) + v.copyOf(v0) + v0.AuxInt = int32ToAuxInt(off) + v0.Aux = symToAux(sym) + v0.AddArg2(ptr, mem) + return true + } + // match: (VMOVSDf2v x:(MOVSDconst [c] )) + // result: (VMOVSDconst [c] ) + for { + x := v_0 + if x.Op != OpAMD64MOVSDconst { + break + } + c := auxIntToFloat64(x.AuxInt) + v.reset(OpAMD64VMOVSDconst) + v.AuxInt = float64ToAuxInt(c) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVSSf2v(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (VMOVSSf2v x:(MOVSSload [off] {sym} ptr mem)) + // cond: x.Uses == 1 && clobber(x) + // result: @x.Block (VMOVSSload [off] {sym} ptr mem) + for { + x := v_0 + if x.Op != OpAMD64MOVSSload { + break + } + off := auxIntToInt32(x.AuxInt) + sym := auxToSym(x.Aux) + mem := x.Args[1] + ptr := x.Args[0] + if !(x.Uses == 1 && clobber(x)) { + break + } + b = x.Block + v0 := b.NewValue0(x.Pos, OpAMD64VMOVSSload, v.Type) + v.copyOf(v0) + v0.AuxInt = int32ToAuxInt(off) + v0.Aux = symToAux(sym) + v0.AddArg2(ptr, mem) + return true + } + // match: (VMOVSSf2v x:(MOVSSconst [c] )) + // result: (VMOVSSconst [c] ) + for { + x := v_0 + if x.Op != OpAMD64MOVSSconst { + break + } + c := auxIntToFloat32(x.AuxInt) + v.reset(OpAMD64VMOVSSconst) + v.AuxInt = float32ToAuxInt(c) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VPANDQ512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] diff --git a/src/cmd/internal/obj/x86/obj6.go b/src/cmd/internal/obj/x86/obj6.go index 48287546b3..9c8e5e96f8 100644 --- a/src/cmd/internal/obj/x86/obj6.go +++ b/src/cmd/internal/obj/x86/obj6.go @@ -236,7 +236,7 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { // Rewrite float constants to values stored in memory. switch p.As { // Convert AMOVSS $(0), Xx to AXORPS Xx, Xx - case AMOVSS: + case AMOVSS, AVMOVSS: if p.From.Type == obj.TYPE_FCONST { // f == 0 can't be used here due to -0, so use Float64bits if f := p.From.Val.(float64); math.Float64bits(f) == 0 { @@ -272,7 +272,7 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { p.From.Offset = 0 } - case AMOVSD: + case AMOVSD, AVMOVSD: // Convert AMOVSD $(0), Xx to AXORPS Xx, Xx if p.From.Type == obj.TYPE_FCONST { // f == 0 can't be used here due to -0, so use Float64bits