This CL use MFC1/MTC1 instructions to move data between GPR and FPR instead of stores and loads to move float/int values.
goos: linux
goarch: mipsle
pkg: math
│ oldmathf │ newmathf │
│ sec/op │ sec/op vs base │
Acos-4 282.7n ± 0% 282.1n ± 0% -0.18% (p=0.010 n=8)
Acosh-4 450.8n ± 0% 450.9n ± 0% ~ (p=0.699 n=8)
Asin-4 272.6n ± 0% 272.1n ± 0% ~ (p=0.050 n=8)
Asinh-4 476.8n ± 0% 475.1n ± 0% -0.35% (p=0.018 n=8)
Atan-4 208.1n ± 0% 207.7n ± 0% -0.17% (p=0.009 n=8)
Atanh-4 448.8n ± 0% 448.7n ± 0% -0.03% (p=0.014 n=8)
Atan2-4 310.2n ± 0% 310.1n ± 0% ~ (p=0.133 n=8)
Cbrt-4 357.9n ± 0% 358.4n ± 0% +0.11% (p=0.014 n=8)
Ceil-4 203.8n ± 0% 204.7n ± 0% +0.42% (p=0.008 n=8)
Compare-4 21.12n ± 0% 22.09n ± 0% +4.59% (p=0.000 n=8)
Compare32-4 19.105n ± 0% 6.022n ± 0% -68.48% (p=0.000 n=8)
Copysign-4 33.17n ± 0% 33.15n ± 0% ~ (p=0.795 n=8)
Cos-4 385.2n ± 0% 384.8n ± 1% ~ (p=0.112 n=8)
Cosh-4 546.0n ± 0% 545.0n ± 0% -0.17% (p=0.012 n=8)
Erf-4 192.4n ± 0% 195.4n ± 1% +1.59% (p=0.000 n=8)
Erfc-4 187.8n ± 0% 192.7n ± 0% +2.64% (p=0.000 n=8)
Erfinv-4 221.8n ± 1% 219.8n ± 0% -0.88% (p=0.000 n=8)
Erfcinv-4 224.1n ± 1% 219.9n ± 0% -1.87% (p=0.000 n=8)
Exp-4 434.7n ± 0% 435.0n ± 0% ~ (p=0.339 n=8)
ExpGo-4 433.7n ± 0% 434.2n ± 0% +0.13% (p=0.005 n=8)
Expm1-4 243.0n ± 0% 242.9n ± 0% ~ (p=0.103 n=8)
Exp2-4 426.6n ± 0% 426.6n ± 0% ~ (p=0.822 n=8)
Exp2Go-4 425.6n ± 0% 425.5n ± 0% ~ (p=0.377 n=8)
Abs-4 8.033n ± 0% 8.029n ± 0% ~ (p=0.065 n=8)
Dim-4 18.07n ± 0% 18.07n ± 0% ~ (p=0.051 n=8)
Floor-4 151.6n ± 0% 151.6n ± 0% ~ (p=0.450 n=8)
Max-4 100.9n ± 8% 103.2n ± 2% ~ (p=0.099 n=8)
Min-4 116.4n ± 0% 116.4n ± 0% ~ (p=0.467 n=8)
Mod-4 959.6n ± 1% 950.9n ± 0% -0.91% (p=0.006 n=8)
Frexp-4 147.6n ± 0% 147.5n ± 0% -0.07% (p=0.026 n=8)
Gamma-4 482.7n ± 0% 478.2n ± 2% -0.92% (p=0.000 n=8)
Hypot-4 139.8n ± 1% 127.1n ± 8% -9.12% (p=0.000 n=8)
HypotGo-4 137.2n ± 7% 117.5n ± 2% -14.39% (p=0.001 n=8)
Ilogb-4 109.5n ± 0% 108.4n ± 1% -1.05% (p=0.001 n=8)
J0-4 1.304µ ± 0% 1.304µ ± 0% ~ (p=0.853 n=8)
J1-4 1.349µ ± 0% 1.331µ ± 0% -1.33% (p=0.000 n=8)
Jn-4 2.774µ ± 0% 2.750µ ± 0% -0.87% (p=0.000 n=8)
Ldexp-4 151.6n ± 0% 151.5n ± 0% ~ (p=0.695 n=8)
Lgamma-4 226.9n ± 0% 233.9n ± 0% +3.09% (p=0.000 n=8)
Log-4 407.6n ± 0% 407.4n ± 0% ~ (p=0.340 n=8)
Logb-4 121.5n ± 0% 121.5n ± 0% -0.08% (p=0.042 n=8)
Log1p-4 315.5n ± 0% 315.6n ± 0% ~ (p=0.930 n=8)
Log10-4 417.8n ± 0% 417.5n ± 0% ~ (p=0.053 n=8)
Log2-4 208.8n ± 0% 208.8n ± 0% ~ (p=0.582 n=8)
Modf-4 126.5n ± 0% 126.4n ± 0% ~ (p=0.128 n=8)
Nextafter32-4 112.45n ± 0% 82.27n ± 0% -26.84% (p=0.000 n=8)
Nextafter64-4 141.5n ± 0% 141.5n ± 0% ~ (p=0.569 n=8)
PowInt-4 754.0n ± 1% 754.6n ± 0% ~ (p=0.279 n=8)
PowFrac-4 1.608µ ± 1% 1.596µ ± 1% ~ (p=0.661 n=8)
Pow10Pos-4 18.07n ± 0% 18.07n ± 0% ~ (p=0.413 n=8)
Pow10Neg-4 17.08n ± 0% 18.07n ± 0% +5.80% (p=0.000 n=8)
Round-4 68.30n ± 0% 69.29n ± 0% +1.45% (p=0.000 n=8)
RoundToEven-4 78.33n ± 0% 78.34n ± 0% ~ (p=0.975 n=8)
Remainder-4 740.6n ± 1% 736.7n ± 0% ~ (p=0.098 n=8)
Signbit-4 18.08n ± 0% 18.07n ± 0% ~ (p=0.546 n=8)
Sin-4 389.4n ± 0% 389.5n ± 0% ~ (p=0.451 n=8)
Sincos-4 415.6n ± 0% 415.6n ± 0% ~ (p=0.450 n=8)
Sinh-4 607.0n ± 0% 590.8n ± 1% -2.68% (p=0.000 n=8)
SqrtIndirect-4 8.034n ± 0% 8.030n ± 0% ~ (p=0.487 n=8)
SqrtLatency-4 8.031n ± 0% 8.034n ± 0% ~ (p=0.152 n=8)
SqrtIndirectLatency-4 8.032n ± 0% 8.032n ± 0% ~ (p=0.818 n=8)
SqrtGoLatency-4 895.8n ± 0% 895.3n ± 0% ~ (p=0.553 n=8)
SqrtPrime-4 5.405µ ± 0% 5.379µ ± 0% -0.48% (p=0.000 n=8)
Tan-4 405.6n ± 0% 405.7n ± 0% ~ (p=0.980 n=8)
Tanh-4 545.1n ± 0% 545.1n ± 0% ~ (p=0.806 n=8)
Trunc-4 146.5n ± 0% 146.6n ± 0% ~ (p=0.380 n=8)
Y0-4 1.308µ ± 0% 1.306µ ± 0% ~ (p=0.071 n=8)
Y1-4 1.311µ ± 0% 1.315µ ± 0% +0.31% (p=0.000 n=8)
Yn-4 2.737µ ± 0% 2.745µ ± 0% +0.27% (p=0.000 n=8)
Float64bits-4 14.56n ± 0% 14.56n ± 0% ~ (p=0.689 n=8)
Float64frombits-4 19.08n ± 0% 19.08n ± 0% ~ (p=0.580 n=8)
Float32bits-4 13.050n ± 0% 5.019n ± 0% -61.54% (p=0.000 n=8)
Float32frombits-4 13.060n ± 0% 4.016n ± 0% -69.25% (p=0.000 n=8)
FMA-4 608.5n ± 0% 586.1n ± 0% -3.67% (p=0.000 n=8)
geomean 185.5n 176.2n -5.02%
Change-Id: Ibf91092ffe70104e6c5ec03bc76d51259818b9b3
Reviewed-on: https://go-review.googlesource.com/c/go/+/494535
Run-TryBot: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
ssa.OpMIPSTRUNCDW,
ssa.OpMIPSMOVFD,
ssa.OpMIPSMOVDF,
+ ssa.OpMIPSMOVWfpgp,
+ ssa.OpMIPSMOVWgpfp,
ssa.OpMIPSNEGF,
ssa.OpMIPSNEGD,
ssa.OpMIPSABSD,
(Store {t} ptr val mem) && t.Size() == 4 && t.IsFloat() => (MOVFstore ptr val mem)
(Store {t} ptr val mem) && t.Size() == 8 && t.IsFloat() => (MOVDstore ptr val mem)
+// float <=> int register moves, with no conversion.
+// These come up when compiling math.{Float32bits, Float32frombits}.
+(MOVWload [off] {sym} ptr (MOVFstore [off] {sym} ptr val _)) => (MOVWfpgp val)
+(MOVFload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) => (MOVWgpfp val)
+
+// Similarly for stores, if we see a store after FPR <=> GPR move, then redirect store to use the other register set.
+(MOVWstore [off] {sym} ptr (MOVWfpgp val) mem) => (MOVFstore [off] {sym} ptr val mem)
+(MOVFstore [off] {sym} ptr (MOVWgpfp val) mem) => (MOVWstore [off] {sym} ptr val mem)
+
// zero instructions
(Zero [0] _ mem) => mem
(Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem)
gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
+ fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+ gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
+ // moves (no conversion)
+ {name: "MOVWfpgp", argLength: 1, reg: fpgp, asm: "MOVW"}, // move float32 to int32 (no conversion)
+ {name: "MOVWgpfp", argLength: 1, reg: gpfp, asm: "MOVW"}, // move int32 to float32 (no conversion)
+
// conversions
{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
OpMIPSMOVBstorezero
OpMIPSMOVHstorezero
OpMIPSMOVWstorezero
+ OpMIPSMOVWfpgp
+ OpMIPSMOVWgpfp
OpMIPSMOVBreg
OpMIPSMOVBUreg
OpMIPSMOVHreg
},
},
},
+ {
+ name: "MOVWfpgp",
+ argLen: 1,
+ asm: mips.AMOVW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 35183835217920}, // F0 F2 F4 F6 F8 F10 F12 F14 F16 F18 F20 F22 F24 F26 F28 F30
+ },
+ outputs: []outputInfo{
+ {0, 335544318}, // R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R28 R31
+ },
+ },
+ },
+ {
+ name: "MOVWgpfp",
+ argLen: 1,
+ asm: mips.AMOVW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 335544318}, // R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R28 R31
+ },
+ outputs: []outputInfo{
+ {0, 35183835217920}, // F0 F2 F4 F6 F8 F10 F12 F14 F16 F18 F20 F22 F24 F26 F28 F30
+ },
+ },
+ },
{
name: "MOVBreg",
argLen: 1,
func rewriteValueMIPS_OpMIPSMOVFload(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
+ // match: (MOVFload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _))
+ // result: (MOVWgpfp val)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpMIPSMOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+ break
+ }
+ val := v_1.Args[1]
+ if ptr != v_1.Args[0] {
+ break
+ }
+ v.reset(OpMIPSMOVWgpfp)
+ v.AddArg(val)
+ return true
+ }
// match: (MOVFload [off1] {sym} x:(ADDconst [off2] ptr) mem)
// cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
// result: (MOVFload [off1+off2] {sym} ptr mem)
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
+ // match: (MOVFstore [off] {sym} ptr (MOVWgpfp val) mem)
+ // result: (MOVWstore [off] {sym} ptr val mem)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpMIPSMOVWgpfp {
+ break
+ }
+ val := v_1.Args[0]
+ mem := v_2
+ v.reset(OpMIPSMOVWstore)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg3(ptr, val, mem)
+ return true
+ }
// match: (MOVFstore [off1] {sym} x:(ADDconst [off2] ptr) val mem)
// cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
// result: (MOVFstore [off1+off2] {sym} ptr val mem)
func rewriteValueMIPS_OpMIPSMOVWload(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
+ // match: (MOVWload [off] {sym} ptr (MOVFstore [off] {sym} ptr val _))
+ // result: (MOVWfpgp val)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpMIPSMOVFstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+ break
+ }
+ val := v_1.Args[1]
+ if ptr != v_1.Args[0] {
+ break
+ }
+ v.reset(OpMIPSMOVWfpgp)
+ v.AddArg(val)
+ return true
+ }
// match: (MOVWload [off1] {sym} x:(ADDconst [off2] ptr) mem)
// cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
// result: (MOVWload [off1+off2] {sym} ptr mem)
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
+ // match: (MOVWstore [off] {sym} ptr (MOVWfpgp val) mem)
+ // result: (MOVFstore [off] {sym} ptr val mem)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpMIPSMOVWfpgp {
+ break
+ }
+ val := v_1.Args[0]
+ mem := v_2
+ v.reset(OpMIPSMOVFstore)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg3(ptr, val, mem)
+ return true
+ }
// match: (MOVWstore [off1] {sym} x:(ADDconst [off2] ptr) val mem)
// cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
// result: (MOVWstore [off1+off2] {sym} ptr val mem)