]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: optimize math.Float32bits and math.Float32frombits on mipsx
authorJunxian Zhu <zhujunxian@oss.cipunited.com>
Fri, 12 May 2023 04:28:51 +0000 (12:28 +0800)
committerKeith Randall <khr@golang.org>
Wed, 24 May 2023 14:43:03 +0000 (14:43 +0000)
This CL use MFC1/MTC1 instructions to move data between GPR and FPR instead of stores and loads to move float/int values.

goos: linux
goarch: mipsle
pkg: math
                      │   oldmathf   │              newmathf              │
                      │    sec/op    │   sec/op     vs base               │
Acos-4                   282.7n ± 0%   282.1n ± 0%   -0.18% (p=0.010 n=8)
Acosh-4                  450.8n ± 0%   450.9n ± 0%        ~ (p=0.699 n=8)
Asin-4                   272.6n ± 0%   272.1n ± 0%        ~ (p=0.050 n=8)
Asinh-4                  476.8n ± 0%   475.1n ± 0%   -0.35% (p=0.018 n=8)
Atan-4                   208.1n ± 0%   207.7n ± 0%   -0.17% (p=0.009 n=8)
Atanh-4                  448.8n ± 0%   448.7n ± 0%   -0.03% (p=0.014 n=8)
Atan2-4                  310.2n ± 0%   310.1n ± 0%        ~ (p=0.133 n=8)
Cbrt-4                   357.9n ± 0%   358.4n ± 0%   +0.11% (p=0.014 n=8)
Ceil-4                   203.8n ± 0%   204.7n ± 0%   +0.42% (p=0.008 n=8)
Compare-4                21.12n ± 0%   22.09n ± 0%   +4.59% (p=0.000 n=8)
Compare32-4             19.105n ± 0%   6.022n ± 0%  -68.48% (p=0.000 n=8)
Copysign-4               33.17n ± 0%   33.15n ± 0%        ~ (p=0.795 n=8)
Cos-4                    385.2n ± 0%   384.8n ± 1%        ~ (p=0.112 n=8)
Cosh-4                   546.0n ± 0%   545.0n ± 0%   -0.17% (p=0.012 n=8)
Erf-4                    192.4n ± 0%   195.4n ± 1%   +1.59% (p=0.000 n=8)
Erfc-4                   187.8n ± 0%   192.7n ± 0%   +2.64% (p=0.000 n=8)
Erfinv-4                 221.8n ± 1%   219.8n ± 0%   -0.88% (p=0.000 n=8)
Erfcinv-4                224.1n ± 1%   219.9n ± 0%   -1.87% (p=0.000 n=8)
Exp-4                    434.7n ± 0%   435.0n ± 0%        ~ (p=0.339 n=8)
ExpGo-4                  433.7n ± 0%   434.2n ± 0%   +0.13% (p=0.005 n=8)
Expm1-4                  243.0n ± 0%   242.9n ± 0%        ~ (p=0.103 n=8)
Exp2-4                   426.6n ± 0%   426.6n ± 0%        ~ (p=0.822 n=8)
Exp2Go-4                 425.6n ± 0%   425.5n ± 0%        ~ (p=0.377 n=8)
Abs-4                    8.033n ± 0%   8.029n ± 0%        ~ (p=0.065 n=8)
Dim-4                    18.07n ± 0%   18.07n ± 0%        ~ (p=0.051 n=8)
Floor-4                  151.6n ± 0%   151.6n ± 0%        ~ (p=0.450 n=8)
Max-4                    100.9n ± 8%   103.2n ± 2%        ~ (p=0.099 n=8)
Min-4                    116.4n ± 0%   116.4n ± 0%        ~ (p=0.467 n=8)
Mod-4                    959.6n ± 1%   950.9n ± 0%   -0.91% (p=0.006 n=8)
Frexp-4                  147.6n ± 0%   147.5n ± 0%   -0.07% (p=0.026 n=8)
Gamma-4                  482.7n ± 0%   478.2n ± 2%   -0.92% (p=0.000 n=8)
Hypot-4                  139.8n ± 1%   127.1n ± 8%   -9.12% (p=0.000 n=8)
HypotGo-4                137.2n ± 7%   117.5n ± 2%  -14.39% (p=0.001 n=8)
Ilogb-4                  109.5n ± 0%   108.4n ± 1%   -1.05% (p=0.001 n=8)
J0-4                     1.304µ ± 0%   1.304µ ± 0%        ~ (p=0.853 n=8)
J1-4                     1.349µ ± 0%   1.331µ ± 0%   -1.33% (p=0.000 n=8)
Jn-4                     2.774µ ± 0%   2.750µ ± 0%   -0.87% (p=0.000 n=8)
Ldexp-4                  151.6n ± 0%   151.5n ± 0%        ~ (p=0.695 n=8)
Lgamma-4                 226.9n ± 0%   233.9n ± 0%   +3.09% (p=0.000 n=8)
Log-4                    407.6n ± 0%   407.4n ± 0%        ~ (p=0.340 n=8)
Logb-4                   121.5n ± 0%   121.5n ± 0%   -0.08% (p=0.042 n=8)
Log1p-4                  315.5n ± 0%   315.6n ± 0%        ~ (p=0.930 n=8)
Log10-4                  417.8n ± 0%   417.5n ± 0%        ~ (p=0.053 n=8)
Log2-4                   208.8n ± 0%   208.8n ± 0%        ~ (p=0.582 n=8)
Modf-4                   126.5n ± 0%   126.4n ± 0%        ~ (p=0.128 n=8)
Nextafter32-4           112.45n ± 0%   82.27n ± 0%  -26.84% (p=0.000 n=8)
Nextafter64-4            141.5n ± 0%   141.5n ± 0%        ~ (p=0.569 n=8)
PowInt-4                 754.0n ± 1%   754.6n ± 0%        ~ (p=0.279 n=8)
PowFrac-4                1.608µ ± 1%   1.596µ ± 1%        ~ (p=0.661 n=8)
Pow10Pos-4               18.07n ± 0%   18.07n ± 0%        ~ (p=0.413 n=8)
Pow10Neg-4               17.08n ± 0%   18.07n ± 0%   +5.80% (p=0.000 n=8)
Round-4                  68.30n ± 0%   69.29n ± 0%   +1.45% (p=0.000 n=8)
RoundToEven-4            78.33n ± 0%   78.34n ± 0%        ~ (p=0.975 n=8)
Remainder-4              740.6n ± 1%   736.7n ± 0%        ~ (p=0.098 n=8)
Signbit-4                18.08n ± 0%   18.07n ± 0%        ~ (p=0.546 n=8)
Sin-4                    389.4n ± 0%   389.5n ± 0%        ~ (p=0.451 n=8)
Sincos-4                 415.6n ± 0%   415.6n ± 0%        ~ (p=0.450 n=8)
Sinh-4                   607.0n ± 0%   590.8n ± 1%   -2.68% (p=0.000 n=8)
SqrtIndirect-4           8.034n ± 0%   8.030n ± 0%        ~ (p=0.487 n=8)
SqrtLatency-4            8.031n ± 0%   8.034n ± 0%        ~ (p=0.152 n=8)
SqrtIndirectLatency-4    8.032n ± 0%   8.032n ± 0%        ~ (p=0.818 n=8)
SqrtGoLatency-4          895.8n ± 0%   895.3n ± 0%        ~ (p=0.553 n=8)
SqrtPrime-4              5.405µ ± 0%   5.379µ ± 0%   -0.48% (p=0.000 n=8)
Tan-4                    405.6n ± 0%   405.7n ± 0%        ~ (p=0.980 n=8)
Tanh-4                   545.1n ± 0%   545.1n ± 0%        ~ (p=0.806 n=8)
Trunc-4                  146.5n ± 0%   146.6n ± 0%        ~ (p=0.380 n=8)
Y0-4                     1.308µ ± 0%   1.306µ ± 0%        ~ (p=0.071 n=8)
Y1-4                     1.311µ ± 0%   1.315µ ± 0%   +0.31% (p=0.000 n=8)
Yn-4                     2.737µ ± 0%   2.745µ ± 0%   +0.27% (p=0.000 n=8)
Float64bits-4            14.56n ± 0%   14.56n ± 0%        ~ (p=0.689 n=8)
Float64frombits-4        19.08n ± 0%   19.08n ± 0%        ~ (p=0.580 n=8)
Float32bits-4           13.050n ± 0%   5.019n ± 0%  -61.54% (p=0.000 n=8)
Float32frombits-4       13.060n ± 0%   4.016n ± 0%  -69.25% (p=0.000 n=8)
FMA-4                    608.5n ± 0%   586.1n ± 0%   -3.67% (p=0.000 n=8)
geomean                  185.5n        176.2n        -5.02%

Change-Id: Ibf91092ffe70104e6c5ec03bc76d51259818b9b3
Reviewed-on: https://go-review.googlesource.com/c/go/+/494535
Run-TryBot: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/mips/ssa.go
src/cmd/compile/internal/ssa/_gen/MIPS.rules
src/cmd/compile/internal/ssa/_gen/MIPSOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteMIPS.go

index 2cfe57f7f4643a50e1c1db56cfd0ffc037e86a0e..bfccafd8e5a52ee57ac8a398cd6a57eadde4ad04 100644 (file)
@@ -361,6 +361,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                ssa.OpMIPSTRUNCDW,
                ssa.OpMIPSMOVFD,
                ssa.OpMIPSMOVDF,
+               ssa.OpMIPSMOVWfpgp,
+               ssa.OpMIPSMOVWgpfp,
                ssa.OpMIPSNEGF,
                ssa.OpMIPSNEGD,
                ssa.OpMIPSABSD,
index b36402dd0a9f590494b8b77e670048445517c3b6..d6ae0101cbd4b013b395df043145f8bc6c6706c6 100644 (file)
 (Store {t} ptr val mem) && t.Size() == 4 &&  t.IsFloat() => (MOVFstore ptr val mem)
 (Store {t} ptr val mem) && t.Size() == 8 &&  t.IsFloat() => (MOVDstore ptr val mem)
 
+// float <=> int register moves, with no conversion.
+// These come up when compiling math.{Float32bits, Float32frombits}.
+(MOVWload [off] {sym} ptr (MOVFstore [off] {sym} ptr val _)) => (MOVWfpgp val)
+(MOVFload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) => (MOVWgpfp val)
+
+// Similarly for stores, if we see a store after FPR <=> GPR move, then redirect store to use the other register set.
+(MOVWstore [off] {sym} ptr (MOVWfpgp val) mem) => (MOVFstore [off] {sym} ptr val mem)
+(MOVFstore [off] {sym} ptr (MOVWgpfp val) mem) => (MOVWstore [off] {sym} ptr val mem)
+
 // zero instructions
 (Zero [0] _ mem) => mem
 (Zero [1] ptr mem) => (MOVBstore ptr (MOVWconst [0]) mem)
index b5d9d2547591eb5cc2aa4111b9305eb683b7592c..5964bb7a333742861dc39d77f7482da254a65146 100644 (file)
@@ -139,6 +139,8 @@ func init() {
                gpxchg    = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
                gpcas     = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
                gpstore0  = regInfo{inputs: []regMask{gpspsbg}}
+               fpgp      = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
+               gpfp      = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
                fp01      = regInfo{inputs: nil, outputs: []regMask{fp}}
                fp11      = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
                fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
@@ -233,6 +235,10 @@ func init() {
                {name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
                {name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux.  arg1=mem.
 
+               // moves (no conversion)
+               {name: "MOVWfpgp", argLength: 1, reg: fpgp, asm: "MOVW"}, // move float32 to int32 (no conversion)
+               {name: "MOVWgpfp", argLength: 1, reg: gpfp, asm: "MOVW"}, // move int32 to float32 (no conversion)
+
                // conversions
                {name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
                {name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
index 6d8bef7ed97ec710c15918f9aa2054ff15aa7b1b..1480fcf45bfd762cde5f5a67dd7e895ccb5ec4e2 100644 (file)
@@ -1913,6 +1913,8 @@ const (
        OpMIPSMOVBstorezero
        OpMIPSMOVHstorezero
        OpMIPSMOVWstorezero
+       OpMIPSMOVWfpgp
+       OpMIPSMOVWgpfp
        OpMIPSMOVBreg
        OpMIPSMOVBUreg
        OpMIPSMOVHreg
@@ -25618,6 +25620,32 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "MOVWfpgp",
+               argLen: 1,
+               asm:    mips.AMOVW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 35183835217920}, // F0 F2 F4 F6 F8 F10 F12 F14 F16 F18 F20 F22 F24 F26 F28 F30
+                       },
+                       outputs: []outputInfo{
+                               {0, 335544318}, // R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R28 R31
+                       },
+               },
+       },
+       {
+               name:   "MOVWgpfp",
+               argLen: 1,
+               asm:    mips.AMOVW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 335544318}, // R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R22 R24 R25 R28 R31
+                       },
+                       outputs: []outputInfo{
+                               {0, 35183835217920}, // F0 F2 F4 F6 F8 F10 F12 F14 F16 F18 F20 F22 F24 F26 F28 F30
+                       },
+               },
+       },
        {
                name:   "MOVBreg",
                argLen: 1,
index 1f44346b7f6eaaa24c0a5315a52d388210c7e1de..6a259f5a475172334fbbb7b94f6130f607f8ee46 100644 (file)
@@ -2974,6 +2974,23 @@ func rewriteValueMIPS_OpMIPSMOVDstore(v *Value) bool {
 func rewriteValueMIPS_OpMIPSMOVFload(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
+       // match: (MOVFload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _))
+       // result: (MOVWgpfp val)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpMIPSMOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+                       break
+               }
+               val := v_1.Args[1]
+               if ptr != v_1.Args[0] {
+                       break
+               }
+               v.reset(OpMIPSMOVWgpfp)
+               v.AddArg(val)
+               return true
+       }
        // match: (MOVFload [off1] {sym} x:(ADDconst [off2] ptr) mem)
        // cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
        // result: (MOVFload [off1+off2] {sym} ptr mem)
@@ -3044,6 +3061,23 @@ func rewriteValueMIPS_OpMIPSMOVFstore(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
+       // match: (MOVFstore [off] {sym} ptr (MOVWgpfp val) mem)
+       // result: (MOVWstore [off] {sym} ptr val mem)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpMIPSMOVWgpfp {
+                       break
+               }
+               val := v_1.Args[0]
+               mem := v_2
+               v.reset(OpMIPSMOVWstore)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, val, mem)
+               return true
+       }
        // match: (MOVFstore [off1] {sym} x:(ADDconst [off2] ptr) val mem)
        // cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
        // result: (MOVFstore [off1+off2] {sym} ptr val mem)
@@ -3623,6 +3657,23 @@ func rewriteValueMIPS_OpMIPSMOVHstorezero(v *Value) bool {
 func rewriteValueMIPS_OpMIPSMOVWload(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
+       // match: (MOVWload [off] {sym} ptr (MOVFstore [off] {sym} ptr val _))
+       // result: (MOVWfpgp val)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpMIPSMOVFstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+                       break
+               }
+               val := v_1.Args[1]
+               if ptr != v_1.Args[0] {
+                       break
+               }
+               v.reset(OpMIPSMOVWfpgp)
+               v.AddArg(val)
+               return true
+       }
        // match: (MOVWload [off1] {sym} x:(ADDconst [off2] ptr) mem)
        // cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
        // result: (MOVWload [off1+off2] {sym} ptr mem)
@@ -3735,6 +3786,23 @@ func rewriteValueMIPS_OpMIPSMOVWstore(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
+       // match: (MOVWstore [off] {sym} ptr (MOVWfpgp val) mem)
+       // result: (MOVFstore [off] {sym} ptr val mem)
+       for {
+               off := auxIntToInt32(v.AuxInt)
+               sym := auxToSym(v.Aux)
+               ptr := v_0
+               if v_1.Op != OpMIPSMOVWfpgp {
+                       break
+               }
+               val := v_1.Args[0]
+               mem := v_2
+               v.reset(OpMIPSMOVFstore)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, val, mem)
+               return true
+       }
        // match: (MOVWstore [off1] {sym} x:(ADDconst [off2] ptr) val mem)
        // cond: (is16Bit(int64(off1+off2)) || x.Uses == 1)
        // result: (MOVWstore [off1+off2] {sym} ptr val mem)