From 97324858513735093a958a585cca6d5fc2780533 Mon Sep 17 00:00:00 2001 From: Ben Shi Date: Wed, 20 Sep 2017 08:48:34 +0000 Subject: [PATCH] cmd/compile: optimized ARM code with BFX/BFXU MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit BFX&BFXU were introduced in ARMv6T2. A single BFX or BFXU is more efficiently than a pair of left-shift/right-shift in bit field extraction. This patch implements this optimization. And the benchmark tests show big improvement in special cases and little change in total. 1. There is big improvement in a special test case. name old time/op new time/op delta BFX-4 665µs ± 1% 595µs ± 0% -10.61% (p=0.000 n=20+20) (The test case: https://github.com/benshi001/ugo1/blob/master/bfx_test.go) 2. The compilecmp benchmark shows no regression. name old time/op new time/op delta Template 2.33s ± 2% 2.34s ± 2% ~ (p=0.356 n=9+10) Unicode 1.32s ± 2% 1.30s ± 2% ~ (p=0.139 n=9+8) GoTypes 7.77s ± 1% 7.76s ± 1% ~ (p=0.780 n=10+9) Compiler 37.3s ± 1% 37.1s ± 1% ~ (p=0.211 n=10+9) SSA 84.3s ± 2% 84.3s ± 2% ~ (p=0.842 n=10+9) Flate 1.45s ± 1% 1.45s ± 3% ~ (p=0.853 n=10+10) GoParser 1.83s ± 2% 1.83s ± 2% ~ (p=0.739 n=10+10) Reflect 5.08s ± 2% 5.09s ± 2% ~ (p=0.720 n=9+10) Tar 2.44s ± 1% 2.44s ± 2% ~ (p=0.684 n=10+10) XML 2.62s ± 2% 2.62s ± 2% ~ (p=0.529 n=10+10) [Geo mean] 4.80s 4.79s -0.06% name old user-time/op new user-time/op delta Template 2.76s ± 2% 2.75s ± 3% ~ (p=0.893 n=10+10) Unicode 1.63s ± 1% 1.60s ± 1% -2.07% (p=0.000 n=8+9) GoTypes 9.54s ± 1% 9.52s ± 1% ~ (p=0.215 n=10+10) Compiler 46.0s ± 1% 46.0s ± 1% ~ (p=0.853 n=10+10) SSA 110s ± 1% 110s ± 1% ~ (p=0.838 n=10+10) Flate 1.69s ± 3% 1.69s ± 5% ~ (p=0.957 n=10+10) GoParser 2.15s ± 2% 2.15s ± 2% ~ (p=0.749 n=10+10) Reflect 6.03s ± 1% 5.99s ± 2% ~ (p=0.060 n=9+10) Tar 3.02s ± 2% 2.99s ± 2% ~ (p=0.214 n=10+10) XML 3.10s ± 2% 3.08s ± 2% ~ (p=0.732 n=9+10) [Geo mean] 5.82s 5.79s -0.41% name old text-bytes new text-bytes delta HelloSize 589kB ± 0% 589kB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 5.46kB ± 0% 5.46kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 76.9kB ± 0% 76.9kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.03MB ± 0% 1.03MB ± 0% ~ (all equal) 3. The go1 benchmark shows little change in total. (excluding noise) name old time/op new time/op delta BinaryTree17-4 41.5s ± 1% 41.6s ± 1% ~ (p=0.373 n=30+26) Fannkuch11-4 23.6s ± 1% 23.6s ± 1% +0.28% (p=0.003 n=29+30) FmtFprintfEmpty-4 826ns ± 1% 827ns ± 1% ~ (p=0.155 n=30+30) FmtFprintfString-4 1.35µs ± 1% 1.35µs ± 1% ~ (p=0.499 n=30+30) FmtFprintfInt-4 1.43µs ± 1% 1.41µs ± 1% -1.19% (p=0.000 n=30+30) FmtFprintfIntInt-4 2.15µs ± 1% 2.11µs ± 1% -1.78% (p=0.000 n=30+30) FmtFprintfPrefixedInt-4 2.21µs ± 1% 2.21µs ± 1% ~ (p=0.881 n=30+30) FmtFprintfFloat-4 4.41µs ± 1% 4.44µs ± 0% +0.64% (p=0.000 n=30+30) FmtManyArgs-4 8.06µs ± 1% 8.06µs ± 0% ~ (p=0.871 n=30+30) GobDecode-4 103ms ± 1% 104ms ± 2% +0.54% (p=0.013 n=28+29) GobEncode-4 92.4ms ± 1% 92.6ms ± 1% ~ (p=0.447 n=30+29) Gzip-4 4.17s ± 1% 4.06s ± 1% -2.56% (p=0.000 n=29+30) Gunzip-4 603ms ± 1% 602ms ± 1% ~ (p=0.423 n=30+30) HTTPClientServer-4 688µs ± 2% 674µs ± 3% -2.09% (p=0.000 n=29+30) JSONEncode-4 237ms ± 1% 237ms ± 1% ~ (p=0.061 n=29+30) JSONDecode-4 907ms ± 1% 910ms ± 1% ~ (p=0.061 n=30+30) Mandelbrot200-4 41.7ms ± 0% 41.7ms ± 0% +0.19% (p=0.000 n=24+20) GoParse-4 45.7ms ± 2% 45.5ms ± 2% -0.29% (p=0.005 n=30+30) RegexpMatchEasy0_32-4 1.27µs ± 0% 1.27µs ± 0% +0.12% (p=0.031 n=30+30) RegexpMatchEasy0_1K-4 7.77µs ± 4% 7.73µs ± 3% ~ (p=0.169 n=30+30) RegexpMatchEasy1_32-4 1.29µs ± 1% 1.29µs ± 1% ~ (p=0.126 n=30+30) RegexpMatchEasy1_1K-4 10.4µs ± 3% 10.3µs ± 2% -1.32% (p=0.004 n=30+29) RegexpMatchMedium_32-4 2.06µs ± 0% 2.06µs ± 0% ~ (p=0.071 n=30+30) RegexpMatchMedium_1K-4 531µs ± 1% 530µs ± 0% ~ (p=0.121 n=30+23) RegexpMatchHard_32-4 28.7µs ± 1% 28.6µs ± 1% -0.21% (p=0.001 n=30+27) RegexpMatchHard_1K-4 860µs ± 1% 857µs ± 1% ~ (p=0.105 n=30+27) Revcomp-4 67.3ms ± 2% 67.3ms ± 2% ~ (p=0.805 n=29+29) Template-4 1.08s ± 1% 1.08s ± 1% ~ (p=0.260 n=30+30) TimeParse-4 7.04µs ± 0% 7.04µs ± 0% ~ (p=0.315 n=30+30) TimeFormat-4 13.2µs ± 1% 13.2µs ± 1% ~ (p=0.077 n=30+30) [Geo mean] 715µs 713µs -0.30% name old speed new speed delta GobDecode-4 7.42MB/s ± 1% 7.38MB/s ± 2% -0.54% (p=0.011 n=28+29) GobEncode-4 8.30MB/s ± 1% 8.29MB/s ± 1% ~ (p=0.484 n=30+29) Gzip-4 4.65MB/s ± 2% 4.78MB/s ± 1% +2.73% (p=0.000 n=30+30) Gunzip-4 32.2MB/s ± 1% 32.2MB/s ± 1% ~ (p=0.357 n=30+30) JSONEncode-4 8.18MB/s ± 1% 8.19MB/s ± 1% ~ (p=0.052 n=29+30) JSONDecode-4 2.14MB/s ± 1% 2.13MB/s ± 1% ~ (p=0.074 n=30+29) GoParse-4 1.27MB/s ± 1% 1.27MB/s ± 2% ~ (p=0.618 n=24+30) RegexpMatchEasy0_32-4 25.2MB/s ± 0% 25.2MB/s ± 0% -0.12% (p=0.031 n=30+30) RegexpMatchEasy0_1K-4 132MB/s ± 5% 132MB/s ± 2% ~ (p=0.171 n=30+30) RegexpMatchEasy1_32-4 24.8MB/s ± 1% 24.9MB/s ± 1% ~ (p=0.106 n=30+30) RegexpMatchEasy1_1K-4 98.4MB/s ± 3% 99.6MB/s ± 4% +1.19% (p=0.011 n=30+30) RegexpMatchMedium_32-4 483kB/s ± 1% 484kB/s ± 1% ~ (p=0.426 n=30+30) RegexpMatchMedium_1K-4 1.93MB/s ± 1% 1.93MB/s ± 0% ~ (p=0.157 n=30+17) RegexpMatchHard_32-4 1.12MB/s ± 1% 1.12MB/s ± 0% +0.33% (p=0.001 n=30+24) RegexpMatchHard_1K-4 1.19MB/s ± 1% 1.19MB/s ± 1% ~ (p=0.290 n=30+30) Revcomp-4 37.8MB/s ± 2% 37.8MB/s ± 1% ~ (p=0.815 n=29+29) Template-4 1.80MB/s ± 1% 1.80MB/s ± 1% ~ (p=0.586 n=30+30) [Geo mean] 6.80MB/s 6.81MB/s +0.25% fixes #20966 Change-Id: Idb5567bbe988c875315b8c98c128957cd474ccc5 Reviewed-on: https://go-review.googlesource.com/64950 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang --- src/cmd/compile/internal/arm/ssa.go | 8 +++ src/cmd/compile/internal/ssa/gen/ARM.rules | 7 ++ src/cmd/compile/internal/ssa/gen/ARMOps.go | 4 ++ src/cmd/compile/internal/ssa/opGen.go | 30 +++++++++ src/cmd/compile/internal/ssa/rewriteARM.go | 76 ++++++++++++++++++++++ 5 files changed, 125 insertions(+) diff --git a/src/cmd/compile/internal/arm/ssa.go b/src/cmd/compile/internal/arm/ssa.go index 4655513fa5..140b9d10ac 100644 --- a/src/cmd/compile/internal/arm/ssa.go +++ b/src/cmd/compile/internal/arm/ssa.go @@ -258,6 +258,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.Reg = r1 p.To.Type = obj.TYPE_REG p.To.Reg = r + case ssa.OpARMBFX, ssa.OpARMBFXU: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_CONST + p.From.Offset = v.AuxInt >> 8 + p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: v.AuxInt & 0xff}) + p.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() case ssa.OpARMADDconst, ssa.OpARMADCconst, ssa.OpARMSUBconst, diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules index b21cd6f9f3..0c53aa3b77 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM.rules @@ -867,6 +867,9 @@ (MOVHreg (MOVWconst [c])) -> (MOVWconst [int64(int16(c))]) (MOVHUreg (MOVWconst [c])) -> (MOVWconst [int64(uint16(c))]) (MOVWreg (MOVWconst [c])) -> (MOVWconst [c]) +// BFX: Width = c >> 8, LSB = c & 0xff, result = d << (32 - Width - LSB) >> (32 - Width) +(BFX [c] (MOVWconst [d])) -> (MOVWconst [int64(int32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))]) +(BFXU [c] (MOVWconst [d])) -> (MOVWconst [int64(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))]) // absorb shifts into ops (ADD x (SLLconst [c] y)) -> (ADDshiftLL x y [c]) @@ -1286,3 +1289,7 @@ // floating point optimizations (CMPF x (MOVFconst [0])) -> (CMPF0 x) (CMPD x (MOVDconst [0])) -> (CMPD0 x) + +// bit extraction +(SRAconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 -> (BFX [(d-c)|(32-d)<<8] x) +(SRLconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 -> (BFXU [(d-c)|(32-d)<<8] x) diff --git a/src/cmd/compile/internal/ssa/gen/ARMOps.go b/src/cmd/compile/internal/ssa/gen/ARMOps.go index 93b50135d4..668ee46a54 100644 --- a/src/cmd/compile/internal/ssa/gen/ARMOps.go +++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go @@ -196,6 +196,10 @@ func init() { {name: "BIC", argLength: 2, reg: gp21, asm: "BIC"}, // arg0 &^ arg1 {name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt + // bit extraction, AuxInt = Width<<8 | LSB + {name: "BFX", argLength: 1, reg: gp11, asm: "BFX", aux: "Int32"}, // extract W bits from bit L in arg0, then signed extend + {name: "BFXU", argLength: 1, reg: gp11, asm: "BFXU", aux: "Int32"}, // extract W bits from bit L in arg0, then unsigned extend + // unary ops {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0 diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index d85ab0aab5..1f0138b610 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -719,6 +719,8 @@ const ( OpARMXORconst OpARMBIC OpARMBICconst + OpARMBFX + OpARMBFXU OpARMMVN OpARMNEGF OpARMNEGD @@ -8838,6 +8840,34 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "BFX", + auxType: auxInt32, + argLen: 1, + asm: arm.ABFX, + reg: regInfo{ + inputs: []inputInfo{ + {0, 22527}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 g R12 R14 + }, + outputs: []outputInfo{ + {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14 + }, + }, + }, + { + name: "BFXU", + auxType: auxInt32, + argLen: 1, + asm: arm.ABFXU, + reg: regInfo{ + inputs: []inputInfo{ + {0, 22527}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 g R12 R14 + }, + outputs: []outputInfo{ + {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14 + }, + }, + }, { name: "MVN", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go index 38695c503d..73aeb81ed7 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM.go +++ b/src/cmd/compile/internal/ssa/rewriteARM.go @@ -81,6 +81,10 @@ func rewriteValueARM(v *Value) bool { return rewriteValueARM_OpARMANDshiftRL_0(v) case OpARMANDshiftRLreg: return rewriteValueARM_OpARMANDshiftRLreg_0(v) + case OpARMBFX: + return rewriteValueARM_OpARMBFX_0(v) + case OpARMBFXU: + return rewriteValueARM_OpARMBFXU_0(v) case OpARMBIC: return rewriteValueARM_OpARMBIC_0(v) case OpARMBICconst: @@ -3966,6 +3970,40 @@ func rewriteValueARM_OpARMANDshiftRLreg_0(v *Value) bool { } return false } +func rewriteValueARM_OpARMBFX_0(v *Value) bool { + // match: (BFX [c] (MOVWconst [d])) + // cond: + // result: (MOVWconst [int64(int32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))]) + for { + c := v.AuxInt + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + d := v_0.AuxInt + v.reset(OpARMMOVWconst) + v.AuxInt = int64(int32(d) << (32 - uint32(c&0xff) - uint32(c>>8)) >> (32 - uint32(c>>8))) + return true + } + return false +} +func rewriteValueARM_OpARMBFXU_0(v *Value) bool { + // match: (BFXU [c] (MOVWconst [d])) + // cond: + // result: (MOVWconst [int64(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))]) + for { + c := v.AuxInt + v_0 := v.Args[0] + if v_0.Op != OpARMMOVWconst { + break + } + d := v_0.AuxInt + v.reset(OpARMMOVWconst) + v.AuxInt = int64(uint32(d) << (32 - uint32(c&0xff) - uint32(c>>8)) >> (32 - uint32(c>>8))) + return true + } + return false +} func rewriteValueARM_OpARMBIC_0(v *Value) bool { // match: (BIC x (MOVWconst [c])) // cond: @@ -13484,6 +13522,25 @@ func rewriteValueARM_OpARMSRAconst_0(v *Value) bool { v.AuxInt = int64(int32(d) >> uint64(c)) return true } + // match: (SRAconst (SLLconst x [c]) [d]) + // cond: objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 + // result: (BFX [(d-c)|(32-d)<<8] x) + for { + d := v.AuxInt + v_0 := v.Args[0] + if v_0.Op != OpARMSLLconst { + break + } + c := v_0.AuxInt + x := v_0.Args[0] + if !(objabi.GOARM == 7 && uint64(d) >= uint64(c) && uint64(d) <= 31) { + break + } + v.reset(OpARMBFX) + v.AuxInt = (d - c) | (32-d)<<8 + v.AddArg(x) + return true + } return false } func rewriteValueARM_OpARMSRL_0(v *Value) bool { @@ -13520,6 +13577,25 @@ func rewriteValueARM_OpARMSRLconst_0(v *Value) bool { v.AuxInt = int64(uint32(d) >> uint64(c)) return true } + // match: (SRLconst (SLLconst x [c]) [d]) + // cond: objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 + // result: (BFXU [(d-c)|(32-d)<<8] x) + for { + d := v.AuxInt + v_0 := v.Args[0] + if v_0.Op != OpARMSLLconst { + break + } + c := v_0.AuxInt + x := v_0.Args[0] + if !(objabi.GOARM == 7 && uint64(d) >= uint64(c) && uint64(d) <= 31) { + break + } + v.reset(OpARMBFXU) + v.AuxInt = (d - c) | (32-d)<<8 + v.AddArg(x) + return true + } return false } func rewriteValueARM_OpARMSUB_0(v *Value) bool { -- 2.50.0