]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: optimized ARM code with BFX/BFXU
authorBen Shi <powerman1st@163.com>
Wed, 20 Sep 2017 08:48:34 +0000 (08:48 +0000)
committerCherry Zhang <cherryyz@google.com>
Thu, 21 Sep 2017 12:41:04 +0000 (12:41 +0000)
BFX&BFXU were introduced in ARMv6T2. A single BFX or BFXU is
more efficiently than a pair of left-shift/right-shift in bit
field extraction.

This patch implements this optimization. And the benchmark tests
show big improvement in special cases and little change in total.

1. There is big improvement in a special test case.
name                     old time/op    new time/op    delta
BFX-4                       665µs ± 1%     595µs ± 0%  -10.61%  (p=0.000 n=20+20)
(The test case: https://github.com/benshi001/ugo1/blob/master/bfx_test.go)

2. The compilecmp benchmark shows no regression.
name        old time/op       new time/op       delta
Template          2.33s ± 2%        2.34s ± 2%    ~     (p=0.356 n=9+10)
Unicode           1.32s ± 2%        1.30s ± 2%    ~     (p=0.139 n=9+8)
GoTypes           7.77s ± 1%        7.76s ± 1%    ~     (p=0.780 n=10+9)
Compiler          37.3s ± 1%        37.1s ± 1%    ~     (p=0.211 n=10+9)
SSA               84.3s ± 2%        84.3s ± 2%    ~     (p=0.842 n=10+9)
Flate             1.45s ± 1%        1.45s ± 3%    ~     (p=0.853 n=10+10)
GoParser          1.83s ± 2%        1.83s ± 2%    ~     (p=0.739 n=10+10)
Reflect           5.08s ± 2%        5.09s ± 2%    ~     (p=0.720 n=9+10)
Tar               2.44s ± 1%        2.44s ± 2%    ~     (p=0.684 n=10+10)
XML               2.62s ± 2%        2.62s ± 2%    ~     (p=0.529 n=10+10)
[Geo mean]        4.80s             4.79s       -0.06%

name        old user-time/op  new user-time/op  delta
Template          2.76s ± 2%        2.75s ± 3%    ~     (p=0.893 n=10+10)
Unicode           1.63s ± 1%        1.60s ± 1%  -2.07%  (p=0.000 n=8+9)
GoTypes           9.54s ± 1%        9.52s ± 1%    ~     (p=0.215 n=10+10)
Compiler          46.0s ± 1%        46.0s ± 1%    ~     (p=0.853 n=10+10)
SSA                110s ± 1%         110s ± 1%    ~     (p=0.838 n=10+10)
Flate             1.69s ± 3%        1.69s ± 5%    ~     (p=0.957 n=10+10)
GoParser          2.15s ± 2%        2.15s ± 2%    ~     (p=0.749 n=10+10)
Reflect           6.03s ± 1%        5.99s ± 2%    ~     (p=0.060 n=9+10)
Tar               3.02s ± 2%        2.99s ± 2%    ~     (p=0.214 n=10+10)
XML               3.10s ± 2%        3.08s ± 2%    ~     (p=0.732 n=9+10)
[Geo mean]        5.82s             5.79s       -0.41%

name        old text-bytes    new text-bytes    delta
HelloSize         589kB ± 0%        589kB ± 0%    ~     (all equal)

name        old data-bytes    new data-bytes    delta
HelloSize        5.46kB ± 0%       5.46kB ± 0%    ~     (all equal)

name        old bss-bytes     new bss-bytes     delta
HelloSize        76.9kB ± 0%       76.9kB ± 0%    ~     (all equal)

name        old exe-bytes     new exe-bytes     delta
HelloSize        1.03MB ± 0%       1.03MB ± 0%    ~     (all equal)

3. The go1 benchmark shows little change in total. (excluding noise)
name                     old time/op    new time/op    delta
BinaryTree17-4              41.5s ± 1%     41.6s ± 1%    ~     (p=0.373 n=30+26)
Fannkuch11-4                23.6s ± 1%     23.6s ± 1%  +0.28%  (p=0.003 n=29+30)
FmtFprintfEmpty-4           826ns ± 1%     827ns ± 1%    ~     (p=0.155 n=30+30)
FmtFprintfString-4         1.35µs ± 1%    1.35µs ± 1%    ~     (p=0.499 n=30+30)
FmtFprintfInt-4            1.43µs ± 1%    1.41µs ± 1%  -1.19%  (p=0.000 n=30+30)
FmtFprintfIntInt-4         2.15µs ± 1%    2.11µs ± 1%  -1.78%  (p=0.000 n=30+30)
FmtFprintfPrefixedInt-4    2.21µs ± 1%    2.21µs ± 1%    ~     (p=0.881 n=30+30)
FmtFprintfFloat-4          4.41µs ± 1%    4.44µs ± 0%  +0.64%  (p=0.000 n=30+30)
FmtManyArgs-4              8.06µs ± 1%    8.06µs ± 0%    ~     (p=0.871 n=30+30)
GobDecode-4                 103ms ± 1%     104ms ± 2%  +0.54%  (p=0.013 n=28+29)
GobEncode-4                92.4ms ± 1%    92.6ms ± 1%    ~     (p=0.447 n=30+29)
Gzip-4                      4.17s ± 1%     4.06s ± 1%  -2.56%  (p=0.000 n=29+30)
Gunzip-4                    603ms ± 1%     602ms ± 1%    ~     (p=0.423 n=30+30)
HTTPClientServer-4          688µs ± 2%     674µs ± 3%  -2.09%  (p=0.000 n=29+30)
JSONEncode-4                237ms ± 1%     237ms ± 1%    ~     (p=0.061 n=29+30)
JSONDecode-4                907ms ± 1%     910ms ± 1%    ~     (p=0.061 n=30+30)
Mandelbrot200-4            41.7ms ± 0%    41.7ms ± 0%  +0.19%  (p=0.000 n=24+20)
GoParse-4                  45.7ms ± 2%    45.5ms ± 2%  -0.29%  (p=0.005 n=30+30)
RegexpMatchEasy0_32-4      1.27µs ± 0%    1.27µs ± 0%  +0.12%  (p=0.031 n=30+30)
RegexpMatchEasy0_1K-4      7.77µs ± 4%    7.73µs ± 3%    ~     (p=0.169 n=30+30)
RegexpMatchEasy1_32-4      1.29µs ± 1%    1.29µs ± 1%    ~     (p=0.126 n=30+30)
RegexpMatchEasy1_1K-4      10.4µs ± 3%    10.3µs ± 2%  -1.32%  (p=0.004 n=30+29)
RegexpMatchMedium_32-4     2.06µs ± 0%    2.06µs ± 0%    ~     (p=0.071 n=30+30)
RegexpMatchMedium_1K-4      531µs ± 1%     530µs ± 0%    ~     (p=0.121 n=30+23)
RegexpMatchHard_32-4       28.7µs ± 1%    28.6µs ± 1%  -0.21%  (p=0.001 n=30+27)
RegexpMatchHard_1K-4        860µs ± 1%     857µs ± 1%    ~     (p=0.105 n=30+27)
Revcomp-4                  67.3ms ± 2%    67.3ms ± 2%    ~     (p=0.805 n=29+29)
Template-4                  1.08s ± 1%     1.08s ± 1%    ~     (p=0.260 n=30+30)
TimeParse-4                7.04µs ± 0%    7.04µs ± 0%    ~     (p=0.315 n=30+30)
TimeFormat-4               13.2µs ± 1%    13.2µs ± 1%    ~     (p=0.077 n=30+30)
[Geo mean]                  715µs          713µs       -0.30%

name                     old speed      new speed      delta
GobDecode-4              7.42MB/s ± 1%  7.38MB/s ± 2%  -0.54%  (p=0.011 n=28+29)
GobEncode-4              8.30MB/s ± 1%  8.29MB/s ± 1%    ~     (p=0.484 n=30+29)
Gzip-4                   4.65MB/s ± 2%  4.78MB/s ± 1%  +2.73%  (p=0.000 n=30+30)
Gunzip-4                 32.2MB/s ± 1%  32.2MB/s ± 1%    ~     (p=0.357 n=30+30)
JSONEncode-4             8.18MB/s ± 1%  8.19MB/s ± 1%    ~     (p=0.052 n=29+30)
JSONDecode-4             2.14MB/s ± 1%  2.13MB/s ± 1%    ~     (p=0.074 n=30+29)
GoParse-4                1.27MB/s ± 1%  1.27MB/s ± 2%    ~     (p=0.618 n=24+30)
RegexpMatchEasy0_32-4    25.2MB/s ± 0%  25.2MB/s ± 0%  -0.12%  (p=0.031 n=30+30)
RegexpMatchEasy0_1K-4     132MB/s ± 5%   132MB/s ± 2%    ~     (p=0.171 n=30+30)
RegexpMatchEasy1_32-4    24.8MB/s ± 1%  24.9MB/s ± 1%    ~     (p=0.106 n=30+30)
RegexpMatchEasy1_1K-4    98.4MB/s ± 3%  99.6MB/s ± 4%  +1.19%  (p=0.011 n=30+30)
RegexpMatchMedium_32-4    483kB/s ± 1%   484kB/s ± 1%    ~     (p=0.426 n=30+30)
RegexpMatchMedium_1K-4   1.93MB/s ± 1%  1.93MB/s ± 0%    ~     (p=0.157 n=30+17)
RegexpMatchHard_32-4     1.12MB/s ± 1%  1.12MB/s ± 0%  +0.33%  (p=0.001 n=30+24)
RegexpMatchHard_1K-4     1.19MB/s ± 1%  1.19MB/s ± 1%    ~     (p=0.290 n=30+30)
Revcomp-4                37.8MB/s ± 2%  37.8MB/s ± 1%    ~     (p=0.815 n=29+29)
Template-4               1.80MB/s ± 1%  1.80MB/s ± 1%    ~     (p=0.586 n=30+30)
[Geo mean]               6.80MB/s       6.81MB/s       +0.25%

fixes #20966

Change-Id: Idb5567bbe988c875315b8c98c128957cd474ccc5
Reviewed-on: https://go-review.googlesource.com/64950
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>

src/cmd/compile/internal/arm/ssa.go
src/cmd/compile/internal/ssa/gen/ARM.rules
src/cmd/compile/internal/ssa/gen/ARMOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteARM.go

index 4655513fa5a63620dd1230fd5e90ac8ec8ed5f95..140b9d10acb8b99e423f4a3febdb8907db798ebc 100644 (file)
@@ -258,6 +258,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.Reg = r1
                p.To.Type = obj.TYPE_REG
                p.To.Reg = r
+       case ssa.OpARMBFX, ssa.OpARMBFXU:
+               p := s.Prog(v.Op.Asm())
+               p.From.Type = obj.TYPE_CONST
+               p.From.Offset = v.AuxInt >> 8
+               p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: v.AuxInt & 0xff})
+               p.Reg = v.Args[0].Reg()
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = v.Reg()
        case ssa.OpARMADDconst,
                ssa.OpARMADCconst,
                ssa.OpARMSUBconst,
index b21cd6f9f3d6f8d53c6e0f78cc9c98ada6284096..0c53aa3b77936f612e29d61a8ab8fa38635c8b12 100644 (file)
 (MOVHreg (MOVWconst [c])) -> (MOVWconst [int64(int16(c))])
 (MOVHUreg (MOVWconst [c])) -> (MOVWconst [int64(uint16(c))])
 (MOVWreg (MOVWconst [c])) -> (MOVWconst [c])
+// BFX: Width = c >> 8, LSB = c & 0xff, result = d << (32 - Width - LSB) >> (32 - Width)
+(BFX [c] (MOVWconst [d])) -> (MOVWconst [int64(int32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
+(BFXU [c] (MOVWconst [d])) -> (MOVWconst [int64(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
 
 // absorb shifts into ops
 (ADD x (SLLconst [c] y)) -> (ADDshiftLL x y [c])
 // floating point optimizations
 (CMPF x (MOVFconst [0])) -> (CMPF0 x)
 (CMPD x (MOVDconst [0])) -> (CMPD0 x)
+
+// bit extraction
+(SRAconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 -> (BFX [(d-c)|(32-d)<<8] x)
+(SRLconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 -> (BFXU [(d-c)|(32-d)<<8] x)
index 93b50135d437d7694140da1b0e97c07d2a60076f..668ee46a549a652d481eba5c419d2f33aacfb847 100644 (file)
@@ -196,6 +196,10 @@ func init() {
                {name: "BIC", argLength: 2, reg: gp21, asm: "BIC"},                    // arg0 &^ arg1
                {name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt
 
+               // bit extraction, AuxInt = Width<<8 | LSB
+               {name: "BFX", argLength: 1, reg: gp11, asm: "BFX", aux: "Int32"},   // extract W bits from bit L in arg0, then signed extend
+               {name: "BFXU", argLength: 1, reg: gp11, asm: "BFXU", aux: "Int32"}, // extract W bits from bit L in arg0, then unsigned extend
+
                // unary ops
                {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
 
index d85ab0aab5d139e0c23655e989a52b058e281ca7..1f0138b610adda9007191d69d0fd91401a6bf273 100644 (file)
@@ -719,6 +719,8 @@ const (
        OpARMXORconst
        OpARMBIC
        OpARMBICconst
+       OpARMBFX
+       OpARMBFXU
        OpARMMVN
        OpARMNEGF
        OpARMNEGD
@@ -8838,6 +8840,34 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:    "BFX",
+               auxType: auxInt32,
+               argLen:  1,
+               asm:     arm.ABFX,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 22527}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 g R12 R14
+                       },
+                       outputs: []outputInfo{
+                               {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
+                       },
+               },
+       },
+       {
+               name:    "BFXU",
+               auxType: auxInt32,
+               argLen:  1,
+               asm:     arm.ABFXU,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 22527}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 g R12 R14
+                       },
+                       outputs: []outputInfo{
+                               {0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
+                       },
+               },
+       },
        {
                name:   "MVN",
                argLen: 1,
index 38695c503dbdf7f173582e81e93e9bb5a7bd5a3f..73aeb81ed7720e8f64c00693f4b8942db34d611f 100644 (file)
@@ -81,6 +81,10 @@ func rewriteValueARM(v *Value) bool {
                return rewriteValueARM_OpARMANDshiftRL_0(v)
        case OpARMANDshiftRLreg:
                return rewriteValueARM_OpARMANDshiftRLreg_0(v)
+       case OpARMBFX:
+               return rewriteValueARM_OpARMBFX_0(v)
+       case OpARMBFXU:
+               return rewriteValueARM_OpARMBFXU_0(v)
        case OpARMBIC:
                return rewriteValueARM_OpARMBIC_0(v)
        case OpARMBICconst:
@@ -3966,6 +3970,40 @@ func rewriteValueARM_OpARMANDshiftRLreg_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueARM_OpARMBFX_0(v *Value) bool {
+       // match: (BFX [c] (MOVWconst [d]))
+       // cond:
+       // result: (MOVWconst [int64(int32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
+       for {
+               c := v.AuxInt
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               d := v_0.AuxInt
+               v.reset(OpARMMOVWconst)
+               v.AuxInt = int64(int32(d) << (32 - uint32(c&0xff) - uint32(c>>8)) >> (32 - uint32(c>>8)))
+               return true
+       }
+       return false
+}
+func rewriteValueARM_OpARMBFXU_0(v *Value) bool {
+       // match: (BFXU [c] (MOVWconst [d]))
+       // cond:
+       // result: (MOVWconst [int64(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
+       for {
+               c := v.AuxInt
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMMOVWconst {
+                       break
+               }
+               d := v_0.AuxInt
+               v.reset(OpARMMOVWconst)
+               v.AuxInt = int64(uint32(d) << (32 - uint32(c&0xff) - uint32(c>>8)) >> (32 - uint32(c>>8)))
+               return true
+       }
+       return false
+}
 func rewriteValueARM_OpARMBIC_0(v *Value) bool {
        // match: (BIC x (MOVWconst [c]))
        // cond:
@@ -13484,6 +13522,25 @@ func rewriteValueARM_OpARMSRAconst_0(v *Value) bool {
                v.AuxInt = int64(int32(d) >> uint64(c))
                return true
        }
+       // match: (SRAconst (SLLconst x [c]) [d])
+       // cond: objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31
+       // result: (BFX [(d-c)|(32-d)<<8] x)
+       for {
+               d := v.AuxInt
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMSLLconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v_0.Args[0]
+               if !(objabi.GOARM == 7 && uint64(d) >= uint64(c) && uint64(d) <= 31) {
+                       break
+               }
+               v.reset(OpARMBFX)
+               v.AuxInt = (d - c) | (32-d)<<8
+               v.AddArg(x)
+               return true
+       }
        return false
 }
 func rewriteValueARM_OpARMSRL_0(v *Value) bool {
@@ -13520,6 +13577,25 @@ func rewriteValueARM_OpARMSRLconst_0(v *Value) bool {
                v.AuxInt = int64(uint32(d) >> uint64(c))
                return true
        }
+       // match: (SRLconst (SLLconst x [c]) [d])
+       // cond: objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31
+       // result: (BFXU [(d-c)|(32-d)<<8] x)
+       for {
+               d := v.AuxInt
+               v_0 := v.Args[0]
+               if v_0.Op != OpARMSLLconst {
+                       break
+               }
+               c := v_0.AuxInt
+               x := v_0.Args[0]
+               if !(objabi.GOARM == 7 && uint64(d) >= uint64(c) && uint64(d) <= 31) {
+                       break
+               }
+               v.reset(OpARMBFXU)
+               v.AuxInt = (d - c) | (32-d)<<8
+               v.AddArg(x)
+               return true
+       }
        return false
 }
 func rewriteValueARM_OpARMSUB_0(v *Value) bool {