]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: optimize multi-register shifts on amd64
authorJosh Bleecher Snyder <josharian@gmail.com>
Fri, 8 Jan 2021 03:25:05 +0000 (19:25 -0800)
committerJosh Bleecher Snyder <josharian@gmail.com>
Thu, 11 Mar 2021 19:11:46 +0000 (19:11 +0000)
amd64 can shift in bits from another register instead of filling with 0/1.
This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts.
In the standard library, it shows up in pure Go math/big.

Benchmarks results on amd64 with -tags=math_big_pure_go.

name                          old time/op  new time/op  delta
NonZeroShifts/1/shrVU-8       4.45ns ± 3%  4.39ns ± 1%   -1.28%  (p=0.000 n=30+27)
NonZeroShifts/1/shlVU-8       4.13ns ± 4%  4.10ns ± 2%     ~     (p=0.254 n=29+28)
NonZeroShifts/2/shrVU-8       5.55ns ± 1%  5.63ns ± 2%   +1.42%  (p=0.000 n=28+29)
NonZeroShifts/2/shlVU-8       5.70ns ± 2%  5.14ns ± 1%   -9.82%  (p=0.000 n=29+28)
NonZeroShifts/3/shrVU-8       6.79ns ± 2%  6.35ns ± 2%   -6.46%  (p=0.000 n=28+29)
NonZeroShifts/3/shlVU-8       6.69ns ± 1%  6.25ns ± 1%   -6.60%  (p=0.000 n=28+27)
NonZeroShifts/4/shrVU-8       7.79ns ± 2%  7.06ns ± 2%   -9.48%  (p=0.000 n=30+30)
NonZeroShifts/4/shlVU-8       7.82ns ± 1%  7.24ns ± 1%   -7.37%  (p=0.000 n=28+29)
NonZeroShifts/5/shrVU-8       8.90ns ± 3%  7.93ns ± 1%  -10.84%  (p=0.000 n=29+26)
NonZeroShifts/5/shlVU-8       8.68ns ± 1%  7.92ns ± 1%   -8.76%  (p=0.000 n=29+29)
NonZeroShifts/10/shrVU-8      14.4ns ± 1%  12.3ns ± 2%  -14.79%  (p=0.000 n=28+29)
NonZeroShifts/10/shlVU-8      14.1ns ± 1%  11.9ns ± 2%  -15.55%  (p=0.000 n=28+27)
NonZeroShifts/100/shrVU-8      118ns ± 1%    96ns ± 3%  -18.82%  (p=0.000 n=30+29)
NonZeroShifts/100/shlVU-8      120ns ± 2%    98ns ± 2%  -18.46%  (p=0.000 n=29+28)
NonZeroShifts/1000/shrVU-8    1.10µs ± 1%  0.88µs ± 2%  -19.63%  (p=0.000 n=29+30)
NonZeroShifts/1000/shlVU-8    1.10µs ± 2%  0.88µs ± 2%  -20.28%  (p=0.000 n=29+28)
NonZeroShifts/10000/shrVU-8   10.9µs ± 1%   8.7µs ± 1%  -19.78%  (p=0.000 n=28+27)
NonZeroShifts/10000/shlVU-8   10.9µs ± 2%   8.7µs ± 1%  -19.64%  (p=0.000 n=29+27)
NonZeroShifts/100000/shrVU-8   111µs ± 2%    90µs ± 2%  -19.39%  (p=0.000 n=28+29)
NonZeroShifts/100000/shlVU-8   113µs ± 2%    90µs ± 2%  -20.43%  (p=0.000 n=30+27)

The assembly version is still faster, unfortunately, but the gap is narrowing.
Speedup from pure Go to assembly:

name                          old time/op  new time/op  delta
NonZeroShifts/1/shrVU-8       4.39ns ± 1%  3.45ns ± 2%  -21.36%  (p=0.000 n=27+29)
NonZeroShifts/1/shlVU-8       4.10ns ± 2%  3.47ns ± 3%  -15.42%  (p=0.000 n=28+30)
NonZeroShifts/2/shrVU-8       5.63ns ± 2%  3.97ns ± 0%  -29.40%  (p=0.000 n=29+25)
NonZeroShifts/2/shlVU-8       5.14ns ± 1%  3.77ns ± 2%  -26.65%  (p=0.000 n=28+26)
NonZeroShifts/3/shrVU-8       6.35ns ± 2%  4.79ns ± 2%  -24.52%  (p=0.000 n=29+29)
NonZeroShifts/3/shlVU-8       6.25ns ± 1%  4.42ns ± 1%  -29.29%  (p=0.000 n=27+26)
NonZeroShifts/4/shrVU-8       7.06ns ± 2%  5.64ns ± 1%  -20.05%  (p=0.000 n=30+29)
NonZeroShifts/4/shlVU-8       7.24ns ± 1%  5.34ns ± 2%  -26.23%  (p=0.000 n=29+29)
NonZeroShifts/5/shrVU-8       7.93ns ± 1%  6.56ns ± 2%  -17.26%  (p=0.000 n=26+30)
NonZeroShifts/5/shlVU-8       7.92ns ± 1%  6.27ns ± 1%  -20.79%  (p=0.000 n=29+25)
NonZeroShifts/10/shrVU-8      12.3ns ± 2%  10.2ns ± 2%  -17.21%  (p=0.000 n=29+29)
NonZeroShifts/10/shlVU-8      11.9ns ± 2%  10.5ns ± 2%  -12.45%  (p=0.000 n=27+29)
NonZeroShifts/100/shrVU-8     95.9ns ± 3%  77.7ns ± 1%  -19.00%  (p=0.000 n=29+30)
NonZeroShifts/100/shlVU-8     97.5ns ± 2%  66.8ns ± 2%  -31.47%  (p=0.000 n=28+30)
NonZeroShifts/1000/shrVU-8     884ns ± 2%   705ns ± 1%  -20.17%  (p=0.000 n=30+28)
NonZeroShifts/1000/shlVU-8     880ns ± 2%   590ns ± 1%  -32.96%  (p=0.000 n=28+25)
NonZeroShifts/10000/shrVU-8   8.74µs ± 1%  7.34µs ± 3%  -15.94%  (p=0.000 n=27+30)
NonZeroShifts/10000/shlVU-8   8.73µs ± 1%  6.00µs ± 1%  -31.25%  (p=0.000 n=27+28)
NonZeroShifts/100000/shrVU-8  89.6µs ± 2%  75.5µs ± 2%  -15.80%  (p=0.000 n=29+29)
NonZeroShifts/100000/shlVU-8  89.6µs ± 2%  68.0µs ± 3%  -24.09%  (p=0.000 n=27+30)

Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958
Reviewed-on: https://go-review.googlesource.com/c/go/+/297050
Trust: Josh Bleecher Snyder <josharian@gmail.com>
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/gen/AMD64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
test/codegen/shift.go

index af398c814abf90f006818188a2fbd370a3cf7872..3798c37b34f88fd6999357ad8ce3a74261ca39cc 100644 (file)
@@ -253,6 +253,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
                opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
 
+       case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
+               p := s.Prog(v.Op.Asm())
+               lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = bits
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = lo
+               p.SetFrom3Reg(hi)
+
        case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
                // Arg[0] (the dividend) is in AX.
                // Arg[1] (the divisor) can be in any other register.
index c61b460a5658b0556ca265b7301c87ce07c82389..bece886c0dac1a4af9a9862b2eae6986b1de00ff 100644 (file)
 ((SHRB|SARB)const           x [0]) => x
 ((ROLQ|ROLL|ROLW|ROLB)const x [0]) => x
 
+// Multi-register shifts
+(ORQ (SH(R|L)Q lo bits) (SH(L|R)Q hi (NEGQ bits))) => (SH(R|L)DQ lo hi bits)
+
 // Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
 // because the x86 instructions are defined to use all 5 bits of the shift even
 // for the small shifts. I don't think we'll ever generate a weird shift (e.g.
index 5f5ebaaa3552217945eb318292216ae7465567e4..6bf5be9e47f7f3c3831a4fb5e0f29539fd6cbf50 100644 (file)
@@ -122,6 +122,7 @@ func init() {
                gp21sp         = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
                gp21sb         = regInfo{inputs: []regMask{gpspsbg, gpsp}, outputs: gponly}
                gp21shift      = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
+               gp31shift      = regInfo{inputs: []regMask{gp, gp, cx}, outputs: []regMask{gp}}
                gp11div        = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
                gp21hmul       = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
                gp21flags      = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
@@ -408,6 +409,9 @@ func init() {
                {name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed int16(arg0) >> auxint, shift amount 0-15
                {name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed int8(arg0) >> auxint, shift amount 0-7
 
+               {name: "SHRDQ", argLength: 3, reg: gp31shift, asm: "SHRQ", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg2, shifting in bits from arg1 (==(arg1<<64+arg0)>>arg2, keeping low 64 bits), shift amount is mod 64
+               {name: "SHLDQ", argLength: 3, reg: gp31shift, asm: "SHLQ", resultInArg0: true, clobberFlags: true}, // unsigned arg0 << arg2, shifting in bits from arg1 (==(arg0<<64+arg1)<<arg2, keeping high 64 bits), shift amount is mod 64
+
                {name: "ROLQ", argLength: 2, reg: gp21shift, asm: "ROLQ", resultInArg0: true, clobberFlags: true},              // arg0 rotate left arg1 bits.
                {name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true},              // arg0 rotate left arg1 bits.
                {name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true},              // arg0 rotate left arg1 bits.
index 34445cfbf19ed57257e496c49fab3eb36c984459..e65c4c4a18ea88ae38572291e15e870e86d7756d 100644 (file)
@@ -732,6 +732,8 @@ const (
        OpAMD64SARLconst
        OpAMD64SARWconst
        OpAMD64SARBconst
+       OpAMD64SHRDQ
+       OpAMD64SHLDQ
        OpAMD64ROLQ
        OpAMD64ROLL
        OpAMD64ROLW
@@ -9101,6 +9103,40 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "SHRDQ",
+               argLen:       3,
+               resultInArg0: true,
+               clobberFlags: true,
+               asm:          x86.ASHRQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 2},     // CX
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                               {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+                       outputs: []outputInfo{
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+               },
+       },
+       {
+               name:         "SHLDQ",
+               argLen:       3,
+               resultInArg0: true,
+               clobberFlags: true,
+               asm:          x86.ASHLQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 2},     // CX
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                               {1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+                       outputs: []outputInfo{
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+               },
+       },
        {
                name:         "ROLQ",
                argLen:       2,
index d208624d0e41e90660bbfcfbee41eb01fab59931..ac5bf9d92809e77ddf81ffff61543d234f52efd3 100644 (file)
@@ -18743,6 +18743,54 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
                }
                break
        }
+       // match: (ORQ (SHRQ lo bits) (SHLQ hi (NEGQ bits)))
+       // result: (SHRDQ lo hi bits)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       if v_0.Op != OpAMD64SHRQ {
+                               continue
+                       }
+                       bits := v_0.Args[1]
+                       lo := v_0.Args[0]
+                       if v_1.Op != OpAMD64SHLQ {
+                               continue
+                       }
+                       _ = v_1.Args[1]
+                       hi := v_1.Args[0]
+                       v_1_1 := v_1.Args[1]
+                       if v_1_1.Op != OpAMD64NEGQ || bits != v_1_1.Args[0] {
+                               continue
+                       }
+                       v.reset(OpAMD64SHRDQ)
+                       v.AddArg3(lo, hi, bits)
+                       return true
+               }
+               break
+       }
+       // match: (ORQ (SHLQ lo bits) (SHRQ hi (NEGQ bits)))
+       // result: (SHLDQ lo hi bits)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       if v_0.Op != OpAMD64SHLQ {
+                               continue
+                       }
+                       bits := v_0.Args[1]
+                       lo := v_0.Args[0]
+                       if v_1.Op != OpAMD64SHRQ {
+                               continue
+                       }
+                       _ = v_1.Args[1]
+                       hi := v_1.Args[0]
+                       v_1_1 := v_1.Args[1]
+                       if v_1_1.Op != OpAMD64NEGQ || bits != v_1_1.Args[0] {
+                               continue
+                       }
+                       v.reset(OpAMD64SHLDQ)
+                       v.AddArg3(lo, hi, bits)
+                       return true
+               }
+               break
+       }
        // match: (ORQ (MOVQconst [c]) (MOVQconst [d]))
        // result: (MOVQconst [c|d])
        for {
index ab0ffc2e1302d19a52d800354665f183517f035c..06f6f1247399b8710bc9da0247f275bc277b8abb 100644 (file)
@@ -288,3 +288,16 @@ func checkMergedShifts32(a [256]uint32, b [256]uint64, u uint32, v uint32) {
        //ppc64: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
        b[2] = b[v>>25]
 }
+
+// 128 bit shifts
+
+func check128bitShifts(x, y uint64, bits uint) (uint64, uint64) {
+       s := bits & 63
+       ŝ := (64 - bits) & 63
+       // check that the shift operation has two commas (three operands)
+       // amd64:"SHRQ.*,.*,"
+       shr := x>>s | y<<ŝ
+       // amd64:"SHLQ.*,.*,"
+       shl := x<<s | y>>ŝ
+       return shr, shl
+}