]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile/internal/amd64: break dependency for CVTS[LQ]2S[DS]
authorIlya Tocar <ilya.tocar@intel.com>
Wed, 19 Oct 2016 17:21:42 +0000 (20:21 +0300)
committerIlya Tocar <ilya.tocar@intel.com>
Thu, 20 Oct 2016 14:07:23 +0000 (14:07 +0000)
CVTSL2SS, CVTSQ2SS, CVTSL2SD, CVTSQ2SD preserve upper part of xmm register,
introducing false dependency on a previous value.
Break it by xoring destination with itself.
Increases size of go executable by 320 bytes, but shows nice improvement on go1.
Also fixes performance degradation introduced by 1.7.

name                     old time/op    new time/op    delta
BinaryTree17-4              2.20s ± 1%     2.19s ± 0%  -0.36%        (p=0.000 n=18+16)
Fannkuch11-4                2.44s ± 1%     2.45s ± 2%  +0.47%        (p=0.030 n=20+20)
FmtFprintfEmpty-4          40.9ns ± 7%    40.5ns ± 1%    ~           (p=0.531 n=20+16)
FmtFprintfString-4          111ns ± 2%     111ns ± 1%    ~           (p=0.510 n=18+19)
FmtFprintfInt-4            98.3ns ± 3%    99.3ns ± 1%  +1.01%        (p=0.003 n=20+18)
FmtFprintfIntInt-4          148ns ± 3%     147ns ± 1%    ~           (p=0.919 n=20+17)
FmtFprintfPrefixedInt-4     149ns ± 1%     152ns ± 0%  +1.73%        (p=0.000 n=19+17)
FmtFprintfFloat-4           231ns ± 0%     231ns ± 1%    ~           (p=0.678 n=18+19)
FmtManyArgs-4               667ns ± 1%     672ns ± 1%  +0.73%        (p=0.005 n=20+20)
GobDecode-4                5.60ms ± 0%    5.61ms ± 0%  +0.24%        (p=0.000 n=20+20)
GobEncode-4                4.74ms ± 0%    4.73ms ± 1%  -0.20%        (p=0.002 n=20+20)
Gzip-4                      199ms ± 0%     199ms ± 1%  +0.35%        (p=0.000 n=19+20)
Gunzip-4                   31.8ms ± 1%    31.5ms ± 1%  -0.89%        (p=0.000 n=20+20)
HTTPClientServer-4         38.1µs ± 1%    38.0µs ± 1%    ~           (p=0.117 n=19+18)
JSONEncode-4               14.2ms ± 1%    13.4ms ± 0%  -5.73%        (p=0.000 n=20+20)
JSONDecode-4               42.7ms ± 0%    42.7ms ± 1%  +0.18%        (p=0.019 n=18+19)
Mandelbrot200-4            3.26ms ± 0%    2.99ms ± 0%  -8.38%        (p=0.000 n=19+19)
GoParse-4                  2.76ms ± 1%    2.76ms ± 1%    ~           (p=0.583 n=20+20)
RegexpMatchEasy0_32-4      69.5ns ± 0%    69.6ns ± 0%  +0.10%        (p=0.017 n=16+17)
RegexpMatchEasy0_1K-4       703ns ± 0%     708ns ± 3%  +0.65%        (p=0.000 n=17+18)
RegexpMatchEasy1_32-4      68.2ns ± 1%    68.2ns ± 2%    ~           (p=0.094 n=18+20)
RegexpMatchEasy1_1K-4       288ns ± 1%     288ns ± 0%    ~           (p=0.403 n=17+18)
RegexpMatchMedium_32-4      104ns ± 2%     103ns ± 1%    ~           (p=0.110 n=20+16)
RegexpMatchMedium_1K-4     31.7µs ± 3%    31.7µs ± 3%    ~           (p=0.091 n=19+20)
RegexpMatchHard_32-4       1.59µs ± 2%    1.58µs ± 2%    ~           (p=0.083 n=20+20)
RegexpMatchHard_1K-4       48.1µs ± 3%    47.9µs ± 2%    ~           (p=0.461 n=20+19)
Revcomp-4                   344ms ± 0%     345ms ± 0%  +0.08%        (p=0.009 n=18+17)
Template-4                 44.8ms ± 1%    44.7ms ± 1%    ~           (p=0.277 n=20+20)
TimeParse-4                 258ns ± 0%     258ns ± 0%    ~     (all samples are equal)
TimeFormat-4                275ns ± 0%     273ns ± 0%  -0.64%        (p=0.000 n=20+18)

name                     old speed      new speed      delta
GobDecode-4               137MB/s ± 0%   137MB/s ± 0%  -0.24%        (p=0.000 n=20+20)
GobEncode-4               162MB/s ± 0%   162MB/s ± 0%  +0.20%        (p=0.002 n=20+20)
Gzip-4                   97.6MB/s ± 0%  97.3MB/s ± 1%  -0.35%        (p=0.000 n=19+20)
Gunzip-4                  610MB/s ± 1%   615MB/s ± 1%  +0.89%        (p=0.000 n=20+20)
JSONEncode-4              136MB/s ± 1%   145MB/s ± 0%  +6.08%        (p=0.000 n=20+20)
JSONDecode-4             45.5MB/s ± 0%  45.4MB/s ± 1%  -0.17%        (p=0.017 n=18+19)
GoParse-4                21.0MB/s ± 1%  21.0MB/s ± 1%    ~           (p=0.578 n=20+20)
RegexpMatchEasy0_32-4     460MB/s ± 0%   460MB/s ± 0%  -0.09%        (p=0.031 n=16+17)
RegexpMatchEasy0_1K-4    1.46GB/s ± 0%  1.45GB/s ± 3%  -0.64%        (p=0.000 n=17+18)
RegexpMatchEasy1_32-4     469MB/s ± 0%   469MB/s ± 2%  +0.06%        (p=0.043 n=18+20)
RegexpMatchEasy1_1K-4    3.55GB/s ± 1%  3.55GB/s ± 0%    ~           (p=0.057 n=17+18)
RegexpMatchMedium_32-4   9.61MB/s ± 2%  9.64MB/s ± 2%    ~           (p=0.856 n=20+20)
RegexpMatchMedium_1K-4   32.3MB/s ± 3%  32.3MB/s ± 3%    ~           (p=0.085 n=19+20)
RegexpMatchHard_32-4     20.1MB/s ± 2%  20.2MB/s ± 2%    ~           (p=0.086 n=20+20)
RegexpMatchHard_1K-4     21.3MB/s ± 3%  21.4MB/s ± 2%    ~           (p=0.578 n=20+20)
Revcomp-4                 738MB/s ± 0%   737MB/s ± 0%  -0.08%        (p=0.009 n=18+17)
Template-4               43.3MB/s ± 1%  43.4MB/s ± 1%    ~           (p=0.274 n=20+20)

Fixes #16982

Change-Id: If574d66f39f4183a9b1d5ffff0339909cc73f59d
Reviewed-on: https://go-review.googlesource.com/31490
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/amd64/ssa.go

index 54c0c0fb5a89cfe3f300d24448bc52d2ec6648dd..3c997f20ae83bfd4659face71b417935fdf967be 100644 (file)
@@ -637,10 +637,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.To.Index = i
                gc.AddAux2(&p.To, v, sc.Off())
        case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
-               ssa.OpAMD64CVTSL2SS, ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSQ2SD,
                ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
                ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
                opregreg(v.Op.Asm(), v.Reg(), v.Args[0].Reg())
+       case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
+               r := v.Reg()
+               // Break false dependency on destination register.
+               opregreg(x86.AXORPS, r, r)
+               opregreg(v.Op.Asm(), r, v.Args[0].Reg())
        case ssa.OpAMD64DUFFZERO:
                off := duffStart(v.AuxInt)
                adj := duffAdj(v.AuxInt)