From: Jakub Ciolek Date: Sun, 27 Nov 2022 15:36:35 +0000 (+0100) Subject: cmd/compile: Use XORL and X15 for zeroing in ggen's zerorange on AMD64 X-Git-Tag: go1.21rc1~256 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=326df693d700fa42c2740dcc89a14c821d019f52;p=gostls13.git cmd/compile: Use XORL and X15 for zeroing in ggen's zerorange on AMD64 Prefer a SSE store from X15 over XORL REG REG -> MOVQ. Use XORL REG, REG to setup 0 for REP STOS. Remove the nacl related block. Intel Alder Lake 12600k (Linux): name old time/op new time/op delta BinaryTree17-16 1.52s ± 1% 1.52s ± 1% ~ (p=0.932 n=12+12) Fannkuch11-16 1.40s ± 0% 1.46s ± 0% +4.12% (p=0.000 n=12+11) FmtFprintfEmpty-16 13.0ns ± 1% 13.2ns ± 1% +1.37% (p=0.000 n=11+12) FmtFprintfString-16 24.0ns ± 1% 23.2ns ± 1% -3.53% (p=0.000 n=12+11) FmtFprintfInt-16 27.6ns ± 1% 25.5ns ± 1% -7.53% (p=0.000 n=12+12) FmtFprintfIntInt-16 43.1ns ± 1% 41.4ns ± 1% -4.00% (p=0.000 n=12+12) FmtFprintfPrefixedInt-16 56.8ns ± 0% 54.8ns ± 1% -3.49% (p=0.000 n=11+12) FmtFprintfFloat-16 59.0ns ± 0% 59.7ns ± 1% +1.11% (p=0.000 n=12+12) FmtManyArgs-16 159ns ± 1% 160ns ± 1% ~ (p=0.070 n=12+12) GobDecode-16 2.37ms ± 2% 2.39ms ± 1% ~ (p=0.059 n=12+11) GobEncode-16 1.99ms ± 2% 2.00ms ± 2% ~ (p=0.291 n=12+12) Gzip-16 98.7ms ± 1% 99.2ms ± 0% +0.51% (p=0.024 n=12+12) Gunzip-16 13.4ms ± 0% 13.5ms ± 0% +0.20% (p=0.001 n=11+12) HTTPClientServer-16 27.0µs ± 6% 26.5µs ± 4% ~ (p=0.266 n=12+12) JSONEncode-16 3.41ms ± 5% 3.44ms ± 2% ~ (p=0.291 n=12+12) JSONDecode-16 16.6ms ± 1% 16.6ms ± 2% ~ (p=0.872 n=10+12) Mandelbrot200-16 1.78ms ± 0% 1.78ms ± 0% ~ (p=0.514 n=12+12) GoParse-16 1.59ms ± 2% 1.57ms ± 2% -0.82% (p=0.016 n=12+11) RegexpMatchEasy0_32-16 21.5ns ± 1% 21.3ns ± 0% -1.10% (p=0.000 n=12+12) RegexpMatchEasy0_1K-16 71.0ns ± 0% 69.9ns ± 0% -1.58% (p=0.000 n=11+11) RegexpMatchEasy1_32-16 18.0ns ± 1% 17.3ns ± 0% -3.38% (p=0.000 n=12+12) RegexpMatchEasy1_1K-16 97.8ns ± 0% 97.2ns ± 1% -0.56% (p=0.001 n=10+10) RegexpMatchMedium_32-16 269ns ± 0% 270ns ± 2% ~ (p=0.241 n=11+12) RegexpMatchMedium_1K-16 11.4µs ± 0% 11.3µs ± 1% -0.69% (p=0.000 n=11+12) RegexpMatchHard_32-16 522ns ± 0% 522ns ± 1% ~ (p=0.811 n=12+12) RegexpMatchHard_1K-16 15.1µs ± 2% 14.8µs ± 0% -2.17% (p=0.000 n=12+12) Revcomp-16 194ms ± 1% 195ms ± 2% ~ (p=0.059 n=11+12) Template-16 22.0ms ± 2% 21.5ms ± 2% -2.11% (p=0.001 n=12+12) TimeParse-16 97.3ns ± 1% 97.2ns ± 0% ~ (p=0.217 n=11+12) TimeFormat-16 98.2ns ± 2% 97.1ns ± 2% ~ (p=0.101 n=12+12) [Geo mean] 17.7µs 17.6µs -0.77% name old speed new speed delta GobDecode-16 324MB/s ± 2% 322MB/s ± 1% ~ (p=0.058 n=12+11) GobEncode-16 385MB/s ± 2% 383MB/s ± 2% ~ (p=0.291 n=12+12) Gzip-16 197MB/s ± 1% 196MB/s ± 0% -0.51% (p=0.023 n=12+12) Gunzip-16 1.44GB/s ± 0% 1.44GB/s ± 0% -0.20% (p=0.001 n=11+12) JSONEncode-16 570MB/s ± 5% 565MB/s ± 2% ~ (p=0.291 n=12+12) JSONDecode-16 117MB/s ± 1% 117MB/s ± 2% ~ (p=0.885 n=10+12) GoParse-16 36.5MB/s ± 2% 36.8MB/s ± 2% +0.83% (p=0.018 n=12+11) RegexpMatchEasy0_32-16 1.49GB/s ± 1% 1.50GB/s ± 0% +1.12% (p=0.000 n=12+12) RegexpMatchEasy0_1K-16 14.4GB/s ± 0% 14.6GB/s ± 0% +1.61% (p=0.000 n=11+11) RegexpMatchEasy1_32-16 1.78GB/s ± 1% 1.84GB/s ± 0% +3.50% (p=0.000 n=12+12) RegexpMatchEasy1_1K-16 10.5GB/s ± 0% 10.5GB/s ± 1% +0.57% (p=0.001 n=10+10) RegexpMatchMedium_32-16 119MB/s ± 0% 119MB/s ± 2% ~ (p=0.235 n=11+12) RegexpMatchMedium_1K-16 90.1MB/s ± 0% 90.8MB/s ± 1% +0.69% (p=0.000 n=11+12) RegexpMatchHard_32-16 61.3MB/s ± 0% 61.3MB/s ± 1% ~ (p=0.765 n=12+12) RegexpMatchHard_1K-16 67.6MB/s ± 2% 69.1MB/s ± 0% +2.20% (p=0.000 n=12+12) Revcomp-16 1.31GB/s ± 1% 1.30GB/s ± 2% ~ (p=0.059 n=11+12) Template-16 88.3MB/s ± 2% 90.2MB/s ± 2% +2.16% (p=0.001 n=12+12) [Geo mean] 401MB/s 403MB/s +0.49% file before after Δ % runtime.s 512467 512447 -20 -0.004% sync.s 16219 16197 -22 -0.136% internal/singleflight.s 2617 2616 -1 -0.038% internal/testlog.s 2157 2152 -5 -0.232% io.s 18992 18980 -12 -0.063% text/tabwriter.s 8952 8913 -39 -0.436% syscall.s 85241 85220 -21 -0.025% go/build/constraint.s 12763 12741 -22 -0.172% time.s 100682 100672 -10 -0.010% context.s 12316 12305 -11 -0.089% internal/poll.s 45297 45114 -183 -0.404% io/fs.s 16767 16763 -4 -0.024% crypto/hmac.s 2546 2537 -9 -0.353% os.s 53983 53964 -19 -0.035% os/exec.s 25723 25710 -13 -0.051% os/user.s 12166 12133 -33 -0.271% debug/gosym.s 36980 36948 -32 -0.087% database/sql.s 90990 90863 -127 -0.140% archive/zip.s 52485 52481 -4 -0.008% debug/dwarf.s 117251 117219 -32 -0.027% encoding/json.s 95637 95579 -58 -0.061% net.s 278084 278056 -28 -0.010% log.s 12153 12121 -32 -0.263% vendor/golang.org/x/net/http2/hpack.s 22562 22552 -10 -0.044% mime.s 32872 32851 -21 -0.064% vendor/golang.org/x/crypto/cryptobyte.s 32035 32024 -11 -0.034% go/token.s 13689 13645 -44 -0.321% image/gif.s 22700 22668 -32 -0.141% text/template/parse.s 81696 81683 -13 -0.016% image/png.s 37704 37692 -12 -0.032% go/ast.s 63753 63751 -2 -0.003% internal/dag.s 13123 13122 -1 -0.008% crypto/x509.s 137641 137635 -6 -0.004% text/template.s 106615 106592 -23 -0.022% os/signal.s 7658 7651 -7 -0.091% go/printer.s 90393 90384 -9 -0.010% runtime/trace.s 2844 2819 -25 -0.879% go/parser.s 111432 111144 -288 -0.258% html/template.s 91633 91619 -14 -0.015% log/syslog.s 6612 6593 -19 -0.287% net/internal/socktest.s 15715 15684 -31 -0.197% runtime/pprof.s 70273 70177 -96 -0.137% crypto/tls.s 288762 288684 -78 -0.027% testing.s 112376 112300 -76 -0.068% internal/fuzz.s 89544 89535 -9 -0.010% net/smtp.s 11357 11325 -32 -0.282% vendor/golang.org/x/net/nettest.s 27449 27361 -88 -0.321% testing/internal/testdeps.s 6384 6369 -15 -0.235% go/types.s 484464 484324 -140 -0.029% cmd/internal/buildid.s 17646 17625 -21 -0.119% go/internal/gccgoimporter.s 44931 44920 -11 -0.024% go/internal/srcimporter.s 9989 9983 -6 -0.060% cmd/api.s 35098 35075 -23 -0.066% net/http.s 551134 550680 -454 -0.082% cmd/internal/obj.s 121795 121750 -45 -0.037% cmd/compile/internal/syntax.s 170092 170075 -17 -0.010% expvar.s 8959 8945 -14 -0.156% net/http/httptest.s 16201 16198 -3 -0.019% net/http/httputil.s 44379 44339 -40 -0.090% net/http/fcgi.s 18105 18099 -6 -0.033% cmd/compile/internal/logopt.s 9916 9905 -11 -0.111% cmd/vendor/golang.org/x/tools/cover.s 9722 9720 -2 -0.021% cmd/compile/internal/base.s 38986 38982 -4 -0.010% cmd/covdata.s 39190 39182 -8 -0.020% cmd/go/internal/fsys.s 17948 17938 -10 -0.056% cmd/dist.s 169725 169616 -109 -0.064% cmd/compile/internal/types.s 74666 74639 -27 -0.036% cmd/go/internal/par.s 4517 4516 -1 -0.022% cmd/go/internal/lockedfile.s 22412 22410 -2 -0.009% cmd/compile/internal/types2.s 476715 476596 -119 -0.025% cmd/go/internal/trace.s 4888 4877 -11 -0.225% cmd/vendor/golang.org/x/mod/module.s 20445 20415 -30 -0.147% cmd/vendor/golang.org/x/mod/sumdb/dirhash.s 3596 3587 -9 -0.250% cmd/go/internal/cache.s 23871 23839 -32 -0.134% cmd/vendor/golang.org/x/mod/zip.s 36602 36567 -35 -0.096% cmd/vendor/golang.org/x/mod/sumdb.s 27669 27552 -117 -0.423% cmd/compile/internal/ir.s 253597 253590 -7 -0.003% cmd/go/internal/vcs.s 45027 45025 -2 -0.004% cmd/go/internal/modfetch/codehost.s 80672 80583 -89 -0.110% cmd/go/internal/modindex.s 79273 79271 -2 -0.003% cmd/go/internal/modconv.s 14793 14792 -1 -0.007% cmd/gofmt.s 29600 29587 -13 -0.044% cmd/go/internal/modfetch.s 111067 111017 -50 -0.045% cmd/go/internal/vcweb.s 37882 37872 -10 -0.026% cmd/compile/internal/typecheck.s 319852 319834 -18 -0.006% cmd/vendor/golang.org/x/term.s 24398 24338 -60 -0.246% cmd/vendor/github.com/google/pprof/profile.s 149107 149103 -4 -0.003% cmd/go/internal/modload.s 275078 275020 -58 -0.021% cmd/vendor/github.com/ianlancetaylor/demangle.s 264051 264013 -38 -0.014% cmd/compile/internal/staticdata.s 14019 14011 -8 -0.057% cmd/go/internal/modcmd.s 47591 47582 -9 -0.019% cmd/vendor/github.com/google/pprof/internal/binutils.s 37978 37926 -52 -0.137% cmd/vendor/golang.org/x/tools/go/analysis/passes/buildtag.s 6880 6864 -16 -0.233% cmd/go/internal/work.s 287700 287637 -63 -0.022% cmd/vendor/github.com/google/pprof/internal/symbolizer.s 10171 10138 -33 -0.324% cmd/go/internal/modget.s 58314 58250 -64 -0.110% cmd/go/internal/test.s 52538 52534 -4 -0.008% cmd/trace.s 100245 100242 -3 -0.003% cmd/vendor/golang.org/x/tools/go/ast/astutil.s 50731 50724 -7 -0.014% cmd/vendor/golang.org/x/tools/internal/facts.s 13018 13011 -7 -0.054% cmd/link/internal/ld.s 562438 562377 -61 -0.011% cmd/vendor/github.com/google/pprof/internal/driver.s 148389 148338 -51 -0.034% cmd/compile/internal/ssa.s 3639799 3639727 -72 -0.002% cmd/compile/internal/ssagen.s 359076 359028 -48 -0.013% cmd/compile/internal/amd64.s 31084 30582 -502 -1.615% cmd/compile/internal/noder.s 407972 407949 -23 -0.006% cmd/compile/internal/gc.s 20101 20069 -32 -0.159% total 20771294 20766881 -4413 -0.021% Change-Id: I2a4b01449e94906fa1ed3fb96a790977466368d9 Reviewed-on: https://go-review.googlesource.com/c/go/+/453536 Reviewed-by: Michael Knyszek TryBot-Result: Gopher Robot Auto-Submit: Dmitri Shuralyov Run-TryBot: Dmitri Shuralyov Reviewed-by: Keith Randall Reviewed-by: Keith Randall --- diff --git a/src/cmd/compile/internal/amd64/ggen.go b/src/cmd/compile/internal/amd64/ggen.go index b8dce81a92..db98a22a1e 100644 --- a/src/cmd/compile/internal/amd64/ggen.go +++ b/src/cmd/compile/internal/amd64/ggen.go @@ -5,7 +5,6 @@ package amd64 import ( - "cmd/compile/internal/base" "cmd/compile/internal/ir" "cmd/compile/internal/objw" "cmd/compile/internal/types" @@ -63,26 +62,8 @@ func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj. return p } - if cnt%int64(types.RegSize) != 0 { - // should only happen with nacl - if cnt%int64(types.PtrSize) != 0 { - base.Fatalf("zerorange count not a multiple of widthptr %d", cnt) - } - if *state&r13 == 0 { - p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_R13, 0) - *state |= r13 - } - p = pp.Append(p, x86.AMOVL, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_MEM, x86.REG_SP, off) - off += int64(types.PtrSize) - cnt -= int64(types.PtrSize) - } - if cnt == 8 { - if *state&r13 == 0 { - p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_R13, 0) - *state |= r13 - } - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_MEM, x86.REG_SP, off) + p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off) } else if !isPlan9 && cnt <= int64(8*types.RegSize) { for i := int64(0); i < cnt/16; i++ { p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16) @@ -120,7 +101,7 @@ func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj. p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0) // Set up the REPSTOSQ and kick it off. - p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) + p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0) p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0) p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0) p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)