From: Ilya Tocar Date: Thu, 29 Oct 2015 15:52:22 +0000 (+0300) Subject: runtime: optimize indexbytebody on amd64 X-Git-Tag: go1.6beta1~602 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=321a40721bbfcb9bcf6113d4e8afd1bc030f1d8f;p=gostls13.git runtime: optimize indexbytebody on amd64 Use avx2 to compare 32 bytes per iteration. Results (haswell): name old time/op new time/op delta IndexByte32-6 15.5ns ± 0% 14.7ns ± 5% -4.87% (p=0.000 n=16+20) IndexByte4K-6 360ns ± 0% 183ns ± 0% -49.17% (p=0.000 n=19+20) IndexByte4M-6 384µs ± 0% 256µs ± 1% -33.41% (p=0.000 n=20+20) IndexByte64M-6 6.20ms ± 0% 4.18ms ± 1% -32.52% (p=0.000 n=19+20) IndexBytePortable32-6 73.4ns ± 5% 75.8ns ± 3% +3.35% (p=0.000 n=20+19) IndexBytePortable4K-6 5.15µs ± 0% 5.15µs ± 0% ~ (all samples are equal) IndexBytePortable4M-6 5.26ms ± 0% 5.25ms ± 0% -0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 84.1ms ± 0% 84.1ms ± 0% -0.08% (p=0.012 n=18+20) Index32-6 352ns ± 0% 352ns ± 0% ~ (all samples are equal) Index4K-6 53.8µs ± 0% 53.8µs ± 0% -0.03% (p=0.000 n=16+18) Index4M-6 55.4ms ± 0% 55.4ms ± 0% ~ (p=0.149 n=20+19) Index64M-6 886ms ± 0% 886ms ± 0% ~ (p=0.108 n=20+20) IndexEasy32-6 80.3ns ± 0% 80.1ns ± 0% -0.21% (p=0.000 n=20+20) IndexEasy4K-6 426ns ± 0% 215ns ± 0% -49.53% (p=0.000 n=20+20) IndexEasy4M-6 388µs ± 0% 262µs ± 1% -32.42% (p=0.000 n=18+20) IndexEasy64M-6 6.20ms ± 0% 4.19ms ± 1% -32.47% (p=0.000 n=18+20) name old speed new speed delta IndexByte32-6 2.06GB/s ± 1% 2.17GB/s ± 5% +5.19% (p=0.000 n=18+20) IndexByte4K-6 11.4GB/s ± 0% 22.3GB/s ± 0% +96.45% (p=0.000 n=17+20) IndexByte4M-6 10.9GB/s ± 0% 16.4GB/s ± 1% +50.17% (p=0.000 n=20+20) IndexByte64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.19% (p=0.000 n=19+20) IndexBytePortable32-6 436MB/s ± 5% 422MB/s ± 3% -3.27% (p=0.000 n=20+19) IndexBytePortable4K-6 795MB/s ± 0% 795MB/s ± 0% ~ (p=0.940 n=17+18) IndexBytePortable4M-6 798MB/s ± 0% 799MB/s ± 0% +0.12% (p=0.000 n=20+18) IndexBytePortable64M-6 798MB/s ± 0% 798MB/s ± 0% +0.08% (p=0.011 n=18+20) Index32-6 90.9MB/s ± 0% 90.9MB/s ± 0% -0.00% (p=0.025 n=20+20) Index4K-6 76.1MB/s ± 0% 76.1MB/s ± 0% +0.03% (p=0.000 n=14+15) Index4M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.076 n=20+19) Index64M-6 75.7MB/s ± 0% 75.7MB/s ± 0% ~ (p=0.456 n=20+17) IndexEasy32-6 399MB/s ± 0% 399MB/s ± 0% +0.20% (p=0.000 n=20+19) IndexEasy4K-6 9.60GB/s ± 0% 19.02GB/s ± 0% +98.19% (p=0.000 n=20+20) IndexEasy4M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +47.98% (p=0.000 n=18+20) IndexEasy64M-6 10.8GB/s ± 0% 16.0GB/s ± 1% +48.08% (p=0.000 n=18+20) Change-Id: I46075921dde9f3580a89544c0b3a2d8c9181ebc4 Reviewed-on: https://go-review.googlesource.com/16484 Reviewed-by: Keith Randall Run-TryBot: Ilya Tocar Reviewed-by: Klaus Post TryBot-Result: Gobot Gobot --- diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index 345135ceec..0b5d8eb976 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -749,6 +749,8 @@ const ( AVPCMPEQB AVPMOVMSKB AVPAND + AVPTEST + AVPBROADCASTB // from 386 AJCXZW diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index 2f1374ada0..f545baf994 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -690,6 +690,8 @@ var Anames = []string{ "VPCMPEQB", "VPMOVMSKB", "VPAND", + "VPTEST", + "VPBROADCASTB", "JCXZW", "FCMOVCC", "FCMOVCS", diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 6e3093819b..919e00b6e4 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -219,8 +219,9 @@ const ( Pf2 = 0xf2 /* xmm escape 1: f2 0f */ Pf3 = 0xf3 /* xmm escape 2: f3 0f */ Pq3 = 0x67 /* xmm escape 3: 66 48 0f */ - Pvex1 = 0xc5 /* 66 escape, vex encoding */ - Pvex2 = 0xc6 /* f3 escape, vex encoding */ + Pvex1 = 0xc5 /* 66.0f escape, vex encoding */ + Pvex2 = 0xc6 /* f3.0f escape, vex encoding */ + Pvex3 = 0xc7 /* 66.0f38 escape, vex encoding */ Pw = 0x48 /* Rex.w */ Pw8 = 0x90 // symbolic; exact value doesn't matter Py = 0x80 /* defaults to 64-bit mode */ @@ -631,6 +632,11 @@ var yxr_ml_vex = []ytab{ {Yxr, Ynone, Yml, Zr_m_xm_vex, 1}, } +var yml_xr_vex = []ytab{ + {Yml, Ynone, Yxr, Zm_r_xm_vex, 1}, + {Yxr, Ynone, Yxr, Zm_r_xm_vex, 1}, +} + var yxm_xm_xm = []ytab{ {Yxr, Yxr, Yxr, Zr_r_r_vex, 1}, {Yxm, Yxr, Yxr, Zr_r_r_vex, 1}, @@ -1510,6 +1516,8 @@ var optab = {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}}, {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}}, {AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}}, + {AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}}, + {AVPTEST, yml_xr_vex, Pvex3, [23]uint8{0x17, 0x17}}, {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}}, {obj.ATYPE, nil, 0, [23]uint8{}}, {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}}, @@ -2965,13 +2973,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr rexX := regrex[from.Index] var prefBit uint8 // This will go into VEX.PP field. - if pref == Pvex1 { + if pref == Pvex1 || pref == Pvex3 { prefBit = 1 } else if pref == Pvex2 { prefBit = 2 - } // TODO add Pvex0,Pvex3 + } // TODO add Pvex0 - if rexX == 0 && rexB == 0 { // 2-byte vex prefix + if rexX == 0 && rexB == 0 && pref != Pvex3 { // 2-byte vex prefix // In 2-byte case, first byte is always C5 ctxt.Andptr[0] = 0xc5 ctxt.Andptr = ctxt.Andptr[1:] @@ -2998,9 +3006,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr ctxt.Andptr[0] = 0xc4 ctxt.Andptr = ctxt.Andptr[1:] - // Encode VEX.mmmmm with prefix value, for now assume 0F 38, - // which encodes as 1. - ctxt.Andptr[0] = 0x1 // TODO handle different prefix + // Encode VEX.mmmmm with prefix value, assume 0F, + // which encodes as 1, unless 0F38 was specified with pvex3. + ctxt.Andptr[0] = 0x1 // TODO handle 0F3A + if pref == Pvex3 { + ctxt.Andptr[0] = 0x2 + } + // REX.[RXB] are inverted and encoded in 3 upper bits if rexR == 0 { ctxt.Andptr[0] |= 0x80 diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 8401accbcd..68b342d4db 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1940,6 +1940,9 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0 CMPQ BX, $16 JLT small + CMPQ BX, $32 + JA avx2 +no_avx2: // round up to first 16-byte boundary TESTQ $15, SI JZ aligned @@ -2003,6 +2006,38 @@ small: MOVQ $-1, (R8) RET +avx2: + CMPB runtime·support_avx2(SB), $1 + JNE no_avx2 + MOVD AX, X0 + LEAQ -32(SI)(BX*1), R11 + VPBROADCASTB X0, X1 +avx2_loop: + MOVHDU (DI), X2 + VPCMPEQB X1, X2, X3 + VPTEST X3, X3 + JNZ avx2success + ADDQ $32, DI + CMPQ DI, R11 + JLT avx2_loop + MOVQ R11, DI + MOVHDU (DI), X2 + VPCMPEQB X1, X2, X3 + VPTEST X3, X3 + JNZ avx2success + VZEROUPPER + MOVQ $-1, (R8) + RET + +avx2success: + VPMOVMSKB X3, DX + BSFL DX, DX + SUBQ SI, DI + ADDQ DI, DX + MOVQ DX, (R8) + VZEROUPPER + RET + // we've found the chunk containing the byte // now just figure out which specific byte it is ssesuccess: