]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: optimize indexbytebody on amd64
authorIlya Tocar <ilya.tocar@intel.com>
Thu, 29 Oct 2015 15:52:22 +0000 (18:52 +0300)
committerKeith Randall <khr@golang.org>
Fri, 6 Nov 2015 15:16:28 +0000 (15:16 +0000)
Use avx2 to compare 32 bytes per iteration.
Results (haswell):

name                    old time/op    new time/op     delta
IndexByte32-6             15.5ns ± 0%     14.7ns ± 5%   -4.87%        (p=0.000 n=16+20)
IndexByte4K-6              360ns ± 0%      183ns ± 0%  -49.17%        (p=0.000 n=19+20)
IndexByte4M-6              384µs ± 0%      256µs ± 1%  -33.41%        (p=0.000 n=20+20)
IndexByte64M-6            6.20ms ± 0%     4.18ms ± 1%  -32.52%        (p=0.000 n=19+20)
IndexBytePortable32-6     73.4ns ± 5%     75.8ns ± 3%   +3.35%        (p=0.000 n=20+19)
IndexBytePortable4K-6     5.15µs ± 0%     5.15µs ± 0%     ~     (all samples are equal)
IndexBytePortable4M-6     5.26ms ± 0%     5.25ms ± 0%   -0.12%        (p=0.000 n=20+18)
IndexBytePortable64M-6    84.1ms ± 0%     84.1ms ± 0%   -0.08%        (p=0.012 n=18+20)
Index32-6                  352ns ± 0%      352ns ± 0%     ~     (all samples are equal)
Index4K-6                 53.8µs ± 0%     53.8µs ± 0%   -0.03%        (p=0.000 n=16+18)
Index4M-6                 55.4ms ± 0%     55.4ms ± 0%     ~           (p=0.149 n=20+19)
Index64M-6                 886ms ± 0%      886ms ± 0%     ~           (p=0.108 n=20+20)
IndexEasy32-6             80.3ns ± 0%     80.1ns ± 0%   -0.21%        (p=0.000 n=20+20)
IndexEasy4K-6              426ns ± 0%      215ns ± 0%  -49.53%        (p=0.000 n=20+20)
IndexEasy4M-6              388µs ± 0%      262µs ± 1%  -32.42%        (p=0.000 n=18+20)
IndexEasy64M-6            6.20ms ± 0%     4.19ms ± 1%  -32.47%        (p=0.000 n=18+20)

name                    old speed      new speed       delta
IndexByte32-6           2.06GB/s ± 1%   2.17GB/s ± 5%   +5.19%        (p=0.000 n=18+20)
IndexByte4K-6           11.4GB/s ± 0%   22.3GB/s ± 0%  +96.45%        (p=0.000 n=17+20)
IndexByte4M-6           10.9GB/s ± 0%   16.4GB/s ± 1%  +50.17%        (p=0.000 n=20+20)
IndexByte64M-6          10.8GB/s ± 0%   16.0GB/s ± 1%  +48.19%        (p=0.000 n=19+20)
IndexBytePortable32-6    436MB/s ± 5%    422MB/s ± 3%   -3.27%        (p=0.000 n=20+19)
IndexBytePortable4K-6    795MB/s ± 0%    795MB/s ± 0%     ~           (p=0.940 n=17+18)
IndexBytePortable4M-6    798MB/s ± 0%    799MB/s ± 0%   +0.12%        (p=0.000 n=20+18)
IndexBytePortable64M-6   798MB/s ± 0%    798MB/s ± 0%   +0.08%        (p=0.011 n=18+20)
Index32-6               90.9MB/s ± 0%   90.9MB/s ± 0%   -0.00%        (p=0.025 n=20+20)
Index4K-6               76.1MB/s ± 0%   76.1MB/s ± 0%   +0.03%        (p=0.000 n=14+15)
Index4M-6               75.7MB/s ± 0%   75.7MB/s ± 0%     ~           (p=0.076 n=20+19)
Index64M-6              75.7MB/s ± 0%   75.7MB/s ± 0%     ~           (p=0.456 n=20+17)
IndexEasy32-6            399MB/s ± 0%    399MB/s ± 0%   +0.20%        (p=0.000 n=20+19)
IndexEasy4K-6           9.60GB/s ± 0%  19.02GB/s ± 0%  +98.19%        (p=0.000 n=20+20)
IndexEasy4M-6           10.8GB/s ± 0%   16.0GB/s ± 1%  +47.98%        (p=0.000 n=18+20)
IndexEasy64M-6          10.8GB/s ± 0%   16.0GB/s ± 1%  +48.08%        (p=0.000 n=18+20)

Change-Id: I46075921dde9f3580a89544c0b3a2d8c9181ebc4
Reviewed-on: https://go-review.googlesource.com/16484
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
Reviewed-by: Klaus Post <klauspost@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/cmd/internal/obj/x86/a.out.go
src/cmd/internal/obj/x86/anames.go
src/cmd/internal/obj/x86/asm6.go
src/runtime/asm_amd64.s

index 345135ceecb63cdb6241df57c0b101462bc879a1..0b5d8eb97674426db34124140dc11c651b857fab 100644 (file)
@@ -749,6 +749,8 @@ const (
        AVPCMPEQB
        AVPMOVMSKB
        AVPAND
+       AVPTEST
+       AVPBROADCASTB
 
        // from 386
        AJCXZW
index 2f1374ada0e13db172d097cf138a88af27a7fa7c..f545baf9940e0c20d284baf0aaa232b82585266d 100644 (file)
@@ -690,6 +690,8 @@ var Anames = []string{
        "VPCMPEQB",
        "VPMOVMSKB",
        "VPAND",
+       "VPTEST",
+       "VPBROADCASTB",
        "JCXZW",
        "FCMOVCC",
        "FCMOVCS",
index 6e3093819b36b621f5b2dea1ca9d3eeb3d1ca3a7..919e00b6e4bd708d54ad46169fa7afb286b222ed 100644 (file)
@@ -219,8 +219,9 @@ const (
        Pf2   = 0xf2 /* xmm escape 1: f2 0f */
        Pf3   = 0xf3 /* xmm escape 2: f3 0f */
        Pq3   = 0x67 /* xmm escape 3: 66 48 0f */
-       Pvex1 = 0xc5 /* 66 escape, vex encoding */
-       Pvex2 = 0xc6 /* f3 escape, vex encoding */
+       Pvex1 = 0xc5 /* 66.0f escape, vex encoding */
+       Pvex2 = 0xc6 /* f3.0f escape, vex encoding */
+       Pvex3 = 0xc7 /* 66.0f38 escape, vex encoding */
        Pw    = 0x48 /* Rex.w */
        Pw8   = 0x90 // symbolic; exact value doesn't matter
        Py    = 0x80 /* defaults to 64-bit mode */
@@ -631,6 +632,11 @@ var yxr_ml_vex = []ytab{
        {Yxr, Ynone, Yml, Zr_m_xm_vex, 1},
 }
 
+var yml_xr_vex = []ytab{
+       {Yml, Ynone, Yxr, Zm_r_xm_vex, 1},
+       {Yxr, Ynone, Yxr, Zm_r_xm_vex, 1},
+}
+
 var yxm_xm_xm = []ytab{
        {Yxr, Yxr, Yxr, Zr_r_r_vex, 1},
        {Yxm, Yxr, Yxr, Zr_r_r_vex, 1},
@@ -1510,6 +1516,8 @@ var optab =
        {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
        {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
        {AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}},
+       {AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}},
+       {AVPTEST, yml_xr_vex, Pvex3, [23]uint8{0x17, 0x17}},
        {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
        {obj.ATYPE, nil, 0, [23]uint8{}},
        {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
@@ -2965,13 +2973,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr
        rexX := regrex[from.Index]
        var prefBit uint8
        // This will go into VEX.PP field.
-       if pref == Pvex1 {
+       if pref == Pvex1 || pref == Pvex3 {
                prefBit = 1
        } else if pref == Pvex2 {
                prefBit = 2
-       } // TODO add Pvex0,Pvex3
+       } // TODO add Pvex0
 
-       if rexX == 0 && rexB == 0 { // 2-byte vex prefix
+       if rexX == 0 && rexB == 0 && pref != Pvex3 { // 2-byte vex prefix
                // In 2-byte case, first byte is always C5
                ctxt.Andptr[0] = 0xc5
                ctxt.Andptr = ctxt.Andptr[1:]
@@ -2998,9 +3006,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr
                ctxt.Andptr[0] = 0xc4
                ctxt.Andptr = ctxt.Andptr[1:]
 
-               // Encode VEX.mmmmm with prefix value, for now assume 0F 38,
-               // which encodes as 1.
-               ctxt.Andptr[0] = 0x1 // TODO handle different prefix
+               // Encode VEX.mmmmm with prefix value, assume 0F,
+               // which encodes as 1, unless 0F38 was specified with pvex3.
+               ctxt.Andptr[0] = 0x1 // TODO handle 0F3A
+               if pref == Pvex3 {
+                       ctxt.Andptr[0] = 0x2
+               }
+
                // REX.[RXB] are inverted and encoded in 3 upper bits
                if rexR == 0 {
                        ctxt.Andptr[0] |= 0x80
index 8401accbcdc02bea73fdc5ec63008b463fef39f7..68b342d4dbdb85db987acf114f48bd4693ead795 100644 (file)
@@ -1940,6 +1940,9 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0
        CMPQ BX, $16
        JLT small
 
+       CMPQ BX, $32
+       JA avx2
+no_avx2:
        // round up to first 16-byte boundary
        TESTQ $15, SI
        JZ aligned
@@ -2003,6 +2006,38 @@ small:
        MOVQ $-1, (R8)
        RET
 
+avx2:
+       CMPB   runtime·support_avx2(SB), $1
+       JNE no_avx2
+       MOVD AX, X0
+       LEAQ -32(SI)(BX*1), R11
+       VPBROADCASTB  X0, X1
+avx2_loop:
+       MOVHDU (DI), X2
+       VPCMPEQB X1, X2, X3
+       VPTEST X3, X3
+       JNZ avx2success
+       ADDQ $32, DI
+       CMPQ DI, R11
+       JLT avx2_loop
+       MOVQ R11, DI
+       MOVHDU (DI), X2
+       VPCMPEQB X1, X2, X3
+       VPTEST X3, X3
+       JNZ avx2success
+       VZEROUPPER
+       MOVQ $-1, (R8)
+       RET
+
+avx2success:
+       VPMOVMSKB X3, DX
+       BSFL DX, DX
+       SUBQ SI, DI
+       ADDQ DI, DX
+       MOVQ DX, (R8)
+       VZEROUPPER
+       RET
+
 // we've found the chunk containing the byte
 // now just figure out which specific byte it is
 ssesuccess: