]> Cypherpunks repositories - gostls13.git/commitdiff
bytes: speed up Compare() on amd64
authorIlya Tocar <ilya.tocar@intel.com>
Wed, 28 Oct 2015 20:20:26 +0000 (23:20 +0300)
committerKeith Randall <khr@golang.org>
Mon, 2 Nov 2015 18:39:38 +0000 (18:39 +0000)
Use AVX2 if available.
Results (haswell), below:

name                           old time/op    new time/op     delta
BytesCompare1-6                  11.4ns ± 0%     11.4ns ± 0%     ~     (all samples are equal)
BytesCompare2-6                  11.4ns ± 0%     11.4ns ± 0%     ~     (all samples are equal)
BytesCompare4-6                  11.4ns ± 0%     11.4ns ± 0%     ~     (all samples are equal)
BytesCompare8-6                  9.29ns ± 2%     8.76ns ± 0%   -5.72%        (p=0.000 n=16+17)
BytesCompare16-6                 9.29ns ± 2%     9.20ns ± 0%   -1.02%        (p=0.000 n=20+16)
BytesCompare32-6                 11.4ns ± 1%     11.4ns ± 0%     ~           (p=0.191 n=20+20)
BytesCompare64-6                 14.4ns ± 0%     13.1ns ± 0%   -8.68%        (p=0.000 n=20+20)
BytesCompare128-6                20.2ns ± 0%     18.5ns ± 0%   -8.27%        (p=0.000 n=16+20)
BytesCompare256-6                29.3ns ± 0%     24.5ns ± 0%  -16.38%        (p=0.000 n=16+16)
BytesCompare512-6                46.8ns ± 0%     37.1ns ± 0%  -20.78%        (p=0.000 n=18+16)
BytesCompare1024-6               82.9ns ± 0%     62.3ns ± 0%  -24.86%        (p=0.000 n=20+14)
BytesCompare2048-6                155ns ± 0%      112ns ± 0%  -27.74%        (p=0.000 n=20+20)
CompareBytesEqual-6              10.1ns ± 1%     10.0ns ± 1%     ~           (p=0.527 n=20+20)
CompareBytesToNil-6              10.0ns ± 2%      9.4ns ± 0%   -6.57%        (p=0.000 n=20+17)
CompareBytesEmpty-6              8.76ns ± 0%     8.76ns ± 0%     ~     (all samples are equal)
CompareBytesIdentical-6          8.76ns ± 0%     8.76ns ± 0%     ~     (all samples are equal)
CompareBytesSameLength-6         10.6ns ± 1%     10.6ns ± 1%     ~           (p=0.240 n=20+20)
CompareBytesDifferentLength-6    10.6ns ± 0%     10.6ns ± 1%     ~           (p=1.000 n=20+20)
CompareBytesBigUnaligned-6        132±s ± 1%      105±s ± 1%  -20.61%        (p=0.000 n=20+18)
CompareBytesBig-6                 125±s ± 1%      105±s ± 1%  -16.31%        (p=0.000 n=20+20)
CompareBytesBigIdentical-6       8.13ns ± 0%     8.13ns ± 0%     ~     (all samples are equal)

name                           old speed      new speed       delta
CompareBytesBigUnaligned-6     7.94GB/s ± 1%  10.01GB/s ± 1%  +25.96%        (p=0.000 n=20+18)
CompareBytesBig-6              8.38GB/s ± 1%  10.01GB/s ± 1%  +19.48%        (p=0.000 n=20+20)
CompareBytesBigIdentical-6      129TB/s ± 0%    129TB/s ± 0%   +0.01%        (p=0.003 n=17+19)

Change-Id: I820f31bab4582dd4204b146bb077c0d2f24cd8f5
Reviewed-on: https://go-review.googlesource.com/16434
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
Reviewed-by: Klaus Post <klauspost@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/internal/obj/x86/a.out.go
src/cmd/internal/obj/x86/anames.go
src/cmd/internal/obj/x86/asm6.go
src/runtime/asm_amd64.s
src/runtime/runtime2.go

index 108ca6a289eb51013062fc4150af428303d06578..55fc31ddaf1010673641ad7e445117d58ab3279f 100644 (file)
@@ -746,6 +746,8 @@ const (
        AMOVHDU
        AMOVNTHD
        AMOVHDA
+       AVPCMPEQB
+       AVPMOVMSKB
 
        // from 386
        AJCXZW
index c075a15c80bd22aa0bb055fba1b802e25cbce079..729b9d423bc437149f87f6acc4d10660d57db621 100644 (file)
@@ -687,6 +687,8 @@ var Anames = []string{
        "MOVHDU",
        "MOVNTHD",
        "MOVHDA",
+       "VPCMPEQB",
+       "VPMOVMSKB",
        "JCXZW",
        "FCMOVCC",
        "FCMOVCS",
index f03df5bf00d757c9c405b920372de134f9240f2d..739ba671058f11599719a58a564441a2e033b3aa 100644 (file)
@@ -195,6 +195,7 @@ const (
        Zr_m
        Zr_m_xm
        Zr_m_xm_vex
+       Zr_r_r_vex
        Zrp_
        Z_ib
        Z_il
@@ -630,6 +631,11 @@ var yxr_ml_vex = []ytab{
        {Yxr, Ynone, Yml, Zr_m_xm_vex, 1},
 }
 
+var yxm_xm_xm = []ytab{
+       {Yxr, Yxr, Yxr, Zr_r_r_vex, 1},
+       {Yxm, Yxr, Yxr, Zr_r_r_vex, 1},
+}
+
 var ymr = []ytab{
        {Ymr, Ynone, Ymr, Zm_r, 1},
 }
@@ -725,6 +731,10 @@ var ymskb = []ytab{
        {Ymr, Ynone, Yrl, Zm_r_xm, 1},
 }
 
+var ymskb_vex = []ytab{
+       {Yxr, Ynone, Yrl, Zm_r_xm_vex, 2},
+}
+
 var ycrc32l = []ytab{
        {Yml, Ynone, Yrl, Zlitm_r, 0},
 }
@@ -1497,6 +1507,8 @@ var optab =
        {AMOVHDU, yxmov_vex, Pvex2, [23]uint8{0x6f, 0x7f}},
        {AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
        {AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
+       {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
+       {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
        {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
        {obj.ATYPE, nil, 0, [23]uint8{}},
        {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
@@ -2943,11 +2955,15 @@ var bpduff2 = []byte{
        0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
 }
 
-func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
+// Assemble vex prefix, from 3 operands and prefix.
+// For details about vex prefix see:
+// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
+func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pref uint8) {
        rexR := regrex[to.Reg]
        rexB := regrex[from.Reg]
        rexX := regrex[from.Index]
        var prefBit uint8
+       // This will go into VEX.PP field.
        if pref == Pvex1 {
                prefBit = 1
        } else if pref == Pvex2 {
@@ -2955,21 +2971,36 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
        } // TODO add Pvex0,Pvex3
 
        if rexX == 0 && rexB == 0 { // 2-byte vex prefix
+               // In 2-byte case, first byte is always C5
                ctxt.Andptr[0] = 0xc5
                ctxt.Andptr = ctxt.Andptr[1:]
 
-               if rexR != 0 {
+               if from3 == nil {
+                       // If this is a 2-operand instruction fill VEX.VVVV with 1111
+                       // We are also interested only in 256-bit version, so VEX.L=1
                        ctxt.Andptr[0] = 0x7c
                } else {
-                       ctxt.Andptr[0] = 0xfc
+                       // VEX.L=1
+                       ctxt.Andptr[0] = 0x4
+                       // VEX.VVVV (bits 3:6) is a inversed register number
+                       ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
+               }
+
+               // VEX encodes REX.R as inversed upper bit
+               if rexR == 0 {
+                       ctxt.Andptr[0] |= 0x80
                }
                ctxt.Andptr[0] |= prefBit
                ctxt.Andptr = ctxt.Andptr[1:]
-       } else {
+       } else { // 3-byte case
+               // First byte is always C$
                ctxt.Andptr[0] = 0xc4
                ctxt.Andptr = ctxt.Andptr[1:]
 
+               // Encode VEX.mmmmm with prefix value, for now assume 0F 38,
+               // which encodes as 1.
                ctxt.Andptr[0] = 0x1 // TODO handle different prefix
+               // REX.[RXB] are inverted and encoded in 3 upper bits
                if rexR == 0 {
                        ctxt.Andptr[0] |= 0x80
                }
@@ -2981,7 +3012,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
                }
                ctxt.Andptr = ctxt.Andptr[1:]
 
-               ctxt.Andptr[0] = 0x7c
+               // Fill VEX.VVVV, same as 2-operand VEX instruction.
+               if from3 == nil {
+                       ctxt.Andptr[0] = 0x7c
+               } else {
+                       ctxt.Andptr[0] = 0x4
+                       ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
+               }
                ctxt.Andptr[0] |= prefBit
                ctxt.Andptr = ctxt.Andptr[1:]
        }
@@ -3222,7 +3259,7 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
 
                        case Zm_r_xm_vex:
                                ctxt.Vexflag = 1
-                               vexprefix(ctxt, &p.To, &p.From, o.prefix)
+                               vexprefix(ctxt, &p.To, &p.From, nil, o.prefix)
                                ctxt.Andptr[0] = byte(op)
                                ctxt.Andptr = ctxt.Andptr[1:]
                                asmand(ctxt, p, &p.From, &p.To)
@@ -3284,11 +3321,18 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
 
                        case Zr_m_xm_vex:
                                ctxt.Vexflag = 1
-                               vexprefix(ctxt, &p.From, &p.To, o.prefix)
+                               vexprefix(ctxt, &p.From, &p.To, nil, o.prefix)
                                ctxt.Andptr[0] = byte(op)
                                ctxt.Andptr = ctxt.Andptr[1:]
                                asmand(ctxt, p, &p.To, &p.From)
 
+                       case Zr_r_r_vex:
+                               ctxt.Vexflag = 1
+                               vexprefix(ctxt, &p.To, &p.From, p.From3, o.prefix)
+                               ctxt.Andptr[0] = byte(op)
+                               ctxt.Andptr = ctxt.Andptr[1:]
+                               asmand(ctxt, p, &p.From, &p.To)
+
                        case Zr_m_xm:
                                mediaop(ctxt, o, op, int(yt.zoffset), z)
                                asmand(ctxt, p, &p.To, &p.From)
index 454789c5098da9f7f7e1d72e2897458b205506b6..33d641e6122c50d3cd029632226158c330479d77 100644 (file)
@@ -42,11 +42,37 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
        JNE     notintel
        MOVB    $1, runtime·lfenceBeforeRdtsc(SB)
 notintel:
+       // Do nothing.
 
        MOVQ    $1, AX
        CPUID
        MOVL    CX, runtime·cpuid_ecx(SB)
        MOVL    DX, runtime·cpuid_edx(SB)
+       // Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
+       // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+       // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
+       ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
+       CMPL    CX, $0x18000000
+       JNE     noavx
+       MOVL    $0, CX
+       // For XGETBV, OSXSAVE bit is required and sufficient
+       BYTE $0x0F; BYTE $0x01; BYTE $0xD0
+       ANDL    $6, AX
+       CMPL    AX, $6 // Check for OS support of YMM registers
+       JNE     noavx
+       MOVB    $1, runtime·support_avx(SB)
+       MOVL    $7, AX
+       MOVL    $0, CX
+       CPUID
+       ANDL    $0x20, BX // check for AVX2 bit
+       CMPL    BX, $0x20
+       JNE     noavx2
+       MOVB    $1, runtime·support_avx2(SB)
+       JMP     nocpuinfo
+noavx:
+       MOVB    $0, runtime·support_avx(SB)
+noavx2:
+       MOVB    $0, runtime·support_avx2(SB)
 nocpuinfo:     
        
        // if there is an _cgo_init, call it.
@@ -1508,7 +1534,10 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
        JB      small
 
        CMPQ    R8, $63
-       JA      big_loop
+       JBE     loop
+       CMPB    runtime·support_avx2(SB), $1
+       JEQ     big_loop_avx2
+       JMP     big_loop
 loop:
        CMPQ    R8, $16
        JBE     _0through16
@@ -1657,6 +1686,45 @@ big_loop:
        JBE     loop
        JMP     big_loop
 
+       // Compare 64-bytes per loop iteration.
+       // Loop is unrolled and uses AVX2.
+big_loop_avx2:
+       MOVHDU  (SI), X2
+       MOVHDU  (DI), X3
+       MOVHDU  32(SI), X4
+       MOVHDU  32(DI), X5
+       VPCMPEQB X2, X3, X0
+       VPMOVMSKB X0, AX
+       XORL    $0xffffffff, AX
+       JNE     diff32_avx2
+       VPCMPEQB X4, X5, X6
+       VPMOVMSKB X6, AX
+       XORL    $0xffffffff, AX
+       JNE     diff64_avx2
+
+       ADDQ    $64, SI
+       ADDQ    $64, DI
+       SUBQ    $64, R8
+       CMPQ    R8, $64
+       JB      big_loop_avx2_exit
+       JMP     big_loop_avx2
+
+       // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+       VZEROUPPER
+       JMP diff16
+
+       // Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+       VZEROUPPER
+       JMP diff48
+
+       // For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+       VZEROUPPER
+       JMP loop
+
+
 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
        MOVQ s+0(FP), SI
        MOVQ s_len+8(FP), BX
index 6b61cd62fa1ea799832f10014a3cb86be32cf01b..f1337e570eca5b1bab9e4d3b8fae75be2dc5017f 100644 (file)
@@ -627,6 +627,8 @@ var (
        cpuid_ecx         uint32
        cpuid_edx         uint32
        lfenceBeforeRdtsc bool
+       support_avx       bool
+       support_avx2      bool
 
        goarm uint8 // set by cmd/link on arm systems
 )