]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: optimize string comparison on amd64
authorIlya Tocar <ilya.tocar@intel.com>
Thu, 29 Oct 2015 14:17:05 +0000 (17:17 +0300)
committerKeith Randall <khr@golang.org>
Thu, 5 Nov 2015 15:42:33 +0000 (15:42 +0000)
Use AVX2 if possible.
Results below (haswell):

name                            old time/op    new time/op     delta
CompareStringEqual-6              8.77ns ± 0%     8.63ns ± 1%   -1.58%        (p=0.000 n=20+19)
CompareStringIdentical-6          5.02ns ± 0%     5.02ns ± 0%     ~     (all samples are equal)
CompareStringSameLength-6         7.51ns ± 0%     7.51ns ± 0%     ~     (all samples are equal)
CompareStringDifferentLength-6    1.56ns ± 0%     1.56ns ± 0%     ~     (all samples are equal)
CompareStringBigUnaligned-6        124µs ± 1%      105µs ± 5%  -14.99%        (p=0.000 n=20+18)
CompareStringBig-6                 112µs ± 1%      103µs ± 0%   -7.87%        (p=0.000 n=20+17)

name                            old speed      new speed       delta
CompareStringBigUnaligned-6     8.48GB/s ± 1%   9.98GB/s ± 5%  +17.67%        (p=0.000 n=20+18)
CompareStringBig-6              9.37GB/s ± 1%  10.17GB/s ± 0%   +8.54%        (p=0.000 n=20+17)

Change-Id: I1c949626dd2aaf9f633e3c888a9df71c82eed7e1
Reviewed-on: https://go-review.googlesource.com/16481
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Klaus Post <klauspost@gmail.com>
src/cmd/internal/obj/x86/a.out.go
src/cmd/internal/obj/x86/anames.go
src/cmd/internal/obj/x86/asm6.go
src/runtime/asm_amd64.s

index 55fc31ddaf1010673641ad7e445117d58ab3279f..345135ceecb63cdb6241df57c0b101462bc879a1 100644 (file)
@@ -748,6 +748,7 @@ const (
        AMOVHDA
        AVPCMPEQB
        AVPMOVMSKB
+       AVPAND
 
        // from 386
        AJCXZW
index 729b9d423bc437149f87f6acc4d10660d57db621..2f1374ada0e13db172d097cf138a88af27a7fa7c 100644 (file)
@@ -689,6 +689,7 @@ var Anames = []string{
        "MOVHDA",
        "VPCMPEQB",
        "VPMOVMSKB",
+       "VPAND",
        "JCXZW",
        "FCMOVCC",
        "FCMOVCS",
index 739ba671058f11599719a58a564441a2e033b3aa..6e3093819b36b621f5b2dea1ca9d3eeb3d1ca3a7 100644 (file)
@@ -1509,6 +1509,7 @@ var optab =
        {AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
        {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
        {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
+       {AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}},
        {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
        {obj.ATYPE, nil, 0, [23]uint8{}},
        {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
index 2ba3d3d106b0dc9801fde0d024c615a4a0871a1c..8401accbcdc02bea73fdc5ec63008b463fef39f7 100644 (file)
@@ -1416,6 +1416,10 @@ eq:
 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
        CMPQ    BX, $8
        JB      small
+       CMPQ    BX, $64
+       JB      bigloop
+       CMPB    runtime·support_avx2(SB), $1
+       JE      hugeloop_avx2
        
        // 64 bytes at a time using xmm registers
 hugeloop:
@@ -1445,6 +1449,30 @@ hugeloop:
        MOVB    $0, (AX)
        RET
 
+       // 64 bytes at a time using ymm registers
+hugeloop_avx2:
+       CMPQ    BX, $64
+       JB      bigloop_avx2
+       MOVHDU  (SI), X0
+       MOVHDU  (DI), X1
+       MOVHDU  32(SI), X2
+       MOVHDU  32(DI), X3
+       VPCMPEQB        X1, X0, X4
+       VPCMPEQB        X2, X3, X5
+       VPAND   X4, X5, X6
+       VPMOVMSKB X6, DX
+       ADDQ    $64, SI
+       ADDQ    $64, DI
+       SUBQ    $64, BX
+       CMPL    DX, $0xffffffff
+       JEQ     hugeloop_avx2
+       VZEROUPPER
+       MOVB    $0, (AX)
+       RET
+
+bigloop_avx2:
+       VZEROUPPER
+
        // 8 bytes at a time using 64-bit register
 bigloop:
        CMPQ    BX, $8