From 967564be7eef0575235e838839c7847da7723378 Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Thu, 29 Oct 2015 17:17:05 +0300 Subject: [PATCH] runtime: optimize string comparison on amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use AVX2 if possible. Results below (haswell): name old time/op new time/op delta CompareStringEqual-6 8.77ns ± 0% 8.63ns ± 1% -1.58% (p=0.000 n=20+19) CompareStringIdentical-6 5.02ns ± 0% 5.02ns ± 0% ~ (all samples are equal) CompareStringSameLength-6 7.51ns ± 0% 7.51ns ± 0% ~ (all samples are equal) CompareStringDifferentLength-6 1.56ns ± 0% 1.56ns ± 0% ~ (all samples are equal) CompareStringBigUnaligned-6 124µs ± 1% 105µs ± 5% -14.99% (p=0.000 n=20+18) CompareStringBig-6 112µs ± 1% 103µs ± 0% -7.87% (p=0.000 n=20+17) name old speed new speed delta CompareStringBigUnaligned-6 8.48GB/s ± 1% 9.98GB/s ± 5% +17.67% (p=0.000 n=20+18) CompareStringBig-6 9.37GB/s ± 1% 10.17GB/s ± 0% +8.54% (p=0.000 n=20+17) Change-Id: I1c949626dd2aaf9f633e3c888a9df71c82eed7e1 Reviewed-on: https://go-review.googlesource.com/16481 Reviewed-by: Keith Randall Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Klaus Post --- src/cmd/internal/obj/x86/a.out.go | 1 + src/cmd/internal/obj/x86/anames.go | 1 + src/cmd/internal/obj/x86/asm6.go | 1 + src/runtime/asm_amd64.s | 28 ++++++++++++++++++++++++++++ 4 files changed, 31 insertions(+) diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index 55fc31ddaf..345135ceec 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -748,6 +748,7 @@ const ( AMOVHDA AVPCMPEQB AVPMOVMSKB + AVPAND // from 386 AJCXZW diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index 729b9d423b..2f1374ada0 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -689,6 +689,7 @@ var Anames = []string{ "MOVHDA", "VPCMPEQB", "VPMOVMSKB", + "VPAND", "JCXZW", "FCMOVCC", "FCMOVCS", diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 739ba67105..6e3093819b 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -1509,6 +1509,7 @@ var optab = {AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}}, {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}}, {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}}, + {AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}}, {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}}, {obj.ATYPE, nil, 0, [23]uint8{}}, {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}}, diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 2ba3d3d106..8401accbcd 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1416,6 +1416,10 @@ eq: TEXT runtime·memeqbody(SB),NOSPLIT,$0-0 CMPQ BX, $8 JB small + CMPQ BX, $64 + JB bigloop + CMPB runtime·support_avx2(SB), $1 + JE hugeloop_avx2 // 64 bytes at a time using xmm registers hugeloop: @@ -1445,6 +1449,30 @@ hugeloop: MOVB $0, (AX) RET + // 64 bytes at a time using ymm registers +hugeloop_avx2: + CMPQ BX, $64 + JB bigloop_avx2 + MOVHDU (SI), X0 + MOVHDU (DI), X1 + MOVHDU 32(SI), X2 + MOVHDU 32(DI), X3 + VPCMPEQB X1, X0, X4 + VPCMPEQB X2, X3, X5 + VPAND X4, X5, X6 + VPMOVMSKB X6, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffffffff + JEQ hugeloop_avx2 + VZEROUPPER + MOVB $0, (AX) + RET + +bigloop_avx2: + VZEROUPPER + // 8 bytes at a time using 64-bit register bigloop: CMPQ BX, $8 -- 2.48.1