From 9a14cd9e7556e68ea98255ae809b5862d574df9e Mon Sep 17 00:00:00 2001 From: Wei Xiao Date: Fri, 16 Jun 2017 06:45:14 +0000 Subject: [PATCH] bytes: add optimized countByte for arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use SIMD instructions when counting a single byte. Inspired from runtime IndexByte implementation. Benchmark results of bytes, where 1 byte in every 8 is the one we are looking: name old time/op new time/op delta CountSingle/10-8 96.1ns ± 1% 38.8ns ± 0% -59.64% (p=0.000 n=9+7) CountSingle/32-8 172ns ± 2% 36ns ± 1% -79.27% (p=0.000 n=10+10) CountSingle/4K-8 18.2µs ± 1% 0.9µs ± 0% -95.17% (p=0.000 n=9+10) CountSingle/4M-8 18.4ms ± 0% 0.9ms ± 0% -95.00% (p=0.000 n=10+9) CountSingle/64M-8 284ms ± 4% 19ms ± 0% -93.40% (p=0.000 n=10+10) name old speed new speed delta CountSingle/10-8 104MB/s ± 1% 258MB/s ± 0% +147.99% (p=0.000 n=9+10) CountSingle/32-8 185MB/s ± 1% 897MB/s ± 1% +385.33% (p=0.000 n=9+10) CountSingle/4K-8 225MB/s ± 1% 4658MB/s ± 0% +1967.40% (p=0.000 n=9+10) CountSingle/4M-8 228MB/s ± 0% 4555MB/s ± 0% +1901.71% (p=0.000 n=10+9) CountSingle/64M-8 236MB/s ± 4% 3575MB/s ± 0% +1414.69% (p=0.000 n=10+10) Change-Id: Ifccb51b3c8658c49773fe05147c3cf3aead361e5 Reviewed-on: https://go-review.googlesource.com/71111 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot --- src/bytes/bytes_arm64.go | 68 +++++++++++++++++++++ src/bytes/bytes_arm64.s | 74 +++++++++++++++++++++++ src/bytes/bytes_generic.go | 2 +- src/cmd/asm/internal/asm/testdata/arm64.s | 5 ++ src/cmd/internal/obj/arm64/a.out.go | 2 + src/cmd/internal/obj/arm64/anames.go | 2 + src/cmd/internal/obj/arm64/asm7.go | 37 +++++++++++- src/cmd/internal/obj/arm64/doc.go | 14 +++++ 8 files changed, 200 insertions(+), 4 deletions(-) create mode 100644 src/bytes/bytes_arm64.go create mode 100644 src/bytes/bytes_arm64.s diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go new file mode 100644 index 0000000000..846eeba486 --- /dev/null +++ b/src/bytes/bytes_arm64.go @@ -0,0 +1,68 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytes + +func countByte(s []byte, c byte) int // bytes_arm64.s + +// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. +func Index(s, sep []byte) int { + n := len(sep) + switch { + case n == 0: + return 0 + case n == 1: + return IndexByte(s, sep[0]) + case n == len(s): + if Equal(sep, s) { + return 0 + } + return -1 + case n > len(s): + return -1 + } + c := sep[0] + i := 0 + fails := 0 + t := s[:len(s)-n+1] + for i < len(t) { + if t[i] != c { + o := IndexByte(t[i:], c) + if o < 0 { + break + } + i += o + } + if Equal(s[i:i+n], sep) { + return i + } + i++ + fails++ + if fails >= 4+i>>4 && i < len(t) { + // Give up on IndexByte, it isn't skipping ahead + // far enough to be better than Rabin-Karp. + // Experiments (using IndexPeriodic) suggest + // the cutover is about 16 byte skips. + // TODO: if large prefixes of sep are matching + // we should cutover at even larger average skips, + // because Equal becomes that much more expensive. + // This code does not take that effect into account. + j := indexRabinKarp(s[i:], sep) + if j < 0 { + return -1 + } + return i + j + } + } + return -1 +} + +// Count counts the number of non-overlapping instances of sep in s. +// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s. +func Count(s, sep []byte) int { + if len(sep) == 1 { + return countByte(s, sep[0]) + } + return countGeneric(s, sep) +} diff --git a/src/bytes/bytes_arm64.s b/src/bytes/bytes_arm64.s new file mode 100644 index 0000000000..5e229d772b --- /dev/null +++ b/src/bytes/bytes_arm64.s @@ -0,0 +1,74 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// countByte(s []byte, c byte) int +TEXT bytes·countByte(SB),NOSPLIT,$0-40 + MOVD s_base+0(FP), R0 + MOVD s_len+8(FP), R2 + MOVBU c+24(FP), R1 + // R11 = count of byte to search + MOVD $0, R11 + // short path to handle 0-byte case + CBZ R2, done + CMP $0x20, R2 + // jump directly to tail if length < 32 + BLO tail + ANDS $0x1f, R0, R9 + BEQ chunk + // Work with not 32-byte aligned head + BIC $0x1f, R0, R3 + ADD $0x20, R3 +head_loop: + MOVBU.P 1(R0), R5 + CMP R5, R1 + CINC EQ, R11, R11 + SUB $1, R2, R2 + CMP R0, R3 + BNE head_loop + // Work with 32-byte aligned chunks +chunk: + BIC $0x1f, R2, R9 + // The first chunk can also be the last + CBZ R9, tail + // R3 = end of 32-byte chunks + ADD R0, R9, R3 + MOVD $1, R5 + VMOV R5, V5.B16 + // R2 = length of tail + SUB R9, R2, R2 + // Duplicate R1 (byte to search) to 16 1-byte elements of V0 + VMOV R1, V0.B16 + // Clear the low 64-bit element of V7 and V8 + VEOR V7.B8, V7.B8, V7.B8 + VEOR V8.B8, V8.B8, V8.B8 + // Count the target byte in 32-byte chunk +chunk_loop: + VLD1.P (R0), [V1.B16, V2.B16] + CMP R0, R3 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + // Clear the higher 7 bits + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + // Count lanes match the requested byte + VADDP V4.B16, V3.B16, V6.B16 // 32B->16B + VUADDLV V6.B16, V7 + // Accumulate the count in low 64-bit element of V8 when inside the loop + VADD V7, V8 + BNE chunk_loop + VMOV V8.D[0], R6 + ADD R6, R11, R11 + CBZ R2, done +tail: + // Work with tail shorter than 32 bytes + MOVBU.P 1(R0), R5 + SUB $1, R2, R2 + CMP R5, R1 + CINC EQ, R11, R11 + CBNZ R2, tail +done: + MOVD R11, ret+32(FP) + RET diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go index b30e53bf2e..0e7d33f09a 100644 --- a/src/bytes/bytes_generic.go +++ b/src/bytes/bytes_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!s390x +// +build !amd64,!s390x,!arm64 package bytes diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s index 269e363f7e..ab6ad5bcb7 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64.s +++ b/src/cmd/asm/internal/asm/testdata/arm64.s @@ -51,6 +51,11 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 SHA1P V11.S4, V10, V9 // 49110b5e VADDV V0.S4, V0 // 00b8b14e VMOVI $82, V0.B16 // 40e6024f + VUADDLV V6.B16, V6 // c638306e + VADD V1, V2, V3 // 4384e15e + VADD V1, V3, V3 // 6384e15e + VSUB V12, V30, V30 // de87ec7e + VSUB V12, V20, V30 // 9e86ec7e // LTYPE1 imsr ',' spreg ',' // { diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go index 2b0b27f6c5..6087b74acf 100644 --- a/src/cmd/internal/obj/arm64/a.out.go +++ b/src/cmd/internal/obj/arm64/a.out.go @@ -745,6 +745,8 @@ const ( AVMOVS AVADDV AVMOVI + AVUADDLV + AVSUB ALAST AB = obj.AJMP ABL = obj.ACALL diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go index 3fe8025e80..4070a43641 100644 --- a/src/cmd/internal/obj/arm64/anames.go +++ b/src/cmd/internal/obj/arm64/anames.go @@ -380,5 +380,7 @@ var Anames = []string{ "VMOVS", "VADDV", "VMOVI", + "VUADDLV", + "VSUB", "LAST", } diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index e15124073a..824fece550 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -579,6 +579,9 @@ var optab = []Optab{ {ASHA1SU0, C_ARNG, C_ARNG, C_ARNG, 1, 4, 0, 0, 0}, {ASHA256H, C_ARNG, C_VREG, C_VREG, 1, 4, 0, 0, 0}, {AVADDP, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0}, + {AVADD, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0}, + {AVADD, C_VREG, C_VREG, C_VREG, 89, 4, 0, 0, 0}, + {AVADD, C_VREG, C_NONE, C_VREG, 89, 4, 0, 0, 0}, {AVLD1, C_ZOREG, C_NONE, C_LIST, 81, 4, 0, 0, 0}, {AVLD1, C_LOREG, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST}, {AVMOV, C_ELEM, C_NONE, C_REG, 73, 4, 0, 0, 0}, @@ -2063,9 +2066,11 @@ func buildop(ctxt *obj.Link) { oprangeset(AVAND, t) oprangeset(AVCMEQ, t) oprangeset(AVORR, t) - oprangeset(AVADD, t) oprangeset(AVEOR, t) + case AVADD: + oprangeset(AVSUB, t) + case AAESD: oprangeset(AAESE, t) oprangeset(AAESMC, t) @@ -2083,6 +2088,9 @@ func buildop(ctxt *obj.Link) { case ASHA1SU0: oprangeset(ASHA256SU1, t) + case AVADDV: + oprangeset(AVUADDLV, t) + case ASHA1H, AVMOV, AVLD1, @@ -2090,7 +2098,6 @@ func buildop(ctxt *obj.Link) { AVST1, AVDUP, AVMOVS, - AVADDV, AVMOVI: break @@ -3612,7 +3619,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 |= uint32(p.From.Offset) o1 |= uint32(r&31) << 5 - case 85: /* vaddv Vn., Vd*/ + case 85: /* vaddv/vuaddlv Vn., Vd*/ af := int((p.From.Reg >> 5) & 15) o1 = c.oprrr(p, p.As) rf := int((p.From.Reg) & 31) @@ -3681,6 +3688,27 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { rel.Type = objabi.R_ADDRARM64 o3 |= 2<<30 | 5<<27 | 2<<23 | 1<<22 | uint32(p.To.Offset&31)<<10 | (REGTMP&31)<<5 | uint32(p.To.Reg&31) + case 89: /* vadd/vsub Vm, Vn, Vd */ + switch p.As { + case AVADD: + o1 = 5<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10 + + case AVSUB: + o1 = 7<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10 + + default: + c.ctxt.Diag("bad opcode: %v\n", p) + break + } + + rf := int(p.From.Reg) + rt := int(p.To.Reg) + r := int(p.Reg) + if r == 0 { + r = rt + } + o1 |= (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31) + // This is supposed to be something that stops execution. // It's not supposed to be reached, ever, but if it is, we'd // like to be able to tell how we got there. Assemble as @@ -4245,6 +4273,9 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 { case AVADDV: return 7<<25 | 3<<20 | 3<<15 | 7<<11 + + case AVUADDLV: + return 1<<29 | 7<<25 | 3<<20 | 7<<11 } c.ctxt.Diag("%v: bad rrr %d %v", p, a, a) diff --git a/src/cmd/internal/obj/arm64/doc.go b/src/cmd/internal/obj/arm64/doc.go index 9f8606a5ec..f75f49fb9c 100644 --- a/src/cmd/internal/obj/arm64/doc.go +++ b/src/cmd/internal/obj/arm64/doc.go @@ -15,6 +15,10 @@ Go Assembly for ARM64 Reference Manual // TODO 3. Alphabetical list of SIMD instructions + VADD: Add (scalar) + VADD , , + Add corresponding low 64-bit elements in and , + place the result into low 64-bit element of . VADD: Add (vector). VADD .T, ., . @@ -115,6 +119,16 @@ Go Assembly for ARM64 Reference Manual Is an arrangement specifier and can have the following values: B8, B16, H4, H8, S2, S4, D1, D2 + VSUB: Sub (scalar) + VSUB , , + Subtract low 64-bit element in from the correponding element in , + place the result into low 64-bit element of . + + VUADDLV: Unsigned sum Long across Vector. + VUADDLV ., Vd + Is an arrangement specifier and can have the following values: + 8B, 16B, H4, H8, S4 + 4. Alphabetical list of cryptographic extension instructions SHA1C, SHA1M, SHA1P: SHA1 hash update. -- 2.50.0