From 9a14cd9e7556e68ea98255ae809b5862d574df9e Mon Sep 17 00:00:00 2001
From: Wei Xiao <wei.xiao@arm.com>
Date: Fri, 16 Jun 2017 06:45:14 +0000
Subject: [PATCH] bytes: add optimized countByte for arm64
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Use SIMD instructions when counting a single byte.
Inspired from runtime IndexByte implementation.

Benchmark results of bytes, where 1 byte in every 8 is the one we are looking:

name               old time/op   new time/op    delta
CountSingle/10-8    96.1ns Â± 1%    38.8ns Â± 0%    -59.64%  (p=0.000 n=9+7)
CountSingle/32-8     172ns Â± 2%      36ns Â± 1%    -79.27%  (p=0.000 n=10+10)
CountSingle/4K-8    18.2Âµs Â± 1%     0.9Âµs Â± 0%    -95.17%  (p=0.000 n=9+10)
CountSingle/4M-8    18.4ms Â± 0%     0.9ms Â± 0%    -95.00%  (p=0.000 n=10+9)
CountSingle/64M-8    284ms Â± 4%      19ms Â± 0%    -93.40%  (p=0.000 n=10+10)

name               old speed     new speed      delta
CountSingle/10-8   104MB/s Â± 1%   258MB/s Â± 0%   +147.99%  (p=0.000 n=9+10)
CountSingle/32-8   185MB/s Â± 1%   897MB/s Â± 1%   +385.33%  (p=0.000 n=9+10)
CountSingle/4K-8   225MB/s Â± 1%  4658MB/s Â± 0%  +1967.40%  (p=0.000 n=9+10)
CountSingle/4M-8   228MB/s Â± 0%  4555MB/s Â± 0%  +1901.71%  (p=0.000 n=10+9)
CountSingle/64M-8  236MB/s Â± 4%  3575MB/s Â± 0%  +1414.69%  (p=0.000 n=10+10)

Change-Id: Ifccb51b3c8658c49773fe05147c3cf3aead361e5
Reviewed-on: https://go-review.googlesource.com/71111
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/bytes/bytes_arm64.go                  | 68 +++++++++++++++++++++
 src/bytes/bytes_arm64.s                   | 74 +++++++++++++++++++++++
 src/bytes/bytes_generic.go                |  2 +-
 src/cmd/asm/internal/asm/testdata/arm64.s |  5 ++
 src/cmd/internal/obj/arm64/a.out.go       |  2 +
 src/cmd/internal/obj/arm64/anames.go      |  2 +
 src/cmd/internal/obj/arm64/asm7.go        | 37 +++++++++++-
 src/cmd/internal/obj/arm64/doc.go         | 14 +++++
 8 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 src/bytes/bytes_arm64.go
 create mode 100644 src/bytes/bytes_arm64.s

diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go
new file mode 100644
index 0000000000..846eeba486
--- /dev/null
+++ b/src/bytes/bytes_arm64.go
@@ -0,0 +1,68 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytes
+
+func countByte(s []byte, c byte) int // bytes_arm64.s
+
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+	n := len(sep)
+	switch {
+	case n == 0:
+		return 0
+	case n == 1:
+		return IndexByte(s, sep[0])
+	case n == len(s):
+		if Equal(sep, s) {
+			return 0
+		}
+		return -1
+	case n > len(s):
+		return -1
+	}
+	c := sep[0]
+	i := 0
+	fails := 0
+	t := s[:len(s)-n+1]
+	for i < len(t) {
+		if t[i] != c {
+			o := IndexByte(t[i:], c)
+			if o < 0 {
+				break
+			}
+			i += o
+		}
+		if Equal(s[i:i+n], sep) {
+			return i
+		}
+		i++
+		fails++
+		if fails >= 4+i>>4 && i < len(t) {
+			// Give up on IndexByte, it isn't skipping ahead
+			// far enough to be better than Rabin-Karp.
+			// Experiments (using IndexPeriodic) suggest
+			// the cutover is about 16 byte skips.
+			// TODO: if large prefixes of sep are matching
+			// we should cutover at even larger average skips,
+			// because Equal becomes that much more expensive.
+			// This code does not take that effect into account.
+			j := indexRabinKarp(s[i:], sep)
+			if j < 0 {
+				return -1
+			}
+			return i + j
+		}
+	}
+	return -1
+}
+
+// Count counts the number of non-overlapping instances of sep in s.
+// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
+func Count(s, sep []byte) int {
+	if len(sep) == 1 {
+		return countByte(s, sep[0])
+	}
+	return countGeneric(s, sep)
+}
diff --git a/src/bytes/bytes_arm64.s b/src/bytes/bytes_arm64.s
new file mode 100644
index 0000000000..5e229d772b
--- /dev/null
+++ b/src/bytes/bytes_arm64.s
@@ -0,0 +1,74 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// countByte(s []byte, c byte) int
+TEXT bytesÂ·countByte(SB),NOSPLIT,$0-40
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, ret+32(FP)
+	RET
diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go
index b30e53bf2e..0e7d33f09a 100644
--- a/src/bytes/bytes_generic.go
+++ b/src/bytes/bytes_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!s390x
+// +build !amd64,!s390x,!arm64
 
 package bytes
 
diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s
index 269e363f7e..ab6ad5bcb7 100644
--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -51,6 +51,11 @@ TEXT	foo(SB), DUPOK|NOSPLIT, $-8
 	SHA1P	V11.S4, V10, V9                 // 49110b5e
 	VADDV	V0.S4, V0                       // 00b8b14e
 	VMOVI	$82, V0.B16                     // 40e6024f
+	VUADDLV	V6.B16, V6                      // c638306e
+	VADD	V1, V2, V3                      // 4384e15e
+	VADD	V1, V3, V3                      // 6384e15e
+	VSUB	V12, V30, V30                   // de87ec7e
+	VSUB	V12, V20, V30                   // 9e86ec7e
 
 //	LTYPE1 imsr ',' spreg ','
 //	{
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go
index 2b0b27f6c5..6087b74acf 100644
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -745,6 +745,8 @@ const (
 	AVMOVS
 	AVADDV
 	AVMOVI
+	AVUADDLV
+	AVSUB
 	ALAST
 	AB  = obj.AJMP
 	ABL = obj.ACALL
diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go
index 3fe8025e80..4070a43641 100644
--- a/src/cmd/internal/obj/arm64/anames.go
+++ b/src/cmd/internal/obj/arm64/anames.go
@@ -380,5 +380,7 @@ var Anames = []string{
 	"VMOVS",
 	"VADDV",
 	"VMOVI",
+	"VUADDLV",
+	"VSUB",
 	"LAST",
 }
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go
index e15124073a..824fece550 100644
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -579,6 +579,9 @@ var optab = []Optab{
 	{ASHA1SU0, C_ARNG, C_ARNG, C_ARNG, 1, 4, 0, 0, 0},
 	{ASHA256H, C_ARNG, C_VREG, C_VREG, 1, 4, 0, 0, 0},
 	{AVADDP, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0},
+	{AVADD, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0},
+	{AVADD, C_VREG, C_VREG, C_VREG, 89, 4, 0, 0, 0},
+	{AVADD, C_VREG, C_NONE, C_VREG, 89, 4, 0, 0, 0},
 	{AVLD1, C_ZOREG, C_NONE, C_LIST, 81, 4, 0, 0, 0},
 	{AVLD1, C_LOREG, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
 	{AVMOV, C_ELEM, C_NONE, C_REG, 73, 4, 0, 0, 0},
@@ -2063,9 +2066,11 @@ func buildop(ctxt *obj.Link) {
 			oprangeset(AVAND, t)
 			oprangeset(AVCMEQ, t)
 			oprangeset(AVORR, t)
-			oprangeset(AVADD, t)
 			oprangeset(AVEOR, t)
 
+		case AVADD:
+			oprangeset(AVSUB, t)
+
 		case AAESD:
 			oprangeset(AAESE, t)
 			oprangeset(AAESMC, t)
@@ -2083,6 +2088,9 @@ func buildop(ctxt *obj.Link) {
 		case ASHA1SU0:
 			oprangeset(ASHA256SU1, t)
 
+		case AVADDV:
+			oprangeset(AVUADDLV, t)
+
 		case ASHA1H,
 			AVMOV,
 			AVLD1,
@@ -2090,7 +2098,6 @@ func buildop(ctxt *obj.Link) {
 			AVST1,
 			AVDUP,
 			AVMOVS,
-			AVADDV,
 			AVMOVI:
 			break
 
@@ -3612,7 +3619,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 |= uint32(p.From.Offset)
 		o1 |= uint32(r&31) << 5
 
-	case 85: /* vaddv Vn.<T>, Vd*/
+	case 85: /* vaddv/vuaddlv Vn.<T>, Vd*/
 		af := int((p.From.Reg >> 5) & 15)
 		o1 = c.oprrr(p, p.As)
 		rf := int((p.From.Reg) & 31)
@@ -3681,6 +3688,27 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		rel.Type = objabi.R_ADDRARM64
 		o3 |= 2<<30 | 5<<27 | 2<<23 | 1<<22 | uint32(p.To.Offset&31)<<10 | (REGTMP&31)<<5 | uint32(p.To.Reg&31)
 
+	case 89: /* vadd/vsub Vm, Vn, Vd */
+		switch p.As {
+		case AVADD:
+			o1 = 5<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10
+
+		case AVSUB:
+			o1 = 7<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10
+
+		default:
+			c.ctxt.Diag("bad opcode: %v\n", p)
+			break
+		}
+
+		rf := int(p.From.Reg)
+		rt := int(p.To.Reg)
+		r := int(p.Reg)
+		if r == 0 {
+			r = rt
+		}
+		o1 |= (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31)
+
 	// This is supposed to be something that stops execution.
 	// It's not supposed to be reached, ever, but if it is, we'd
 	// like to be able to tell how we got there. Assemble as
@@ -4245,6 +4273,9 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 {
 
 	case AVADDV:
 		return 7<<25 | 3<<20 | 3<<15 | 7<<11
+
+	case AVUADDLV:
+		return 1<<29 | 7<<25 | 3<<20 | 7<<11
 	}
 
 	c.ctxt.Diag("%v: bad rrr %d %v", p, a, a)
diff --git a/src/cmd/internal/obj/arm64/doc.go b/src/cmd/internal/obj/arm64/doc.go
index 9f8606a5ec..f75f49fb9c 100644
--- a/src/cmd/internal/obj/arm64/doc.go
+++ b/src/cmd/internal/obj/arm64/doc.go
@@ -15,6 +15,10 @@ Go Assembly for ARM64 Reference Manual
     // TODO
 
 3. Alphabetical list of SIMD instructions
+    VADD: Add (scalar)
+      VADD	<Vm>, <Vn>, <Vd>
+        Add corresponding low 64-bit elements in <Vm> and <Vn>,
+        place the result into low 64-bit element of <Vd>.
 
     VADD: Add (vector).
       VADD	<Vm>.T, <Vn>.<T>, <Vd>.<T>
@@ -115,6 +119,16 @@ Go Assembly for ARM64 Reference Manual
         <T> Is an arrangement specifier and can have the following values:
         B8, B16, H4, H8, S2, S4, D1, D2
 
+    VSUB: Sub (scalar)
+      VSUB	<Vm>, <Vn>, <Vd>
+        Subtract low 64-bit element in <Vm> from the correponding element in <Vn>,
+        place the result into low 64-bit element of <Vd>.
+
+    VUADDLV: Unsigned sum Long across Vector.
+      VUADDLV	<Vn>.<T>, Vd
+        <T> Is an arrangement specifier and can have the following values:
+        8B, 16B, H4, H8, S4
+
 4. Alphabetical list of cryptographic extension instructions
 
     SHA1C, SHA1M, SHA1P: SHA1 hash update.
-- 
2.50.0