bytes: add optimized countByte for arm64

author Wei Xiao <wei.xiao@arm.com>

Fri, 16 Jun 2017 06:45:14 +0000 (06:45 +0000)

committer Brad Fitzpatrick <bradfitz@golang.org>

Tue, 21 Nov 2017 19:07:38 +0000 (19:07 +0000)
author Wei Xiao <wei.xiao@arm.com>
Fri, 16 Jun 2017 06:45:14 +0000 (06:45 +0000)
committer Brad Fitzpatrick <bradfitz@golang.org>
Tue, 21 Nov 2017 19:07:38 +0000 (19:07 +0000)
diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go

new file mode 100644 (file)

index 0000000..846eeba
--- /dev/null
+++ b/src/bytes/bytes_arm64.go
@@ -0,0 +1,68 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytes
+
+func countByte(s []byte, c byte) int // bytes_arm64.s
+
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+       n := len(sep)
+       switch {
+       case n == 0:
+               return 0
+       case n == 1:
+               return IndexByte(s, sep[0])
+       case n == len(s):
+               if Equal(sep, s) {
+                       return 0
+               }
+               return -1
+       case n > len(s):
+               return -1
+       }
+       c := sep[0]
+       i := 0
+       fails := 0
+       t := s[:len(s)-n+1]
+       for i < len(t) {
+               if t[i] != c {
+                       o := IndexByte(t[i:], c)
+                       if o < 0 {
+                               break
+                       }
+                       i += o
+               }
+               if Equal(s[i:i+n], sep) {
+                       return i
+               }
+               i++
+               fails++
+               if fails >= 4+i>>4 && i < len(t) {
+                       // Give up on IndexByte, it isn't skipping ahead
+                       // far enough to be better than Rabin-Karp.
+                       // Experiments (using IndexPeriodic) suggest
+                       // the cutover is about 16 byte skips.
+                       // TODO: if large prefixes of sep are matching
+                       // we should cutover at even larger average skips,
+                       // because Equal becomes that much more expensive.
+                       // This code does not take that effect into account.
+                       j := indexRabinKarp(s[i:], sep)
+                       if j < 0 {
+                               return -1
+                       }
+                       return i + j
+               }
+       }
+       return -1
+}
+
+// Count counts the number of non-overlapping instances of sep in s.
+// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
+func Count(s, sep []byte) int {
+       if len(sep) == 1 {
+               return countByte(s, sep[0])
+       }
+       return countGeneric(s, sep)
+}
diff --git a/src/bytes/bytes_arm64.s b/src/bytes/bytes_arm64.s

new file mode 100644 (file)

index 0000000..5e229d7
--- /dev/null
+++ b/src/bytes/bytes_arm64.s
@@ -0,0 +1,74 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// countByte(s []byte, c byte) int
+TEXT bytes·countByte(SB),NOSPLIT,$0-40
+       MOVD    s_base+0(FP), R0
+       MOVD    s_len+8(FP), R2
+       MOVBU   c+24(FP), R1
+       // R11 = count of byte to search
+       MOVD    $0, R11
+       // short path to handle 0-byte case
+       CBZ     R2, done
+       CMP     $0x20, R2
+       // jump directly to tail if length < 32
+       BLO     tail
+       ANDS    $0x1f, R0, R9
+       BEQ     chunk
+       // Work with not 32-byte aligned head
+       BIC     $0x1f, R0, R3
+       ADD     $0x20, R3
+head_loop:
+       MOVBU.P 1(R0), R5
+       CMP     R5, R1
+       CINC    EQ, R11, R11
+       SUB     $1, R2, R2
+       CMP     R0, R3
+       BNE     head_loop
+       // Work with 32-byte aligned chunks
+chunk:
+       BIC     $0x1f, R2, R9
+       // The first chunk can also be the last
+       CBZ     R9, tail
+       // R3 = end of 32-byte chunks
+       ADD     R0, R9, R3
+       MOVD    $1, R5
+       VMOV    R5, V5.B16
+       // R2 = length of tail
+       SUB     R9, R2, R2
+       // Duplicate R1 (byte to search) to 16 1-byte elements of V0
+       VMOV    R1, V0.B16
+       // Clear the low 64-bit element of V7 and V8
+       VEOR    V7.B8, V7.B8, V7.B8
+       VEOR    V8.B8, V8.B8, V8.B8
+       // Count the target byte in 32-byte chunk
+chunk_loop:
+       VLD1.P  (R0), [V1.B16, V2.B16]
+       CMP     R0, R3
+       VCMEQ   V0.B16, V1.B16, V3.B16
+       VCMEQ   V0.B16, V2.B16, V4.B16
+       // Clear the higher 7 bits
+       VAND    V5.B16, V3.B16, V3.B16
+       VAND    V5.B16, V4.B16, V4.B16
+       // Count lanes match the requested byte
+       VADDP   V4.B16, V3.B16, V6.B16 // 32B->16B
+       VUADDLV V6.B16, V7
+       // Accumulate the count in low 64-bit element of V8 when inside the loop
+       VADD    V7, V8
+       BNE     chunk_loop
+       VMOV    V8.D[0], R6
+       ADD     R6, R11, R11
+       CBZ     R2, done
+tail:
+       // Work with tail shorter than 32 bytes
+       MOVBU.P 1(R0), R5
+       SUB     $1, R2, R2
+       CMP     R5, R1
+       CINC    EQ, R11, R11
+       CBNZ    R2, tail
+done:
+       MOVD    R11, ret+32(FP)
+       RET
diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go

index b30e53bf2e73c77ac5349516a2cd8643e285c779..0e7d33f09ade8a5bd4d457a5b15492610ad59056 100644 (file)
--- a/src/bytes/bytes_generic.go
+++ b/src/bytes/bytes_generic.go
@@ -2,7 +2,7 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-// +build !amd64,!s390x
+// +build !amd64,!s390x,!arm64
  
  package bytes
  
diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s

index 269e363f7e5898e5651b3db342553395570bca82..ab6ad5bcb79796d0d413a6f78b3733f4ce29fe50 100644 (file)
--- a/src/cmd/asm/internal/asm/testdata/arm64.s
+++ b/src/cmd/asm/internal/asm/testdata/arm64.s
@@ -51,6 +51,11 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
         SHA1P   V11.S4, V10, V9                 // 49110b5e
         VADDV   V0.S4, V0                       // 00b8b14e
         VMOVI   $82, V0.B16                     // 40e6024f
+       VUADDLV V6.B16, V6                      // c638306e
+       VADD    V1, V2, V3                      // 4384e15e
+       VADD    V1, V3, V3                      // 6384e15e
+       VSUB    V12, V30, V30                   // de87ec7e
+       VSUB    V12, V20, V30                   // 9e86ec7e
  
  //     LTYPE1 imsr ',' spreg ','
  //     {
diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go

index 2b0b27f6c5cc5724818e639c28247f733ff5ec7d..6087b74acfd5818e2d19044564cbf7e6733975b2 100644 (file)
--- a/src/cmd/internal/obj/arm64/a.out.go
+++ b/src/cmd/internal/obj/arm64/a.out.go
@@ -745,6 +745,8 @@ const (
         AVMOVS
         AVADDV
         AVMOVI
+       AVUADDLV
+       AVSUB
         ALAST
         AB  = obj.AJMP
         ABL = obj.ACALL
diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go

index 3fe8025e80ceb82decf3f5fafd70f0126fff97ec..4070a436412ad3113372a1e2bec4d2af63e109d5 100644 (file)
--- a/src/cmd/internal/obj/arm64/anames.go
+++ b/src/cmd/internal/obj/arm64/anames.go
@@ -380,5 +380,7 @@ var Anames = []string{
         "VMOVS",
         "VADDV",
         "VMOVI",
+       "VUADDLV",
+       "VSUB",
         "LAST",
  }
diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go

index e15124073a5967275b1f32782d7375278864608c..824fece5505201e253040761a9f88d5e97a5fec8 100644 (file)
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@@ -579,6 +579,9 @@ var optab = []Optab{
         {ASHA1SU0, C_ARNG, C_ARNG, C_ARNG, 1, 4, 0, 0, 0},
         {ASHA256H, C_ARNG, C_VREG, C_VREG, 1, 4, 0, 0, 0},
         {AVADDP, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0},
+       {AVADD, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0},
+       {AVADD, C_VREG, C_VREG, C_VREG, 89, 4, 0, 0, 0},
+       {AVADD, C_VREG, C_NONE, C_VREG, 89, 4, 0, 0, 0},
         {AVLD1, C_ZOREG, C_NONE, C_LIST, 81, 4, 0, 0, 0},
         {AVLD1, C_LOREG, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
         {AVMOV, C_ELEM, C_NONE, C_REG, 73, 4, 0, 0, 0},
@@ -2063,9 +2066,11 @@ func buildop(ctxt *obj.Link) {
                         oprangeset(AVAND, t)
                         oprangeset(AVCMEQ, t)
                         oprangeset(AVORR, t)
-                       oprangeset(AVADD, t)
                         oprangeset(AVEOR, t)
  
+               case AVADD:
+                       oprangeset(AVSUB, t)
+
                 case AAESD:
                         oprangeset(AAESE, t)
                         oprangeset(AAESMC, t)
@@ -2083,6 +2088,9 @@ func buildop(ctxt *obj.Link) {
                 case ASHA1SU0:
                         oprangeset(ASHA256SU1, t)
  
+               case AVADDV:
+                       oprangeset(AVUADDLV, t)
+
                 case ASHA1H,
                         AVMOV,
                         AVLD1,
@@ -2090,7 +2098,6 @@ func buildop(ctxt *obj.Link) {
                         AVST1,
                         AVDUP,
                         AVMOVS,
-                       AVADDV,
                         AVMOVI:
                         break
  
@@ -3612,7 +3619,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
                 o1 |= uint32(p.From.Offset)
                 o1 |= uint32(r&31) << 5
  
-       case 85: /* vaddv Vn.<T>, Vd*/
+       case 85: /* vaddv/vuaddlv Vn.<T>, Vd*/
                 af := int((p.From.Reg >> 5) & 15)
                 o1 = c.oprrr(p, p.As)
                 rf := int((p.From.Reg) & 31)
@@ -3681,6 +3688,27 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
                 rel.Type = objabi.R_ADDRARM64
                 o3 |= 2<<30 | 5<<27 | 2<<23 | 1<<22 | uint32(p.To.Offset&31)<<10 | (REGTMP&31)<<5 | uint32(p.To.Reg&31)
  
+       case 89: /* vadd/vsub Vm, Vn, Vd */
+               switch p.As {
+               case AVADD:
+                       o1 = 5<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10
+
+               case AVSUB:
+                       o1 = 7<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10
+
+               default:
+                       c.ctxt.Diag("bad opcode: %v\n", p)
+                       break
+               }
+
+               rf := int(p.From.Reg)
+               rt := int(p.To.Reg)
+               r := int(p.Reg)
+               if r == 0 {
+                       r = rt
+               }
+               o1 |= (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31)
+
         // This is supposed to be something that stops execution.
         // It's not supposed to be reached, ever, but if it is, we'd
         // like to be able to tell how we got there. Assemble as
@@ -4245,6 +4273,9 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 {
  
         case AVADDV:
                 return 7<<25 | 3<<20 | 3<<15 | 7<<11
+
+       case AVUADDLV:
+               return 1<<29 | 7<<25 | 3<<20 | 7<<11
         }
  
         c.ctxt.Diag("%v: bad rrr %d %v", p, a, a)
diff --git a/src/cmd/internal/obj/arm64/doc.go b/src/cmd/internal/obj/arm64/doc.go

index 9f8606a5ece7f0dafd96720ca8158b0032655533..f75f49fb9c42c105544b99efb94c6e64a9e59db0 100644 (file)
--- a/src/cmd/internal/obj/arm64/doc.go
+++ b/src/cmd/internal/obj/arm64/doc.go
@@ -15,6 +15,10 @@ Go Assembly for ARM64 Reference Manual
      // TODO
  
  3. Alphabetical list of SIMD instructions
+    VADD: Add (scalar)
+      VADD     <Vm>, <Vn>, <Vd>
+        Add corresponding low 64-bit elements in <Vm> and <Vn>,
+        place the result into low 64-bit element of <Vd>.
  
      VADD: Add (vector).
        VADD     <Vm>.T, <Vn>.<T>, <Vd>.<T>
@@ -115,6 +119,16 @@ Go Assembly for ARM64 Reference Manual
          <T> Is an arrangement specifier and can have the following values:
          B8, B16, H4, H8, S2, S4, D1, D2
  
+    VSUB: Sub (scalar)
+      VSUB     <Vm>, <Vn>, <Vd>
+        Subtract low 64-bit element in <Vm> from the correponding element in <Vn>,
+        place the result into low 64-bit element of <Vd>.
+
+    VUADDLV: Unsigned sum Long across Vector.
+      VUADDLV  <Vn>.<T>, Vd
+        <T> Is an arrangement specifier and can have the following values:
+        8B, 16B, H4, H8, S4
+
  4. Alphabetical list of cryptographic extension instructions
  
      SHA1C, SHA1M, SHA1P: SHA1 hash update.
author	Wei Xiao <wei.xiao@arm.com>
	Fri, 16 Jun 2017 06:45:14 +0000 (06:45 +0000)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Tue, 21 Nov 2017 19:07:38 +0000 (19:07 +0000)
src/bytes/bytes_arm64.go	[new file with mode: 0644]	patch \| blob
src/bytes/bytes_arm64.s	[new file with mode: 0644]	patch \| blob
src/bytes/bytes_generic.go		patch \| blob \| history
src/cmd/asm/internal/asm/testdata/arm64.s		patch \| blob \| history
src/cmd/internal/obj/arm64/a.out.go		patch \| blob \| history
src/cmd/internal/obj/arm64/anames.go		patch \| blob \| history
src/cmd/internal/obj/arm64/asm7.go		patch \| blob \| history
src/cmd/internal/obj/arm64/doc.go		patch \| blob \| history