cmd/compile/internal/gc: speed-up small array comparison

author Ilya Tocar <ilya.tocar@intel.com>

Fri, 14 Apr 2017 18:53:40 +0000 (13:53 -0500)

committer Ilya Tocar <ilya.tocar@intel.com>

Thu, 1 Jun 2017 15:46:16 +0000 (15:46 +0000)
author Ilya Tocar <ilya.tocar@intel.com>
Fri, 14 Apr 2017 18:53:40 +0000 (13:53 -0500)
committer Ilya Tocar <ilya.tocar@intel.com>
Thu, 1 Jun 2017 15:46:16 +0000 (15:46 +0000)
diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go

index bac09ef295784eb5c2d9088fc2782c544cbe72b9..221b8497f15984d834118d81045eb2a3bd1d73dd 100644 (file)
--- a/src/cmd/compile/internal/gc/asm_test.go
+++ b/src/cmd/compile/internal/gc/asm_test.go
@@ -166,7 +166,7 @@ var allAsmTests = []*asmTests{
         {
                 arch:    "amd64",
                 os:      "linux",
-               imports: []string{"encoding/binary", "math/bits"},
+               imports: []string{"encoding/binary", "math/bits", "unsafe"},
                 tests:   linuxAMD64Tests,
         },
         {
@@ -869,6 +869,35 @@ var linuxAMD64Tests = []*asmTest{
                 }`,
                 []string{"\tRORB\t"},
         },
+       // Check that array compare uses 2/4/8 byte compares
+       {
+               `
+               func f68(a,b [2]byte) bool {
+                   return a == b
+               }`,
+               []string{"\tCMPW\t[A-Z]"},
+       },
+       {
+               `
+               func f69(a,b [3]uint16) bool {
+                   return a == b
+               }`,
+               []string{"\tCMPL\t[A-Z]"},
+       },
+       {
+               `
+               func f70(a,b [15]byte) bool {
+                   return a == b
+               }`,
+               []string{"\tCMPQ\t[A-Z]"},
+       },
+       {
+               `
+               func f71(a,b unsafe.Pointer) bool { // This was a TODO in mapaccess1_faststr
+                   return *((*[4]byte)(a)) != *((*[4]byte)(b))
+               }`,
+               []string{"\tCMPL\t[A-Z]"},
+       },
  }
  
  var linux386Tests = []*asmTest{
diff --git a/src/cmd/compile/internal/gc/walk.go b/src/cmd/compile/internal/gc/walk.go

index 557293b9f01bcdf38969e5a12b3836f08f6db736..15108e6e5737a50fdf8daf3a1f2b29a15284ce09 100644 (file)
--- a/src/cmd/compile/internal/gc/walk.go
+++ b/src/cmd/compile/internal/gc/walk.go
@@ -3243,11 +3243,25 @@ func walkcompare(n *Node, init *Nodes) *Node {
         // inline or call an eq alg.
         t := n.Left.Type
         var inline bool
+
+       maxcmpsize := int64(4)
+       unalignedLoad := false
+       switch thearch.LinkArch.Family {
+       case sys.AMD64, sys.ARM64, sys.S390X:
+               // Keep this low enough, to generate less code than function call.
+               maxcmpsize = 16
+               unalignedLoad = true
+       case sys.I386:
+               maxcmpsize = 8
+               unalignedLoad = true
+       }
+
         switch t.Etype {
         default:
                 return n
         case TARRAY:
-               inline = t.NumElem() <= 1 || (t.NumElem() <= 4 && issimple[t.Elem().Etype])
+               // We can compare several elements at once with 2/4/8 byte integer compares
+               inline = t.NumElem() <= 1 || (issimple[t.Elem().Etype] && (t.NumElem() <= 4 || t.Elem().Width*t.NumElem() <= maxcmpsize))
         case TSTRUCT:
                 inline = t.NumFields() <= 4
         }
@@ -3333,11 +3347,54 @@ func walkcompare(n *Node, init *Nodes) *Node {
                         )
                 }
         } else {
-               for i := 0; int64(i) < t.NumElem(); i++ {
-                       compare(
-                               nod(OINDEX, cmpl, nodintconst(int64(i))),
-                               nod(OINDEX, cmpr, nodintconst(int64(i))),
-                       )
+               step := int64(1)
+               remains := t.NumElem() * t.Elem().Width
+               combine64bit := unalignedLoad && Widthreg == 8 && t.Elem().Width <= 4 && t.Elem().IsInteger()
+               combine32bit := unalignedLoad && t.Elem().Width <= 2 && t.Elem().IsInteger()
+               combine16bit := unalignedLoad && t.Elem().Width == 1 && t.Elem().IsInteger()
+               for i := int64(0); remains > 0; {
+                       var convType *types.Type
+                       switch {
+                       case remains >= 8 && combine64bit:
+                               convType = types.Types[TINT64]
+                               step = 8 / t.Elem().Width
+                       case remains >= 4 && combine32bit:
+                               convType = types.Types[TUINT32]
+                               step = 4 / t.Elem().Width
+                       case remains >= 2 && combine16bit:
+                               convType = types.Types[TUINT16]
+                               step = 2 / t.Elem().Width
+                       default:
+                               step = 1
+                       }
+                       if step == 1 {
+                               compare(
+                                       nod(OINDEX, cmpl, nodintconst(int64(i))),
+                                       nod(OINDEX, cmpr, nodintconst(int64(i))),
+                               )
+                               i++
+                               remains -= t.Elem().Width
+                       } else {
+                               cmplw := nod(OINDEX, cmpl, nodintconst(int64(i)))
+                               cmplw = conv(cmplw, convType)
+                               cmprw := nod(OINDEX, cmpr, nodintconst(int64(i)))
+                               cmprw = conv(cmprw, convType)
+                               // For code like this:  uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 ...
+                               // ssa will generate a single large load.
+                               for offset := int64(1); offset < step; offset++ {
+                                       lb := nod(OINDEX, cmpl, nodintconst(int64(i+offset)))
+                                       lb = conv(lb, convType)
+                                       lb = nod(OLSH, lb, nodintconst(int64(8*t.Elem().Width*offset)))
+                                       cmplw = nod(OOR, cmplw, lb)
+                                       rb := nod(OINDEX, cmpr, nodintconst(int64(i+offset)))
+                                       rb = conv(rb, convType)
+                                       rb = nod(OLSH, rb, nodintconst(int64(8*t.Elem().Width*offset)))
+                                       cmprw = nod(OOR, cmprw, rb)
+                               }
+                               compare(cmplw, cmprw)
+                               i += step
+                               remains -= step * t.Elem().Width
+                       }
                 }
         }
         if expr == nil {
diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go

index ebba001d46bd4387110caea5f4e73566eae061ed..1f9b313219c52c6a31696c2970219cb155d2ca44 100644 (file)
--- a/src/runtime/hashmap_fast.go
+++ b/src/runtime/hashmap_fast.go
@@ -252,8 +252,6 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
                                 return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
                         }
                         // check first 4 bytes
-                       // TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
-                       // four 1-byte comparisons.
                         if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
                                 continue
                         }
author	Ilya Tocar <ilya.tocar@intel.com>
	Fri, 14 Apr 2017 18:53:40 +0000 (13:53 -0500)
committer	Ilya Tocar <ilya.tocar@intel.com>
	Thu, 1 Jun 2017 15:46:16 +0000 (15:46 +0000)
src/cmd/compile/internal/gc/asm_test.go		patch \| blob \| history
src/cmd/compile/internal/gc/walk.go		patch \| blob \| history
src/runtime/hashmap_fast.go		patch \| blob \| history