From: Ilya Tocar Date: Fri, 14 Apr 2017 18:53:40 +0000 (-0500) Subject: cmd/compile/internal/gc: speed-up small array comparison X-Git-Tag: go1.9beta1~133 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=3bdc2f3abf0f9cffc8f4e294ef22a23b82e88415;p=gostls13.git cmd/compile/internal/gc: speed-up small array comparison Currently we inline array comparisons for arrays with at most 4 elements. Compare arrays with small size, but more than 4 elements (e. g. [16]byte) with larger compares. This provides very slightly smaller binaries, and results in faster code. ArrayEqual-6 7.41ns ± 0% 3.17ns ± 0% -57.15% (p=0.000 n=10+10) For go tool: global text (code) = -559 bytes (-0.014566%) This also helps mapaccess1_faststr, and maps in general: MapDelete/Str/1-6 195ns ± 1% 186ns ± 2% -4.47% (p=0.000 n=10+10) MapDelete/Str/2-6 211ns ± 1% 177ns ± 1% -16.01% (p=0.000 n=10+10) MapDelete/Str/4-6 225ns ± 1% 183ns ± 1% -18.49% (p=0.000 n=8+10) MapStringKeysEight_16-6 31.3ns ± 0% 28.6ns ± 0% -8.63% (p=0.000 n=6+9) MapStringKeysEight_32-6 29.2ns ± 0% 27.6ns ± 0% -5.45% (p=0.000 n=10+10) MapStringKeysEight_64-6 29.1ns ± 1% 27.5ns ± 0% -5.46% (p=0.000 n=10+10) MapStringKeysEight_1M-6 29.1ns ± 1% 27.6ns ± 0% -5.49% (p=0.000 n=10+10) Change-Id: I9ec98e41b233031e0e96c4e13d86a324f628ed4a Reviewed-on: https://go-review.googlesource.com/40771 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index bac09ef295..221b8497f1 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -166,7 +166,7 @@ var allAsmTests = []*asmTests{ { arch: "amd64", os: "linux", - imports: []string{"encoding/binary", "math/bits"}, + imports: []string{"encoding/binary", "math/bits", "unsafe"}, tests: linuxAMD64Tests, }, { @@ -869,6 +869,35 @@ var linuxAMD64Tests = []*asmTest{ }`, []string{"\tRORB\t"}, }, + // Check that array compare uses 2/4/8 byte compares + { + ` + func f68(a,b [2]byte) bool { + return a == b + }`, + []string{"\tCMPW\t[A-Z]"}, + }, + { + ` + func f69(a,b [3]uint16) bool { + return a == b + }`, + []string{"\tCMPL\t[A-Z]"}, + }, + { + ` + func f70(a,b [15]byte) bool { + return a == b + }`, + []string{"\tCMPQ\t[A-Z]"}, + }, + { + ` + func f71(a,b unsafe.Pointer) bool { // This was a TODO in mapaccess1_faststr + return *((*[4]byte)(a)) != *((*[4]byte)(b)) + }`, + []string{"\tCMPL\t[A-Z]"}, + }, } var linux386Tests = []*asmTest{ diff --git a/src/cmd/compile/internal/gc/walk.go b/src/cmd/compile/internal/gc/walk.go index 557293b9f0..15108e6e57 100644 --- a/src/cmd/compile/internal/gc/walk.go +++ b/src/cmd/compile/internal/gc/walk.go @@ -3243,11 +3243,25 @@ func walkcompare(n *Node, init *Nodes) *Node { // inline or call an eq alg. t := n.Left.Type var inline bool + + maxcmpsize := int64(4) + unalignedLoad := false + switch thearch.LinkArch.Family { + case sys.AMD64, sys.ARM64, sys.S390X: + // Keep this low enough, to generate less code than function call. + maxcmpsize = 16 + unalignedLoad = true + case sys.I386: + maxcmpsize = 8 + unalignedLoad = true + } + switch t.Etype { default: return n case TARRAY: - inline = t.NumElem() <= 1 || (t.NumElem() <= 4 && issimple[t.Elem().Etype]) + // We can compare several elements at once with 2/4/8 byte integer compares + inline = t.NumElem() <= 1 || (issimple[t.Elem().Etype] && (t.NumElem() <= 4 || t.Elem().Width*t.NumElem() <= maxcmpsize)) case TSTRUCT: inline = t.NumFields() <= 4 } @@ -3333,11 +3347,54 @@ func walkcompare(n *Node, init *Nodes) *Node { ) } } else { - for i := 0; int64(i) < t.NumElem(); i++ { - compare( - nod(OINDEX, cmpl, nodintconst(int64(i))), - nod(OINDEX, cmpr, nodintconst(int64(i))), - ) + step := int64(1) + remains := t.NumElem() * t.Elem().Width + combine64bit := unalignedLoad && Widthreg == 8 && t.Elem().Width <= 4 && t.Elem().IsInteger() + combine32bit := unalignedLoad && t.Elem().Width <= 2 && t.Elem().IsInteger() + combine16bit := unalignedLoad && t.Elem().Width == 1 && t.Elem().IsInteger() + for i := int64(0); remains > 0; { + var convType *types.Type + switch { + case remains >= 8 && combine64bit: + convType = types.Types[TINT64] + step = 8 / t.Elem().Width + case remains >= 4 && combine32bit: + convType = types.Types[TUINT32] + step = 4 / t.Elem().Width + case remains >= 2 && combine16bit: + convType = types.Types[TUINT16] + step = 2 / t.Elem().Width + default: + step = 1 + } + if step == 1 { + compare( + nod(OINDEX, cmpl, nodintconst(int64(i))), + nod(OINDEX, cmpr, nodintconst(int64(i))), + ) + i++ + remains -= t.Elem().Width + } else { + cmplw := nod(OINDEX, cmpl, nodintconst(int64(i))) + cmplw = conv(cmplw, convType) + cmprw := nod(OINDEX, cmpr, nodintconst(int64(i))) + cmprw = conv(cmprw, convType) + // For code like this: uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 ... + // ssa will generate a single large load. + for offset := int64(1); offset < step; offset++ { + lb := nod(OINDEX, cmpl, nodintconst(int64(i+offset))) + lb = conv(lb, convType) + lb = nod(OLSH, lb, nodintconst(int64(8*t.Elem().Width*offset))) + cmplw = nod(OOR, cmplw, lb) + rb := nod(OINDEX, cmpr, nodintconst(int64(i+offset))) + rb = conv(rb, convType) + rb = nod(OLSH, rb, nodintconst(int64(8*t.Elem().Width*offset))) + cmprw = nod(OOR, cmprw, rb) + } + compare(cmplw, cmprw) + i += step + remains -= step * t.Elem().Width + } } } if expr == nil { diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go index ebba001d46..1f9b313219 100644 --- a/src/runtime/hashmap_fast.go +++ b/src/runtime/hashmap_fast.go @@ -252,8 +252,6 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer { return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)) } // check first 4 bytes - // TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of - // four 1-byte comparisons. if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) { continue }