From 9b875bc037407b47c4922871390fbae8e3f16592 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 7 Dec 2011 15:09:56 -0500 Subject: [PATCH] bytes: faster Count, Index, Equal Benchmarks are from GOARCH=amd64 on a MacPro5,1. benchmark old MB/s new MB/s speedup bytes_test.BenchmarkEqual32 452.89 891.07 1.97x bytes_test.BenchmarkEqual4K 852.71 1700.44 1.99x bytes_test.BenchmarkEqual4M 841.53 1587.93 1.89x bytes_test.BenchmarkEqual64M 838.22 1578.14 1.88x bytes_test.BenchmarkIndex32 58.02 48.99 0.84x bytes_test.BenchmarkIndex4K 48.26 41.32 0.86x bytes_test.BenchmarkIndex4M 48.20 41.24 0.86x bytes_test.BenchmarkIndex64M 48.08 41.21 0.86x bytes_test.BenchmarkIndexEasy32 410.04 546.82 1.33x bytes_test.BenchmarkIndexEasy4K 849.26 14257.37 16.79x bytes_test.BenchmarkIndexEasy4M 854.54 17222.15 20.15x bytes_test.BenchmarkIndexEasy64M 843.57 11060.40 13.11x bytes_test.BenchmarkCount32 57.24 50.68 0.89x bytes_test.BenchmarkCount4K 48.19 41.82 0.87x bytes_test.BenchmarkCount4M 48.18 41.74 0.87x bytes_test.BenchmarkCount64M 48.17 41.71 0.87x bytes_test.BenchmarkCountEasy32 433.11 547.44 1.26x bytes_test.BenchmarkCountEasy4K 1130.59 14194.06 12.55x bytes_test.BenchmarkCountEasy4M 1131.23 17231.18 15.23x bytes_test.BenchmarkCountEasy64M 1111.40 11068.88 9.96x The non-easy Count/Index benchmarks are a worst case input. regexp.BenchmarkMatchEasy0_32 237.46 221.47 0.93x regexp.BenchmarkMatchEasy0_1K 553.53 1019.72 1.84x regexp.BenchmarkMatchEasy0_32K 693.99 1672.06 2.41x regexp.BenchmarkMatchEasy0_1M 688.72 1611.68 2.34x regexp.BenchmarkMatchEasy0_32M 680.70 1565.05 2.30x regexp.BenchmarkMatchEasy1_32 165.56 243.08 1.47x regexp.BenchmarkMatchEasy1_1K 336.45 496.32 1.48x regexp.BenchmarkMatchEasy1_32K 302.80 425.63 1.41x regexp.BenchmarkMatchEasy1_1M 300.42 414.20 1.38x regexp.BenchmarkMatchEasy1_32M 299.64 413.47 1.38x R=golang-dev, r, iant CC=golang-dev https://golang.org/cl/5451116 --- src/pkg/bytes/asm_386.s | 16 ++++ src/pkg/bytes/asm_amd64.s | 16 ++++ src/pkg/bytes/asm_arm.s | 3 + src/pkg/bytes/bytes.go | 53 ++++++++++--- src/pkg/bytes/bytes_test.go | 147 +++++++++++++++++++++++++++++++---- src/pkg/bytes/export_test.go | 1 + 6 files changed, 210 insertions(+), 26 deletions(-) diff --git a/src/pkg/bytes/asm_386.s b/src/pkg/bytes/asm_386.s index f3391740be..e7833de0c8 100644 --- a/src/pkg/bytes/asm_386.s +++ b/src/pkg/bytes/asm_386.s @@ -15,3 +15,19 @@ TEXT ·IndexByte(SB),7,$0 SUBL $1, DI MOVL DI, ret+16(FP) RET + +TEXT ·Equal(SB),7,$0 + MOVL len+4(FP), BX + MOVL len1+16(FP), CX + MOVL $0, AX + CMPL BX, CX + JNE eqret + MOVL p+0(FP), SI + MOVL q+12(FP), DI + CLD + REP; CMPSB + JNE eqret + MOVL $1, AX +eqret: + MOVB AX, ret+24(FP) + RET diff --git a/src/pkg/bytes/asm_amd64.s b/src/pkg/bytes/asm_amd64.s index c6793cbdcc..bc6e886bda 100644 --- a/src/pkg/bytes/asm_amd64.s +++ b/src/pkg/bytes/asm_amd64.s @@ -90,3 +90,19 @@ success: MOVL DI, ret+24(FP) RET +TEXT ·Equal(SB),7,$0 + MOVL len+8(FP), BX + MOVL len1+24(FP), CX + MOVL $0, AX + MOVL $1, DX + CMPL BX, CX + JNE eqret + MOVQ p+0(FP), SI + MOVQ q+16(FP), DI + CLD + REP; CMPSB + CMOVLEQ DX, AX +eqret: + MOVB AX, ret+32(FP) + RET + diff --git a/src/pkg/bytes/asm_arm.s b/src/pkg/bytes/asm_arm.s index f32fca1366..4ed0c1580a 100644 --- a/src/pkg/bytes/asm_arm.s +++ b/src/pkg/bytes/asm_arm.s @@ -6,3 +6,6 @@ TEXT ·IndexByte(SB),7,$0 B ·indexBytePortable(SB) +// no memcmp implementation on arm yet +TEXT ·Equal(SB),7,$0 + B ·equalPortable(SB) diff --git a/src/pkg/bytes/bytes.go b/src/pkg/bytes/bytes.go index 9bfd88fa39..307c89aa3d 100644 --- a/src/pkg/bytes/bytes.go +++ b/src/pkg/bytes/bytes.go @@ -37,7 +37,9 @@ func Compare(a, b []byte) int { } // Equal returns a boolean reporting whether a == b. -func Equal(a, b []byte) bool { +func Equal(a, b []byte) bool + +func equalPortable(a, b []byte) bool { if len(a) != len(b) { return false } @@ -74,18 +76,33 @@ func explode(s []byte, n int) [][]byte { // Count counts the number of non-overlapping instances of sep in s. func Count(s, sep []byte) int { - if len(sep) == 0 { + n := len(sep) + if n == 0 { return utf8.RuneCount(s) + 1 } + if n > len(s) { + return 0 + } + count := 0 c := sep[0] - n := 0 - for i := 0; i+len(sep) <= len(s); i++ { - if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) { - n++ - i += len(sep) - 1 + i := 0 + t := s[:len(s)-n+1] + for i < len(t) { + if t[i] != c { + o := IndexByte(t[i:], c) + if o < 0 { + break + } + i += o } + if n == 1 || Equal(s[i:i+n], sep) { + count++ + i += n + continue + } + i++ } - return n + return count } // Contains returns whether subslice is within b. @@ -99,11 +116,27 @@ func Index(s, sep []byte) int { if n == 0 { return 0 } + if n > len(s) { + return -1 + } c := sep[0] - for i := 0; i+n <= len(s); i++ { - if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) { + if n == 1 { + return IndexByte(s, c) + } + i := 0 + t := s[:len(s)-n+1] + for i < len(t) { + if t[i] != c { + o := IndexByte(t[i:], c) + if o < 0 { + break + } + i += o + } + if Equal(s[i:i+n], sep) { return i } + i++ } return -1 } diff --git a/src/pkg/bytes/bytes_test.go b/src/pkg/bytes/bytes_test.go index 829ef05319..a2a08c20db 100644 --- a/src/pkg/bytes/bytes_test.go +++ b/src/pkg/bytes/bytes_test.go @@ -64,13 +64,17 @@ func TestCompare(t *testing.T) { a := []byte(tt.a) b := []byte(tt.b) cmp := Compare(a, b) - eql := Equal(a, b) if cmp != tt.i { t.Errorf(`Compare(%q, %q) = %v`, tt.a, tt.b, cmp) } + eql := Equal(a, b) if eql != (tt.i == 0) { t.Errorf(`Equal(%q, %q) = %v`, tt.a, tt.b, eql) } + eql = EqualPortable(a, b) + if eql != (tt.i == 0) { + t.Errorf(`EqualPortable(%q, %q) = %v`, tt.a, tt.b, eql) + } } } @@ -264,41 +268,152 @@ func TestIndexRune(t *testing.T) { } } -func BenchmarkIndexByte4K(b *testing.B) { bmIndex(b, IndexByte, 4<<10) } - -func BenchmarkIndexByte4M(b *testing.B) { bmIndex(b, IndexByte, 4<<20) } +var bmbuf []byte -func BenchmarkIndexByte64M(b *testing.B) { bmIndex(b, IndexByte, 64<<20) } +func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) } +func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) } +func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) } +func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) } +func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) } +func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) } +func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) } +func BenchmarkIndexBytePortable64M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 64<<20) } -func BenchmarkIndexBytePortable4K(b *testing.B) { - bmIndex(b, IndexBytePortable, 4<<10) +func bmIndexByte(b *testing.B, index func([]byte, byte) int, n int) { + if len(bmbuf) < n { + bmbuf = make([]byte, n) + } + b.SetBytes(int64(n)) + buf := bmbuf[0:n] + buf[n-1] = 'x' + for i := 0; i < b.N; i++ { + j := index(buf, 'x') + if j != n-1 { + println("bad index", j) + panic("bad index") + } + } + buf[n-1] = '\x00' } -func BenchmarkIndexBytePortable4M(b *testing.B) { - bmIndex(b, IndexBytePortable, 4<<20) +func BenchmarkEqual32(b *testing.B) { bmEqual(b, Equal, 32) } +func BenchmarkEqual4K(b *testing.B) { bmEqual(b, Equal, 4<<10) } +func BenchmarkEqual4M(b *testing.B) { bmEqual(b, Equal, 4<<20) } +func BenchmarkEqual64M(b *testing.B) { bmEqual(b, Equal, 64<<20) } +func BenchmarkEqualPort32(b *testing.B) { bmEqual(b, EqualPortable, 32) } +func BenchmarkEqualPort4K(b *testing.B) { bmEqual(b, EqualPortable, 4<<10) } +func BenchmarkEqualPortable4M(b *testing.B) { bmEqual(b, EqualPortable, 4<<20) } +func BenchmarkEqualPortable64M(b *testing.B) { bmEqual(b, EqualPortable, 64<<20) } + +func bmEqual(b *testing.B, equal func([]byte, []byte) bool, n int) { + if len(bmbuf) < 2*n { + bmbuf = make([]byte, 2*n) + } + b.SetBytes(int64(n)) + buf1 := bmbuf[0:n] + buf2 := bmbuf[n : 2*n] + buf1[n-1] = 'x' + buf2[n-1] = 'x' + for i := 0; i < b.N; i++ { + eq := equal(buf1, buf2) + if !eq { + panic("bad equal") + } + } + buf1[n-1] = '\x00' + buf2[n-1] = '\x00' } -func BenchmarkIndexBytePortable64M(b *testing.B) { - bmIndex(b, IndexBytePortable, 64<<20) +func BenchmarkIndex32(b *testing.B) { bmIndex(b, Index, 32) } +func BenchmarkIndex4K(b *testing.B) { bmIndex(b, Index, 4<<10) } +func BenchmarkIndex4M(b *testing.B) { bmIndex(b, Index, 4<<20) } +func BenchmarkIndex64M(b *testing.B) { bmIndex(b, Index, 64<<20) } + +func bmIndex(b *testing.B, index func([]byte, []byte) int, n int) { + if len(bmbuf) < n { + bmbuf = make([]byte, n) + } + b.SetBytes(int64(n)) + buf := bmbuf[0:n] + buf[n-1] = 'x' + for i := 0; i < b.N; i++ { + j := index(buf, buf[n-7:]) + if j != n-7 { + println("bad index", j) + panic("bad index") + } + } + buf[n-1] = '\x00' } -var bmbuf []byte +func BenchmarkIndexEasy32(b *testing.B) { bmIndexEasy(b, Index, 32) } +func BenchmarkIndexEasy4K(b *testing.B) { bmIndexEasy(b, Index, 4<<10) } +func BenchmarkIndexEasy4M(b *testing.B) { bmIndexEasy(b, Index, 4<<20) } +func BenchmarkIndexEasy64M(b *testing.B) { bmIndexEasy(b, Index, 64<<20) } -func bmIndex(b *testing.B, index func([]byte, byte) int, n int) { +func bmIndexEasy(b *testing.B, index func([]byte, []byte) int, n int) { if len(bmbuf) < n { bmbuf = make([]byte, n) } b.SetBytes(int64(n)) buf := bmbuf[0:n] buf[n-1] = 'x' + buf[n-7] = 'x' for i := 0; i < b.N; i++ { - j := index(buf, 'x') - if j != n-1 { + j := index(buf, buf[n-7:]) + if j != n-7 { println("bad index", j) panic("bad index") } } - buf[n-1] = '0' + buf[n-1] = '\x00' + buf[n-7] = '\x00' +} + +func BenchmarkCount32(b *testing.B) { bmCount(b, Count, 32) } +func BenchmarkCount4K(b *testing.B) { bmCount(b, Count, 4<<10) } +func BenchmarkCount4M(b *testing.B) { bmCount(b, Count, 4<<20) } +func BenchmarkCount64M(b *testing.B) { bmCount(b, Count, 64<<20) } + +func bmCount(b *testing.B, count func([]byte, []byte) int, n int) { + if len(bmbuf) < n { + bmbuf = make([]byte, n) + } + b.SetBytes(int64(n)) + buf := bmbuf[0:n] + buf[n-1] = 'x' + for i := 0; i < b.N; i++ { + j := count(buf, buf[n-7:]) + if j != 1 { + println("bad count", j) + panic("bad count") + } + } + buf[n-1] = '\x00' +} + +func BenchmarkCountEasy32(b *testing.B) { bmCountEasy(b, Count, 32) } +func BenchmarkCountEasy4K(b *testing.B) { bmCountEasy(b, Count, 4<<10) } +func BenchmarkCountEasy4M(b *testing.B) { bmCountEasy(b, Count, 4<<20) } +func BenchmarkCountEasy64M(b *testing.B) { bmCountEasy(b, Count, 64<<20) } + +func bmCountEasy(b *testing.B, count func([]byte, []byte) int, n int) { + if len(bmbuf) < n { + bmbuf = make([]byte, n) + } + b.SetBytes(int64(n)) + buf := bmbuf[0:n] + buf[n-1] = 'x' + buf[n-7] = 'x' + for i := 0; i < b.N; i++ { + j := count(buf, buf[n-7:]) + if j != 1 { + println("bad count", j) + panic("bad count") + } + } + buf[n-1] = '\x00' + buf[n-7] = '\x00' } type ExplodeTest struct { diff --git a/src/pkg/bytes/export_test.go b/src/pkg/bytes/export_test.go index b65428d9ce..f61523e60b 100644 --- a/src/pkg/bytes/export_test.go +++ b/src/pkg/bytes/export_test.go @@ -6,3 +6,4 @@ package bytes // Export func for testing var IndexBytePortable = indexBytePortable +var EqualPortable = equalPortable -- 2.48.1