From 2181653be637cdcc7a6efee8ec0a719df1d83c00 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Justin=20Nu=C3=9F?= Date: Tue, 25 Apr 2017 21:23:34 +0200 Subject: [PATCH] encoding/csv: add option to reuse slices returned by Read MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In many cases the records returned by Reader.Read will only be used between calls to Read and become garbage once a new record is read. In this case, instead of allocating a new slice on each call to Read, we can reuse the last allocated slice for successive calls to avoid unnecessary allocations. This change adds a new field ReuseRecord to the Reader struct to enable this reuse. ReuseRecord is false by default to avoid breaking existing code which dependss on the current behaviour. I also added 4 new benchmarks, corresponding to the existing Read benchmarks, which set ReuseRecord to true. Benchstat on my local machine (old is ReuseRecord = false, new is ReuseRecord = true) name old time/op new time/op delta Read-8 2.75µs ± 2% 1.88µs ± 1% -31.52% (p=0.000 n=14+15) ReadWithFieldsPerRecord-8 2.75µs ± 0% 1.89µs ± 1% -31.43% (p=0.000 n=13+13) ReadWithoutFieldsPerRecord-8 2.77µs ± 1% 1.88µs ± 1% -32.06% (p=0.000 n=15+15) ReadLargeFields-8 55.4µs ± 1% 54.2µs ± 0% -2.07% (p=0.000 n=15+14) name old alloc/op new alloc/op delta Read-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadWithFieldsPerRecord-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadWithoutFieldsPerRecord-8 664B ± 0% 24B ± 0% -96.39% (p=0.000 n=15+15) ReadLargeFields-8 3.94kB ± 0% 2.98kB ± 0% -24.39% (p=0.000 n=15+15) name old allocs/op new allocs/op delta Read-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadWithFieldsPerRecord-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadWithoutFieldsPerRecord-8 18.0 ± 0% 8.0 ± 0% -55.56% (p=0.000 n=15+15) ReadLargeFields-8 24.0 ± 0% 12.0 ± 0% -50.00% (p=0.000 n=15+15) Fixes #19721 Change-Id: I79b14128bb9bb3465f53f40f93b1b528a9da6f58 Reviewed-on: https://go-review.googlesource.com/41730 Reviewed-by: Brad Fitzpatrick Run-TryBot: Brad Fitzpatrick TryBot-Result: Gobot Gobot --- src/encoding/csv/reader.go | 69 +++++++++++++++++++++++---------- src/encoding/csv/reader_test.go | 31 +++++++++++++++ 2 files changed, 80 insertions(+), 20 deletions(-) diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go index c8c4ca7758..a3497c84f9 100644 --- a/src/encoding/csv/reader.go +++ b/src/encoding/csv/reader.go @@ -110,6 +110,10 @@ type Reader struct { // If TrimLeadingSpace is true, leading white space in a field is ignored. // This is done even if the field delimiter, Comma, is white space. TrimLeadingSpace bool + // ReuseRecord controls whether calls to Read may return a slice sharing + // the backing array of the previous call's returned slice for performance. + // By default, each call to Read returns newly allocated memory owned by the caller. + ReuseRecord bool line int column int @@ -122,6 +126,9 @@ type Reader struct { // Indexes of fields inside lineBuffer // The i'th field starts at offset fieldIndexes[i] in lineBuffer. fieldIndexes []int + + // only used when ReuseRecord == true + lastRecord []string } // NewReader returns a new Reader that reads from r. @@ -147,26 +154,17 @@ func (r *Reader) error(err error) error { // Except for that case, Read always returns either a non-nil // record or a non-nil error, but not both. // If there is no data left to be read, Read returns nil, io.EOF. +// If ReuseRecord is true, the returned slice may be shared +// between multiple calls to Read. func (r *Reader) Read() (record []string, err error) { - for { - record, err = r.parseRecord() - if record != nil { - break - } - if err != nil { - return nil, err - } + if r.ReuseRecord { + record, err = r.readRecord(r.lastRecord) + r.lastRecord = record + } else { + record, err = r.readRecord(nil) } - if r.FieldsPerRecord > 0 { - if len(record) != r.FieldsPerRecord { - r.column = 0 // report at start of record - return record, r.error(ErrFieldCount) - } - } else if r.FieldsPerRecord == 0 { - r.FieldsPerRecord = len(record) - } - return record, nil + return record, err } // ReadAll reads all the remaining records from r. @@ -176,7 +174,7 @@ func (r *Reader) Read() (record []string, err error) { // reported. func (r *Reader) ReadAll() (records [][]string, err error) { for { - record, err := r.Read() + record, err := r.readRecord(nil) if err == io.EOF { return records, nil } @@ -187,6 +185,31 @@ func (r *Reader) ReadAll() (records [][]string, err error) { } } +// readRecord reads and parses a single csv record from r. +// Unlike parseRecord, readRecord handles FieldsPerRecord. +// If dst has enough capacity it will be used for the returned record. +func (r *Reader) readRecord(dst []string) (record []string, err error) { + for { + record, err = r.parseRecord(dst) + if record != nil { + break + } + if err != nil { + return nil, err + } + } + + if r.FieldsPerRecord > 0 { + if len(record) != r.FieldsPerRecord { + r.column = 0 // report at start of record + return record, r.error(ErrFieldCount) + } + } else if r.FieldsPerRecord == 0 { + r.FieldsPerRecord = len(record) + } + return record, nil +} + // readRune reads one rune from r, folding \r\n to \n and keeping track // of how far into the line we have read. r.column will point to the start // of this rune, not the end of this rune. @@ -223,7 +246,8 @@ func (r *Reader) skip(delim rune) error { } // parseRecord reads and parses a single csv record from r. -func (r *Reader) parseRecord() (fields []string, err error) { +// If dst has enough capacity it will be used for the returned fields. +func (r *Reader) parseRecord(dst []string) (fields []string, err error) { // Each record starts on a new line. We increment our line // number (lines start at 1, not 0) and set column to -1 // so as we increment in readRune it points to the character we read. @@ -275,7 +299,12 @@ func (r *Reader) parseRecord() (fields []string, err error) { // minimal and a tradeoff for better performance through the combined // allocations. line := r.lineBuffer.String() - fields = make([]string, fieldCount) + + if cap(dst) >= fieldCount { + fields = dst[:fieldCount] + } else { + fields = make([]string, fieldCount) + } for i, idx := range r.fieldIndexes { if i == fieldCount-1 { diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go index 7b3aca4c5f..5ab1b61256 100644 --- a/src/encoding/csv/reader_test.go +++ b/src/encoding/csv/reader_test.go @@ -24,6 +24,7 @@ var readTests = []struct { LazyQuotes bool TrailingComma bool TrimLeadingSpace bool + ReuseRecord bool Error string Line int // Expected error line if != 0 @@ -260,6 +261,15 @@ x,,, {"c", "d", "e"}, }, }, + { + Name: "ReadAllReuseRecord", + ReuseRecord: true, + Input: "a,b\nc,d", + Output: [][]string{ + {"a", "b"}, + {"c", "d"}, + }, + }, } func TestRead(t *testing.T) { @@ -274,6 +284,7 @@ func TestRead(t *testing.T) { r.LazyQuotes = tt.LazyQuotes r.TrailingComma = tt.TrailingComma r.TrimLeadingSpace = tt.TrimLeadingSpace + r.ReuseRecord = tt.ReuseRecord if tt.Comma != 0 { r.Comma = tt.Comma } @@ -369,3 +380,23 @@ xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzz xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv `, 3)) } + +func BenchmarkReadReuseRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData) +} + +func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData) +} + +func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData) +} + +func BenchmarkReadReuseRecordLargeFields(b *testing.B) { + benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv +,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv +`, 3)) +} -- 2.48.1