From 89ccfe496224bc92f2d2af860cae2f5d7e830f8d Mon Sep 17 00:00:00 2001 From: Joe Tsai Date: Fri, 20 Oct 2017 05:26:43 -0700 Subject: [PATCH] encoding/csv: simplify and optimize Reader MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The Reader implementation is slow because it operates on a rune-by-rune basis via bufio.Reader.ReadRune. We speed this up by operating on entire lines that we read from bufio.Reader.ReadSlice. In order to ensure that we read the full line, we augment ReadSlice in our Reader.readLine method to automatically expand the slice if bufio.ErrBufferFull is every hit. This change happens to fix #19410 because it no longer relies on rune-by-rune parsing and only searches for the relevant delimiter rune. In order to keep column accounting simple and consistent, this change reverts parts of CL 52830. This CL is an alternative to CL 36270 and builds on some of the ideas from that change by Diogo Pinela. name old time/op new time/op delta Read-8 3.12µs ± 1% 2.54µs ± 2% -18.76% (p=0.000 n=10+9) ReadWithFieldsPerRecord-8 3.12µs ± 1% 2.53µs ± 1% -18.91% (p=0.000 n=9+9) ReadWithoutFieldsPerRecord-8 3.13µs ± 0% 2.57µs ± 3% -18.07% (p=0.000 n=10+10) ReadLargeFields-8 52.3µs ± 1% 5.3µs ± 2% -89.93% (p=0.000 n=10+9) ReadReuseRecord-8 2.05µs ± 1% 1.40µs ± 1% -31.48% (p=0.000 n=10+9) ReadReuseRecordWithFieldsPerRecord-8 2.05µs ± 1% 1.41µs ± 0% -31.03% (p=0.000 n=10+9) ReadReuseRecordWithoutFieldsPerRecord-8 2.06µs ± 1% 1.40µs ± 1% -31.70% (p=0.000 n=9+10) ReadReuseRecordLargeFields-8 50.9µs ± 0% 4.1µs ± 3% -92.01% (p=0.000 n=10+10) name old alloc/op new alloc/op Read-8 664B ± 0% 664B ± 0% ReadWithFieldsPerRecord-8 664B ± 0% 664B ± 0% ReadWithoutFieldsPerRecord-8 664B ± 0% 664B ± 0% ReadLargeFields-8 3.94kB ± 0% 3.94kB ± 0% ReadReuseRecord-8 24.0B ± 0% 24.0B ± 0% ReadReuseRecordWithFieldsPerRecord-8 24.0B ± 0% 24.0B ± 0% ReadReuseRecordWithoutFieldsPerRecord-8 24.0B ± 0% 24.0B ± 0% ReadReuseRecordLargeFields-8 2.98kB ± 0% 2.98kB ± 0% name old allocs/op new allocs/op Read-8 18.0 ± 0% 18.0 ± 0% ReadWithFieldsPerRecord-8 18.0 ± 0% 18.0 ± 0% ReadWithoutFieldsPerRecord-8 18.0 ± 0% 18.0 ± 0% ReadLargeFields-8 24.0 ± 0% 24.0 ± 0% ReadReuseRecord-8 8.00 ± 0% 8.00 ± 0% ReadReuseRecordWithFieldsPerRecord-8 8.00 ± 0% 8.00 ± 0% ReadReuseRecordWithoutFieldsPerRecord-8 8.00 ± 0% 8.00 ± 0% ReadReuseRecordLargeFields-8 12.0 ± 0% 12.0 ± 0% Updates #22352 Updates #19019 Fixes #16791 Fixes #19410 Change-Id: I31c27cfcc56880e6abac262f36c947179b550bbf Reviewed-on: https://go-review.googlesource.com/72150 Reviewed-by: Ian Lance Taylor Run-TryBot: Joe Tsai TryBot-Result: Gobot Gobot --- src/encoding/csv/reader.go | 387 ++++++++++++++------------------ src/encoding/csv/reader_test.go | 135 ++++++++--- 2 files changed, 270 insertions(+), 252 deletions(-) diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go index e49240fb53..3c08b9f9d1 100644 --- a/src/encoding/csv/reader.go +++ b/src/encoding/csv/reader.go @@ -58,6 +58,7 @@ import ( "fmt" "io" "unicode" + "unicode/utf8" ) // A ParseError is returned for parsing errors. @@ -115,20 +116,25 @@ type Reader struct { // By default, each call to Read returns newly allocated memory owned by the caller. ReuseRecord bool - line int - recordLine int // line where the current record started - column int - r *bufio.Reader - // lineBuffer holds the unescaped fields read by readField, one after another. + r *bufio.Reader + + // numLine is the current line being read in the CSV file. + numLine int + + // rawBuffer is a line buffer only used by the readLine method. + rawBuffer []byte + + // recordBuffer holds the unescaped fields, one after another. // The fields can be accessed by using the indexes in fieldIndexes. - // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and - // fieldIndexes will contain the indexes 0, 1, 2, 5. - lineBuffer bytes.Buffer - // Indexes of fields inside lineBuffer - // The i'th field starts at offset fieldIndexes[i] in lineBuffer. + // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de` + // and fieldIndexes will contain the indexes [1, 2, 5, 6]. + recordBuffer []byte + + // fieldIndexes is an index of fields inside recordBuffer. + // The i'th field ends at offset fieldIndexes[i] in recordBuffer. fieldIndexes []int - // only used when ReuseRecord == true + // lastRecord is a record cache and only used when ReuseRecord == true. lastRecord []string } @@ -140,15 +146,6 @@ func NewReader(r io.Reader) *Reader { } } -// error creates a new ParseError based on err. -func (r *Reader) error(err error) error { - return &ParseError{ - Line: r.recordLine, - Column: r.column, - Err: err, - } -} - // Read reads one record (a slice of fields) from r. // If the record has an unexpected number of fields, // Read returns the record along with the error ErrFieldCount. @@ -164,7 +161,6 @@ func (r *Reader) Read() (record []string, err error) { } else { record, err = r.readRecord(nil) } - return record, err } @@ -186,237 +182,182 @@ func (r *Reader) ReadAll() (records [][]string, err error) { } } -// readRecord reads and parses a single csv record from r. -// Unlike parseRecord, readRecord handles FieldsPerRecord. -// If dst has enough capacity it will be used for the returned record. -func (r *Reader) readRecord(dst []string) (record []string, err error) { - for { - record, err = r.parseRecord(dst) - if record != nil { - break - } - if err != nil { - return nil, err +// readLine reads the next line (with the trailing endline). +// If EOF is hit without a trailing endline, it will be omitted. +// If some bytes were read, then the error is never io.EOF. +// The result is only valid until the next call to readLine. +func (r *Reader) readLine() ([]byte, error) { + line, err := r.r.ReadSlice('\n') + if err == bufio.ErrBufferFull { + r.rawBuffer = append(r.rawBuffer[:0], line...) + for err == bufio.ErrBufferFull { + line, err = r.r.ReadSlice('\n') + r.rawBuffer = append(r.rawBuffer, line...) } + line = r.rawBuffer } - - if r.FieldsPerRecord > 0 { - if len(record) != r.FieldsPerRecord { - r.column = 0 // report at start of record - return record, r.error(ErrFieldCount) - } - } else if r.FieldsPerRecord == 0 { - r.FieldsPerRecord = len(record) + if len(line) > 0 && err == io.EOF { + err = nil } - return record, nil + r.numLine++ + return line, err } -// readRune reads one rune from r, folding \r\n to \n and keeping track -// of how far into the line we have read. r.column will point to the start -// of this rune, not the end of this rune. -func (r *Reader) readRune() (rune, error) { - r1, _, err := r.r.ReadRune() - - // Handle \r\n here. We make the simplifying assumption that - // anytime \r is followed by \n that it can be folded to \n. - // We will not detect files which contain both \r\n and bare \n. - if r1 == '\r' { - r1, _, err = r.r.ReadRune() - if err == nil { - if r1 != '\n' { - r.r.UnreadRune() - r1 = '\r' - } +// lengthCRLF reports the number of bytes for a trailing "\r\n". +func lengthCRLF(b []byte) int { + if j := len(b) - 1; j >= 0 && b[j] == '\n' { + if j := len(b) - 2; j >= 0 && b[j] == '\r' { + return 2 } + return 1 } - r.column++ - return r1, err + return 0 } -// readRawRune works the same way as readRune, but does not fold \r\n to \n. -func (r *Reader) readRawRune() (rune, error) { - r1, _, err := r.r.ReadRune() - r.column++ - return r1, err +// nextRune returns the next rune in b or utf8.RuneError. +func nextRune(b []byte) rune { + r, _ := utf8.DecodeRune(b) + return r } -// skip reads runes up to and including the rune delim or until error. -func (r *Reader) skip(delim rune) error { - for { - r1, err := r.readRune() - if err != nil { - return err +func (r *Reader) readRecord(dst []string) ([]string, error) { + // Read line (automatically skipping past empty lines and any comments). + var line, fullLine []byte + var errRead error + for errRead == nil { + line, errRead = r.readLine() + if r.Comment != 0 && nextRune(line) == r.Comment { + line = nil + continue // Skip comment lines } - if r1 == delim { - return nil + if errRead == nil && len(line) == lengthCRLF(line) { + line = nil + continue // Skip empty lines } + fullLine = line + break } -} - -// parseRecord reads and parses a single csv record from r. -// If dst has enough capacity it will be used for the returned fields. -func (r *Reader) parseRecord(dst []string) (fields []string, err error) { - // Each record starts on a new line. We increment our line - // number (lines start at 1, not 0) and set column to -1 - // so as we increment in readRune it points to the character we read. - // We track the line where the record starts in recordLine for use in errors. - r.line++ - r.recordLine = r.line - r.column = -1 - - // Peek at the first rune. If it is an error we are done. - // If we support comments and it is the comment character - // then skip to the end of line. - - r1, _, err := r.r.ReadRune() - if err != nil { - return nil, err - } - - if r.Comment != 0 && r1 == r.Comment { - return nil, r.skip('\n') + if errRead == io.EOF { + return nil, errRead } - r.r.UnreadRune() - r.lineBuffer.Reset() + // Parse each field in the record. + var err error + const quoteLen = len(`"`) + commaLen := utf8.RuneLen(r.Comma) + recLine := r.numLine // Starting line for record + r.recordBuffer = r.recordBuffer[:0] r.fieldIndexes = r.fieldIndexes[:0] - - // At this point we have at least one field. +parseField: for { - idx := r.lineBuffer.Len() - - haveField, delim, err := r.parseField() - if haveField { - r.fieldIndexes = append(r.fieldIndexes, idx) + if r.TrimLeadingSpace { + line = bytes.TrimLeftFunc(line, unicode.IsSpace) } - - if delim == '\n' || err == io.EOF { - if len(r.fieldIndexes) == 0 { - return nil, err + if len(line) == 0 || line[0] != '"' { + // Non-quoted string field + i := bytes.IndexRune(line, r.Comma) + field := line + if i >= 0 { + field = field[:i] + } else { + field = field[:len(field)-lengthCRLF(field)] } - break - } - - if err != nil { - return nil, err - } - } - - fieldCount := len(r.fieldIndexes) - // Using this approach (creating a single string and taking slices of it) - // means that a single reference to any of the fields will retain the whole - // string. The risk of a nontrivial space leak caused by this is considered - // minimal and a tradeoff for better performance through the combined - // allocations. - line := r.lineBuffer.String() - - if cap(dst) >= fieldCount { - fields = dst[:fieldCount] - } else { - fields = make([]string, fieldCount) - } - - for i, idx := range r.fieldIndexes { - if i == fieldCount-1 { - fields[i] = line[idx:] - } else { - fields[i] = line[idx:r.fieldIndexes[i+1]] - } - } - - return fields, nil -} - -// parseField parses the next field in the record. The read field is -// appended to r.lineBuffer. Delim is the first character not part of the field -// (r.Comma or '\n'). -func (r *Reader) parseField() (haveField bool, delim rune, err error) { - r1, err := r.readRune() - for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { - r1, err = r.readRune() - } - - if err == io.EOF && r.column != 0 { - return true, 0, err - } - if err != nil { - return false, 0, err - } - - switch r1 { - case r.Comma: - // will check below - - case '\n': - // We are a trailing empty field or a blank line - if r.column == 0 { - return false, r1, nil - } - return true, r1, nil - - case '"': - // quoted field - Quoted: - for { - // use readRawRune instead of readRune to preserve \r\n - // in quotes fields. - r1, err = r.readRawRune() - if err != nil { - if err == io.EOF { - if r.LazyQuotes { - return true, 0, err - } - return false, 0, r.error(ErrQuote) + // Check to make sure a quote does not appear in field. + if !r.LazyQuotes { + if j := bytes.IndexByte(field, '"'); j >= 0 { + col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])]) + err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote} + break parseField } - return false, 0, err } - switch r1 { - case '"': - r1, err = r.readRune() - if err != nil || r1 == r.Comma { - break Quoted - } - if r1 == '\n' { - return true, r1, nil - } - if r1 != '"' { - if !r.LazyQuotes { - r.column-- - return false, 0, r.error(ErrQuote) + r.recordBuffer = append(r.recordBuffer, field...) + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + if i >= 0 { + line = line[i+commaLen:] + continue parseField + } + break parseField + } else { + // Quoted string field + line = line[quoteLen:] + for { + i := bytes.IndexByte(line, '"') + if i >= 0 { + // Hit next quote. + r.recordBuffer = append(r.recordBuffer, line[:i]...) + line = line[i+quoteLen:] + switch rn := nextRune(line); { + case rn == '"': + // `""` sequence (append quote). + r.recordBuffer = append(r.recordBuffer, '"') + line = line[quoteLen:] + case rn == r.Comma: + // `",` sequence (end of field). + line = line[commaLen:] + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + continue parseField + case lengthCRLF(line) == len(line): + // `"\n` sequence (end of line). + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + break parseField + case r.LazyQuotes: + // `"` sequence (bare quote). + r.recordBuffer = append(r.recordBuffer, '"') + default: + // `"*` squence (invalid non-escaped quote). + col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen]) + err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote} + break parseField + } + } else if len(line) > 0 { + // Hit end of line (copy all data so far). + r.recordBuffer = append(r.recordBuffer, line...) + if errRead != nil { + break parseField + } + line, errRead = r.readLine() + if errRead == io.EOF { + errRead = nil } - // accept the bare quote - r.lineBuffer.WriteRune('"') + fullLine = line + } else { + // Abrupt end of file (EOF or error). + if !r.LazyQuotes && errRead == nil { + col := utf8.RuneCount(fullLine) + err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote} + break parseField + } + r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) + break parseField } - case '\n': - r.line++ - r.column = -1 } - r.lineBuffer.WriteRune(r1) } + } + if err == nil { + err = errRead + } - default: - // unquoted field - for { - r.lineBuffer.WriteRune(r1) - r1, err = r.readRune() - if err != nil || r1 == r.Comma { - break - } - if r1 == '\n' { - return true, r1, nil - } - if !r.LazyQuotes && r1 == '"' { - return false, 0, r.error(ErrBareQuote) - } - } + // Create a single string and create slices out of it. + // This pins the memory of the fields together, but allocates once. + str := string(r.recordBuffer) // Convert to string once to batch allocations + dst = dst[:0] + if cap(dst) < len(r.fieldIndexes) { + dst = make([]string, len(r.fieldIndexes)) + } + dst = dst[:len(r.fieldIndexes)] + var preIdx int + for i, idx := range r.fieldIndexes { + dst[i] = str[preIdx:idx] + preIdx = idx } - if err != nil { - if err == io.EOF { - return true, 0, err + // Check or update the expected fields per record. + if r.FieldsPerRecord > 0 { + if len(dst) != r.FieldsPerRecord && err == nil { + err = &ParseError{Line: recLine, Err: ErrFieldCount} } - return false, 0, err + } else if r.FieldsPerRecord == 0 { + r.FieldsPerRecord = len(dst) } - - return true, r1, nil + return dst, err } diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go index 3811629aad..781847cefa 100644 --- a/src/encoding/csv/reader_test.go +++ b/src/encoding/csv/reader_test.go @@ -26,9 +26,8 @@ var readTests = []struct { TrimLeadingSpace bool ReuseRecord bool - Error string - Line int // Expected error line if != 0 - Column int // Expected error column if line != 0 + Error error + Line int // Expected error line if != 0 }{ { Name: "Simple", @@ -140,7 +139,7 @@ field"`, { Name: "BadDoubleQuotes", Input: `a""b,c`, - Error: `bare " in non-quoted-field`, Line: 1, Column: 1, + Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote}, }, { Name: "TrimQuote", @@ -151,30 +150,30 @@ field"`, { Name: "BadBareQuote", Input: `a "word","b"`, - Error: `bare " in non-quoted-field`, Line: 1, Column: 2, + Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote}, }, { Name: "BadTrailingQuote", Input: `"a word",b"`, - Error: `bare " in non-quoted-field`, Line: 1, Column: 10, + Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote}, }, { Name: "ExtraneousQuote", Input: `"a "word","b"`, - Error: `extraneous " in field`, Line: 1, Column: 3, + Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote}, }, { Name: "BadFieldCount", UseFieldsPerRecord: true, Input: "a,b,c\nd,e", - Error: "wrong number of fields", Line: 2, + Error: &ParseError{Line: 2, Err: ErrFieldCount}, }, { Name: "BadFieldCount1", UseFieldsPerRecord: true, FieldsPerRecord: 2, Input: `a,b,c`, - Error: "wrong number of fields", Line: 1, + Error: &ParseError{Line: 1, Err: ErrFieldCount}, }, { Name: "FieldCount", @@ -271,18 +270,14 @@ x,,, }, }, { // issue 19019 - Name: "RecordLine1", - Input: "a,\"b\nc\"d,e", - Error: `extraneous " in field`, - Line: 1, - Column: 1, + Name: "RecordLine1", + Input: "a,\"b\nc\"d,e", + Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote}, }, { - Name: "RecordLine2", - Input: "a,b\n\"d\n\n,e", - Error: `extraneous " in field`, - Line: 2, - Column: 2, + Name: "RecordLine2", + Input: "a,b\n\"d\n\n,e", + Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote}, }, { // issue 21201 Name: "CRLFInQuotedField", @@ -291,6 +286,95 @@ x,,, {"Hello\r\nHi"}, }, }, + { // issue 19410 + Name: "BinaryBlobField", + Input: "x09\x41\xb4\x1c,aktau", + Output: [][]string{{"x09A\xb4\x1c", "aktau"}}, + }, + { + Name: "TrailingCR", + Input: "field1,field2\r", + Output: [][]string{{"field1", "field2\r"}}, + }, + { + Name: "NonASCIICommaAndComment", + TrimLeadingSpace: true, + Comma: '£', + Comment: '€', + Input: "a£b,c£ \td,e\n€ comment\n", + Output: [][]string{{"a", "b,c", "d,e"}}, + }, + { + Name: "NonASCIICommaAndCommentWithQuotes", + Comma: '€', + Comment: 'λ', + Input: "a€\" b,\"€ c\nλ comment\n", + Output: [][]string{{"a", " b,", " c"}}, + }, + { + Name: "NonASCIICommaConfusion", + Comma: 'λ', + Comment: '€', + // λ and θ start with the same byte. This test is intended to ensure the parser doesn't + // confuse such characters. + Input: "\"abθcd\"λefθgh", + Output: [][]string{{"abθcd", "efθgh"}}, + }, + { + Name: "NonASCIICommentConfusion", + Comment: 'θ', + Input: "λ\nλ\nθ\nλ\n", + Output: [][]string{{"λ"}, {"λ"}, {"λ"}}, + }, + { + Name: "QuotedFieldMultipleLF", + Input: "\"\n\n\n\n\"", + Output: [][]string{{"\n\n\n\n"}}, + }, + { + Name: "MultipleCRLF", + Input: "\r\n\r\n\r\n\r\n", + }, + { + // The implementation may read each line in several chunks if it doesn't fit entirely + // in the read buffer, so we should test the code to handle that condition. + Name: "HugeLines", + Comment: '#', + Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000), + Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}}, + }, + { + Name: "QuoteWithTrailingCRLF", + Input: "\"foo\"bar\"\r\n", + Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote}, + }, + { + Name: "LazyQuoteWithTrailingCRLF", + Input: "\"foo\"bar\"\r\n", + LazyQuotes: true, + Output: [][]string{{`foo"bar`}}, + }, + { + Name: "DoubleQuoteWithTrailingCRLF", + Input: "\"foo\"\"bar\"\r\n", + Output: [][]string{{`foo"bar`}}, + }, + { + Name: "EvenQuotes", + Input: `""""""""`, + Output: [][]string{{`"""`}}, + }, + { + Name: "OddQuotes", + Input: `"""""""`, + Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote}, + }, + { + Name: "LazyOddQuotes", + Input: `"""""""`, + LazyQuotes: true, + Output: [][]string{{`"""`}}, + }, } func TestRead(t *testing.T) { @@ -310,17 +394,10 @@ func TestRead(t *testing.T) { r.Comma = tt.Comma } out, err := r.ReadAll() - perr, _ := err.(*ParseError) - if tt.Error != "" { - if err == nil || !strings.Contains(err.Error(), tt.Error) { - t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error) - } else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) { - t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column) - } - } else if err != nil { - t.Errorf("%s: unexpected error %v", tt.Name, err) + if !reflect.DeepEqual(err, tt.Error) { + t.Errorf("%s: ReadAll() error:\ngot %v\nwant %v", tt.Name, err, tt.Error) } else if !reflect.DeepEqual(out, tt.Output) { - t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output) + t.Errorf("%s: ReadAll() output:\ngot %q\nwant %q", tt.Name, out, tt.Output) } } } -- 2.48.1