encoding/csv: simplify and optimize Reader

author Joe Tsai <joetsai@digital-static.net>

Fri, 20 Oct 2017 12:26:43 +0000 (05:26 -0700)

committer Joe Tsai <thebrokentoaster@gmail.com>

Fri, 20 Oct 2017 23:20:46 +0000 (23:20 +0000)
author Joe Tsai <joetsai@digital-static.net>
Fri, 20 Oct 2017 12:26:43 +0000 (05:26 -0700)
committer Joe Tsai <thebrokentoaster@gmail.com>
Fri, 20 Oct 2017 23:20:46 +0000 (23:20 +0000)
diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go

index e49240fb53d17f830fca7bf9efb278f83075dfe7..3c08b9f9d14c1628b2bc68474ef0eecc40fe2b4e 100644 (file)
--- a/src/encoding/csv/reader.go
+++ b/src/encoding/csv/reader.go
@@ -58,6 +58,7 @@ import (
         "fmt"
         "io"
         "unicode"
+       "unicode/utf8"
  )
  
  // A ParseError is returned for parsing errors.
@@ -115,20 +116,25 @@ type Reader struct {
         // By default, each call to Read returns newly allocated memory owned by the caller.
         ReuseRecord bool
  
-       line       int
-       recordLine int // line where the current record started
-       column     int
-       r          *bufio.Reader
-       // lineBuffer holds the unescaped fields read by readField, one after another.
+       r *bufio.Reader
+
+       // numLine is the current line being read in the CSV file.
+       numLine int
+
+       // rawBuffer is a line buffer only used by the readLine method.
+       rawBuffer []byte
+
+       // recordBuffer holds the unescaped fields, one after another.
         // The fields can be accessed by using the indexes in fieldIndexes.
-       // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
-       // fieldIndexes will contain the indexes 0, 1, 2, 5.
-       lineBuffer bytes.Buffer
-       // Indexes of fields inside lineBuffer
-       // The i'th field starts at offset fieldIndexes[i] in lineBuffer.
+       // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
+       // and fieldIndexes will contain the indexes [1, 2, 5, 6].
+       recordBuffer []byte
+
+       // fieldIndexes is an index of fields inside recordBuffer.
+       // The i'th field ends at offset fieldIndexes[i] in recordBuffer.
         fieldIndexes []int
  
-       // only used when ReuseRecord == true
+       // lastRecord is a record cache and only used when ReuseRecord == true.
         lastRecord []string
  }
  
@@ -140,15 +146,6 @@ func NewReader(r io.Reader) *Reader {
         }
  }
  
-// error creates a new ParseError based on err.
-func (r *Reader) error(err error) error {
-       return &ParseError{
-               Line:   r.recordLine,
-               Column: r.column,
-               Err:    err,
-       }
-}
-
  // Read reads one record (a slice of fields) from r.
  // If the record has an unexpected number of fields,
  // Read returns the record along with the error ErrFieldCount.
@@ -164,7 +161,6 @@ func (r *Reader) Read() (record []string, err error) {
         } else {
                 record, err = r.readRecord(nil)
         }
-
         return record, err
  }
  
@@ -186,237 +182,182 @@ func (r *Reader) ReadAll() (records [][]string, err error) {
         }
  }
  
-// readRecord reads and parses a single csv record from r.
-// Unlike parseRecord, readRecord handles FieldsPerRecord.
-// If dst has enough capacity it will be used for the returned record.
-func (r *Reader) readRecord(dst []string) (record []string, err error) {
-       for {
-               record, err = r.parseRecord(dst)
-               if record != nil {
-                       break
-               }
-               if err != nil {
-                       return nil, err
+// readLine reads the next line (with the trailing endline).
+// If EOF is hit without a trailing endline, it will be omitted.
+// If some bytes were read, then the error is never io.EOF.
+// The result is only valid until the next call to readLine.
+func (r *Reader) readLine() ([]byte, error) {
+       line, err := r.r.ReadSlice('\n')
+       if err == bufio.ErrBufferFull {
+               r.rawBuffer = append(r.rawBuffer[:0], line...)
+               for err == bufio.ErrBufferFull {
+                       line, err = r.r.ReadSlice('\n')
+                       r.rawBuffer = append(r.rawBuffer, line...)
                 }
+               line = r.rawBuffer
         }
-
-       if r.FieldsPerRecord > 0 {
-               if len(record) != r.FieldsPerRecord {
-                       r.column = 0 // report at start of record
-                       return record, r.error(ErrFieldCount)
-               }
-       } else if r.FieldsPerRecord == 0 {
-               r.FieldsPerRecord = len(record)
+       if len(line) > 0 && err == io.EOF {
+               err = nil
         }
-       return record, nil
+       r.numLine++
+       return line, err
  }
  
-// readRune reads one rune from r, folding \r\n to \n and keeping track
-// of how far into the line we have read.  r.column will point to the start
-// of this rune, not the end of this rune.
-func (r *Reader) readRune() (rune, error) {
-       r1, _, err := r.r.ReadRune()
-
-       // Handle \r\n here. We make the simplifying assumption that
-       // anytime \r is followed by \n that it can be folded to \n.
-       // We will not detect files which contain both \r\n and bare \n.
-       if r1 == '\r' {
-               r1, _, err = r.r.ReadRune()
-               if err == nil {
-                       if r1 != '\n' {
-                               r.r.UnreadRune()
-                               r1 = '\r'
-                       }
+// lengthCRLF reports the number of bytes for a trailing "\r\n".
+func lengthCRLF(b []byte) int {
+       if j := len(b) - 1; j >= 0 && b[j] == '\n' {
+               if j := len(b) - 2; j >= 0 && b[j] == '\r' {
+                       return 2
                 }
+               return 1
         }
-       r.column++
-       return r1, err
+       return 0
  }
  
-// readRawRune works the same way as readRune, but does not fold \r\n to \n.
-func (r *Reader) readRawRune() (rune, error) {
-       r1, _, err := r.r.ReadRune()
-       r.column++
-       return r1, err
+// nextRune returns the next rune in b or utf8.RuneError.
+func nextRune(b []byte) rune {
+       r, _ := utf8.DecodeRune(b)
+       return r
  }
  
-// skip reads runes up to and including the rune delim or until error.
-func (r *Reader) skip(delim rune) error {
-       for {
-               r1, err := r.readRune()
-               if err != nil {
-                       return err
+func (r *Reader) readRecord(dst []string) ([]string, error) {
+       // Read line (automatically skipping past empty lines and any comments).
+       var line, fullLine []byte
+       var errRead error
+       for errRead == nil {
+               line, errRead = r.readLine()
+               if r.Comment != 0 && nextRune(line) == r.Comment {
+                       line = nil
+                       continue // Skip comment lines
                 }
-               if r1 == delim {
-                       return nil
+               if errRead == nil && len(line) == lengthCRLF(line) {
+                       line = nil
+                       continue // Skip empty lines
                 }
+               fullLine = line
+               break
         }
-}
-
-// parseRecord reads and parses a single csv record from r.
-// If dst has enough capacity it will be used for the returned fields.
-func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
-       // Each record starts on a new line. We increment our line
-       // number (lines start at 1, not 0) and set column to -1
-       // so as we increment in readRune it points to the character we read.
-       // We track the line where the record starts in recordLine for use in errors.
-       r.line++
-       r.recordLine = r.line
-       r.column = -1
-
-       // Peek at the first rune. If it is an error we are done.
-       // If we support comments and it is the comment character
-       // then skip to the end of line.
-
-       r1, _, err := r.r.ReadRune()
-       if err != nil {
-               return nil, err
-       }
-
-       if r.Comment != 0 && r1 == r.Comment {
-               return nil, r.skip('\n')
+       if errRead == io.EOF {
+               return nil, errRead
         }
-       r.r.UnreadRune()
  
-       r.lineBuffer.Reset()
+       // Parse each field in the record.
+       var err error
+       const quoteLen = len(`"`)
+       commaLen := utf8.RuneLen(r.Comma)
+       recLine := r.numLine // Starting line for record
+       r.recordBuffer = r.recordBuffer[:0]
         r.fieldIndexes = r.fieldIndexes[:0]
-
-       // At this point we have at least one field.
+parseField:
         for {
-               idx := r.lineBuffer.Len()
-
-               haveField, delim, err := r.parseField()
-               if haveField {
-                       r.fieldIndexes = append(r.fieldIndexes, idx)
+               if r.TrimLeadingSpace {
+                       line = bytes.TrimLeftFunc(line, unicode.IsSpace)
                 }
-
-               if delim == '\n' || err == io.EOF {
-                       if len(r.fieldIndexes) == 0 {
-                               return nil, err
+               if len(line) == 0 || line[0] != '"' {
+                       // Non-quoted string field
+                       i := bytes.IndexRune(line, r.Comma)
+                       field := line
+                       if i >= 0 {
+                               field = field[:i]
+                       } else {
+                               field = field[:len(field)-lengthCRLF(field)]
                         }
-                       break
-               }
-
-               if err != nil {
-                       return nil, err
-               }
-       }
-
-       fieldCount := len(r.fieldIndexes)
-       // Using this approach (creating a single string and taking slices of it)
-       // means that a single reference to any of the fields will retain the whole
-       // string. The risk of a nontrivial space leak caused by this is considered
-       // minimal and a tradeoff for better performance through the combined
-       // allocations.
-       line := r.lineBuffer.String()
-
-       if cap(dst) >= fieldCount {
-               fields = dst[:fieldCount]
-       } else {
-               fields = make([]string, fieldCount)
-       }
-
-       for i, idx := range r.fieldIndexes {
-               if i == fieldCount-1 {
-                       fields[i] = line[idx:]
-               } else {
-                       fields[i] = line[idx:r.fieldIndexes[i+1]]
-               }
-       }
-
-       return fields, nil
-}
-
-// parseField parses the next field in the record. The read field is
-// appended to r.lineBuffer. Delim is the first character not part of the field
-// (r.Comma or '\n').
-func (r *Reader) parseField() (haveField bool, delim rune, err error) {
-       r1, err := r.readRune()
-       for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
-               r1, err = r.readRune()
-       }
-
-       if err == io.EOF && r.column != 0 {
-               return true, 0, err
-       }
-       if err != nil {
-               return false, 0, err
-       }
-
-       switch r1 {
-       case r.Comma:
-               // will check below
-
-       case '\n':
-               // We are a trailing empty field or a blank line
-               if r.column == 0 {
-                       return false, r1, nil
-               }
-               return true, r1, nil
-
-       case '"':
-               // quoted field
-       Quoted:
-               for {
-                       // use readRawRune instead of readRune to preserve \r\n
-                       // in quotes fields.
-                       r1, err = r.readRawRune()
-                       if err != nil {
-                               if err == io.EOF {
-                                       if r.LazyQuotes {
-                                               return true, 0, err
-                                       }
-                                       return false, 0, r.error(ErrQuote)
+                       // Check to make sure a quote does not appear in field.
+                       if !r.LazyQuotes {
+                               if j := bytes.IndexByte(field, '"'); j >= 0 {
+                                       col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
+                                       err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote}
+                                       break parseField
                                 }
-                               return false, 0, err
                         }
-                       switch r1 {
-                       case '"':
-                               r1, err = r.readRune()
-                               if err != nil || r1 == r.Comma {
-                                       break Quoted
-                               }
-                               if r1 == '\n' {
-                                       return true, r1, nil
-                               }
-                               if r1 != '"' {
-                                       if !r.LazyQuotes {
-                                               r.column--
-                                               return false, 0, r.error(ErrQuote)
+                       r.recordBuffer = append(r.recordBuffer, field...)
+                       r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+                       if i >= 0 {
+                               line = line[i+commaLen:]
+                               continue parseField
+                       }
+                       break parseField
+               } else {
+                       // Quoted string field
+                       line = line[quoteLen:]
+                       for {
+                               i := bytes.IndexByte(line, '"')
+                               if i >= 0 {
+                                       // Hit next quote.
+                                       r.recordBuffer = append(r.recordBuffer, line[:i]...)
+                                       line = line[i+quoteLen:]
+                                       switch rn := nextRune(line); {
+                                       case rn == '"':
+                                               // `""` sequence (append quote).
+                                               r.recordBuffer = append(r.recordBuffer, '"')
+                                               line = line[quoteLen:]
+                                       case rn == r.Comma:
+                                               // `",` sequence (end of field).
+                                               line = line[commaLen:]
+                                               r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+                                               continue parseField
+                                       case lengthCRLF(line) == len(line):
+                                               // `"\n` sequence (end of line).
+                                               r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+                                               break parseField
+                                       case r.LazyQuotes:
+                                               // `"` sequence (bare quote).
+                                               r.recordBuffer = append(r.recordBuffer, '"')
+                                       default:
+                                               // `"*` squence (invalid non-escaped quote).
+                                               col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
+                                               err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
+                                               break parseField
+                                       }
+                               } else if len(line) > 0 {
+                                       // Hit end of line (copy all data so far).
+                                       r.recordBuffer = append(r.recordBuffer, line...)
+                                       if errRead != nil {
+                                               break parseField
+                                       }
+                                       line, errRead = r.readLine()
+                                       if errRead == io.EOF {
+                                               errRead = nil
                                         }
-                                       // accept the bare quote
-                                       r.lineBuffer.WriteRune('"')
+                                       fullLine = line
+                               } else {
+                                       // Abrupt end of file (EOF or error).
+                                       if !r.LazyQuotes && errRead == nil {
+                                               col := utf8.RuneCount(fullLine)
+                                               err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
+                                               break parseField
+                                       }
+                                       r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+                                       break parseField
                                 }
-                       case '\n':
-                               r.line++
-                               r.column = -1
                         }
-                       r.lineBuffer.WriteRune(r1)
                 }
+       }
+       if err == nil {
+               err = errRead
+       }
  
-       default:
-               // unquoted field
-               for {
-                       r.lineBuffer.WriteRune(r1)
-                       r1, err = r.readRune()
-                       if err != nil || r1 == r.Comma {
-                               break
-                       }
-                       if r1 == '\n' {
-                               return true, r1, nil
-                       }
-                       if !r.LazyQuotes && r1 == '"' {
-                               return false, 0, r.error(ErrBareQuote)
-                       }
-               }
+       // Create a single string and create slices out of it.
+       // This pins the memory of the fields together, but allocates once.
+       str := string(r.recordBuffer) // Convert to string once to batch allocations
+       dst = dst[:0]
+       if cap(dst) < len(r.fieldIndexes) {
+               dst = make([]string, len(r.fieldIndexes))
+       }
+       dst = dst[:len(r.fieldIndexes)]
+       var preIdx int
+       for i, idx := range r.fieldIndexes {
+               dst[i] = str[preIdx:idx]
+               preIdx = idx
         }
  
-       if err != nil {
-               if err == io.EOF {
-                       return true, 0, err
+       // Check or update the expected fields per record.
+       if r.FieldsPerRecord > 0 {
+               if len(dst) != r.FieldsPerRecord && err == nil {
+                       err = &ParseError{Line: recLine, Err: ErrFieldCount}
                 }
-               return false, 0, err
+       } else if r.FieldsPerRecord == 0 {
+               r.FieldsPerRecord = len(dst)
         }
-
-       return true, r1, nil
+       return dst, err
  }
diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go

index 3811629aad681e516f6e77c0b5ef6a8ad7a8f93f..781847cefa4f08a8ec850ffd032b8e00062c310f 100644 (file)
--- a/src/encoding/csv/reader_test.go
+++ b/src/encoding/csv/reader_test.go
@@ -26,9 +26,8 @@ var readTests = []struct {
         TrimLeadingSpace bool
         ReuseRecord      bool
  
-       Error  string
-       Line   int // Expected error line if != 0
-       Column int // Expected error column if line != 0
+       Error error
+       Line  int // Expected error line if != 0
  }{
         {
                 Name:   "Simple",
@@ -140,7 +139,7 @@ field"`,
         {
                 Name:  "BadDoubleQuotes",
                 Input: `a""b,c`,
-               Error: `bare " in non-quoted-field`, Line: 1, Column: 1,
+               Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote},
         },
         {
                 Name:             "TrimQuote",
@@ -151,30 +150,30 @@ field"`,
         {
                 Name:  "BadBareQuote",
                 Input: `a "word","b"`,
-               Error: `bare " in non-quoted-field`, Line: 1, Column: 2,
+               Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote},
         },
         {
                 Name:  "BadTrailingQuote",
                 Input: `"a word",b"`,
-               Error: `bare " in non-quoted-field`, Line: 1, Column: 10,
+               Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote},
         },
         {
                 Name:  "ExtraneousQuote",
                 Input: `"a "word","b"`,
-               Error: `extraneous " in field`, Line: 1, Column: 3,
+               Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote},
         },
         {
                 Name:               "BadFieldCount",
                 UseFieldsPerRecord: true,
                 Input:              "a,b,c\nd,e",
-               Error:              "wrong number of fields", Line: 2,
+               Error:              &ParseError{Line: 2, Err: ErrFieldCount},
         },
         {
                 Name:               "BadFieldCount1",
                 UseFieldsPerRecord: true,
                 FieldsPerRecord:    2,
                 Input:              `a,b,c`,
-               Error:              "wrong number of fields", Line: 1,
+               Error:              &ParseError{Line: 1, Err: ErrFieldCount},
         },
         {
                 Name:   "FieldCount",
@@ -271,18 +270,14 @@ x,,,
                 },
         },
         { // issue 19019
-               Name:   "RecordLine1",
-               Input:  "a,\"b\nc\"d,e",
-               Error:  `extraneous " in field`,
-               Line:   1,
-               Column: 1,
+               Name:  "RecordLine1",
+               Input: "a,\"b\nc\"d,e",
+               Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote},
         },
         {
-               Name:   "RecordLine2",
-               Input:  "a,b\n\"d\n\n,e",
-               Error:  `extraneous " in field`,
-               Line:   2,
-               Column: 2,
+               Name:  "RecordLine2",
+               Input: "a,b\n\"d\n\n,e",
+               Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote},
         },
         { // issue 21201
                 Name:  "CRLFInQuotedField",
@@ -291,6 +286,95 @@ x,,,
                         {"Hello\r\nHi"},
                 },
         },
+       { // issue 19410
+               Name:   "BinaryBlobField",
+               Input:  "x09\x41\xb4\x1c,aktau",
+               Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
+       },
+       {
+               Name:   "TrailingCR",
+               Input:  "field1,field2\r",
+               Output: [][]string{{"field1", "field2\r"}},
+       },
+       {
+               Name:             "NonASCIICommaAndComment",
+               TrimLeadingSpace: true,
+               Comma:            '£',
+               Comment:          '€',
+               Input:            "a£b,c£ \td,e\n€ comment\n",
+               Output:           [][]string{{"a", "b,c", "d,e"}},
+       },
+       {
+               Name:    "NonASCIICommaAndCommentWithQuotes",
+               Comma:   '€',
+               Comment: 'λ',
+               Input:   "a€\"  b,\"€ c\nλ comment\n",
+               Output:  [][]string{{"a", "  b,", " c"}},
+       },
+       {
+               Name:    "NonASCIICommaConfusion",
+               Comma:   'λ',
+               Comment: '€',
+               // λ and θ start with the same byte. This test is intended to ensure the parser doesn't
+               // confuse such characters.
+               Input:  "\"abθcd\"λefθgh",
+               Output: [][]string{{"abθcd", "efθgh"}},
+       },
+       {
+               Name:    "NonASCIICommentConfusion",
+               Comment: 'θ',
+               Input:   "λ\nλ\nθ\nλ\n",
+               Output:  [][]string{{"λ"}, {"λ"}, {"λ"}},
+       },
+       {
+               Name:   "QuotedFieldMultipleLF",
+               Input:  "\"\n\n\n\n\"",
+               Output: [][]string{{"\n\n\n\n"}},
+       },
+       {
+               Name:  "MultipleCRLF",
+               Input: "\r\n\r\n\r\n\r\n",
+       },
+       {
+               // The implementation may read each line in several chunks if it doesn't fit entirely
+               // in the read buffer, so we should test the code to handle that condition.
+               Name:    "HugeLines",
+               Comment: '#',
+               Input:   strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
+               Output:  [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
+       },
+       {
+               Name:  "QuoteWithTrailingCRLF",
+               Input: "\"foo\"bar\"\r\n",
+               Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote},
+       },
+       {
+               Name:       "LazyQuoteWithTrailingCRLF",
+               Input:      "\"foo\"bar\"\r\n",
+               LazyQuotes: true,
+               Output:     [][]string{{`foo"bar`}},
+       },
+       {
+               Name:   "DoubleQuoteWithTrailingCRLF",
+               Input:  "\"foo\"\"bar\"\r\n",
+               Output: [][]string{{`foo"bar`}},
+       },
+       {
+               Name:   "EvenQuotes",
+               Input:  `""""""""`,
+               Output: [][]string{{`"""`}},
+       },
+       {
+               Name:  "OddQuotes",
+               Input: `"""""""`,
+               Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote},
+       },
+       {
+               Name:       "LazyOddQuotes",
+               Input:      `"""""""`,
+               LazyQuotes: true,
+               Output:     [][]string{{`"""`}},
+       },
  }
  
  func TestRead(t *testing.T) {
@@ -310,17 +394,10 @@ func TestRead(t *testing.T) {
                         r.Comma = tt.Comma
                 }
                 out, err := r.ReadAll()
-               perr, _ := err.(*ParseError)
-               if tt.Error != "" {
-                       if err == nil || !strings.Contains(err.Error(), tt.Error) {
-                               t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error)
-                       } else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) {
-                               t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column)
-                       }
-               } else if err != nil {
-                       t.Errorf("%s: unexpected error %v", tt.Name, err)
+               if !reflect.DeepEqual(err, tt.Error) {
+                       t.Errorf("%s: ReadAll() error:\ngot  %v\nwant %v", tt.Name, err, tt.Error)
                 } else if !reflect.DeepEqual(out, tt.Output) {
-                       t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output)
+                       t.Errorf("%s: ReadAll() output:\ngot  %q\nwant %q", tt.Name, out, tt.Output)
                 }
         }
  }
author	Joe Tsai <joetsai@digital-static.net>
	Fri, 20 Oct 2017 12:26:43 +0000 (05:26 -0700)
committer	Joe Tsai <thebrokentoaster@gmail.com>
	Fri, 20 Oct 2017 23:20:46 +0000 (23:20 +0000)
src/encoding/csv/reader.go		patch \| blob \| history
src/encoding/csv/reader_test.go		patch \| blob \| history