"fmt"
"io"
"unicode"
+ "unicode/utf8"
)
// A ParseError is returned for parsing errors.
// By default, each call to Read returns newly allocated memory owned by the caller.
ReuseRecord bool
- line int
- recordLine int // line where the current record started
- column int
- r *bufio.Reader
- // lineBuffer holds the unescaped fields read by readField, one after another.
+ r *bufio.Reader
+
+ // numLine is the current line being read in the CSV file.
+ numLine int
+
+ // rawBuffer is a line buffer only used by the readLine method.
+ rawBuffer []byte
+
+ // recordBuffer holds the unescaped fields, one after another.
// The fields can be accessed by using the indexes in fieldIndexes.
- // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
- // fieldIndexes will contain the indexes 0, 1, 2, 5.
- lineBuffer bytes.Buffer
- // Indexes of fields inside lineBuffer
- // The i'th field starts at offset fieldIndexes[i] in lineBuffer.
+ // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
+ // and fieldIndexes will contain the indexes [1, 2, 5, 6].
+ recordBuffer []byte
+
+ // fieldIndexes is an index of fields inside recordBuffer.
+ // The i'th field ends at offset fieldIndexes[i] in recordBuffer.
fieldIndexes []int
- // only used when ReuseRecord == true
+ // lastRecord is a record cache and only used when ReuseRecord == true.
lastRecord []string
}
}
}
-// error creates a new ParseError based on err.
-func (r *Reader) error(err error) error {
- return &ParseError{
- Line: r.recordLine,
- Column: r.column,
- Err: err,
- }
-}
-
// Read reads one record (a slice of fields) from r.
// If the record has an unexpected number of fields,
// Read returns the record along with the error ErrFieldCount.
} else {
record, err = r.readRecord(nil)
}
-
return record, err
}
}
}
-// readRecord reads and parses a single csv record from r.
-// Unlike parseRecord, readRecord handles FieldsPerRecord.
-// If dst has enough capacity it will be used for the returned record.
-func (r *Reader) readRecord(dst []string) (record []string, err error) {
- for {
- record, err = r.parseRecord(dst)
- if record != nil {
- break
- }
- if err != nil {
- return nil, err
+// readLine reads the next line (with the trailing endline).
+// If EOF is hit without a trailing endline, it will be omitted.
+// If some bytes were read, then the error is never io.EOF.
+// The result is only valid until the next call to readLine.
+func (r *Reader) readLine() ([]byte, error) {
+ line, err := r.r.ReadSlice('\n')
+ if err == bufio.ErrBufferFull {
+ r.rawBuffer = append(r.rawBuffer[:0], line...)
+ for err == bufio.ErrBufferFull {
+ line, err = r.r.ReadSlice('\n')
+ r.rawBuffer = append(r.rawBuffer, line...)
}
+ line = r.rawBuffer
}
-
- if r.FieldsPerRecord > 0 {
- if len(record) != r.FieldsPerRecord {
- r.column = 0 // report at start of record
- return record, r.error(ErrFieldCount)
- }
- } else if r.FieldsPerRecord == 0 {
- r.FieldsPerRecord = len(record)
+ if len(line) > 0 && err == io.EOF {
+ err = nil
}
- return record, nil
+ r.numLine++
+ return line, err
}
-// readRune reads one rune from r, folding \r\n to \n and keeping track
-// of how far into the line we have read. r.column will point to the start
-// of this rune, not the end of this rune.
-func (r *Reader) readRune() (rune, error) {
- r1, _, err := r.r.ReadRune()
-
- // Handle \r\n here. We make the simplifying assumption that
- // anytime \r is followed by \n that it can be folded to \n.
- // We will not detect files which contain both \r\n and bare \n.
- if r1 == '\r' {
- r1, _, err = r.r.ReadRune()
- if err == nil {
- if r1 != '\n' {
- r.r.UnreadRune()
- r1 = '\r'
- }
+// lengthCRLF reports the number of bytes for a trailing "\r\n".
+func lengthCRLF(b []byte) int {
+ if j := len(b) - 1; j >= 0 && b[j] == '\n' {
+ if j := len(b) - 2; j >= 0 && b[j] == '\r' {
+ return 2
}
+ return 1
}
- r.column++
- return r1, err
+ return 0
}
-// readRawRune works the same way as readRune, but does not fold \r\n to \n.
-func (r *Reader) readRawRune() (rune, error) {
- r1, _, err := r.r.ReadRune()
- r.column++
- return r1, err
+// nextRune returns the next rune in b or utf8.RuneError.
+func nextRune(b []byte) rune {
+ r, _ := utf8.DecodeRune(b)
+ return r
}
-// skip reads runes up to and including the rune delim or until error.
-func (r *Reader) skip(delim rune) error {
- for {
- r1, err := r.readRune()
- if err != nil {
- return err
+func (r *Reader) readRecord(dst []string) ([]string, error) {
+ // Read line (automatically skipping past empty lines and any comments).
+ var line, fullLine []byte
+ var errRead error
+ for errRead == nil {
+ line, errRead = r.readLine()
+ if r.Comment != 0 && nextRune(line) == r.Comment {
+ line = nil
+ continue // Skip comment lines
}
- if r1 == delim {
- return nil
+ if errRead == nil && len(line) == lengthCRLF(line) {
+ line = nil
+ continue // Skip empty lines
}
+ fullLine = line
+ break
}
-}
-
-// parseRecord reads and parses a single csv record from r.
-// If dst has enough capacity it will be used for the returned fields.
-func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
- // Each record starts on a new line. We increment our line
- // number (lines start at 1, not 0) and set column to -1
- // so as we increment in readRune it points to the character we read.
- // We track the line where the record starts in recordLine for use in errors.
- r.line++
- r.recordLine = r.line
- r.column = -1
-
- // Peek at the first rune. If it is an error we are done.
- // If we support comments and it is the comment character
- // then skip to the end of line.
-
- r1, _, err := r.r.ReadRune()
- if err != nil {
- return nil, err
- }
-
- if r.Comment != 0 && r1 == r.Comment {
- return nil, r.skip('\n')
+ if errRead == io.EOF {
+ return nil, errRead
}
- r.r.UnreadRune()
- r.lineBuffer.Reset()
+ // Parse each field in the record.
+ var err error
+ const quoteLen = len(`"`)
+ commaLen := utf8.RuneLen(r.Comma)
+ recLine := r.numLine // Starting line for record
+ r.recordBuffer = r.recordBuffer[:0]
r.fieldIndexes = r.fieldIndexes[:0]
-
- // At this point we have at least one field.
+parseField:
for {
- idx := r.lineBuffer.Len()
-
- haveField, delim, err := r.parseField()
- if haveField {
- r.fieldIndexes = append(r.fieldIndexes, idx)
+ if r.TrimLeadingSpace {
+ line = bytes.TrimLeftFunc(line, unicode.IsSpace)
}
-
- if delim == '\n' || err == io.EOF {
- if len(r.fieldIndexes) == 0 {
- return nil, err
+ if len(line) == 0 || line[0] != '"' {
+ // Non-quoted string field
+ i := bytes.IndexRune(line, r.Comma)
+ field := line
+ if i >= 0 {
+ field = field[:i]
+ } else {
+ field = field[:len(field)-lengthCRLF(field)]
}
- break
- }
-
- if err != nil {
- return nil, err
- }
- }
-
- fieldCount := len(r.fieldIndexes)
- // Using this approach (creating a single string and taking slices of it)
- // means that a single reference to any of the fields will retain the whole
- // string. The risk of a nontrivial space leak caused by this is considered
- // minimal and a tradeoff for better performance through the combined
- // allocations.
- line := r.lineBuffer.String()
-
- if cap(dst) >= fieldCount {
- fields = dst[:fieldCount]
- } else {
- fields = make([]string, fieldCount)
- }
-
- for i, idx := range r.fieldIndexes {
- if i == fieldCount-1 {
- fields[i] = line[idx:]
- } else {
- fields[i] = line[idx:r.fieldIndexes[i+1]]
- }
- }
-
- return fields, nil
-}
-
-// parseField parses the next field in the record. The read field is
-// appended to r.lineBuffer. Delim is the first character not part of the field
-// (r.Comma or '\n').
-func (r *Reader) parseField() (haveField bool, delim rune, err error) {
- r1, err := r.readRune()
- for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
- r1, err = r.readRune()
- }
-
- if err == io.EOF && r.column != 0 {
- return true, 0, err
- }
- if err != nil {
- return false, 0, err
- }
-
- switch r1 {
- case r.Comma:
- // will check below
-
- case '\n':
- // We are a trailing empty field or a blank line
- if r.column == 0 {
- return false, r1, nil
- }
- return true, r1, nil
-
- case '"':
- // quoted field
- Quoted:
- for {
- // use readRawRune instead of readRune to preserve \r\n
- // in quotes fields.
- r1, err = r.readRawRune()
- if err != nil {
- if err == io.EOF {
- if r.LazyQuotes {
- return true, 0, err
- }
- return false, 0, r.error(ErrQuote)
+ // Check to make sure a quote does not appear in field.
+ if !r.LazyQuotes {
+ if j := bytes.IndexByte(field, '"'); j >= 0 {
+ col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
+ err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote}
+ break parseField
}
- return false, 0, err
}
- switch r1 {
- case '"':
- r1, err = r.readRune()
- if err != nil || r1 == r.Comma {
- break Quoted
- }
- if r1 == '\n' {
- return true, r1, nil
- }
- if r1 != '"' {
- if !r.LazyQuotes {
- r.column--
- return false, 0, r.error(ErrQuote)
+ r.recordBuffer = append(r.recordBuffer, field...)
+ r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+ if i >= 0 {
+ line = line[i+commaLen:]
+ continue parseField
+ }
+ break parseField
+ } else {
+ // Quoted string field
+ line = line[quoteLen:]
+ for {
+ i := bytes.IndexByte(line, '"')
+ if i >= 0 {
+ // Hit next quote.
+ r.recordBuffer = append(r.recordBuffer, line[:i]...)
+ line = line[i+quoteLen:]
+ switch rn := nextRune(line); {
+ case rn == '"':
+ // `""` sequence (append quote).
+ r.recordBuffer = append(r.recordBuffer, '"')
+ line = line[quoteLen:]
+ case rn == r.Comma:
+ // `",` sequence (end of field).
+ line = line[commaLen:]
+ r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+ continue parseField
+ case lengthCRLF(line) == len(line):
+ // `"\n` sequence (end of line).
+ r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+ break parseField
+ case r.LazyQuotes:
+ // `"` sequence (bare quote).
+ r.recordBuffer = append(r.recordBuffer, '"')
+ default:
+ // `"*` squence (invalid non-escaped quote).
+ col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
+ err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
+ break parseField
+ }
+ } else if len(line) > 0 {
+ // Hit end of line (copy all data so far).
+ r.recordBuffer = append(r.recordBuffer, line...)
+ if errRead != nil {
+ break parseField
+ }
+ line, errRead = r.readLine()
+ if errRead == io.EOF {
+ errRead = nil
}
- // accept the bare quote
- r.lineBuffer.WriteRune('"')
+ fullLine = line
+ } else {
+ // Abrupt end of file (EOF or error).
+ if !r.LazyQuotes && errRead == nil {
+ col := utf8.RuneCount(fullLine)
+ err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
+ break parseField
+ }
+ r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+ break parseField
}
- case '\n':
- r.line++
- r.column = -1
}
- r.lineBuffer.WriteRune(r1)
}
+ }
+ if err == nil {
+ err = errRead
+ }
- default:
- // unquoted field
- for {
- r.lineBuffer.WriteRune(r1)
- r1, err = r.readRune()
- if err != nil || r1 == r.Comma {
- break
- }
- if r1 == '\n' {
- return true, r1, nil
- }
- if !r.LazyQuotes && r1 == '"' {
- return false, 0, r.error(ErrBareQuote)
- }
- }
+ // Create a single string and create slices out of it.
+ // This pins the memory of the fields together, but allocates once.
+ str := string(r.recordBuffer) // Convert to string once to batch allocations
+ dst = dst[:0]
+ if cap(dst) < len(r.fieldIndexes) {
+ dst = make([]string, len(r.fieldIndexes))
+ }
+ dst = dst[:len(r.fieldIndexes)]
+ var preIdx int
+ for i, idx := range r.fieldIndexes {
+ dst[i] = str[preIdx:idx]
+ preIdx = idx
}
- if err != nil {
- if err == io.EOF {
- return true, 0, err
+ // Check or update the expected fields per record.
+ if r.FieldsPerRecord > 0 {
+ if len(dst) != r.FieldsPerRecord && err == nil {
+ err = &ParseError{Line: recLine, Err: ErrFieldCount}
}
- return false, 0, err
+ } else if r.FieldsPerRecord == 0 {
+ r.FieldsPerRecord = len(dst)
}
-
- return true, r1, nil
+ return dst, err
}
TrimLeadingSpace bool
ReuseRecord bool
- Error string
- Line int // Expected error line if != 0
- Column int // Expected error column if line != 0
+ Error error
+ Line int // Expected error line if != 0
}{
{
Name: "Simple",
{
Name: "BadDoubleQuotes",
Input: `a""b,c`,
- Error: `bare " in non-quoted-field`, Line: 1, Column: 1,
+ Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote},
},
{
Name: "TrimQuote",
{
Name: "BadBareQuote",
Input: `a "word","b"`,
- Error: `bare " in non-quoted-field`, Line: 1, Column: 2,
+ Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote},
},
{
Name: "BadTrailingQuote",
Input: `"a word",b"`,
- Error: `bare " in non-quoted-field`, Line: 1, Column: 10,
+ Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote},
},
{
Name: "ExtraneousQuote",
Input: `"a "word","b"`,
- Error: `extraneous " in field`, Line: 1, Column: 3,
+ Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote},
},
{
Name: "BadFieldCount",
UseFieldsPerRecord: true,
Input: "a,b,c\nd,e",
- Error: "wrong number of fields", Line: 2,
+ Error: &ParseError{Line: 2, Err: ErrFieldCount},
},
{
Name: "BadFieldCount1",
UseFieldsPerRecord: true,
FieldsPerRecord: 2,
Input: `a,b,c`,
- Error: "wrong number of fields", Line: 1,
+ Error: &ParseError{Line: 1, Err: ErrFieldCount},
},
{
Name: "FieldCount",
},
},
{ // issue 19019
- Name: "RecordLine1",
- Input: "a,\"b\nc\"d,e",
- Error: `extraneous " in field`,
- Line: 1,
- Column: 1,
+ Name: "RecordLine1",
+ Input: "a,\"b\nc\"d,e",
+ Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote},
},
{
- Name: "RecordLine2",
- Input: "a,b\n\"d\n\n,e",
- Error: `extraneous " in field`,
- Line: 2,
- Column: 2,
+ Name: "RecordLine2",
+ Input: "a,b\n\"d\n\n,e",
+ Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote},
},
{ // issue 21201
Name: "CRLFInQuotedField",
{"Hello\r\nHi"},
},
},
+ { // issue 19410
+ Name: "BinaryBlobField",
+ Input: "x09\x41\xb4\x1c,aktau",
+ Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
+ },
+ {
+ Name: "TrailingCR",
+ Input: "field1,field2\r",
+ Output: [][]string{{"field1", "field2\r"}},
+ },
+ {
+ Name: "NonASCIICommaAndComment",
+ TrimLeadingSpace: true,
+ Comma: '£',
+ Comment: '€',
+ Input: "a£b,c£ \td,e\n€ comment\n",
+ Output: [][]string{{"a", "b,c", "d,e"}},
+ },
+ {
+ Name: "NonASCIICommaAndCommentWithQuotes",
+ Comma: '€',
+ Comment: 'λ',
+ Input: "a€\" b,\"€ c\nλ comment\n",
+ Output: [][]string{{"a", " b,", " c"}},
+ },
+ {
+ Name: "NonASCIICommaConfusion",
+ Comma: 'λ',
+ Comment: '€',
+ // λ and θ start with the same byte. This test is intended to ensure the parser doesn't
+ // confuse such characters.
+ Input: "\"abθcd\"λefθgh",
+ Output: [][]string{{"abθcd", "efθgh"}},
+ },
+ {
+ Name: "NonASCIICommentConfusion",
+ Comment: 'θ',
+ Input: "λ\nλ\nθ\nλ\n",
+ Output: [][]string{{"λ"}, {"λ"}, {"λ"}},
+ },
+ {
+ Name: "QuotedFieldMultipleLF",
+ Input: "\"\n\n\n\n\"",
+ Output: [][]string{{"\n\n\n\n"}},
+ },
+ {
+ Name: "MultipleCRLF",
+ Input: "\r\n\r\n\r\n\r\n",
+ },
+ {
+ // The implementation may read each line in several chunks if it doesn't fit entirely
+ // in the read buffer, so we should test the code to handle that condition.
+ Name: "HugeLines",
+ Comment: '#',
+ Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
+ Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
+ },
+ {
+ Name: "QuoteWithTrailingCRLF",
+ Input: "\"foo\"bar\"\r\n",
+ Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote},
+ },
+ {
+ Name: "LazyQuoteWithTrailingCRLF",
+ Input: "\"foo\"bar\"\r\n",
+ LazyQuotes: true,
+ Output: [][]string{{`foo"bar`}},
+ },
+ {
+ Name: "DoubleQuoteWithTrailingCRLF",
+ Input: "\"foo\"\"bar\"\r\n",
+ Output: [][]string{{`foo"bar`}},
+ },
+ {
+ Name: "EvenQuotes",
+ Input: `""""""""`,
+ Output: [][]string{{`"""`}},
+ },
+ {
+ Name: "OddQuotes",
+ Input: `"""""""`,
+ Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote},
+ },
+ {
+ Name: "LazyOddQuotes",
+ Input: `"""""""`,
+ LazyQuotes: true,
+ Output: [][]string{{`"""`}},
+ },
}
func TestRead(t *testing.T) {
r.Comma = tt.Comma
}
out, err := r.ReadAll()
- perr, _ := err.(*ParseError)
- if tt.Error != "" {
- if err == nil || !strings.Contains(err.Error(), tt.Error) {
- t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error)
- } else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) {
- t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column)
- }
- } else if err != nil {
- t.Errorf("%s: unexpected error %v", tt.Name, err)
+ if !reflect.DeepEqual(err, tt.Error) {
+ t.Errorf("%s: ReadAll() error:\ngot %v\nwant %v", tt.Name, err, tt.Error)
} else if !reflect.DeepEqual(out, tt.Output) {
- t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output)
+ t.Errorf("%s: ReadAll() output:\ngot %q\nwant %q", tt.Name, out, tt.Output)
}
}
}