]> Cypherpunks repositories - gostls13.git/commitdiff
encoding/csv: add ParseError.RecordLine
authorJoe Tsai <joetsai@digital-static.net>
Sat, 21 Oct 2017 00:04:17 +0000 (17:04 -0700)
committerJoe Tsai <thebrokentoaster@gmail.com>
Sat, 21 Oct 2017 01:32:28 +0000 (01:32 +0000)
CL 72150 fixes #22352 by reverting the problematic parts of that CL
where the line number and column number were inconsistent with each other.
This CL adds back functionality to address the issue that CL 72150
was trying to solve in the first place. That is, it reports the starting
line of the record, so that users have a frame of reference to start with
when debugging what went wrong.

In the event of gnarly CSV files with multiline quoted strings, a parse
failure likely occurs somewhere between the start of the record and
the point where the parser finally detected an error.
Since ParserError.{Line,Column} reports where the *error* occurs, we
add a RecordLine field to report where the record starts.

Also take this time to cleanup and modernize TestRead.

Fixes #19019
Fixes #22352

Change-Id: I16cebf0b81922c35f75804c7073e9cddbfd11a04
Reviewed-on: https://go-review.googlesource.com/72310
Reviewed-by: Ian Lance Taylor <iant@golang.org>
src/encoding/csv/reader.go
src/encoding/csv/reader_test.go

index 3c08b9f9d14c1628b2bc68474ef0eecc40fe2b4e..e646740b4f717225c58af71820294e668535eb76 100644 (file)
@@ -62,23 +62,30 @@ import (
 )
 
 // A ParseError is returned for parsing errors.
-// The first line is 1.  The first column is 0.
+// Line numbers are 1-indexed and columns are 0-indexed.
 type ParseError struct {
-       Line   int   // Line where the error occurred
-       Column int   // Column (rune index) where the error occurred
-       Err    error // The actual error
+       RecordLine int   // Line where the record starts
+       Line       int   // Line where the error occurred
+       Column     int   // Column (rune index) where the error occurred
+       Err        error // The actual error
 }
 
 func (e *ParseError) Error() string {
-       return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
+       if e.Err == ErrFieldCount {
+               return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)
+       }
+       if e.RecordLine != e.Line {
+               return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.RecordLine, e.Line, e.Column, e.Err)
+       }
+       return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)
 }
 
 // These are the errors that can be returned in ParseError.Error
 var (
-       ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used
+       ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used.
        ErrBareQuote     = errors.New("bare \" in non-quoted-field")
-       ErrQuote         = errors.New("extraneous \" in field")
-       ErrFieldCount    = errors.New("wrong number of fields in line")
+       ErrQuote         = errors.New("extraneous or missing \" in field")
+       ErrFieldCount    = errors.New("wrong number of fields")
 )
 
 // A Reader reads records from a CSV-encoded file.
@@ -86,17 +93,17 @@ var (
 // As returned by NewReader, a Reader expects input conforming to RFC 4180.
 // The exported fields can be changed to customize the details before the
 // first call to Read or ReadAll.
-//
-//
 type Reader struct {
        // Comma is the field delimiter.
        // It is set to comma (',') by NewReader.
        Comma rune
+
        // Comment, if not 0, is the comment character. Lines beginning with the
        // Comment character without preceding whitespace are ignored.
        // With leading whitespace the Comment character becomes part of the
        // field, even if TrimLeadingSpace is true.
        Comment rune
+
        // FieldsPerRecord is the number of expected fields per record.
        // If FieldsPerRecord is positive, Read requires each record to
        // have the given number of fields. If FieldsPerRecord is 0, Read sets it to
@@ -104,18 +111,22 @@ type Reader struct {
        // have the same field count. If FieldsPerRecord is negative, no check is
        // made and records may have a variable number of fields.
        FieldsPerRecord int
+
        // If LazyQuotes is true, a quote may appear in an unquoted field and a
        // non-doubled quote may appear in a quoted field.
-       LazyQuotes    bool
-       TrailingComma bool // ignored; here for backwards compatibility
+       LazyQuotes bool
+
        // If TrimLeadingSpace is true, leading white space in a field is ignored.
        // This is done even if the field delimiter, Comma, is white space.
        TrimLeadingSpace bool
+
        // ReuseRecord controls whether calls to Read may return a slice sharing
        // the backing array of the previous call's returned slice for performance.
        // By default, each call to Read returns newly allocated memory owned by the caller.
        ReuseRecord bool
 
+       TrailingComma bool // Deprecated: No longer used.
+
        r *bufio.Reader
 
        // numLine is the current line being read in the CSV file.
@@ -266,7 +277,7 @@ parseField:
                        if !r.LazyQuotes {
                                if j := bytes.IndexByte(field, '"'); j >= 0 {
                                        col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
-                                       err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote}
+                                       err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
                                        break parseField
                                }
                        }
@@ -306,7 +317,7 @@ parseField:
                                        default:
                                                // `"*` squence (invalid non-escaped quote).
                                                col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
-                                               err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
+                                               err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
                                                break parseField
                                        }
                                } else if len(line) > 0 {
@@ -324,7 +335,7 @@ parseField:
                                        // Abrupt end of file (EOF or error).
                                        if !r.LazyQuotes && errRead == nil {
                                                col := utf8.RuneCount(fullLine)
-                                               err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
+                                               err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
                                                break parseField
                                        }
                                        r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
@@ -354,7 +365,7 @@ parseField:
        // Check or update the expected fields per record.
        if r.FieldsPerRecord > 0 {
                if len(dst) != r.FieldsPerRecord && err == nil {
-                       err = &ParseError{Line: recLine, Err: ErrFieldCount}
+                       err = &ParseError{RecordLine: recLine, Line: recLine, Err: ErrFieldCount}
                }
        } else if r.FieldsPerRecord == 0 {
                r.FieldsPerRecord = len(dst)
index 781847cefa4f08a8ec850ffd032b8e00062c310f..ed7d89dfe0645a83b335b096fb975180854327f9 100644 (file)
@@ -11,42 +11,35 @@ import (
        "testing"
 )
 
-var readTests = []struct {
-       Name               string
-       Input              string
-       Output             [][]string
-       UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1
-
-       // These fields are copied into the Reader
-       Comma            rune
-       Comment          rune
-       FieldsPerRecord  int
-       LazyQuotes       bool
-       TrailingComma    bool
-       TrimLeadingSpace bool
-       ReuseRecord      bool
+func TestRead(t *testing.T) {
+       tests := []struct {
+               Name   string
+               Input  string
+               Output [][]string
+               Error  error
 
-       Error error
-       Line  int // Expected error line if != 0
-}{
-       {
+               // These fields are copied into the Reader
+               Comma              rune
+               Comment            rune
+               UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1
+               FieldsPerRecord    int
+               LazyQuotes         bool
+               TrimLeadingSpace   bool
+               ReuseRecord        bool
+       }{{
                Name:   "Simple",
                Input:  "a,b,c\n",
                Output: [][]string{{"a", "b", "c"}},
-       },
-       {
+       }, {
                Name:   "CRLF",
                Input:  "a,b\r\nc,d\r\n",
                Output: [][]string{{"a", "b"}, {"c", "d"}},
-       },
-       {
+       }, {
                Name:   "BareCR",
                Input:  "a,b\rc,d\r\n",
                Output: [][]string{{"a", "b\rc", "d"}},
-       },
-       {
-               Name:               "RFC4180test",
-               UseFieldsPerRecord: true,
+       }, {
+               Name: "RFC4180test",
                Input: `#field1,field2,field3
 "aaa","bb
 b","ccc"
@@ -59,163 +52,139 @@ zzz,yyy,xxx
                        {"a,a", `b"bb`, "ccc"},
                        {"zzz", "yyy", "xxx"},
                },
-       },
-       {
+               UseFieldsPerRecord: true,
+               FieldsPerRecord:    0,
+       }, {
                Name:   "NoEOLTest",
                Input:  "a,b,c",
                Output: [][]string{{"a", "b", "c"}},
-       },
-       {
+       }, {
                Name:   "Semicolon",
-               Comma:  ';',
                Input:  "a;b;c\n",
                Output: [][]string{{"a", "b", "c"}},
-       },
-       {
+               Comma:  ';',
+       }, {
                Name: "MultiLine",
                Input: `"two
 line","one line","three
 line
 field"`,
                Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}},
-       },
-       {
+       }, {
                Name:  "BlankLine",
                Input: "a,b,c\n\nd,e,f\n\n",
                Output: [][]string{
                        {"a", "b", "c"},
                        {"d", "e", "f"},
                },
-       },
-       {
-               Name:               "BlankLineFieldCount",
-               Input:              "a,b,c\n\nd,e,f\n\n",
-               UseFieldsPerRecord: true,
+       }, {
+               Name:  "BlankLineFieldCount",
+               Input: "a,b,c\n\nd,e,f\n\n",
                Output: [][]string{
                        {"a", "b", "c"},
                        {"d", "e", "f"},
                },
-       },
-       {
+               UseFieldsPerRecord: true,
+               FieldsPerRecord:    0,
+       }, {
                Name:             "TrimSpace",
                Input:            " a,  b,   c\n",
-               TrimLeadingSpace: true,
                Output:           [][]string{{"a", "b", "c"}},
-       },
-       {
+               TrimLeadingSpace: true,
+       }, {
                Name:   "LeadingSpace",
                Input:  " a,  b,   c\n",
                Output: [][]string{{" a", "  b", "   c"}},
-       },
-       {
+       }, {
                Name:    "Comment",
-               Comment: '#',
                Input:   "#1,2,3\na,b,c\n#comment",
                Output:  [][]string{{"a", "b", "c"}},
-       },
-       {
+               Comment: '#',
+       }, {
                Name:   "NoComment",
                Input:  "#1,2,3\na,b,c",
                Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}},
-       },
-       {
+       }, {
                Name:       "LazyQuotes",
-               LazyQuotes: true,
                Input:      `a "word","1"2",a","b`,
                Output:     [][]string{{`a "word"`, `1"2`, `a"`, `b`}},
-       },
-       {
-               Name:       "BareQuotes",
                LazyQuotes: true,
+       }, {
+               Name:       "BareQuotes",
                Input:      `a "word","1"2",a"`,
                Output:     [][]string{{`a "word"`, `1"2`, `a"`}},
-       },
-       {
-               Name:       "BareDoubleQuotes",
                LazyQuotes: true,
+       }, {
+               Name:       "BareDoubleQuotes",
                Input:      `a""b,c`,
                Output:     [][]string{{`a""b`, `c`}},
-       },
-       {
+               LazyQuotes: true,
+       }, {
                Name:  "BadDoubleQuotes",
                Input: `a""b,c`,
-               Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote},
-       },
-       {
+               Error: &ParseError{RecordLine: 1, Line: 1, Column: 1, Err: ErrBareQuote},
+       }, {
                Name:             "TrimQuote",
                Input:            ` "a"," b",c`,
-               TrimLeadingSpace: true,
                Output:           [][]string{{"a", " b", "c"}},
-       },
-       {
+               TrimLeadingSpace: true,
+       }, {
                Name:  "BadBareQuote",
                Input: `a "word","b"`,
-               Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote},
-       },
-       {
+               Error: &ParseError{RecordLine: 1, Line: 1, Column: 2, Err: ErrBareQuote},
+       }, {
                Name:  "BadTrailingQuote",
                Input: `"a word",b"`,
-               Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote},
-       },
-       {
+               Error: &ParseError{RecordLine: 1, Line: 1, Column: 10, Err: ErrBareQuote},
+       }, {
                Name:  "ExtraneousQuote",
                Input: `"a "word","b"`,
-               Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote},
-       },
-       {
+               Error: &ParseError{RecordLine: 1, Line: 1, Column: 3, Err: ErrQuote},
+       }, {
                Name:               "BadFieldCount",
-               UseFieldsPerRecord: true,
                Input:              "a,b,c\nd,e",
-               Error:              &ParseError{Line: 2, Err: ErrFieldCount},
-       },
-       {
+               Error:              &ParseError{RecordLine: 2, Line: 2, Err: ErrFieldCount},
+               UseFieldsPerRecord: true,
+               FieldsPerRecord:    0,
+       }, {
                Name:               "BadFieldCount1",
+               Input:              `a,b,c`,
+               Error:              &ParseError{RecordLine: 1, Line: 1, Err: ErrFieldCount},
                UseFieldsPerRecord: true,
                FieldsPerRecord:    2,
-               Input:              `a,b,c`,
-               Error:              &ParseError{Line: 1, Err: ErrFieldCount},
-       },
-       {
+       }, {
                Name:   "FieldCount",
                Input:  "a,b,c\nd,e",
                Output: [][]string{{"a", "b", "c"}, {"d", "e"}},
-       },
-       {
+       }, {
                Name:   "TrailingCommaEOF",
                Input:  "a,b,c,",
                Output: [][]string{{"a", "b", "c", ""}},
-       },
-       {
+       }, {
                Name:   "TrailingCommaEOL",
                Input:  "a,b,c,\n",
                Output: [][]string{{"a", "b", "c", ""}},
-       },
-       {
+       }, {
                Name:             "TrailingCommaSpaceEOF",
-               TrimLeadingSpace: true,
                Input:            "a,b,c, ",
                Output:           [][]string{{"a", "b", "c", ""}},
-       },
-       {
-               Name:             "TrailingCommaSpaceEOL",
                TrimLeadingSpace: true,
+       }, {
+               Name:             "TrailingCommaSpaceEOL",
                Input:            "a,b,c, \n",
                Output:           [][]string{{"a", "b", "c", ""}},
-       },
-       {
-               Name:             "TrailingCommaLine3",
                TrimLeadingSpace: true,
+       }, {
+               Name:             "TrailingCommaLine3",
                Input:            "a,b,c\nd,e,f\ng,hi,",
                Output:           [][]string{{"a", "b", "c"}, {"d", "e", "f"}, {"g", "hi", ""}},
-       },
-       {
+               TrimLeadingSpace: true,
+       }, {
                Name:   "NotTrailingComma3",
                Input:  "a,b,c, \n",
                Output: [][]string{{"a", "b", "c", " "}},
-       },
-       {
-               Name:          "CommaFieldTest",
-               TrailingComma: true,
+       }, {
+               Name: "CommaFieldTest",
                Input: `x,y,z,w
 x,y,z,
 x,y,,
@@ -239,166 +208,136 @@ x,,,
                        {"x", "", "", ""},
                        {"", "", "", ""},
                },
-       },
-       {
-               Name:             "TrailingCommaIneffective1",
-               TrailingComma:    true,
-               TrimLeadingSpace: true,
-               Input:            "a,b,\nc,d,e",
+       }, {
+               Name:  "TrailingCommaIneffective1",
+               Input: "a,b,\nc,d,e",
                Output: [][]string{
                        {"a", "b", ""},
                        {"c", "d", "e"},
                },
-       },
-       {
-               Name:             "TrailingCommaIneffective2",
-               TrailingComma:    false,
                TrimLeadingSpace: true,
-               Input:            "a,b,\nc,d,e",
-               Output: [][]string{
-                       {"a", "b", ""},
-                       {"c", "d", "e"},
-               },
-       },
-       {
-               Name:        "ReadAllReuseRecord",
-               ReuseRecord: true,
-               Input:       "a,b\nc,d",
+       }, {
+               Name:  "ReadAllReuseRecord",
+               Input: "a,b\nc,d",
                Output: [][]string{
                        {"a", "b"},
                        {"c", "d"},
                },
-       },
-       { // issue 19019
-               Name:  "RecordLine1",
+               ReuseRecord: true,
+       }, {
+               Name:  "RecordLine1", // Issue 19019
                Input: "a,\"b\nc\"d,e",
-               Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote},
-       },
-       {
+               Error: &ParseError{RecordLine: 1, Line: 2, Column: 1, Err: ErrQuote},
+       }, {
                Name:  "RecordLine2",
                Input: "a,b\n\"d\n\n,e",
-               Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote},
-       },
-       { // issue 21201
-               Name:  "CRLFInQuotedField",
+               Error: &ParseError{RecordLine: 2, Line: 5, Column: 0, Err: ErrQuote},
+       }, {
+               Name:  "CRLFInQuotedField", // Issue 21201
                Input: "\"Hello\r\nHi\"",
                Output: [][]string{
                        {"Hello\r\nHi"},
                },
-       },
-       { // issue 19410
-               Name:   "BinaryBlobField",
+       }, {
+               Name:   "BinaryBlobField", // Issue 19410
                Input:  "x09\x41\xb4\x1c,aktau",
                Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
-       },
-       {
+       }, {
                Name:   "TrailingCR",
                Input:  "field1,field2\r",
                Output: [][]string{{"field1", "field2\r"}},
-       },
-       {
+       }, {
                Name:             "NonASCIICommaAndComment",
+               Input:            "a£b,c£ \td,e\n€ comment\n",
+               Output:           [][]string{{"a", "b,c", "d,e"}},
                TrimLeadingSpace: true,
                Comma:            '£',
                Comment:          '€',
-               Input:            "a£b,c£ \td,e\n€ comment\n",
-               Output:           [][]string{{"a", "b,c", "d,e"}},
-       },
-       {
+       }, {
                Name:    "NonASCIICommaAndCommentWithQuotes",
-               Comma:   '€',
-               Comment: 'λ',
                Input:   "a€\"  b,\"€ c\nλ comment\n",
                Output:  [][]string{{"a", "  b,", " c"}},
-       },
-       {
+               Comma:   '€',
+               Comment: 'λ',
+       }, {
+               // λ and θ start with the same byte.
+               // This tests that the parser doesn't confuse such characters.
                Name:    "NonASCIICommaConfusion",
+               Input:   "\"abθcd\"λefθgh",
+               Output:  [][]string{{"abθcd", "efθgh"}},
                Comma:   'λ',
                Comment: '€',
-               // λ and θ start with the same byte. This test is intended to ensure the parser doesn't
-               // confuse such characters.
-               Input:  "\"abθcd\"λefθgh",
-               Output: [][]string{{"abθcd", "efθgh"}},
-       },
-       {
+       }, {
                Name:    "NonASCIICommentConfusion",
-               Comment: 'θ',
                Input:   "λ\nλ\nθ\nλ\n",
                Output:  [][]string{{"λ"}, {"λ"}, {"λ"}},
-       },
-       {
+               Comment: 'θ',
+       }, {
                Name:   "QuotedFieldMultipleLF",
                Input:  "\"\n\n\n\n\"",
                Output: [][]string{{"\n\n\n\n"}},
-       },
-       {
+       }, {
                Name:  "MultipleCRLF",
                Input: "\r\n\r\n\r\n\r\n",
-       },
-       {
+       }, {
                // The implementation may read each line in several chunks if it doesn't fit entirely
                // in the read buffer, so we should test the code to handle that condition.
                Name:    "HugeLines",
-               Comment: '#',
                Input:   strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
                Output:  [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
-       },
-       {
+               Comment: '#',
+       }, {
                Name:  "QuoteWithTrailingCRLF",
                Input: "\"foo\"bar\"\r\n",
-               Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote},
-       },
-       {
+               Error: &ParseError{RecordLine: 1, Line: 1, Column: 4, Err: ErrQuote},
+       }, {
                Name:       "LazyQuoteWithTrailingCRLF",
                Input:      "\"foo\"bar\"\r\n",
-               LazyQuotes: true,
                Output:     [][]string{{`foo"bar`}},
-       },
-       {
+               LazyQuotes: true,
+       }, {
                Name:   "DoubleQuoteWithTrailingCRLF",
                Input:  "\"foo\"\"bar\"\r\n",
                Output: [][]string{{`foo"bar`}},
-       },
-       {
+       }, {
                Name:   "EvenQuotes",
                Input:  `""""""""`,
                Output: [][]string{{`"""`}},
-       },
-       {
+       }, {
                Name:  "OddQuotes",
                Input: `"""""""`,
-               Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote},
-       },
-       {
+               Error: &ParseError{RecordLine: 1, Line: 1, Column: 7, Err: ErrQuote},
+       }, {
                Name:       "LazyOddQuotes",
                Input:      `"""""""`,
-               LazyQuotes: true,
                Output:     [][]string{{`"""`}},
-       },
-}
+               LazyQuotes: true,
+       }}
 
-func TestRead(t *testing.T) {
-       for _, tt := range readTests {
-               r := NewReader(strings.NewReader(tt.Input))
-               r.Comment = tt.Comment
-               if tt.UseFieldsPerRecord {
-                       r.FieldsPerRecord = tt.FieldsPerRecord
-               } else {
-                       r.FieldsPerRecord = -1
-               }
-               r.LazyQuotes = tt.LazyQuotes
-               r.TrailingComma = tt.TrailingComma
-               r.TrimLeadingSpace = tt.TrimLeadingSpace
-               r.ReuseRecord = tt.ReuseRecord
-               if tt.Comma != 0 {
-                       r.Comma = tt.Comma
-               }
-               out, err := r.ReadAll()
-               if !reflect.DeepEqual(err, tt.Error) {
-                       t.Errorf("%s: ReadAll() error:\ngot  %v\nwant %v", tt.Name, err, tt.Error)
-               } else if !reflect.DeepEqual(out, tt.Output) {
-                       t.Errorf("%s: ReadAll() output:\ngot  %q\nwant %q", tt.Name, out, tt.Output)
-               }
+       for _, tt := range tests {
+               t.Run(tt.Name, func(t *testing.T) {
+                       r := NewReader(strings.NewReader(tt.Input))
+
+                       if tt.Comma != 0 {
+                               r.Comma = tt.Comma
+                       }
+                       r.Comment = tt.Comment
+                       if tt.UseFieldsPerRecord {
+                               r.FieldsPerRecord = tt.FieldsPerRecord
+                       } else {
+                               r.FieldsPerRecord = -1
+                       }
+                       r.LazyQuotes = tt.LazyQuotes
+                       r.TrimLeadingSpace = tt.TrimLeadingSpace
+                       r.ReuseRecord = tt.ReuseRecord
+
+                       out, err := r.ReadAll()
+                       if !reflect.DeepEqual(err, tt.Error) {
+                               t.Errorf("ReadAll() error:\ngot  %v\nwant %v", err, tt.Error)
+                       } else if !reflect.DeepEqual(out, tt.Output) {
+                               t.Errorf("ReadAll() output:\ngot  %q\nwant %q", out, tt.Output)
+                       }
+               })
        }
 }