From 29ea82d072731aedb2c117bef3aecdb6d035a8d0 Mon Sep 17 00:00:00 2001 From: Joe Tsai Date: Fri, 20 Oct 2017 17:04:17 -0700 Subject: [PATCH] encoding/csv: add ParseError.RecordLine CL 72150 fixes #22352 by reverting the problematic parts of that CL where the line number and column number were inconsistent with each other. This CL adds back functionality to address the issue that CL 72150 was trying to solve in the first place. That is, it reports the starting line of the record, so that users have a frame of reference to start with when debugging what went wrong. In the event of gnarly CSV files with multiline quoted strings, a parse failure likely occurs somewhere between the start of the record and the point where the parser finally detected an error. Since ParserError.{Line,Column} reports where the *error* occurs, we add a RecordLine field to report where the record starts. Also take this time to cleanup and modernize TestRead. Fixes #19019 Fixes #22352 Change-Id: I16cebf0b81922c35f75804c7073e9cddbfd11a04 Reviewed-on: https://go-review.googlesource.com/72310 Reviewed-by: Ian Lance Taylor --- src/encoding/csv/reader.go | 43 ++-- src/encoding/csv/reader_test.go | 343 +++++++++++++------------------- 2 files changed, 168 insertions(+), 218 deletions(-) diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go index 3c08b9f9d1..e646740b4f 100644 --- a/src/encoding/csv/reader.go +++ b/src/encoding/csv/reader.go @@ -62,23 +62,30 @@ import ( ) // A ParseError is returned for parsing errors. -// The first line is 1. The first column is 0. +// Line numbers are 1-indexed and columns are 0-indexed. type ParseError struct { - Line int // Line where the error occurred - Column int // Column (rune index) where the error occurred - Err error // The actual error + RecordLine int // Line where the record starts + Line int // Line where the error occurred + Column int // Column (rune index) where the error occurred + Err error // The actual error } func (e *ParseError) Error() string { - return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err) + if e.Err == ErrFieldCount { + return fmt.Sprintf("record on line %d: %v", e.Line, e.Err) + } + if e.RecordLine != e.Line { + return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.RecordLine, e.Line, e.Column, e.Err) + } + return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err) } // These are the errors that can be returned in ParseError.Error var ( - ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used + ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used. ErrBareQuote = errors.New("bare \" in non-quoted-field") - ErrQuote = errors.New("extraneous \" in field") - ErrFieldCount = errors.New("wrong number of fields in line") + ErrQuote = errors.New("extraneous or missing \" in field") + ErrFieldCount = errors.New("wrong number of fields") ) // A Reader reads records from a CSV-encoded file. @@ -86,17 +93,17 @@ var ( // As returned by NewReader, a Reader expects input conforming to RFC 4180. // The exported fields can be changed to customize the details before the // first call to Read or ReadAll. -// -// type Reader struct { // Comma is the field delimiter. // It is set to comma (',') by NewReader. Comma rune + // Comment, if not 0, is the comment character. Lines beginning with the // Comment character without preceding whitespace are ignored. // With leading whitespace the Comment character becomes part of the // field, even if TrimLeadingSpace is true. Comment rune + // FieldsPerRecord is the number of expected fields per record. // If FieldsPerRecord is positive, Read requires each record to // have the given number of fields. If FieldsPerRecord is 0, Read sets it to @@ -104,18 +111,22 @@ type Reader struct { // have the same field count. If FieldsPerRecord is negative, no check is // made and records may have a variable number of fields. FieldsPerRecord int + // If LazyQuotes is true, a quote may appear in an unquoted field and a // non-doubled quote may appear in a quoted field. - LazyQuotes bool - TrailingComma bool // ignored; here for backwards compatibility + LazyQuotes bool + // If TrimLeadingSpace is true, leading white space in a field is ignored. // This is done even if the field delimiter, Comma, is white space. TrimLeadingSpace bool + // ReuseRecord controls whether calls to Read may return a slice sharing // the backing array of the previous call's returned slice for performance. // By default, each call to Read returns newly allocated memory owned by the caller. ReuseRecord bool + TrailingComma bool // Deprecated: No longer used. + r *bufio.Reader // numLine is the current line being read in the CSV file. @@ -266,7 +277,7 @@ parseField: if !r.LazyQuotes { if j := bytes.IndexByte(field, '"'); j >= 0 { col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])]) - err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote} + err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote} break parseField } } @@ -306,7 +317,7 @@ parseField: default: // `"*` squence (invalid non-escaped quote). col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen]) - err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote} + err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote} break parseField } } else if len(line) > 0 { @@ -324,7 +335,7 @@ parseField: // Abrupt end of file (EOF or error). if !r.LazyQuotes && errRead == nil { col := utf8.RuneCount(fullLine) - err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote} + err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote} break parseField } r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) @@ -354,7 +365,7 @@ parseField: // Check or update the expected fields per record. if r.FieldsPerRecord > 0 { if len(dst) != r.FieldsPerRecord && err == nil { - err = &ParseError{Line: recLine, Err: ErrFieldCount} + err = &ParseError{RecordLine: recLine, Line: recLine, Err: ErrFieldCount} } } else if r.FieldsPerRecord == 0 { r.FieldsPerRecord = len(dst) diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go index 781847cefa..ed7d89dfe0 100644 --- a/src/encoding/csv/reader_test.go +++ b/src/encoding/csv/reader_test.go @@ -11,42 +11,35 @@ import ( "testing" ) -var readTests = []struct { - Name string - Input string - Output [][]string - UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1 - - // These fields are copied into the Reader - Comma rune - Comment rune - FieldsPerRecord int - LazyQuotes bool - TrailingComma bool - TrimLeadingSpace bool - ReuseRecord bool +func TestRead(t *testing.T) { + tests := []struct { + Name string + Input string + Output [][]string + Error error - Error error - Line int // Expected error line if != 0 -}{ - { + // These fields are copied into the Reader + Comma rune + Comment rune + UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1 + FieldsPerRecord int + LazyQuotes bool + TrimLeadingSpace bool + ReuseRecord bool + }{{ Name: "Simple", Input: "a,b,c\n", Output: [][]string{{"a", "b", "c"}}, - }, - { + }, { Name: "CRLF", Input: "a,b\r\nc,d\r\n", Output: [][]string{{"a", "b"}, {"c", "d"}}, - }, - { + }, { Name: "BareCR", Input: "a,b\rc,d\r\n", Output: [][]string{{"a", "b\rc", "d"}}, - }, - { - Name: "RFC4180test", - UseFieldsPerRecord: true, + }, { + Name: "RFC4180test", Input: `#field1,field2,field3 "aaa","bb b","ccc" @@ -59,163 +52,139 @@ zzz,yyy,xxx {"a,a", `b"bb`, "ccc"}, {"zzz", "yyy", "xxx"}, }, - }, - { + UseFieldsPerRecord: true, + FieldsPerRecord: 0, + }, { Name: "NoEOLTest", Input: "a,b,c", Output: [][]string{{"a", "b", "c"}}, - }, - { + }, { Name: "Semicolon", - Comma: ';', Input: "a;b;c\n", Output: [][]string{{"a", "b", "c"}}, - }, - { + Comma: ';', + }, { Name: "MultiLine", Input: `"two line","one line","three line field"`, Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}}, - }, - { + }, { Name: "BlankLine", Input: "a,b,c\n\nd,e,f\n\n", Output: [][]string{ {"a", "b", "c"}, {"d", "e", "f"}, }, - }, - { - Name: "BlankLineFieldCount", - Input: "a,b,c\n\nd,e,f\n\n", - UseFieldsPerRecord: true, + }, { + Name: "BlankLineFieldCount", + Input: "a,b,c\n\nd,e,f\n\n", Output: [][]string{ {"a", "b", "c"}, {"d", "e", "f"}, }, - }, - { + UseFieldsPerRecord: true, + FieldsPerRecord: 0, + }, { Name: "TrimSpace", Input: " a, b, c\n", - TrimLeadingSpace: true, Output: [][]string{{"a", "b", "c"}}, - }, - { + TrimLeadingSpace: true, + }, { Name: "LeadingSpace", Input: " a, b, c\n", Output: [][]string{{" a", " b", " c"}}, - }, - { + }, { Name: "Comment", - Comment: '#', Input: "#1,2,3\na,b,c\n#comment", Output: [][]string{{"a", "b", "c"}}, - }, - { + Comment: '#', + }, { Name: "NoComment", Input: "#1,2,3\na,b,c", Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}}, - }, - { + }, { Name: "LazyQuotes", - LazyQuotes: true, Input: `a "word","1"2",a","b`, Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}}, - }, - { - Name: "BareQuotes", LazyQuotes: true, + }, { + Name: "BareQuotes", Input: `a "word","1"2",a"`, Output: [][]string{{`a "word"`, `1"2`, `a"`}}, - }, - { - Name: "BareDoubleQuotes", LazyQuotes: true, + }, { + Name: "BareDoubleQuotes", Input: `a""b,c`, Output: [][]string{{`a""b`, `c`}}, - }, - { + LazyQuotes: true, + }, { Name: "BadDoubleQuotes", Input: `a""b,c`, - Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote}, - }, - { + Error: &ParseError{RecordLine: 1, Line: 1, Column: 1, Err: ErrBareQuote}, + }, { Name: "TrimQuote", Input: ` "a"," b",c`, - TrimLeadingSpace: true, Output: [][]string{{"a", " b", "c"}}, - }, - { + TrimLeadingSpace: true, + }, { Name: "BadBareQuote", Input: `a "word","b"`, - Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote}, - }, - { + Error: &ParseError{RecordLine: 1, Line: 1, Column: 2, Err: ErrBareQuote}, + }, { Name: "BadTrailingQuote", Input: `"a word",b"`, - Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote}, - }, - { + Error: &ParseError{RecordLine: 1, Line: 1, Column: 10, Err: ErrBareQuote}, + }, { Name: "ExtraneousQuote", Input: `"a "word","b"`, - Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote}, - }, - { + Error: &ParseError{RecordLine: 1, Line: 1, Column: 3, Err: ErrQuote}, + }, { Name: "BadFieldCount", - UseFieldsPerRecord: true, Input: "a,b,c\nd,e", - Error: &ParseError{Line: 2, Err: ErrFieldCount}, - }, - { + Error: &ParseError{RecordLine: 2, Line: 2, Err: ErrFieldCount}, + UseFieldsPerRecord: true, + FieldsPerRecord: 0, + }, { Name: "BadFieldCount1", + Input: `a,b,c`, + Error: &ParseError{RecordLine: 1, Line: 1, Err: ErrFieldCount}, UseFieldsPerRecord: true, FieldsPerRecord: 2, - Input: `a,b,c`, - Error: &ParseError{Line: 1, Err: ErrFieldCount}, - }, - { + }, { Name: "FieldCount", Input: "a,b,c\nd,e", Output: [][]string{{"a", "b", "c"}, {"d", "e"}}, - }, - { + }, { Name: "TrailingCommaEOF", Input: "a,b,c,", Output: [][]string{{"a", "b", "c", ""}}, - }, - { + }, { Name: "TrailingCommaEOL", Input: "a,b,c,\n", Output: [][]string{{"a", "b", "c", ""}}, - }, - { + }, { Name: "TrailingCommaSpaceEOF", - TrimLeadingSpace: true, Input: "a,b,c, ", Output: [][]string{{"a", "b", "c", ""}}, - }, - { - Name: "TrailingCommaSpaceEOL", TrimLeadingSpace: true, + }, { + Name: "TrailingCommaSpaceEOL", Input: "a,b,c, \n", Output: [][]string{{"a", "b", "c", ""}}, - }, - { - Name: "TrailingCommaLine3", TrimLeadingSpace: true, + }, { + Name: "TrailingCommaLine3", Input: "a,b,c\nd,e,f\ng,hi,", Output: [][]string{{"a", "b", "c"}, {"d", "e", "f"}, {"g", "hi", ""}}, - }, - { + TrimLeadingSpace: true, + }, { Name: "NotTrailingComma3", Input: "a,b,c, \n", Output: [][]string{{"a", "b", "c", " "}}, - }, - { - Name: "CommaFieldTest", - TrailingComma: true, + }, { + Name: "CommaFieldTest", Input: `x,y,z,w x,y,z, x,y,, @@ -239,166 +208,136 @@ x,,, {"x", "", "", ""}, {"", "", "", ""}, }, - }, - { - Name: "TrailingCommaIneffective1", - TrailingComma: true, - TrimLeadingSpace: true, - Input: "a,b,\nc,d,e", + }, { + Name: "TrailingCommaIneffective1", + Input: "a,b,\nc,d,e", Output: [][]string{ {"a", "b", ""}, {"c", "d", "e"}, }, - }, - { - Name: "TrailingCommaIneffective2", - TrailingComma: false, TrimLeadingSpace: true, - Input: "a,b,\nc,d,e", - Output: [][]string{ - {"a", "b", ""}, - {"c", "d", "e"}, - }, - }, - { - Name: "ReadAllReuseRecord", - ReuseRecord: true, - Input: "a,b\nc,d", + }, { + Name: "ReadAllReuseRecord", + Input: "a,b\nc,d", Output: [][]string{ {"a", "b"}, {"c", "d"}, }, - }, - { // issue 19019 - Name: "RecordLine1", + ReuseRecord: true, + }, { + Name: "RecordLine1", // Issue 19019 Input: "a,\"b\nc\"d,e", - Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote}, - }, - { + Error: &ParseError{RecordLine: 1, Line: 2, Column: 1, Err: ErrQuote}, + }, { Name: "RecordLine2", Input: "a,b\n\"d\n\n,e", - Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote}, - }, - { // issue 21201 - Name: "CRLFInQuotedField", + Error: &ParseError{RecordLine: 2, Line: 5, Column: 0, Err: ErrQuote}, + }, { + Name: "CRLFInQuotedField", // Issue 21201 Input: "\"Hello\r\nHi\"", Output: [][]string{ {"Hello\r\nHi"}, }, - }, - { // issue 19410 - Name: "BinaryBlobField", + }, { + Name: "BinaryBlobField", // Issue 19410 Input: "x09\x41\xb4\x1c,aktau", Output: [][]string{{"x09A\xb4\x1c", "aktau"}}, - }, - { + }, { Name: "TrailingCR", Input: "field1,field2\r", Output: [][]string{{"field1", "field2\r"}}, - }, - { + }, { Name: "NonASCIICommaAndComment", + Input: "a£b,c£ \td,e\n€ comment\n", + Output: [][]string{{"a", "b,c", "d,e"}}, TrimLeadingSpace: true, Comma: '£', Comment: '€', - Input: "a£b,c£ \td,e\n€ comment\n", - Output: [][]string{{"a", "b,c", "d,e"}}, - }, - { + }, { Name: "NonASCIICommaAndCommentWithQuotes", - Comma: '€', - Comment: 'λ', Input: "a€\" b,\"€ c\nλ comment\n", Output: [][]string{{"a", " b,", " c"}}, - }, - { + Comma: '€', + Comment: 'λ', + }, { + // λ and θ start with the same byte. + // This tests that the parser doesn't confuse such characters. Name: "NonASCIICommaConfusion", + Input: "\"abθcd\"λefθgh", + Output: [][]string{{"abθcd", "efθgh"}}, Comma: 'λ', Comment: '€', - // λ and θ start with the same byte. This test is intended to ensure the parser doesn't - // confuse such characters. - Input: "\"abθcd\"λefθgh", - Output: [][]string{{"abθcd", "efθgh"}}, - }, - { + }, { Name: "NonASCIICommentConfusion", - Comment: 'θ', Input: "λ\nλ\nθ\nλ\n", Output: [][]string{{"λ"}, {"λ"}, {"λ"}}, - }, - { + Comment: 'θ', + }, { Name: "QuotedFieldMultipleLF", Input: "\"\n\n\n\n\"", Output: [][]string{{"\n\n\n\n"}}, - }, - { + }, { Name: "MultipleCRLF", Input: "\r\n\r\n\r\n\r\n", - }, - { + }, { // The implementation may read each line in several chunks if it doesn't fit entirely // in the read buffer, so we should test the code to handle that condition. Name: "HugeLines", - Comment: '#', Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000), Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}}, - }, - { + Comment: '#', + }, { Name: "QuoteWithTrailingCRLF", Input: "\"foo\"bar\"\r\n", - Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote}, - }, - { + Error: &ParseError{RecordLine: 1, Line: 1, Column: 4, Err: ErrQuote}, + }, { Name: "LazyQuoteWithTrailingCRLF", Input: "\"foo\"bar\"\r\n", - LazyQuotes: true, Output: [][]string{{`foo"bar`}}, - }, - { + LazyQuotes: true, + }, { Name: "DoubleQuoteWithTrailingCRLF", Input: "\"foo\"\"bar\"\r\n", Output: [][]string{{`foo"bar`}}, - }, - { + }, { Name: "EvenQuotes", Input: `""""""""`, Output: [][]string{{`"""`}}, - }, - { + }, { Name: "OddQuotes", Input: `"""""""`, - Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote}, - }, - { + Error: &ParseError{RecordLine: 1, Line: 1, Column: 7, Err: ErrQuote}, + }, { Name: "LazyOddQuotes", Input: `"""""""`, - LazyQuotes: true, Output: [][]string{{`"""`}}, - }, -} + LazyQuotes: true, + }} -func TestRead(t *testing.T) { - for _, tt := range readTests { - r := NewReader(strings.NewReader(tt.Input)) - r.Comment = tt.Comment - if tt.UseFieldsPerRecord { - r.FieldsPerRecord = tt.FieldsPerRecord - } else { - r.FieldsPerRecord = -1 - } - r.LazyQuotes = tt.LazyQuotes - r.TrailingComma = tt.TrailingComma - r.TrimLeadingSpace = tt.TrimLeadingSpace - r.ReuseRecord = tt.ReuseRecord - if tt.Comma != 0 { - r.Comma = tt.Comma - } - out, err := r.ReadAll() - if !reflect.DeepEqual(err, tt.Error) { - t.Errorf("%s: ReadAll() error:\ngot %v\nwant %v", tt.Name, err, tt.Error) - } else if !reflect.DeepEqual(out, tt.Output) { - t.Errorf("%s: ReadAll() output:\ngot %q\nwant %q", tt.Name, out, tt.Output) - } + for _, tt := range tests { + t.Run(tt.Name, func(t *testing.T) { + r := NewReader(strings.NewReader(tt.Input)) + + if tt.Comma != 0 { + r.Comma = tt.Comma + } + r.Comment = tt.Comment + if tt.UseFieldsPerRecord { + r.FieldsPerRecord = tt.FieldsPerRecord + } else { + r.FieldsPerRecord = -1 + } + r.LazyQuotes = tt.LazyQuotes + r.TrimLeadingSpace = tt.TrimLeadingSpace + r.ReuseRecord = tt.ReuseRecord + + out, err := r.ReadAll() + if !reflect.DeepEqual(err, tt.Error) { + t.Errorf("ReadAll() error:\ngot %v\nwant %v", err, tt.Error) + } else if !reflect.DeepEqual(out, tt.Output) { + t.Errorf("ReadAll() output:\ngot %q\nwant %q", out, tt.Output) + } + }) } } -- 2.48.1