From: Russ Cox Date: Thu, 16 Nov 2017 15:29:50 +0000 (-0500) Subject: encoding/csv: restore Go 1.9 quoted \r\n handling in Reader X-Git-Tag: go1.10beta1~233 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=1d47a145913f76f64414bbbc14bff3c95450535e;p=gostls13.git encoding/csv: restore Go 1.9 quoted \r\n handling in Reader CL 52810 changed Reader to interpret a quoted \r\n as a raw \r\n when reading fields. This seems likely to break existing users, and discussion on both #21201 (the original issue that triggered the change) and #22746 (discussing whether to revert the change) failed to identify a single motivating example for this change. To avoid breaking existing users for no clear reason, revert the change. The Reader has been rewritten in the interim so this is not a git revert but instead and adjustment (and slight simplification) of the new Reader. Fixes #22746. Change-Id: Ie857b2f4b1359a207d085b6d3c3a6d440a997d12 Reviewed-on: https://go-review.googlesource.com/78295 Reviewed-by: Joe Tsai --- diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go index 09f0dac5d0..1350f3ebdd 100644 --- a/src/encoding/csv/reader.go +++ b/src/encoding/csv/reader.go @@ -99,15 +99,24 @@ func validDelim(r rune) bool { // As returned by NewReader, a Reader expects input conforming to RFC 4180. // The exported fields can be changed to customize the details before the // first call to Read or ReadAll. +// +// The Reader converts all \r\n sequences in its input to plain \n, +// including in multiline field values, so that the returned data does +// not depend on which line-ending convention an input file uses. type Reader struct { // Comma is the field delimiter. // It is set to comma (',') by NewReader. + // Comma must be a valid rune and must not be \r, \n, + // or the Unicode replacement character (0xFFFD). Comma rune // Comment, if not 0, is the comment character. Lines beginning with the // Comment character without preceding whitespace are ignored. // With leading whitespace the Comment character becomes part of the // field, even if TrimLeadingSpace is true. + // Comment must be a valid rune and must not be \r, \n, + // or the Unicode replacement character (0xFFFD). + // It must also not be equal to Comma. Comment rune // FieldsPerRecord is the number of expected fields per record. @@ -217,15 +226,17 @@ func (r *Reader) readLine() ([]byte, error) { err = nil } r.numLine++ + // Normalize \r\n to \n on all input lines. + if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' { + line[n-2] = '\n' + line = line[:n-1] + } return line, err } -// lengthCRLF reports the number of bytes for a trailing "\r\n". -func lengthCRLF(b []byte) int { - if j := len(b) - 1; j >= 0 && b[j] == '\n' { - if j := len(b) - 2; j >= 0 && b[j] == '\r' { - return 2 - } +// lengthNL reports the number of bytes for the trailing \n. +func lengthNL(b []byte) int { + if len(b) > 0 && b[len(b)-1] == '\n' { return 1 } return 0 @@ -251,7 +262,7 @@ func (r *Reader) readRecord(dst []string) ([]string, error) { line = nil continue // Skip comment lines } - if errRead == nil && len(line) == lengthCRLF(line) { + if errRead == nil && len(line) == lengthNL(line) { line = nil continue // Skip empty lines } @@ -281,7 +292,7 @@ parseField: if i >= 0 { field = field[:i] } else { - field = field[:len(field)-lengthCRLF(field)] + field = field[:len(field)-lengthNL(field)] } // Check to make sure a quote does not appear in field. if !r.LazyQuotes { @@ -317,7 +328,7 @@ parseField: line = line[commaLen:] r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) continue parseField - case lengthCRLF(line) == len(line): + case lengthNL(line) == len(line): // `"\n` sequence (end of line). r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) break parseField diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go index 69e2e2becd..d62aa77382 100644 --- a/src/encoding/csv/reader_test.go +++ b/src/encoding/csv/reader_test.go @@ -235,9 +235,9 @@ x,,, Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote}, }, { Name: "CRLFInQuotedField", // Issue 21201 - Input: "\"Hello\r\nHi\"", + Input: "A,\"Hello\r\nHi\",B\r\n", Output: [][]string{ - {"Hello\r\nHi"}, + {"A", "Hello\nHi", "B"}, }, }, { Name: "BinaryBlobField", // Issue 19410 diff --git a/src/encoding/csv/writer.go b/src/encoding/csv/writer.go index b23cae4517..ef3594e523 100644 --- a/src/encoding/csv/writer.go +++ b/src/encoding/csv/writer.go @@ -20,7 +20,7 @@ import ( // // Comma is the field delimiter. // -// If UseCRLF is true, the Writer ends each record with \r\n instead of \n. +// If UseCRLF is true, the Writer ends each output line with \r\n instead of \n. type Writer struct { Comma rune // Field delimiter (set to ',' by NewWriter) UseCRLF bool // True to use \r\n as the line terminator