encoding/csv: restore Go 1.9 quoted \r\n handling in Reader

author Russ Cox <rsc@golang.org>

Thu, 16 Nov 2017 15:29:50 +0000 (10:29 -0500)

committer Russ Cox <rsc@golang.org>

Thu, 16 Nov 2017 16:29:37 +0000 (16:29 +0000)
author Russ Cox <rsc@golang.org>
Thu, 16 Nov 2017 15:29:50 +0000 (10:29 -0500)
committer Russ Cox <rsc@golang.org>
Thu, 16 Nov 2017 16:29:37 +0000 (16:29 +0000)
diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go

index 09f0dac5d0880bd2efab6b642393ef8e5687b611..1350f3ebdd87cad6dbfd4200bb0cd748c8dcf7c9 100644 (file)
--- a/src/encoding/csv/reader.go
+++ b/src/encoding/csv/reader.go
@@ -99,15 +99,24 @@ func validDelim(r rune) bool {
  // As returned by NewReader, a Reader expects input conforming to RFC 4180.
  // The exported fields can be changed to customize the details before the
  // first call to Read or ReadAll.
+//
+// The Reader converts all \r\n sequences in its input to plain \n,
+// including in multiline field values, so that the returned data does
+// not depend on which line-ending convention an input file uses.
  type Reader struct {
         // Comma is the field delimiter.
         // It is set to comma (',') by NewReader.
+       // Comma must be a valid rune and must not be \r, \n,
+       // or the Unicode replacement character (0xFFFD).
         Comma rune
  
         // Comment, if not 0, is the comment character. Lines beginning with the
         // Comment character without preceding whitespace are ignored.
         // With leading whitespace the Comment character becomes part of the
         // field, even if TrimLeadingSpace is true.
+       // Comment must be a valid rune and must not be \r, \n,
+       // or the Unicode replacement character (0xFFFD).
+       // It must also not be equal to Comma.
         Comment rune
  
         // FieldsPerRecord is the number of expected fields per record.
@@ -217,15 +226,17 @@ func (r *Reader) readLine() ([]byte, error) {
                 err = nil
         }
         r.numLine++
+       // Normalize \r\n to \n on all input lines.
+       if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
+               line[n-2] = '\n'
+               line = line[:n-1]
+       }
         return line, err
  }
  
-// lengthCRLF reports the number of bytes for a trailing "\r\n".
-func lengthCRLF(b []byte) int {
-       if j := len(b) - 1; j >= 0 && b[j] == '\n' {
-               if j := len(b) - 2; j >= 0 && b[j] == '\r' {
-                       return 2
-               }
+// lengthNL reports the number of bytes for the trailing \n.
+func lengthNL(b []byte) int {
+       if len(b) > 0 && b[len(b)-1] == '\n' {
                 return 1
         }
         return 0
@@ -251,7 +262,7 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
                         line = nil
                         continue // Skip comment lines
                 }
-               if errRead == nil && len(line) == lengthCRLF(line) {
+               if errRead == nil && len(line) == lengthNL(line) {
                         line = nil
                         continue // Skip empty lines
                 }
@@ -281,7 +292,7 @@ parseField:
                         if i >= 0 {
                                 field = field[:i]
                         } else {
-                               field = field[:len(field)-lengthCRLF(field)]
+                               field = field[:len(field)-lengthNL(field)]
                         }
                         // Check to make sure a quote does not appear in field.
                         if !r.LazyQuotes {
@@ -317,7 +328,7 @@ parseField:
                                                 line = line[commaLen:]
                                                 r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
                                                 continue parseField
-                                       case lengthCRLF(line) == len(line):
+                                       case lengthNL(line) == len(line):
                                                 // `"\n` sequence (end of line).
                                                 r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
                                                 break parseField
diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go

index 69e2e2becd7a3eee36eed77dc82f1e94014e9026..d62aa77382e43c8a78f09c321994c8d23a66b25e 100644 (file)
--- a/src/encoding/csv/reader_test.go
+++ b/src/encoding/csv/reader_test.go
@@ -235,9 +235,9 @@ x,,,
                 Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote},
         }, {
                 Name:  "CRLFInQuotedField", // Issue 21201
-               Input: "\"Hello\r\nHi\"",
+               Input: "A,\"Hello\r\nHi\",B\r\n",
                 Output: [][]string{
-                       {"Hello\r\nHi"},
+                       {"A", "Hello\nHi", "B"},
                 },
         }, {
                 Name:   "BinaryBlobField", // Issue 19410
diff --git a/src/encoding/csv/writer.go b/src/encoding/csv/writer.go

index b23cae4517ef60f5a3146230035ec93f0df8acd7..ef3594e523c9edeadd8d4008701c19db7beaf6df 100644 (file)
--- a/src/encoding/csv/writer.go
+++ b/src/encoding/csv/writer.go
@@ -20,7 +20,7 @@ import (
  //
  // Comma is the field delimiter.
  //
-// If UseCRLF is true, the Writer ends each record with \r\n instead of \n.
+// If UseCRLF is true, the Writer ends each output line with \r\n instead of \n.
  type Writer struct {
         Comma   rune // Field delimiter (set to ',' by NewWriter)
         UseCRLF bool // True to use \r\n as the line terminator
author	Russ Cox <rsc@golang.org>
	Thu, 16 Nov 2017 15:29:50 +0000 (10:29 -0500)
committer	Russ Cox <rsc@golang.org>
	Thu, 16 Nov 2017 16:29:37 +0000 (16:29 +0000)
src/encoding/csv/reader.go		patch \| blob \| history
src/encoding/csv/reader_test.go		patch \| blob \| history
src/encoding/csv/writer.go		patch \| blob \| history