fmt: document and adjust Scanf space handling to eliminate a few paradoxes

author Russ Cox <rsc@golang.org>

Thu, 6 Oct 2016 21:08:44 +0000 (17:08 -0400)

committer Russ Cox <rsc@golang.org>

Fri, 7 Oct 2016 03:46:50 +0000 (03:46 +0000)
author Russ Cox <rsc@golang.org>
Thu, 6 Oct 2016 21:08:44 +0000 (17:08 -0400)
committer Russ Cox <rsc@golang.org>
Fri, 7 Oct 2016 03:46:50 +0000 (03:46 +0000)
diff --git a/src/fmt/doc.go b/src/fmt/doc.go

index eb97e51d5d1cd090843df34a009e9c213b5e0588..8b0c7d10af7af26ecfa4329332d9c7b11ac3e228 100644 (file)
--- a/src/fmt/doc.go
+++ b/src/fmt/doc.go
@@ -244,31 +244,42 @@
         Scanln, Fscanln and Sscanln stop scanning at a newline and
         require that the items be followed by a newline or EOF.
  
-       Scanf, Fscanf and Sscanf require that (after skipping spaces)
-       newlines in the format are matched by newlines in the input
-       and vice versa.  This behavior differs from the corresponding
-       routines in C, which uniformly treat newlines as spaces.
-
-       When scanning with Scanf, Fscanf, and Sscanf, all non-empty
-       runs of space characters (except newline) are equivalent
-       to a single space in both the format and the input.  With
-       that proviso, text in the format string must match the input
-       text; scanning stops if it does not, with the return value
-       of the function indicating the number of arguments scanned.
-
         Scanf, Fscanf, and Sscanf parse the arguments according to a
-       format string, analogous to that of Printf.  For example, %x
-       will scan an integer as a hexadecimal number, and %v will scan
-       the default representation format for the value.
-
-       The formats behave analogously to those of Printf with the
-       following exceptions:
-
-               %p is not implemented
-               %T is not implemented
-               %e %E %f %F %g %G are all equivalent and scan any floating point or complex value
-               %s and %v on strings scan a space-delimited token
-               Flags # and + are not implemented.
+       format string, analogous to that of Printf. In the text that
+       follows, 'space' means any Unicode whitespace character
+       except newline.
+
+       In the format string, a verb introduced by the % character
+       consumes and parses input; these verbs are described in more
+       detail below. A character other than %, space, or newline in
+       the format consumes exactly that input character, which must
+       be present. A newline with zero or more spaces before it in
+       the format string consumes zero or more spaces in the input
+       followed by a single newline or the end of the input. A space
+       following a newline in the format string consumes zero or more
+       spaces in the input. Otherwise, any run of one or more spaces
+       in the format string consumes as many spaces as possible in
+       the input. Unless the run of spaces in the format string
+       appears adjacent to a newline, the run must consume at least
+       one space from the input or find the end of the input.
+
+       The handling of spaces and newlines differs from that of C's
+       scanf family: in C, newlines are treated as any other space,
+       and it is never an error when a run of spaces in the format
+       string finds no spaces to consume in the input.
+
+       The verbs behave analogously to those of Printf.
+       For example, %x will scan an integer as a hexadecimal number,
+       and %v will scan the default representation format for the value.
+       The Printf verbs %p and %T and the flags # and + are not implemented,
+       and the verbs %e %E %f %F %g and %G are all equivalent and scan any
+       floating-point or complex value.
+
+       Input processed by verbs is implicitly space-delimited: the
+       implementation of every verb except %c starts by discarding
+       leading spaces from the remaining input, and the %s verb
+       (and %v reading into a string) stops consuming input at the first
+       space or newline character.
  
         The familiar base-setting prefixes 0 (octal) and 0x
         (hexadecimal) are accepted when scanning integers without
@@ -297,6 +308,9 @@
         All arguments to be scanned must be either pointers to basic
         types or implementations of the Scanner interface.
  
+       Like Scanf and Fscanf, Sscanf need not consume its entire input.
+       There is no way to recover how much of the input string Sscanf used.
+
         Note: Fscan etc. can read one character (rune) past the input
         they return, which means that a loop calling a scan routine
         may skip some of the input.  This is usually a problem only
diff --git a/src/fmt/scan.go b/src/fmt/scan.go

index fdf419795d9ffcb1041ba8df94e0b7cb1e529ecf..cd7232c33c4f04b90d131773e55885afe0dc4e61 100644 (file)
--- a/src/fmt/scan.go
+++ b/src/fmt/scan.go
@@ -1075,6 +1075,58 @@ func (s *ss) doScan(a []interface{}) (numProcessed int, err error) {
  func (s *ss) advance(format string) (i int) {
         for i < len(format) {
                 fmtc, w := utf8.DecodeRuneInString(format[i:])
+
+               // Space processing.
+               // In the rest of this comment "space" means spaces other than newline.
+               // Newline in the format matches input of zero or more spaces and then newline or end-of-input.
+               // Spaces in the format before the newline are collapsed into the newline.
+               // Spaces in the format after the newline match zero or more spaces after the corresponding input newline.
+               // Other spaces in the format match input of one or more spaces or end-of-input.
+               if isSpace(fmtc) {
+                       newlines := 0
+                       trailingSpace := false
+                       for isSpace(fmtc) && i < len(format) {
+                               if fmtc == '\n' {
+                                       newlines++
+                                       trailingSpace = false
+                               } else {
+                                       trailingSpace = true
+                               }
+                               i += w
+                               fmtc, w = utf8.DecodeRuneInString(format[i:])
+                       }
+                       for j := 0; j < newlines; j++ {
+                               inputc := s.getRune()
+                               for isSpace(inputc) && inputc != '\n' {
+                                       inputc = s.getRune()
+                               }
+                               if inputc != '\n' && inputc != eof {
+                                       s.errorString("newline in format does not match input")
+                               }
+                       }
+                       if trailingSpace {
+                               inputc := s.getRune()
+                               if newlines == 0 {
+                                       // If the trailing space stood alone (did not follow a newline),
+                                       // it must find at least one space to consume.
+                                       if !isSpace(inputc) && inputc != eof {
+                                               s.errorString("expected space in input to match format")
+                                       }
+                                       if inputc == '\n' {
+                                               s.errorString("newline in input does not match format")
+                                       }
+                               }
+                               for isSpace(inputc) && inputc != '\n' {
+                                       inputc = s.getRune()
+                               }
+                               if inputc != eof {
+                                       s.UnreadRune()
+                               }
+                       }
+                       continue
+               }
+
+               // Verbs.
                 if fmtc == '%' {
                         // % at end of string is an error.
                         if i+w == len(format) {
@@ -1087,48 +1139,8 @@ func (s *ss) advance(format string) (i int) {
                         }
                         i += w // skip the first %
                 }
-               sawSpace := false
-               wasNewline := false
-               // Skip spaces in format but absorb at most one newline.
-               for isSpace(fmtc) && i < len(format) {
-                       if fmtc == '\n' {
-                               if wasNewline { // Already saw one; stop here.
-                                       break
-                               }
-                               wasNewline = true
-                       }
-                       sawSpace = true
-                       i += w
-                       fmtc, w = utf8.DecodeRuneInString(format[i:])
-               }
-               if sawSpace {
-                       // There was space in the format, so there should be space
-                       // in the input.
-                       inputc := s.getRune()
-                       if inputc == eof {
-                               return
-                       }
-                       if !isSpace(inputc) {
-                               // Space in format but not in input.
-                               s.errorString("expected space in input to match format")
-                       }
-                       // Skip spaces but stop at newline.
-                       for inputc != '\n' && isSpace(inputc) {
-                               inputc = s.getRune()
-                       }
-                       if inputc == '\n' {
-                               if !wasNewline {
-                                       s.errorString("newline in input does not match format")
-                               }
-                               // We've reached a newline, stop now; don't read further.
-                               return
-                       }
-                       s.UnreadRune()
-                       if wasNewline {
-                               s.errorString("newline in format does not match input")
-                       }
-                       continue
-               }
+
+               // Literals.
                 inputc := s.mustReadRune()
                 if fmtc != inputc {
                         s.UnreadRune()
diff --git a/src/fmt/scan_test.go b/src/fmt/scan_test.go

index 9e58a6a5f7562bbb890005eda61f4e73d2d68521..d7019d943942c7d297d6f80eff1372479e3140f8 100644 (file)
--- a/src/fmt/scan_test.go
+++ b/src/fmt/scan_test.go
@@ -349,12 +349,12 @@ var scanfTests = []ScanfTest{
         {"X%dX", " X27X ", &intVal, nil}, // input does not match format
  
         {"X%dX\n", "X27X", &intVal, 27},
-       {"X%dX\n", "X27X ", &intVal, nil}, // newline in format does not match input
+       {"X%dX \n", "X27X ", &intVal, 27},
         {"X%dX\n", "X27X\n", &intVal, 27},
         {"X%dX\n", "X27X \n", &intVal, 27},
  
         {"X%dX \n", "X27X", &intVal, 27},
-       {"X%dX \n", "X27X ", &intVal, nil}, // newline in format does not match input
+       {"X%dX \n", "X27X ", &intVal, 27},
         {"X%dX \n", "X27X\n", &intVal, 27},
         {"X%dX \n", "X27X \n", &intVal, 27},
  
@@ -363,7 +363,7 @@ var scanfTests = []ScanfTest{
         {"X %c", "X!", &runeVal, nil},  // expected space in input to match format
         {"X %c", "X\n", &runeVal, nil}, // newline in input does not match format
         {"X %c", "X !", &runeVal, '!'},
-       {"X %c", "X \n", &runeVal, nil}, // newline in input does not match format
+       {"X %c", "X \n", &runeVal, '\n'},
  
         {" X%dX", "X27X", &intVal, nil},  // expected space in input to match format
         {" X%dX", "X27X ", &intVal, nil}, // expected space in input to match format
@@ -381,7 +381,7 @@ var scanfTests = []ScanfTest{
         {" X%dX ", " X27X ", &intVal, 27},
  
         {"%d\nX", "27\nX", &intVal, 27},
-       {"%dX\n X", "27X\n X", &intVal, nil}, // input does not match format
+       {"%dX\n X", "27X\n X", &intVal, 27},
  }
  
  var overflowTests = []ScanTest{
@@ -1230,18 +1230,18 @@ func TestScanfNewlineMatchFormat(t *testing.T) {
                 {"space vs newline no-percent 0001", "1\n2", "1\n 2", 0, true},
                 {"space vs newline no-percent 0010", "1\n2", "1 \n2", 0, true},
                 {"space vs newline no-percent 0011", "1\n2", "1 \n 2", 0, true},
-               {"space vs newline no-percent 0100", "1\n 2", "1\n2", 0, false},   // fails: space after nl in input but not pattern
-               {"space vs newline no-percent 0101", "1\n 2", "1\n2 ", 0, false},  // fails: space after nl in input but not pattern
-               {"space vs newline no-percent 0110", "1\n 2", "1 \n2", 0, false},  // fails: space after nl in input but not pattern
-               {"space vs newline no-percent 0111", "1\n 2", "1 \n 2", 0, false}, // fails: hard to explain
+               {"space vs newline no-percent 0100", "1\n 2", "1\n2", 0, false},  // fails: space after nl in input but not pattern
+               {"space vs newline no-percent 0101", "1\n 2", "1\n2 ", 0, false}, // fails: space after nl in input but not pattern
+               {"space vs newline no-percent 0110", "1\n 2", "1 \n2", 0, false}, // fails: space after nl in input but not pattern
+               {"space vs newline no-percent 0111", "1\n 2", "1 \n 2", 0, true},
                 {"space vs newline no-percent 1000", "1 \n2", "1\n2", 0, true},
                 {"space vs newline no-percent 1001", "1 \n2", "1\n 2", 0, true},
                 {"space vs newline no-percent 1010", "1 \n2", "1 \n2", 0, true},
                 {"space vs newline no-percent 1011", "1 \n2", "1 \n 2", 0, true},
-               {"space vs newline no-percent 1100", "1 \n 2", "1\n2", 0, false},   // fails: space after nl in input but not pattern
-               {"space vs newline no-percent 1101", "1 \n 2", "1\n 2", 0, false},  // fails: hard to explain
-               {"space vs newline no-percent 1110", "1 \n 2", "1 \n2", 0, false},  // fails: space after nl in input but not pattern
-               {"space vs newline no-percent 1111", "1 \n 2", "1 \n 2", 0, false}, // fails: hard to explain
+               {"space vs newline no-percent 1100", "1 \n 2", "1\n2", 0, false}, // fails: space after nl in input but not pattern
+               {"space vs newline no-percent 1101", "1 \n 2", "1\n 2", 0, true},
+               {"space vs newline no-percent 1110", "1 \n 2", "1 \n2", 0, false}, // fails: space after nl in input but not pattern
+               {"space vs newline no-percent 1111", "1 \n 2", "1 \n 2", 0, true},
         }
         for _, test := range tests {
                 var n int
author	Russ Cox <rsc@golang.org>
	Thu, 6 Oct 2016 21:08:44 +0000 (17:08 -0400)
committer	Russ Cox <rsc@golang.org>
	Fri, 7 Oct 2016 03:46:50 +0000 (03:46 +0000)
src/fmt/doc.go		patch \| blob \| history
src/fmt/scan.go		patch \| blob \| history
src/fmt/scan_test.go		patch \| blob \| history