From: Rob Pike Date: Thu, 24 Sep 2015 20:10:00 +0000 (-0700) Subject: bufio: fix scanning with a final empty token. X-Git-Tag: go1.6beta1~980 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=ec12754700c5635c916361c6cd95718f57a8f1c9;p=gostls13.git bufio: fix scanning with a final empty token. The Scan function's interface to the split function was not sufficient to handle an empty final token in a pure function; state was required. This was ugly. We introduce a special error value that a split function can return that signals that this token is OK, but is the last one and scanning should stop immediately _after_ this token. The same effect could be achieved using the same trick (a special error value) and checking for that error after Scan finishes, but it's a little clumsy. Providing a published sentinel value in bufio is cleaner and means everyone can use the same trick. The result is an error-free scan. Rewrite the test (that was only barely working) to use the value and be more robust. Also write a new example showing how to do it. Fixes #11836 Change-Id: Iaae77d0f95b4a2efa0175ced94d93c66353079e8 Reviewed-on: https://go-review.googlesource.com/14924 Reviewed-by: Ian Lance Taylor --- diff --git a/src/bufio/example_test.go b/src/bufio/example_test.go index 3da9141421..4666e6d985 100644 --- a/src/bufio/example_test.go +++ b/src/bufio/example_test.go @@ -80,3 +80,32 @@ func ExampleScanner_custom() { // 5678 // Invalid input: strconv.ParseInt: parsing "1234567901234567890": value out of range } + +// Use a Scanner with a custom split function to parse a comma-separated +// list with an empty final value. +func ExampleScanner_emptyFinalToken() { + // Comma-separated list; last entry is empty. + const input = "1,2,3,4," + scanner := bufio.NewScanner(strings.NewReader(input)) + // Define a split function that separates on commas. + onComma := func(data []byte, atEOF bool) (advance int, token []byte, err error) { + for i := 0; i < len(data); i++ { + if data[i] == ',' { + return i + 1, data[:i], nil + } + } + // There is one final token to be delivered, which may be the empty string. + // Returning bufio.ErrFinalToken here tells Scan there are no more tokens after this + // but does not trigger an error to be returned from Scan itself. + return 0, data, bufio.ErrFinalToken + } + scanner.Split(onComma) + // Scan. + for scanner.Scan() { + fmt.Printf("%q ", scanner.Text()) + } + if err := scanner.Err(); err != nil { + fmt.Fprintln(os.Stderr, "reading input:", err) + } + // Output: "1" "2" "3" "4" "" +} diff --git a/src/bufio/scan.go b/src/bufio/scan.go index 0ec584b027..27a0f00459 100644 --- a/src/bufio/scan.go +++ b/src/bufio/scan.go @@ -38,6 +38,7 @@ type Scanner struct { err error // Sticky error. empties int // Count of successive empty tokens. scanCalled bool // Scan has been called; buffer is in use. + done bool // Scan has finished. } // SplitFunc is the signature of the split function used to tokenize the @@ -106,6 +107,16 @@ func (s *Scanner) Text() string { return string(s.token) } +// ErrFinalToken is a special sentinel error value. It is intended to be +// returned by a Split function to indicate that the token being delivered +// with the error is the last token and scanning should stop after this one. +// After ErrFinalToken is received by Scan, scanning stops with no error. +// The value is useful to stop processing early or when it is necessary to +// deliver a final empty token. One could achieve the same behavior +// with a custom error value but providing one here is tidier. +// See the emptyFinalToken example for a use of this value. +var ErrFinalToken = errors.New("final token") + // Scan advances the Scanner to the next token, which will then be // available through the Bytes or Text method. It returns false when the // scan stops, either by reaching the end of the input or an error. @@ -115,6 +126,9 @@ func (s *Scanner) Text() string { // Scan panics if the split function returns 100 empty tokens without // advancing the input. This is a common error mode for scanners. func (s *Scanner) Scan() bool { + if s.done { + return false + } s.scanCalled = true // Loop until we have a token. for { @@ -124,6 +138,11 @@ func (s *Scanner) Scan() bool { if s.end > s.start || s.err != nil { advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil) if err != nil { + if err == ErrFinalToken { + s.token = token + s.done = true + return true + } s.setErr(err) return false } diff --git a/src/bufio/scan_test.go b/src/bufio/scan_test.go index ac65de9c44..07b1a56dc0 100644 --- a/src/bufio/scan_test.go +++ b/src/bufio/scan_test.go @@ -429,33 +429,37 @@ func commaSplit(data []byte, atEOF bool) (advance int, token []byte, err error) return i + 1, data[:i], nil } } - if !atEOF { - return 0, nil, nil - } - return 0, data, nil + return 0, data, ErrFinalToken } -func TestEmptyTokens(t *testing.T) { - s := NewScanner(strings.NewReader("1,2,3,")) - values := []string{"1", "2", "3", ""} +func testEmptyTokens(t *testing.T, text string, values []string) { + s := NewScanner(strings.NewReader(text)) s.Split(commaSplit) var i int - for i = 0; i < len(values); i++ { - if !s.Scan() { - break + for i = 0; s.Scan(); i++ { + if i >= len(values) { + t.Fatalf("got %d fields, expected %d", i+1, len(values)) } if s.Text() != values[i] { t.Errorf("%d: expected %q got %q", i, values[i], s.Text()) } } if i != len(values) { - t.Errorf("got %d fields, expected %d", i, len(values)) + t.Fatalf("got %d fields, expected %d", i, len(values)) } if err := s.Err(); err != nil { t.Fatal(err) } } +func TestEmptyTokens(t *testing.T) { + testEmptyTokens(t, "1,2,3,", []string{"1", "2", "3", ""}) +} + +func TestWithNoEmptyTokens(t *testing.T) { + testEmptyTokens(t, "1,2,3", []string{"1", "2", "3"}) +} + func loopAtEOFSplit(data []byte, atEOF bool) (advance int, token []byte, err error) { if len(data) > 0 { return 1, data[:1], nil