bufio: allow Scanner to accept a user-provided buffer

author Rob Pike <r@golang.org>

Tue, 15 Sep 2015 21:14:44 +0000 (14:14 -0700)

committer Rob Pike <r@golang.org>

Fri, 18 Sep 2015 18:56:49 +0000 (18:56 +0000)
author Rob Pike <r@golang.org>
Tue, 15 Sep 2015 21:14:44 +0000 (14:14 -0700)
committer Rob Pike <r@golang.org>
Fri, 18 Sep 2015 18:56:49 +0000 (18:56 +0000)
diff --git a/src/bufio/scan.go b/src/bufio/scan.go

index 7a349fa8fab7e5d0a7dc37c12af4cb8279b2a612..4f06f9764f0f717b124b3847749efaa3b1caf401 100644 (file)
--- a/src/bufio/scan.go
+++ b/src/bufio/scan.go
@@ -37,6 +37,7 @@ type Scanner struct {
         end          int       // End of data in buf.
         err          error     // Sticky error.
         empties      int       // Count of successive empty tokens.
+       scanCalled   bool      // Scan has been called; buffer is in use.
  }
  
  // SplitFunc is the signature of the split function used to tokenize the
@@ -65,10 +66,13 @@ var (
  )
  
  const (
-       // MaxScanTokenSize is the maximum size used to buffer a token.
+       // MaxScanTokenSize is the maximum size used to buffer a token
+       // unless the user provides an explicit buffer with Scan.Buffer.
         // The actual maximum token size may be smaller as the buffer
         // may need to include, for instance, a newline.
         MaxScanTokenSize = 64 * 1024
+
+       startBufSize = 4096 // Size of initial allocation for buffer.
  )
  
  // NewScanner returns a new Scanner to read from r.
@@ -78,7 +82,6 @@ func NewScanner(r io.Reader) *Scanner {
                 r:            r,
                 split:        ScanLines,
                 maxTokenSize: MaxScanTokenSize,
-               buf:          make([]byte, 4096), // Plausible starting size; needn't be large.
         }
  }
  
@@ -112,6 +115,7 @@ func (s *Scanner) Text() string {
  // Scan panics if the split function returns 100 empty tokens without
  // advancing the input. This is a common error mode for scanners.
  func (s *Scanner) Scan() bool {
+       s.scanCalled = true
         // Loop until we have a token.
         for {
                 // See if we can get a token with what we already have.
@@ -162,7 +166,10 @@ func (s *Scanner) Scan() bool {
                                 s.setErr(ErrTooLong)
                                 return false
                         }
-                       newSize := len(s.buf) * 2
+                       newSize := len(s.buf) * 2 // See protection against overflow in Buffer.
+                       if newSize == 0 {
+                               newSize = startBufSize
+                       }
                         if newSize > s.maxTokenSize {
                                 newSize = s.maxTokenSize
                         }
@@ -217,9 +224,37 @@ func (s *Scanner) setErr(err error) {
         }
  }
  
-// Split sets the split function for the Scanner. If called, it must be
-// called before Scan. The default split function is ScanLines.
+// Buffer sets the initial buffer to use when scanning and the maximum
+// size of buffer that may be allocated during scanning. The maximum
+// token size is the larger of max and cap(buf). If max <= cap(buf),
+// Scan will use this buffer only and do no allocation.
+//
+// By default, Scan uses an internal buffer and sets the
+// maximum token size to MaxScanTokenSize.
+//
+// Buffer panics if it is called after scanning has started.
+func (s *Scanner) Buffer(buf []byte, max int) {
+       if s.scanCalled {
+               panic("Buffer called after Scan")
+       }
+       s.buf = buf[0:cap(buf)]
+       // Guarantee no overflow: we multiply len(s.buf) by two in Scan,
+       // but only if it exceeds maxTokenSize.
+       const maxInt = int(^uint(0) >> 1)
+       if max > maxInt {
+               max = maxInt
+       }
+       s.maxTokenSize = max
+}
+
+// Split sets the split function for the Scanner.
+// The default split function is ScanLines.
+//
+// Split panics if it is called after scanning has started.
  func (s *Scanner) Split(split SplitFunc) {
+       if s.scanCalled {
+               panic("Split called after Scan")
+       }
         s.split = split
  }
  
diff --git a/src/bufio/scan_test.go b/src/bufio/scan_test.go

index eea87cbf7b39de26059bb7f1dd673b37820d868e..ac65de9c44d198d1c04f15f8013725508bdc5bfe 100644 (file)
--- a/src/bufio/scan_test.go
+++ b/src/bufio/scan_test.go
@@ -522,3 +522,19 @@ func TestEmptyLinesOK(t *testing.T) {
                 t.Fatalf("stopped with %d left to process", c)
         }
  }
+
+// Make sure we can read a huge token if a big enough buffer is provided.
+func TestHugeBuffer(t *testing.T) {
+       text := strings.Repeat("x", 2*MaxScanTokenSize)
+       s := NewScanner(strings.NewReader(text + "\n"))
+       s.Buffer(make([]byte, 100), 3*MaxScanTokenSize)
+       for s.Scan() {
+               token := s.Text()
+               if token != text {
+                       t.Errorf("scan got incorrect token of length %d", len(token))
+               }
+       }
+       if s.Err() != nil {
+               t.Fatal("after scan:", s.Err())
+       }
+}
author	Rob Pike <r@golang.org>
	Tue, 15 Sep 2015 21:14:44 +0000 (14:14 -0700)
committer	Rob Pike <r@golang.org>
	Fri, 18 Sep 2015 18:56:49 +0000 (18:56 +0000)
src/bufio/scan.go		patch \| blob \| history
src/bufio/scan_test.go		patch \| blob \| history