- fixed bug with unicode text formatting: the number of bytes

author Robert Griesemer <gri@golang.org>

Tue, 9 Dec 2008 21:03:15 +0000 (13:03 -0800)

committer Robert Griesemer <gri@golang.org>

Tue, 9 Dec 2008 21:03:15 +0000 (13:03 -0800)
author Robert Griesemer <gri@golang.org>
Tue, 9 Dec 2008 21:03:15 +0000 (13:03 -0800)
committer Robert Griesemer <gri@golang.org>
Tue, 9 Dec 2008 21:03:15 +0000 (13:03 -0800)
diff --git a/src/lib/tabwriter/tabwriter.go b/src/lib/tabwriter/tabwriter.go

index 53a7961e5e81f8d610c5c716031f2408731abd9e..b9b9365d4d76f5965098380b89703227bb978f66 100644 (file)
--- a/src/lib/tabwriter/tabwriter.go
+++ b/src/lib/tabwriter/tabwriter.go
@@ -13,7 +13,7 @@ import (
  
  
  // ----------------------------------------------------------------------------
-// ByteArray
+// Basic ByteArray support
  
  type ByteArray struct {
         a *[]byte;
@@ -25,6 +25,11 @@ func (b *ByteArray) Init(initial_size int) {
  }
  
  
+func (b *ByteArray) Len() int {
+       return len(b.a);
+}
+
+
  func (b *ByteArray) Clear() {
         b.a = b.a[0 : 0];
  }
@@ -77,15 +82,16 @@ func (b *ByteArray) Append(s *[]byte) {
  //
  // Formatting can be controlled via parameters:
  //
-// cellwidth  minimal cell width
-// padding    additional cell padding
-// padchar    ASCII char used for padding
-//            if padchar == '\t', the Writer will assume that the
-//            width of a '\t' in the formatted output is tabwith,
-//            and cells are left-aligned independent of align_left
-//            (for correct-looking results, cellwidth must correspond
-//            to the tabwidth in the editor used to look at the result)
-
+// cellwidth   minimal cell width
+// padding      additional cell padding
+// padchar      ASCII char used for padding
+//              if padchar == '\t', the Writer will assume that the
+//              width of a '\t' in the formatted output is cellwidth,
+//              and cells are left-aligned independent of align_left
+//              (for correct-looking results, cellwidth must correspond
+//              to the tabwidth in the viewer displaying the result)
+// filter_html  ignores html tags and handles entities (starting with '&'
+//              and ending in ';') as single characters (width = 1)
  
  export type Writer struct {
         // TODO should not export any of the fields
@@ -95,16 +101,42 @@ export type Writer struct {
         padding int;
         padbytes [8]byte;
         align_left bool;
+       filter_html bool;
  
         // current state
+       html_char byte;  // terminating char of html tag/entity, or 0 ('>', ';', or 0)
         buf ByteArray;  // collected text w/o tabs and newlines
-       size int;  // size of last incomplete cell in bytes
-       width int;  // width of last incomplete cell in runes
+       size int;  // size of incomplete cell in bytes
+       width int;  // width of incomplete cell in runes up to buf[pos] w/o ignored sections
+       pos int;  // buffer position up to which width of incomplete cell has been computed
         lines_size array.Array;  // list of lines; each line is a list of cell sizes in bytes
         lines_width array.Array;  // list of lines; each line is a list of cell widths in runes
         widths array.IntArray;  // list of column widths in runes - re-used during formatting
  }
  
+// Internal representation (current state):
+//
+// - all text written is appended to buf; tabs and newlines are stripped away
+// - at any given time there is a (possibly empty) incomplete cell at the end
+//   (the cell starts after a tab or newline)
+// - size is the number of bytes belonging to the cell so far
+// - width is text width in runes of that cell from the start of the cell to
+//   position pos; html tags and entities are excluded from this width if html
+//   filtering is enabled
+// - the sizes and widths of processed text are kept in the lines_size and
+//   lines_width arrays, which contain an array of sizes or widths for each line
+// - the widths array is a temporary array with current widths used during
+//   formatting; it is kept in Writer because it's re-used
+//
+//                    |<---------- size ---------->|
+//                    |                            |
+//                    |<- width ->|<- ignored ->|  |
+//                    |           |             |  |
+// [---processed---tab------------<tag>...</tag>...]
+// ^                  ^                         ^
+// |                  |                         |
+// buf                start of incomplete cell  pos
+
  
  func (b *Writer) AddLine() {
         b.lines_size.Push(array.NewIntArray(0));
@@ -112,7 +144,7 @@ func (b *Writer) AddLine() {
  }
  
  
-func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, align_left bool) *Writer {
+func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, align_left, filter_html bool) *Writer {
         if cellwidth < 0 {
                 panic("negative cellwidth");
         }
@@ -126,7 +158,8 @@ func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, ali
                 b.padbytes[i] = padchar;
         }
         b.align_left = align_left || padchar == '\t';  // tab enforces left-alignment
-       
+       b.filter_html = filter_html;
+
         b.buf.Init(1024);
         b.lines_size.Init(0);
         b.lines_width.Init(0);
@@ -209,7 +242,7 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error)
                         s, w := line_size.At(j), line_width.At(j);
  
                         if b.align_left {
-                               err = b.Write0(b.buf.a[pos : pos + s]);
+                               err = b.Write0(b.buf.Slice(pos, pos + s));
                                 if err != nil {
                                         goto exit;
                                 }
@@ -229,7 +262,7 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error)
                                                 goto exit;
                                         }
                                 }
-                               err = b.Write0(b.buf.a[pos : pos + s]);
+                               err = b.Write0(b.buf.Slice(pos, pos + s));
                                 if err != nil {
                                         goto exit;
                                 }
@@ -240,9 +273,8 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error)
                 if i+1 == b.lines_size.Len() {
                         // last buffered line - we don't have a newline, so just write
                         // any outstanding buffered data
-                       err = b.Write0(b.buf.a[pos : pos + b.size]);
+                       err = b.Write0(b.buf.Slice(pos, pos + b.size));
                         pos += b.size;
-                       b.size, b.width = 0, 0;
                 } else {
                         // not the last line - write newline
                         err = b.Write0(Newline);
@@ -308,6 +340,19 @@ exit:
  }
  
  
+/* export */ func (b *Writer) Flush() *os.Error {
+       dummy, err := b.Format(0, 0, b.lines_size.Len());
+       // reset (even in the presence of errors)
+       b.buf.Clear();
+       b.size, b.width = 0, 0;
+       b.pos = 0;
+       b.lines_size.Init(0);
+       b.lines_width.Init(0);
+       b.AddLine();
+       return err;
+}
+
+
  func UnicodeLen(buf *[]byte) int {
         l := 0;
         for i := 0; i < len(buf); {
@@ -326,50 +371,71 @@ func UnicodeLen(buf *[]byte) int {
  func (b *Writer) Append(buf *[]byte) {
         b.buf.Append(buf);
         b.size += len(buf);
-       b.width += UnicodeLen(buf);
-}
-
-
-/* export */ func (b *Writer) Flush() *os.Error {
-       dummy, err := b.Format(0, 0, b.lines_size.Len());
-       // reset (even in the presence of errors)
-       b.buf.Clear();
-       b.size, b.width = 0, 0;
-       b.lines_size.Init(0);
-       b.lines_width.Init(0);
-       b.AddLine();
-       return err;
  }
  
  
  /* export */ func (b *Writer) Write(buf *[]byte) (written int, err *os.Error) {
         i0, n := 0, len(buf);
-       
+
         // split text into cells
         for i := 0; i < n; i++ {
-               if ch := buf[i]; ch == '\t' || ch == '\n' {
-                       b.Append(buf[i0 : i]);
-                       i0 = i + 1;  // exclude ch from (next) cell
-
-                       // terminate cell
-                       last_size, last_width := b.Line(b.lines_size.Len() - 1);
-                       last_size.Push(b.size);
-                       last_width.Push(b.width);
-                       b.size, b.width = 0, 0;
-
-                       if ch == '\n' {
-                               b.AddLine();
-                               if last_size.Len() == 1 {
-                                       // The previous line has only one cell which does not have
-                                       // an impact on the formatting of the following lines (the
-                                       // last cell per line is ignored by Format), thus we can
-                                       // flush the Writer contents.
-                                       err = b.Flush();
-                                       if err != nil {
-                                               return i0, err;
+               ch := buf[i];
+
+               if b.html_char == 0 {
+                       // outside html tag/entity
+                       switch ch {
+                       case '\t', '\n':
+                               b.Append(buf[i0 : i]);
+                               i0 = i + 1;  // exclude ch from (next) cell
+                               b.width += UnicodeLen(b.buf.Slice(b.pos, b.buf.Len()));
+                               b.pos = b.buf.Len();
+
+                               // terminate cell
+                               last_size, last_width := b.Line(b.lines_size.Len() - 1);
+                               last_size.Push(b.size);
+                               last_width.Push(b.width);
+                               b.size, b.width = 0, 0;
+
+                               if ch == '\n' {
+                                       b.AddLine();
+                                       if last_size.Len() == 1 {
+                                               // The previous line has only one cell which does not have
+                                               // an impact on the formatting of the following lines (the
+                                               // last cell per line is ignored by Format), thus we can
+                                               // flush the Writer contents.
+                                               err = b.Flush();
+                                               if err != nil {
+                                                       return i0, err;
+                                               }
+                                       }
+                               }
+
+                       case '<', '&':
+                               if b.filter_html {
+                                       b.Append(buf[i0 : i]);
+                                       i0 = i;
+                                       b.width += UnicodeLen(b.buf.Slice(b.pos, b.buf.Len()));
+                                       b.pos = -1;  // preventative - should not be used (will cause index out of bounds)
+                                       if ch == '<' {
+                                               b.html_char = '>';
+                                       } else {
+                                               b.html_char = ';';
                                         }
                                 }
                         }
+
+               } else {
+                       // inside html tag/entity
+                       if ch == b.html_char {
+                               // reached the end of tag/entity
+                               b.Append(buf[i0 : i + 1]);
+                               i0 = i + 1;
+                               if b.html_char == ';' {
+                                       b.width++;  // count as one char
+                               }
+                               b.pos = b.buf.Len();
+                               b.html_char = 0;
+                       }
                 }
         }
         
@@ -379,6 +445,6 @@ func (b *Writer) Append(buf *[]byte) {
  }
  
  
-export func New(writer io.Write, cellwidth, padding int, padchar byte, align_left bool) *Writer {
-       return new(Writer).Init(writer, cellwidth, padding, padchar, align_left)
+export func New(writer io.Write, cellwidth, padding int, padchar byte, align_left, filter_html bool) *Writer {
+       return new(Writer).Init(writer, cellwidth, padding, padchar, align_left, filter_html)
  }
diff --git a/src/lib/tabwriter/tabwriter_test.go b/src/lib/tabwriter/tabwriter_test.go

index 097a894823a515670d5fb5c70b750f2836fb8e3f..0ff4964af465203437b58d4a2a911be7327ececc 100644 (file)
--- a/src/lib/tabwriter/tabwriter_test.go
+++ b/src/lib/tabwriter/tabwriter_test.go
@@ -22,6 +22,11 @@ func (b *Buffer) Init(n int) {
  }
  
  
+func (b *Buffer) Clear() {
+       b.a = b.a[0 : 0];
+}
+
+
  func (b *Buffer) Write(buf *[]byte) (written int, err *os.Error) {
         n := len(b.a);
         m := len(buf);
@@ -42,22 +47,19 @@ func (b *Buffer) String() string {
  }
  
  
-func Check(t *testing.T, tabwidth, padding int, padchar byte, align_left bool, src, expected string) {
-       var b Buffer;
-       b.Init(1000);
-
-       var w tabwriter.Writer;
-       w.Init(&b, tabwidth, padding, padchar, align_left);
-
-       written, err := io.WriteString(&w, src);
+func Write(t *testing.T, w *tabwriter.Writer, src string) {
+       written, err := io.WriteString(w, src);
         if err != nil {
                 t.Errorf("--- src:\n%s\n--- write error: %v\n", src, err);
         }
         if written != len(src) {
                 t.Errorf("--- src:\n%s\n--- written = %d, len(src) = %d\n", src, written, len(src));
         }
+}
+
  
-       err = w.Flush();
+func Verify(t *testing.T, w *tabwriter.Writer, b *Buffer, src, expected string) {
+       err := w.Flush();
         if err != nil {
                 t.Errorf("--- src:\n%s\n--- flush error: %v\n", src, err);
         }
@@ -69,105 +71,143 @@ func Check(t *testing.T, tabwidth, padding int, padchar byte, align_left bool, s
  }
  
  
+func Check(t *testing.T, tabwidth, padding int, padchar byte, align_left, filter_html bool, src, expected string) {
+       var b Buffer;
+       b.Init(1000);
+
+       var w tabwriter.Writer;
+       w.Init(&b, tabwidth, padding, padchar, align_left, filter_html);
+
+       // write all at once
+       b.Clear();
+       Write(t, &w, src);
+       Verify(t, &w, &b, src, expected);
+
+       // write byte-by-byte
+       b.Clear();
+       for i := 0; i < len(src); i++ {
+               Write(t, &w, src[i : i+1]);
+       }
+       Verify(t, &w, &b, src, expected);
+
+       // write using Fibonacci slice sizes
+       b.Clear();
+       for i, d := 0, 0; i < len(src); {
+               Write(t, &w, src[i : i+d]);
+               i, d = i+d, d+1;
+               if i+d > len(src) {
+                       d = len(src) - i;
+               }
+       }
+       Verify(t, &w, &b, src, expected);
+}
+
+
  export func Test(t *testing.T) {
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "",
                 ""
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "\n\n\n",
                 "\n\n\n"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "a\nb\nc",
                 "a\nb\nc"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "\t",  // '\t' terminates an empty cell on last line - nothing to print
                 ""
         );
  
         Check(
-               t, 8, 1, '.', false,
+               t, 8, 1, '.', false, false,
                 "\t",  // '\t' terminates an empty cell on last line - nothing to print
                 ""
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "*\t*",
                 "**"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "*\t*\n",
                 "*.......*\n"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "*\t*\t",
                 "*.......*"
         );
  
         Check(
-               t, 8, 1, '.', false,
+               t, 8, 1, '.', false, false,
                 "*\t*\t",
                 ".......**"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "\t\n",
                 "........\n"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "a) foo",
                 "a) foo"
         );
  
         Check(
-               t, 8, 1, ' ', true,
+               t, 8, 1, ' ', true, false,
                 "b) foo\tbar",  // "bar" is not in any cell - not formatted, just flushed
                 "b) foobar"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "c) foo\tbar\t",
                 "c) foo..bar"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "d) foo\tbar\n",
                 "d) foo..bar\n"
         );
  
         Check(
-               t, 8, 1, '.', true,
+               t, 8, 1, '.', true, false,
                 "e) foo\tbar\t\n",
                 "e) foo..bar.....\n"
         );
  
         Check(
-               t, 8, 1, '*', true,
+               t, 8, 1, '.', true, true,
+               "e) f&lt;o\t<b>bar</b>\t\n",
+               "e) f&lt;o..<b>bar</b>.....\n"
+       );
+
+       Check(
+               t, 8, 1, '*', true, false,
                 "Hello, world!\n",
                 "Hello, world!\n"
         );
  
         Check(
-               t, 0, 0, '.', true,
+               t, 0, 0, '.', true, false,
                 "1\t2\t3\t4\n"
                 "11\t222\t3333\t44444\n",
  
@@ -176,30 +216,30 @@ export func Test(t *testing.T) {
         );
  
         Check(
-               t, 5, 0, '.', true,
+               t, 5, 0, '.', true, false,
                 "1\t2\t3\t4\n",
                 "1....2....3....4\n"
         );
  
         Check(
-               t, 5, 0, '.', true,
+               t, 5, 0, '.', true, false,
                 "1\t2\t3\t4\t\n",
                 "1....2....3....4....\n"
         );
  
         Check(
-               t, 8, 1, ' ', true,
+               t, 8, 1, '.', true, false,
                 "本\tb\tc\n"
                 "aa\t\u672c\u672c\u672c\tcccc\tddddd\n"
                 "aaa\tbbbb\n",
  
-               "本       b       c\n"
-               "aa      本本本     cccc    ddddd\n"
-               "aaa     bbbb\n"
+               "本.......b.......c\n"
+               "aa......本本本.....cccc....ddddd\n"
+               "aaa.....bbbb\n"
         );
  
         Check(
-               t, 8, 1, ' ', false,
+               t, 8, 1, ' ', false, false,
                 "a\tè\tc\t\n"
                 "aa\tèèè\tcccc\tddddd\t\n"
                 "aaa\tèèèè\t\n",
@@ -210,7 +250,7 @@ export func Test(t *testing.T) {
         );
  
         Check(
-               t, 2, 0, ' ', true,
+               t, 2, 0, ' ', true, false,
                 "a\tb\tc\n"
                 "aa\tbbb\tcccc\n"
                 "aaa\tbbbb\n",
@@ -221,7 +261,7 @@ export func Test(t *testing.T) {
         );
  
         Check(
-               t, 8, 1, '_', true,
+               t, 8, 1, '_', true, false,
                 "a\tb\tc\n"
                 "aa\tbbb\tcccc\n"
                 "aaa\tbbbb\n",
@@ -232,7 +272,7 @@ export func Test(t *testing.T) {
         );
  
         Check(
-               t, 4, 1, '-', true,
+               t, 4, 1, '-', true, false,
                 "4444\t日本語\t22\t1\t333\n"
                 "999999999\t22\n"
                 "7\t22\n"
@@ -251,7 +291,7 @@ export func Test(t *testing.T) {
         );
  
         Check(
-               t, 4, 3, '.', true,
+               t, 4, 3, '.', true, false,
                 "4444\t333\t22\t1\t333\n"
                 "999999999\t22\n"
                 "7\t22\n"
@@ -270,14 +310,14 @@ export func Test(t *testing.T) {
         );
  
         Check(
-               t, 8, 1, '\t', true,
+               t, 8, 1, '\t', true, true,
                 "4444\t333\t22\t1\t333\n"
                 "999999999\t22\n"
                 "7\t22\n"
                 "\t\t\t88888888\n"
                 "\n"
                 "666666\t666666\t666666\t4444\n"
-               "1\t1\t999999999\t0000000000\n",
+               "1\t1\t<font color=red attr=日本語>999999999</font>\t0000000000\n",
  
                 "4444\t\t333\t22\t1\t333\n"
                 "999999999\t22\n"
@@ -285,11 +325,11 @@ export func Test(t *testing.T) {
                 "\t\t\t\t88888888\n"
                 "\n"
                 "666666\t666666\t666666\t\t4444\n"
-               "1\t1\t999999999\t0000000000\n"
+               "1\t1\t<font color=red attr=日本語>999999999</font>\t0000000000\n"
         );
  
         Check(
-               t, 0, 2, ' ', false,
+               t, 0, 2, ' ', false, false,
                 ".0\t.3\t2.4\t-5.1\t\n"
                 "23.0\t12345678.9\t2.4\t-989.4\t\n"
                 "5.1\t12.0\t2.4\t-7.0\t\n"
author	Robert Griesemer <gri@golang.org>
	Tue, 9 Dec 2008 21:03:15 +0000 (13:03 -0800)
committer	Robert Griesemer <gri@golang.org>
	Tue, 9 Dec 2008 21:03:15 +0000 (13:03 -0800)
src/lib/tabwriter/tabwriter.go		patch \| blob \| history
src/lib/tabwriter/tabwriter_test.go		patch \| blob \| history