From 6cbdeb3f8810a7acb20d166fe399ab087587a353 Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Tue, 9 Dec 2008 13:03:15 -0800 Subject: [PATCH] - fixed bug with unicode text formatting: the number of bytes per rune cannot be computed correctly if we have only parts of a rune - delay computation - added html filtering mode: html tags and entities are ignored for width computations - expanded tests: - extra tests for html text - extra tests that write text in various portions R=r DELTA=227 (126 added, 20 deleted, 81 changed) OCL=20833 CL=20835 --- src/lib/tabwriter/tabwriter.go | 174 +++++++++++++++++++--------- src/lib/tabwriter/tabwriter_test.go | 122 ++++++++++++------- 2 files changed, 201 insertions(+), 95 deletions(-) diff --git a/src/lib/tabwriter/tabwriter.go b/src/lib/tabwriter/tabwriter.go index 53a7961e5e..b9b9365d4d 100644 --- a/src/lib/tabwriter/tabwriter.go +++ b/src/lib/tabwriter/tabwriter.go @@ -13,7 +13,7 @@ import ( // ---------------------------------------------------------------------------- -// ByteArray +// Basic ByteArray support type ByteArray struct { a *[]byte; @@ -25,6 +25,11 @@ func (b *ByteArray) Init(initial_size int) { } +func (b *ByteArray) Len() int { + return len(b.a); +} + + func (b *ByteArray) Clear() { b.a = b.a[0 : 0]; } @@ -77,15 +82,16 @@ func (b *ByteArray) Append(s *[]byte) { // // Formatting can be controlled via parameters: // -// cellwidth minimal cell width -// padding additional cell padding -// padchar ASCII char used for padding -// if padchar == '\t', the Writer will assume that the -// width of a '\t' in the formatted output is tabwith, -// and cells are left-aligned independent of align_left -// (for correct-looking results, cellwidth must correspond -// to the tabwidth in the editor used to look at the result) - +// cellwidth minimal cell width +// padding additional cell padding +// padchar ASCII char used for padding +// if padchar == '\t', the Writer will assume that the +// width of a '\t' in the formatted output is cellwidth, +// and cells are left-aligned independent of align_left +// (for correct-looking results, cellwidth must correspond +// to the tabwidth in the viewer displaying the result) +// filter_html ignores html tags and handles entities (starting with '&' +// and ending in ';') as single characters (width = 1) export type Writer struct { // TODO should not export any of the fields @@ -95,16 +101,42 @@ export type Writer struct { padding int; padbytes [8]byte; align_left bool; + filter_html bool; // current state + html_char byte; // terminating char of html tag/entity, or 0 ('>', ';', or 0) buf ByteArray; // collected text w/o tabs and newlines - size int; // size of last incomplete cell in bytes - width int; // width of last incomplete cell in runes + size int; // size of incomplete cell in bytes + width int; // width of incomplete cell in runes up to buf[pos] w/o ignored sections + pos int; // buffer position up to which width of incomplete cell has been computed lines_size array.Array; // list of lines; each line is a list of cell sizes in bytes lines_width array.Array; // list of lines; each line is a list of cell widths in runes widths array.IntArray; // list of column widths in runes - re-used during formatting } +// Internal representation (current state): +// +// - all text written is appended to buf; tabs and newlines are stripped away +// - at any given time there is a (possibly empty) incomplete cell at the end +// (the cell starts after a tab or newline) +// - size is the number of bytes belonging to the cell so far +// - width is text width in runes of that cell from the start of the cell to +// position pos; html tags and entities are excluded from this width if html +// filtering is enabled +// - the sizes and widths of processed text are kept in the lines_size and +// lines_width arrays, which contain an array of sizes or widths for each line +// - the widths array is a temporary array with current widths used during +// formatting; it is kept in Writer because it's re-used +// +// |<---------- size ---------->| +// | | +// |<- width ->|<- ignored ->| | +// | | | | +// [---processed---tab------------......] +// ^ ^ ^ +// | | | +// buf start of incomplete cell pos + func (b *Writer) AddLine() { b.lines_size.Push(array.NewIntArray(0)); @@ -112,7 +144,7 @@ func (b *Writer) AddLine() { } -func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, align_left bool) *Writer { +func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, align_left, filter_html bool) *Writer { if cellwidth < 0 { panic("negative cellwidth"); } @@ -126,7 +158,8 @@ func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, ali b.padbytes[i] = padchar; } b.align_left = align_left || padchar == '\t'; // tab enforces left-alignment - + b.filter_html = filter_html; + b.buf.Init(1024); b.lines_size.Init(0); b.lines_width.Init(0); @@ -209,7 +242,7 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error) s, w := line_size.At(j), line_width.At(j); if b.align_left { - err = b.Write0(b.buf.a[pos : pos + s]); + err = b.Write0(b.buf.Slice(pos, pos + s)); if err != nil { goto exit; } @@ -229,7 +262,7 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error) goto exit; } } - err = b.Write0(b.buf.a[pos : pos + s]); + err = b.Write0(b.buf.Slice(pos, pos + s)); if err != nil { goto exit; } @@ -240,9 +273,8 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error) if i+1 == b.lines_size.Len() { // last buffered line - we don't have a newline, so just write // any outstanding buffered data - err = b.Write0(b.buf.a[pos : pos + b.size]); + err = b.Write0(b.buf.Slice(pos, pos + b.size)); pos += b.size; - b.size, b.width = 0, 0; } else { // not the last line - write newline err = b.Write0(Newline); @@ -308,6 +340,19 @@ exit: } +/* export */ func (b *Writer) Flush() *os.Error { + dummy, err := b.Format(0, 0, b.lines_size.Len()); + // reset (even in the presence of errors) + b.buf.Clear(); + b.size, b.width = 0, 0; + b.pos = 0; + b.lines_size.Init(0); + b.lines_width.Init(0); + b.AddLine(); + return err; +} + + func UnicodeLen(buf *[]byte) int { l := 0; for i := 0; i < len(buf); { @@ -326,50 +371,71 @@ func UnicodeLen(buf *[]byte) int { func (b *Writer) Append(buf *[]byte) { b.buf.Append(buf); b.size += len(buf); - b.width += UnicodeLen(buf); -} - - -/* export */ func (b *Writer) Flush() *os.Error { - dummy, err := b.Format(0, 0, b.lines_size.Len()); - // reset (even in the presence of errors) - b.buf.Clear(); - b.size, b.width = 0, 0; - b.lines_size.Init(0); - b.lines_width.Init(0); - b.AddLine(); - return err; } /* export */ func (b *Writer) Write(buf *[]byte) (written int, err *os.Error) { i0, n := 0, len(buf); - + // split text into cells for i := 0; i < n; i++ { - if ch := buf[i]; ch == '\t' || ch == '\n' { - b.Append(buf[i0 : i]); - i0 = i + 1; // exclude ch from (next) cell - - // terminate cell - last_size, last_width := b.Line(b.lines_size.Len() - 1); - last_size.Push(b.size); - last_width.Push(b.width); - b.size, b.width = 0, 0; - - if ch == '\n' { - b.AddLine(); - if last_size.Len() == 1 { - // The previous line has only one cell which does not have - // an impact on the formatting of the following lines (the - // last cell per line is ignored by Format), thus we can - // flush the Writer contents. - err = b.Flush(); - if err != nil { - return i0, err; + ch := buf[i]; + + if b.html_char == 0 { + // outside html tag/entity + switch ch { + case '\t', '\n': + b.Append(buf[i0 : i]); + i0 = i + 1; // exclude ch from (next) cell + b.width += UnicodeLen(b.buf.Slice(b.pos, b.buf.Len())); + b.pos = b.buf.Len(); + + // terminate cell + last_size, last_width := b.Line(b.lines_size.Len() - 1); + last_size.Push(b.size); + last_width.Push(b.width); + b.size, b.width = 0, 0; + + if ch == '\n' { + b.AddLine(); + if last_size.Len() == 1 { + // The previous line has only one cell which does not have + // an impact on the formatting of the following lines (the + // last cell per line is ignored by Format), thus we can + // flush the Writer contents. + err = b.Flush(); + if err != nil { + return i0, err; + } + } + } + + case '<', '&': + if b.filter_html { + b.Append(buf[i0 : i]); + i0 = i; + b.width += UnicodeLen(b.buf.Slice(b.pos, b.buf.Len())); + b.pos = -1; // preventative - should not be used (will cause index out of bounds) + if ch == '<' { + b.html_char = '>'; + } else { + b.html_char = ';'; } } } + + } else { + // inside html tag/entity + if ch == b.html_char { + // reached the end of tag/entity + b.Append(buf[i0 : i + 1]); + i0 = i + 1; + if b.html_char == ';' { + b.width++; // count as one char + } + b.pos = b.buf.Len(); + b.html_char = 0; + } } } @@ -379,6 +445,6 @@ func (b *Writer) Append(buf *[]byte) { } -export func New(writer io.Write, cellwidth, padding int, padchar byte, align_left bool) *Writer { - return new(Writer).Init(writer, cellwidth, padding, padchar, align_left) +export func New(writer io.Write, cellwidth, padding int, padchar byte, align_left, filter_html bool) *Writer { + return new(Writer).Init(writer, cellwidth, padding, padchar, align_left, filter_html) } diff --git a/src/lib/tabwriter/tabwriter_test.go b/src/lib/tabwriter/tabwriter_test.go index 097a894823..0ff4964af4 100644 --- a/src/lib/tabwriter/tabwriter_test.go +++ b/src/lib/tabwriter/tabwriter_test.go @@ -22,6 +22,11 @@ func (b *Buffer) Init(n int) { } +func (b *Buffer) Clear() { + b.a = b.a[0 : 0]; +} + + func (b *Buffer) Write(buf *[]byte) (written int, err *os.Error) { n := len(b.a); m := len(buf); @@ -42,22 +47,19 @@ func (b *Buffer) String() string { } -func Check(t *testing.T, tabwidth, padding int, padchar byte, align_left bool, src, expected string) { - var b Buffer; - b.Init(1000); - - var w tabwriter.Writer; - w.Init(&b, tabwidth, padding, padchar, align_left); - - written, err := io.WriteString(&w, src); +func Write(t *testing.T, w *tabwriter.Writer, src string) { + written, err := io.WriteString(w, src); if err != nil { t.Errorf("--- src:\n%s\n--- write error: %v\n", src, err); } if written != len(src) { t.Errorf("--- src:\n%s\n--- written = %d, len(src) = %d\n", src, written, len(src)); } +} + - err = w.Flush(); +func Verify(t *testing.T, w *tabwriter.Writer, b *Buffer, src, expected string) { + err := w.Flush(); if err != nil { t.Errorf("--- src:\n%s\n--- flush error: %v\n", src, err); } @@ -69,105 +71,143 @@ func Check(t *testing.T, tabwidth, padding int, padchar byte, align_left bool, s } +func Check(t *testing.T, tabwidth, padding int, padchar byte, align_left, filter_html bool, src, expected string) { + var b Buffer; + b.Init(1000); + + var w tabwriter.Writer; + w.Init(&b, tabwidth, padding, padchar, align_left, filter_html); + + // write all at once + b.Clear(); + Write(t, &w, src); + Verify(t, &w, &b, src, expected); + + // write byte-by-byte + b.Clear(); + for i := 0; i < len(src); i++ { + Write(t, &w, src[i : i+1]); + } + Verify(t, &w, &b, src, expected); + + // write using Fibonacci slice sizes + b.Clear(); + for i, d := 0, 0; i < len(src); { + Write(t, &w, src[i : i+d]); + i, d = i+d, d+1; + if i+d > len(src) { + d = len(src) - i; + } + } + Verify(t, &w, &b, src, expected); +} + + export func Test(t *testing.T) { Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "", "" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "\n\n\n", "\n\n\n" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "a\nb\nc", "a\nb\nc" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "\t", // '\t' terminates an empty cell on last line - nothing to print "" ); Check( - t, 8, 1, '.', false, + t, 8, 1, '.', false, false, "\t", // '\t' terminates an empty cell on last line - nothing to print "" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "*\t*", "**" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "*\t*\n", "*.......*\n" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "*\t*\t", "*.......*" ); Check( - t, 8, 1, '.', false, + t, 8, 1, '.', false, false, "*\t*\t", ".......**" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "\t\n", "........\n" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "a) foo", "a) foo" ); Check( - t, 8, 1, ' ', true, + t, 8, 1, ' ', true, false, "b) foo\tbar", // "bar" is not in any cell - not formatted, just flushed "b) foobar" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "c) foo\tbar\t", "c) foo..bar" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "d) foo\tbar\n", "d) foo..bar\n" ); Check( - t, 8, 1, '.', true, + t, 8, 1, '.', true, false, "e) foo\tbar\t\n", "e) foo..bar.....\n" ); Check( - t, 8, 1, '*', true, + t, 8, 1, '.', true, true, + "e) f<o\tbar\t\n", + "e) f<o..bar.....\n" + ); + + Check( + t, 8, 1, '*', true, false, "Hello, world!\n", "Hello, world!\n" ); Check( - t, 0, 0, '.', true, + t, 0, 0, '.', true, false, "1\t2\t3\t4\n" "11\t222\t3333\t44444\n", @@ -176,30 +216,30 @@ export func Test(t *testing.T) { ); Check( - t, 5, 0, '.', true, + t, 5, 0, '.', true, false, "1\t2\t3\t4\n", "1....2....3....4\n" ); Check( - t, 5, 0, '.', true, + t, 5, 0, '.', true, false, "1\t2\t3\t4\t\n", "1....2....3....4....\n" ); Check( - t, 8, 1, ' ', true, + t, 8, 1, '.', true, false, "本\tb\tc\n" "aa\t\u672c\u672c\u672c\tcccc\tddddd\n" "aaa\tbbbb\n", - "本 b c\n" - "aa 本本本 cccc ddddd\n" - "aaa bbbb\n" + "本.......b.......c\n" + "aa......本本本.....cccc....ddddd\n" + "aaa.....bbbb\n" ); Check( - t, 8, 1, ' ', false, + t, 8, 1, ' ', false, false, "a\tè\tc\t\n" "aa\tèèè\tcccc\tddddd\t\n" "aaa\tèèèè\t\n", @@ -210,7 +250,7 @@ export func Test(t *testing.T) { ); Check( - t, 2, 0, ' ', true, + t, 2, 0, ' ', true, false, "a\tb\tc\n" "aa\tbbb\tcccc\n" "aaa\tbbbb\n", @@ -221,7 +261,7 @@ export func Test(t *testing.T) { ); Check( - t, 8, 1, '_', true, + t, 8, 1, '_', true, false, "a\tb\tc\n" "aa\tbbb\tcccc\n" "aaa\tbbbb\n", @@ -232,7 +272,7 @@ export func Test(t *testing.T) { ); Check( - t, 4, 1, '-', true, + t, 4, 1, '-', true, false, "4444\t日本語\t22\t1\t333\n" "999999999\t22\n" "7\t22\n" @@ -251,7 +291,7 @@ export func Test(t *testing.T) { ); Check( - t, 4, 3, '.', true, + t, 4, 3, '.', true, false, "4444\t333\t22\t1\t333\n" "999999999\t22\n" "7\t22\n" @@ -270,14 +310,14 @@ export func Test(t *testing.T) { ); Check( - t, 8, 1, '\t', true, + t, 8, 1, '\t', true, true, "4444\t333\t22\t1\t333\n" "999999999\t22\n" "7\t22\n" "\t\t\t88888888\n" "\n" "666666\t666666\t666666\t4444\n" - "1\t1\t999999999\t0000000000\n", + "1\t1\t999999999\t0000000000\n", "4444\t\t333\t22\t1\t333\n" "999999999\t22\n" @@ -285,11 +325,11 @@ export func Test(t *testing.T) { "\t\t\t\t88888888\n" "\n" "666666\t666666\t666666\t\t4444\n" - "1\t1\t999999999\t0000000000\n" + "1\t1\t999999999\t0000000000\n" ); Check( - t, 0, 2, ' ', false, + t, 0, 2, ' ', false, false, ".0\t.3\t2.4\t-5.1\t\n" "23.0\t12345678.9\t2.4\t-989.4\t\n" "5.1\t12.0\t2.4\t-7.0\t\n" -- 2.48.1