From: Robert Griesemer Date: Fri, 5 Dec 2008 17:22:13 +0000 (-0800) Subject: - handle UTF-8 text in tabwriter X-Git-Tag: weekly.2009-11-06~2564 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=8aeb8647c5be40ef4e85649453da9ca3c52a42e5;p=gostls13.git - handle UTF-8 text in tabwriter R=r DELTA=84 (27 added, 3 deleted, 54 changed) OCL=20539 CL=20584 --- diff --git a/src/lib/tabwriter/tabwriter.go b/src/lib/tabwriter/tabwriter.go index ec6fadad1f..53a7961e5e 100644 --- a/src/lib/tabwriter/tabwriter.go +++ b/src/lib/tabwriter/tabwriter.go @@ -8,12 +8,12 @@ import ( "os"; "io"; "array"; + "utf8"; ) // ---------------------------------------------------------------------------- // ByteArray -// TODO should use a ByteArray library eventually type ByteArray struct { a *[]byte; @@ -62,11 +62,13 @@ func (b *ByteArray) Append(s *[]byte) { // ---------------------------------------------------------------------------- // Writer is a filter implementing the io.Write interface. It assumes -// that the incoming bytes represent ASCII encoded text consisting of +// that the incoming bytes represent UTF-8 encoded text consisting of // lines of tab-terminated "cells". Cells in adjacent lines constitute // a column. Writer rewrites the incoming text such that all cells in // a column have the same width; thus it effectively aligns cells. It -// does this by adding padding where necessary. +// does this by adding padding where necessary. All characters (ASCII +// or not) are assumed to be of the same width - this may not be true +// for arbitrary UTF-8 characters visualized on the screen. // // Note that any text at the end of a line that is not tab-terminated // is not a cell and does not enforce alignment of cells in adjacent @@ -84,8 +86,6 @@ func (b *ByteArray) Append(s *[]byte) { // (for correct-looking results, cellwidth must correspond // to the tabwidth in the editor used to look at the result) -// TODO Should support UTF-8 (requires more complicated width bookkeeping) - export type Writer struct { // TODO should not export any of the fields @@ -97,15 +97,18 @@ export type Writer struct { align_left bool; // current state - buf ByteArray; // the collected text w/o tabs and newlines - width int; // width of last incomplete cell - lines array.Array; // list of lines; each line is a list of cell widths - widths array.IntArray; // list of column widths - re-used during formatting + buf ByteArray; // collected text w/o tabs and newlines + size int; // size of last incomplete cell in bytes + width int; // width of last incomplete cell in runes + lines_size array.Array; // list of lines; each line is a list of cell sizes in bytes + lines_width array.Array; // list of lines; each line is a list of cell widths in runes + widths array.IntArray; // list of column widths in runes - re-used during formatting } func (b *Writer) AddLine() { - b.lines.Push(array.NewIntArray(0)); + b.lines_size.Push(array.NewIntArray(0)); + b.lines_width.Push(array.NewIntArray(0)); } @@ -125,7 +128,8 @@ func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, ali b.align_left = align_left || padchar == '\t'; // tab enforces left-alignment b.buf.Init(1024); - b.lines.Init(0); + b.lines_size.Init(0); + b.lines_width.Init(0); b.widths.Init(0); b.AddLine(); // the very first line @@ -133,21 +137,23 @@ func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, ali } -func (b *Writer) Line(i int) *array.IntArray { - return b.lines.At(i).(*array.IntArray); +func (b *Writer) Line(i int) (*array.IntArray, *array.IntArray) { + return + b.lines_size.At(i).(*array.IntArray), + b.lines_width.At(i).(*array.IntArray); } // debugging support func (b *Writer) Dump() { pos := 0; - for i := 0; i < b.lines.Len(); i++ { - line := b.Line(i); + for i := 0; i < b.lines_size.Len(); i++ { + line_size, line_width := b.Line(i); print("(", i, ") "); - for j := 0; j < line.Len(); j++ { - w := line.At(j); - print("[", string(b.buf.Slice(pos, pos + w)), "]"); - pos += w; + for j := 0; j < line_size.Len(); j++ { + s := line_size.At(j); + print("[", string(b.buf.Slice(pos, pos + s)), "]"); + pos += s; } print("\n"); } @@ -198,16 +204,16 @@ exit: func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error) { pos = pos0; for i := line0; i < line1; i++ { - line := b.Line(i); - for j := 0; j < line.Len(); j++ { - w := line.At(j); + line_size, line_width := b.Line(i); + for j := 0; j < line_size.Len(); j++ { + s, w := line_size.At(j), line_width.At(j); if b.align_left { - err = b.Write0(b.buf.a[pos : pos + w]); + err = b.Write0(b.buf.a[pos : pos + s]); if err != nil { goto exit; } - pos += w; + pos += s; if j < b.widths.Len() { err = b.WritePadding(w, b.widths.At(j)); if err != nil { @@ -223,20 +229,20 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error) goto exit; } } - err = b.Write0(b.buf.a[pos : pos + w]); + err = b.Write0(b.buf.a[pos : pos + s]); if err != nil { goto exit; } - pos += w; + pos += s; } } - if i+1 == b.lines.Len() { + if i+1 == b.lines_size.Len() { // last buffered line - we don't have a newline, so just write // any outstanding buffered data - err = b.Write0(b.buf.a[pos : pos + b.width]); - pos += b.width; - b.width = 0; + err = b.Write0(b.buf.a[pos : pos + b.size]); + pos += b.size; + b.size, b.width = 0, 0; } else { // not the last line - write newline err = b.Write0(Newline); @@ -256,9 +262,9 @@ func (b *Writer) Format(pos0 int, line0, line1 int) (pos int, err *os.Error) { column := b.widths.Len(); last := line0; for this := line0; this < line1; this++ { - line := b.Line(this); + line_size, line_width := b.Line(this); - if column < line.Len() - 1 { + if column < line_size.Len() - 1 { // cell exists in this column // (note that the last cell per line is ignored) @@ -272,10 +278,10 @@ func (b *Writer) Format(pos0 int, line0, line1 int) (pos int, err *os.Error) { // column block begin width := b.cellwidth; // minimal width for ; this < line1; this++ { - line = b.Line(this); - if column < line.Len() - 1 { + line_size, line_width = b.Line(this); + if column < line_size.Len() - 1 { // cell exists in this column => update width - w := line.At(column) + b.padding; + w := line_width.At(column) + b.padding; if w > width { width = w; } @@ -302,18 +308,35 @@ exit: } +func UnicodeLen(buf *[]byte) int { + l := 0; + for i := 0; i < len(buf); { + if buf[i] < utf8.RuneSelf { + i++; + } else { + rune, size := utf8.DecodeRune(buf[i : len(buf)]); + i += size; + } + l++; + } + return l; +} + + func (b *Writer) Append(buf *[]byte) { b.buf.Append(buf); - b.width += len(buf); + b.size += len(buf); + b.width += UnicodeLen(buf); } /* export */ func (b *Writer) Flush() *os.Error { - dummy, err := b.Format(0, 0, b.lines.Len()); + dummy, err := b.Format(0, 0, b.lines_size.Len()); // reset (even in the presence of errors) b.buf.Clear(); - b.width = 0; - b.lines.Init(0); + b.size, b.width = 0, 0; + b.lines_size.Init(0); + b.lines_width.Init(0); b.AddLine(); return err; } @@ -329,13 +352,14 @@ func (b *Writer) Append(buf *[]byte) { i0 = i + 1; // exclude ch from (next) cell // terminate cell - last := b.Line(b.lines.Len() - 1); - last.Push(b.width); - b.width = 0; + last_size, last_width := b.Line(b.lines_size.Len() - 1); + last_size.Push(b.size); + last_width.Push(b.width); + b.size, b.width = 0, 0; if ch == '\n' { b.AddLine(); - if last.Len() == 1 { + if last_size.Len() == 1 { // The previous line has only one cell which does not have // an impact on the formatting of the following lines (the // last cell per line is ignored by Format), thus we can diff --git a/src/lib/tabwriter/tabwriter_test.go b/src/lib/tabwriter/tabwriter_test.go index 03b0409c90..097a894823 100644 --- a/src/lib/tabwriter/tabwriter_test.go +++ b/src/lib/tabwriter/tabwriter_test.go @@ -189,24 +189,24 @@ export func Test(t *testing.T) { Check( t, 8, 1, ' ', true, - "a\tb\tc\n" - "aa\tbbb\tcccc\tddddd\n" + "本\tb\tc\n" + "aa\t\u672c\u672c\u672c\tcccc\tddddd\n" "aaa\tbbbb\n", - "a b c\n" - "aa bbb cccc ddddd\n" + "本 b c\n" + "aa 本本本 cccc ddddd\n" "aaa bbbb\n" ); Check( t, 8, 1, ' ', false, - "a\tb\tc\t\n" - "aa\tbbb\tcccc\tddddd\t\n" - "aaa\tbbbb\t\n", + "a\tè\tc\t\n" + "aa\tèèè\tcccc\tddddd\t\n" + "aaa\tèèèè\t\n", - " a b c\n" - " aa bbb cccc ddddd\n" - " aaa bbbb\n" + " a è c\n" + " aa èèè cccc ddddd\n" + " aaa èèèè\n" ); Check( @@ -233,7 +233,7 @@ export func Test(t *testing.T) { Check( t, 4, 1, '-', true, - "4444\t333\t22\t1\t333\n" + "4444\t日本語\t22\t1\t333\n" "999999999\t22\n" "7\t22\n" "\t\t\t88888888\n" @@ -241,7 +241,7 @@ export func Test(t *testing.T) { "666666\t666666\t666666\t4444\n" "1\t1\t999999999\t0000000000\n", - "4444------333-22--1---333\n" + "4444------日本語-22--1---333\n" "999999999-22\n" "7---------22\n" "------------------88888888\n"