- handle UTF-8 text in tabwriter

author Robert Griesemer <gri@golang.org>

Fri, 5 Dec 2008 17:22:13 +0000 (09:22 -0800)

committer Robert Griesemer <gri@golang.org>

Fri, 5 Dec 2008 17:22:13 +0000 (09:22 -0800)
author Robert Griesemer <gri@golang.org>
Fri, 5 Dec 2008 17:22:13 +0000 (09:22 -0800)
committer Robert Griesemer <gri@golang.org>
Fri, 5 Dec 2008 17:22:13 +0000 (09:22 -0800)
diff --git a/src/lib/tabwriter/tabwriter.go b/src/lib/tabwriter/tabwriter.go

index ec6fadad1f1ad116ed4e105494c87c2cb9777b09..53a7961e5e81f8d610c5c716031f2408731abd9e 100644 (file)
--- a/src/lib/tabwriter/tabwriter.go
+++ b/src/lib/tabwriter/tabwriter.go
@@ -8,12 +8,12 @@ import (
         "os";
         "io";
         "array";
+       "utf8";
  )
  
  
  // ----------------------------------------------------------------------------
  // ByteArray
-// TODO should use a ByteArray library eventually
  
  type ByteArray struct {
         a *[]byte;
@@ -62,11 +62,13 @@ func (b *ByteArray) Append(s *[]byte) {
  
  // ----------------------------------------------------------------------------
  // Writer is a filter implementing the io.Write interface. It assumes
-// that the incoming bytes represent ASCII encoded text consisting of
+// that the incoming bytes represent UTF-8 encoded text consisting of
  // lines of tab-terminated "cells". Cells in adjacent lines constitute
  // a column. Writer rewrites the incoming text such that all cells in
  // a column have the same width; thus it effectively aligns cells. It
-// does this by adding padding where necessary.
+// does this by adding padding where necessary. All characters (ASCII
+// or not) are assumed to be of the same width - this may not be true
+// for arbitrary UTF-8 characters visualized on the screen.
  //
  // Note that any text at the end of a line that is not tab-terminated
  // is not a cell and does not enforce alignment of cells in adjacent
@@ -84,8 +86,6 @@ func (b *ByteArray) Append(s *[]byte) {
  //            (for correct-looking results, cellwidth must correspond
  //            to the tabwidth in the editor used to look at the result)
  
-// TODO Should support UTF-8 (requires more complicated width bookkeeping)
-
  
  export type Writer struct {
         // TODO should not export any of the fields
@@ -97,15 +97,18 @@ export type Writer struct {
         align_left bool;
  
         // current state
-       buf ByteArray;  // the collected text w/o tabs and newlines
-       width int;  // width of last incomplete cell
-       lines array.Array;  // list of lines; each line is a list of cell widths
-       widths array.IntArray;  // list of column widths - re-used during formatting
+       buf ByteArray;  // collected text w/o tabs and newlines
+       size int;  // size of last incomplete cell in bytes
+       width int;  // width of last incomplete cell in runes
+       lines_size array.Array;  // list of lines; each line is a list of cell sizes in bytes
+       lines_width array.Array;  // list of lines; each line is a list of cell widths in runes
+       widths array.IntArray;  // list of column widths in runes - re-used during formatting
  }
  
  
  func (b *Writer) AddLine() {
-       b.lines.Push(array.NewIntArray(0));
+       b.lines_size.Push(array.NewIntArray(0));
+       b.lines_width.Push(array.NewIntArray(0));
  }
  
  
@@ -125,7 +128,8 @@ func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, ali
         b.align_left = align_left || padchar == '\t';  // tab enforces left-alignment
         
         b.buf.Init(1024);
-       b.lines.Init(0);
+       b.lines_size.Init(0);
+       b.lines_width.Init(0);
         b.widths.Init(0);
         b.AddLine();  // the very first line
         
@@ -133,21 +137,23 @@ func (b *Writer) Init(writer io.Write, cellwidth, padding int, padchar byte, ali
  }
  
  
-func (b *Writer) Line(i int) *array.IntArray {
-       return b.lines.At(i).(*array.IntArray);
+func (b *Writer) Line(i int) (*array.IntArray, *array.IntArray) {
+       return
+               b.lines_size.At(i).(*array.IntArray),
+               b.lines_width.At(i).(*array.IntArray);
  }
  
  
  // debugging support
  func (b *Writer) Dump() {
         pos := 0;
-       for i := 0; i < b.lines.Len(); i++ {
-               line := b.Line(i);
+       for i := 0; i < b.lines_size.Len(); i++ {
+               line_size, line_width := b.Line(i);
                 print("(", i, ") ");
-               for j := 0; j < line.Len(); j++ {
-                       w := line.At(j);
-                       print("[", string(b.buf.Slice(pos, pos + w)), "]");
-                       pos += w;
+               for j := 0; j < line_size.Len(); j++ {
+                       s := line_size.At(j);
+                       print("[", string(b.buf.Slice(pos, pos + s)), "]");
+                       pos += s;
                 }
                 print("\n");
         }
@@ -198,16 +204,16 @@ exit:
  func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error) {
         pos = pos0;
         for i := line0; i < line1; i++ {
-               line := b.Line(i);
-               for j := 0; j < line.Len(); j++ {
-                       w := line.At(j);
+               line_size, line_width := b.Line(i);
+               for j := 0; j < line_size.Len(); j++ {
+                       s, w := line_size.At(j), line_width.At(j);
  
                         if b.align_left {
-                               err = b.Write0(b.buf.a[pos : pos + w]);
+                               err = b.Write0(b.buf.a[pos : pos + s]);
                                 if err != nil {
                                         goto exit;
                                 }
-                               pos += w;
+                               pos += s;
                                 if j < b.widths.Len() {
                                         err = b.WritePadding(w, b.widths.At(j));
                                         if err != nil {
@@ -223,20 +229,20 @@ func (b *Writer) WriteLines(pos0 int, line0, line1 int) (pos int, err *os.Error)
                                                 goto exit;
                                         }
                                 }
-                               err = b.Write0(b.buf.a[pos : pos + w]);
+                               err = b.Write0(b.buf.a[pos : pos + s]);
                                 if err != nil {
                                         goto exit;
                                 }
-                               pos += w;
+                               pos += s;
                         }
                 }
                 
-               if i+1 == b.lines.Len() {
+               if i+1 == b.lines_size.Len() {
                         // last buffered line - we don't have a newline, so just write
                         // any outstanding buffered data
-                       err = b.Write0(b.buf.a[pos : pos + b.width]);
-                       pos += b.width;
-                       b.width = 0;
+                       err = b.Write0(b.buf.a[pos : pos + b.size]);
+                       pos += b.size;
+                       b.size, b.width = 0, 0;
                 } else {
                         // not the last line - write newline
                         err = b.Write0(Newline);
@@ -256,9 +262,9 @@ func (b *Writer) Format(pos0 int, line0, line1 int) (pos int, err *os.Error) {
         column := b.widths.Len();       
         last := line0;
         for this := line0; this < line1; this++ {
-               line := b.Line(this);
+               line_size, line_width := b.Line(this);
                 
-               if column < line.Len() - 1 {
+               if column < line_size.Len() - 1 {
                         // cell exists in this column
                         // (note that the last cell per line is ignored)
                         
@@ -272,10 +278,10 @@ func (b *Writer) Format(pos0 int, line0, line1 int) (pos int, err *os.Error) {
                         // column block begin
                         width := b.cellwidth;  // minimal width
                         for ; this < line1; this++ {
-                               line = b.Line(this);
-                               if column < line.Len() - 1 {
+                               line_size, line_width = b.Line(this);
+                               if column < line_size.Len() - 1 {
                                         // cell exists in this column => update width
-                                       w := line.At(column) + b.padding;
+                                       w := line_width.At(column) + b.padding;
                                         if w > width {
                                                 width = w;
                                         }
@@ -302,18 +308,35 @@ exit:
  }
  
  
+func UnicodeLen(buf *[]byte) int {
+       l := 0;
+       for i := 0; i < len(buf); {
+               if buf[i] < utf8.RuneSelf {
+                       i++;
+               } else {
+                       rune, size := utf8.DecodeRune(buf[i : len(buf)]);
+                       i += size;
+               }
+               l++;
+       }
+       return l;
+}
+ 
+
  func (b *Writer) Append(buf *[]byte) {
         b.buf.Append(buf);
-       b.width += len(buf);
+       b.size += len(buf);
+       b.width += UnicodeLen(buf);
  }
  
  
  /* export */ func (b *Writer) Flush() *os.Error {
-       dummy, err := b.Format(0, 0, b.lines.Len());
+       dummy, err := b.Format(0, 0, b.lines_size.Len());
         // reset (even in the presence of errors)
         b.buf.Clear();
-       b.width = 0;
-       b.lines.Init(0);
+       b.size, b.width = 0, 0;
+       b.lines_size.Init(0);
+       b.lines_width.Init(0);
         b.AddLine();
         return err;
  }
@@ -329,13 +352,14 @@ func (b *Writer) Append(buf *[]byte) {
                         i0 = i + 1;  // exclude ch from (next) cell
  
                         // terminate cell
-                       last := b.Line(b.lines.Len() - 1);
-                       last.Push(b.width);
-                       b.width = 0;
+                       last_size, last_width := b.Line(b.lines_size.Len() - 1);
+                       last_size.Push(b.size);
+                       last_width.Push(b.width);
+                       b.size, b.width = 0, 0;
  
                         if ch == '\n' {
                                 b.AddLine();
-                               if last.Len() == 1 {
+                               if last_size.Len() == 1 {
                                         // The previous line has only one cell which does not have
                                         // an impact on the formatting of the following lines (the
                                         // last cell per line is ignored by Format), thus we can
diff --git a/src/lib/tabwriter/tabwriter_test.go b/src/lib/tabwriter/tabwriter_test.go

index 03b0409c90e0ba160149ebbb1994695491563c9d..097a894823a515670d5fb5c70b750f2836fb8e3f 100644 (file)
--- a/src/lib/tabwriter/tabwriter_test.go
+++ b/src/lib/tabwriter/tabwriter_test.go
@@ -189,24 +189,24 @@ export func Test(t *testing.T) {
  
         Check(
                 t, 8, 1, ' ', true,
-               "a\tb\tc\n"
-               "aa\tbbb\tcccc\tddddd\n"
+               "本\tb\tc\n"
+               "aa\t\u672c\u672c\u672c\tcccc\tddddd\n"
                 "aaa\tbbbb\n",
  
-               "a       b       c\n"
-               "aa      bbb     cccc    ddddd\n"
+               "本       b       c\n"
+               "aa      本本本     cccc    ddddd\n"
                 "aaa     bbbb\n"
         );
  
         Check(
                 t, 8, 1, ' ', false,
-               "a\tb\tc\t\n"
-               "aa\tbbb\tcccc\tddddd\t\n"
-               "aaa\tbbbb\t\n",
+               "a\tè\tc\t\n"
+               "aa\tèèè\tcccc\tddddd\t\n"
+               "aaa\tèèèè\t\n",
  
-               "       a       b       c\n"
-               "      aa     bbb    cccc   ddddd\n"
-               "     aaa    bbbb\n"
+               "       a       è       c\n"
+               "      aa     èèè    cccc   ddddd\n"
+               "     aaa    èèèè\n"
         );
  
         Check(
@@ -233,7 +233,7 @@ export func Test(t *testing.T) {
  
         Check(
                 t, 4, 1, '-', true,
-               "4444\t333\t22\t1\t333\n"
+               "4444\t日本語\t22\t1\t333\n"
                 "999999999\t22\n"
                 "7\t22\n"
                 "\t\t\t88888888\n"
@@ -241,7 +241,7 @@ export func Test(t *testing.T) {
                 "666666\t666666\t666666\t4444\n"
                 "1\t1\t999999999\t0000000000\n",
  
-               "4444------333-22--1---333\n"
+               "4444------日本語-22--1---333\n"
                 "999999999-22\n"
                 "7---------22\n"
                 "------------------88888888\n"
author	Robert Griesemer <gri@golang.org>
	Fri, 5 Dec 2008 17:22:13 +0000 (09:22 -0800)
committer	Robert Griesemer <gri@golang.org>
	Fri, 5 Dec 2008 17:22:13 +0000 (09:22 -0800)
src/lib/tabwriter/tabwriter.go		patch \| blob \| history
src/lib/tabwriter/tabwriter_test.go		patch \| blob \| history