]> Cypherpunks repositories - gostls13.git/commitdiff
exp/locale/collate: Added functionality to parse and process LDML files
authorMarcel van Lohuizen <mpvl@golang.org>
Sat, 1 Sep 2012 12:15:00 +0000 (14:15 +0200)
committerMarcel van Lohuizen <mpvl@golang.org>
Sat, 1 Sep 2012 12:15:00 +0000 (14:15 +0200)
for both locale-specific exemplar characters and tailorings to
the collation table.
Some specifices:
- Moved stringSet to the beginning of the file and added some functionality
  to parse command line files.
- openReader now modifies the input URL for localFiles to guarantee that
  any http source listed in the generated file is indeed this source.
- Note that the implementation of the Tailoring API used by maketables.go
  is not yet checked in. So for now adding tailorings are simply no-ops.
- The generated file of exemplar characters will be used somewhere else.
  Here is a snippet of how the body of the generated file looks like:

type exemplarType int
const (
        exCharacters exemplarType = iota
        exContractions
        exPunctuation
        exAuxiliary
        exCurrency
        exIndex
        exN
)

var exemplarCharacters = map[string][exN]string{
        "af": [exN]string{
                0: "a á â b c d e é è ê ë f g h i î ï j k l m n o ô ö p q r s t u û v w x y z",
                3: "á à â ä ã æ ç é è ê ë í ì î ï ó ò ô ö ú ù û ü ý",
                4: "a b c d e f g h i j k l m n o p q r s t u v w x y z",
        },
        ...
}

R=r
CC=golang-dev
https://golang.org/cl/6501070

src/pkg/exp/locale/collate/maketables.go
src/pkg/exp/locale/collate/tables.go

index a42ce9b67b62c5fb24f3645aad6ec82dd277669a..b73c596b2ce55dea03fb3a50dce671a75c37ba5f 100644 (file)
 package main
 
 import (
+       "archive/zip"
        "bufio"
        "bytes"
+       "encoding/xml"
        "exp/locale/collate"
        "exp/locale/collate/build"
        "flag"
        "fmt"
        "io"
+       "io/ioutil"
        "log"
        "net/http"
        "os"
@@ -26,17 +29,155 @@ import (
        "strconv"
        "strings"
        "unicode"
+       "unicode/utf8"
 )
 
-var ducet = flag.String("ducet",
-       "http://unicode.org/Public/UCA/"+unicode.Version+"/allkeys.txt",
-       "URL of the Default Unicode Collation Element Table (DUCET).")
-var test = flag.Bool("test",
-       false,
-       "test existing tables; can be used to compare web data with package data")
-var localFiles = flag.Bool("local",
-       false,
-       "data files have been copied to the current directory; for debugging only")
+var (
+       // TODO: change default to "http://unicode.org/Public/UCA/"+unicode.Version+"/CollationAuxiliary.zip"
+       root = flag.String("root",
+               "http://unicode.org/Public/UCA/"+unicode.Version+"/allkeys.txt",
+               `URL of the Default Unicode Collation Element Table (DUCET). This can be a zip
+file containing the file allkeys_CLDR.txt or an allkeys.txt file.`)
+       cldr = flag.String("cldr",
+               "http://www.unicode.org/Public/cldr/2.0.1/core.zip",
+               "URL of CLDR archive.")
+       test = flag.Bool("test", false,
+               "test existing tables; can be used to compare web data with package data.")
+       localFiles = flag.Bool("local", false,
+               "data files have been copied to the current directory; for debugging only.")
+       short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
+       draft = flag.Bool("draft", false, `Use draft versions, when available.`)
+       tags  = flag.String("tags", "", "build tags to be included after +build directive")
+       pkg   = flag.String("package", "collate",
+               "the name of the package in which the generated file is to be included")
+
+       tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
+               "comma-spearated list of tables to generate.")
+       exclude = flagStringSet("exclude", "zh2", "",
+               "comma-separated list of languages to exclude.")
+       include = flagStringSet("include", "", "",
+               "comma-separated list of languages to include. Include trumps exclude.")
+       types = flagStringSetAllowAll("types", "", "",
+               "comma-separated list of types that should be included in addition to the standard type.")
+)
+
+// stringSet implements an ordered set based on a list.  It implements flag.Value
+// to allow a set to be specified as a comma-separated list.
+type stringSet struct {
+       s        []string
+       allowed  *stringSet
+       dirty    bool // needs compaction if true
+       all      bool
+       allowAll bool
+}
+
+func flagStringSet(name, def, allowed, usage string) *stringSet {
+       ss := &stringSet{}
+       if allowed != "" {
+               usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
+               ss.allowed = &stringSet{}
+               failOnError(ss.allowed.Set(allowed))
+       }
+       ss.Set(def)
+       flag.Var(ss, name, usage)
+       return ss
+}
+
+func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
+       ss := &stringSet{allowAll: true}
+       if allowed == "" {
+               flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
+       } else {
+               ss.allowed = &stringSet{}
+               failOnError(ss.allowed.Set(allowed))
+               flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
+       }
+       ss.Set(def)
+       return ss
+}
+
+func (ss stringSet) Len() int {
+       return len(ss.s)
+}
+
+func (ss stringSet) String() string {
+       return strings.Join(ss.s, ",")
+}
+
+func (ss *stringSet) Set(s string) error {
+       if ss.allowAll && s == "all" {
+               ss.s = nil
+               ss.all = true
+               return nil
+       }
+       ss.s = ss.s[:0]
+       for _, s := range strings.Split(s, ",") {
+               if s := strings.TrimSpace(s); s != "" {
+                       if ss.allowed != nil && !ss.allowed.contains(s) {
+                               return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
+                       }
+                       ss.add(s)
+               }
+       }
+       ss.compact()
+       return nil
+}
+
+func (ss *stringSet) add(s string) {
+       ss.s = append(ss.s, s)
+       ss.dirty = true
+}
+
+func (ss *stringSet) values() []string {
+       ss.compact()
+       return ss.s
+}
+
+func (ss *stringSet) contains(s string) bool {
+       if ss.all {
+               return true
+       }
+       for _, v := range ss.s {
+               if v == s {
+                       return true
+               }
+       }
+       return false
+}
+
+func (ss *stringSet) compact() {
+       if !ss.dirty {
+               return
+       }
+       a := ss.s
+       sort.Strings(a)
+       k := 0
+       for i := 1; i < len(a); i++ {
+               if a[k] != a[i] {
+                       a[k+1] = a[i]
+                       k++
+               }
+       }
+       ss.s = a[:k+1]
+       ss.dirty = false
+}
+
+func skipLang(l string) bool {
+       if include.Len() > 0 {
+               return !include.contains(l)
+       }
+       return exclude.contains(l)
+}
+
+func skipAlt(a string) bool {
+       if *draft && a == "proposed" {
+               return false
+       }
+       if *short && a == "short" {
+               return false
+       }
+       return true
+}
 
 func failOnError(e error) {
        if e != nil {
@@ -44,31 +185,55 @@ func failOnError(e error) {
        }
 }
 
-// openReader opens the url or file given by url and returns it as an io.ReadCloser
+// openReader opens the URL or file given by url and returns it as an io.ReadCloser
 // or nil on error.
-func openReader(url string) (io.ReadCloser, error) {
+func openReader(url *string) (io.ReadCloser, error) {
        if *localFiles {
                pwd, _ := os.Getwd()
-               url = "file://" + path.Join(pwd, path.Base(url))
+               *url = "file://" + path.Join(pwd, path.Base(*url))
        }
        t := &http.Transport{}
        t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
        c := &http.Client{Transport: t}
-       resp, err := c.Get(url)
+       resp, err := c.Get(*url)
        if err != nil {
                return nil, err
        }
        if resp.StatusCode != 200 {
-               return nil, fmt.Errorf(`bad GET status for "%s": %s`, url, resp.Status)
+               return nil, fmt.Errorf(`bad GET status for "%s": %s`, *url, resp.Status)
        }
        return resp.Body, nil
 }
 
+func openArchive(url *string) *zip.Reader {
+       f, err := openReader(url)
+       failOnError(err)
+       buffer, err := ioutil.ReadAll(f)
+       f.Close()
+       failOnError(err)
+       archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
+       failOnError(err)
+       return archive
+}
+
 // parseUCA parses a Default Unicode Collation Element Table of the format
 // specified in http://www.unicode.org/reports/tr10/#File_Format.
 // It returns the variable top.
 func parseUCA(builder *build.Builder) {
-       r, err := openReader(*ducet)
+       var r io.ReadCloser
+       var err error
+       if strings.HasSuffix(*root, ".zip") {
+               for _, f := range openArchive(root).File {
+                       if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
+                               r, err = f.Open()
+                       }
+               }
+               if r == nil {
+                       err = fmt.Errorf("file allkeys_CLDR.txt not found in archive %q", *root)
+               }
+       } else {
+               r, err = openReader(root)
+       }
        failOnError(err)
        defer r.Close()
        input := bufio.NewReader(r)
@@ -147,30 +312,350 @@ func convHex(line int, s string) int {
 
 var testInput = stringSet{}
 
-type stringSet struct {
-       set []string
+// LDML holds all collation information parsed from an LDML XML file.
+// The format of these files is defined in http://unicode.org/reports/tr35/.
+type LDML struct {
+       XMLName   xml.Name `xml:"ldml"`
+       Language  Attr     `xml:"identity>language"`
+       Territory Attr     `xml:"identity>territory"`
+       Chars     *struct {
+               ExemplarCharacters []AttrValue `xml:"exemplarCharacters"`
+               MoreInformaton     string      `xml:"moreInformation,omitempty"`
+       } `xml:"characters"`
+       Default    Attr        `xml:"collations>default"`
+       Collations []Collation `xml:"collations>collation"`
 }
 
-func (ss *stringSet) add(s string) {
-       ss.set = append(ss.set, s)
+type Attr struct {
+       XMLName xml.Name
+       Attr    string `xml:"type,attr"`
 }
 
-func (ss *stringSet) values() []string {
-       ss.compact()
-       return ss.set
+func (t Attr) String() string {
+       return t.Attr
 }
 
-func (ss *stringSet) compact() {
-       a := ss.set
-       sort.Strings(a)
-       k := 0
-       for i := 1; i < len(a); i++ {
-               if a[k] != a[i] {
-                       a[k+1] = a[i]
-                       k++
+type AttrValue struct {
+       Type  string `xml:"type,attr"`
+       Key   string `xml:"key,attr,omitempty"`
+       Draft string `xml:"draft,attr,omitempty"`
+       Value string `xml:",innerxml"`
+}
+
+type Collation struct {
+       Type                string    `xml:"type,attr"`
+       Alt                 string    `xml:"alt,attr"`
+       SuppressContraction string    `xml:"suppress_contractions,omitempty"`
+       Settings            *Settings `xml:"settings"`
+       Optimize            string    `xml:"optimize"`
+       Rules               Rules     `xml:"rules"`
+}
+
+type Optimize struct {
+       XMLName xml.Name `xml:"optimize"`
+       Data    string   `xml:"chardata"`
+}
+
+type Suppression struct {
+       XMLName xml.Name `xml:"suppress_contractions"`
+       Data    string   `xml:"chardata"`
+}
+
+type Settings struct {
+       Strength            string `xml:"strenght,attr,omitempty"`
+       Backwards           string `xml:"backwards,attr,omitempty"`
+       Normalization       string `xml:"normalization,attr,omitempty"`
+       CaseLevel           string `xml:"caseLevel,attr,omitempty"`
+       CaseFirst           string `xml:"caseFirst,attr,omitempty"`
+       HiraganaQuarternary string `xml:"hiraganaQuartenary,attr,omitempty"`
+       Numeric             string `xml:"numeric,attr,omitempty"`
+       VariableTop         string `xml:"variableTop,attr,omitempty"`
+}
+
+type Rules struct {
+       XMLName xml.Name   `xml:"rules"`
+       Any     []RuleElem `xml:",any"`
+}
+
+type RuleElem struct {
+       XMLName xml.Name
+       Value   string     `xml:",innerxml"`
+       Before  string     `xml:"before,attr"`
+       Any     []RuleElem `xml:",any"` // for <x> elements
+}
+
+var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
+var tagRe = regexp.MustCompile(`<([a-z_]*)  */>`)
+
+func (r *RuleElem) rewrite() {
+       // Convert hexadecimal Unicode codepoint notation to a string.
+       if m := charRe.FindAllStringSubmatch(r.Value, -1); m != nil {
+               runes := []rune{}
+               for _, sa := range m {
+                       runes = append(runes, rune(convHex(-1, sa[1])))
+               }
+               r.Value = string(runes)
+       }
+       // Strip spaces from reset positions.
+       if m := tagRe.FindStringSubmatch(r.Value); m != nil {
+               r.Value = fmt.Sprintf("<%s/>", m[1])
+       }
+       for _, rr := range r.Any {
+               rr.rewrite()
+       }
+}
+
+func decodeXML(f *zip.File) *LDML {
+       r, err := f.Open()
+       failOnError(err)
+       d := xml.NewDecoder(r)
+       var x LDML
+       err = d.Decode(&x)
+       failOnError(err)
+       return &x
+}
+
+var mainLocales = []string{}
+
+// charsets holds a list of exemplar characters per category.
+type charSets map[string][]string
+
+func (p charSets) fprint(w io.Writer) {
+       fmt.Fprintln(w, "[exN]string{")
+       for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
+               if set := p[k]; len(set) != 0 {
+                       fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
+               }
+       }
+       fmt.Fprintln(w, "\t},")
+}
+
+var localeChars = make(map[string]charSets)
+
+const exemplarHeader = `
+type exemplarType int
+const (
+       exCharacters exemplarType = iota
+       exContractions
+       exPunctuation
+       exAuxiliary
+       exCurrency
+       exIndex
+       exN
+)
+`
+
+func printExemplarCharacters(w io.Writer) {
+       fmt.Fprintln(w, exemplarHeader)
+       fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
+       for _, loc := range mainLocales {
+               fmt.Fprintf(w, "\t%q: ", loc)
+               localeChars[loc].fprint(w)
+       }
+       fmt.Fprintln(w, "}")
+}
+
+var mainRe = regexp.MustCompile(`.*/main/(.*)\.xml`)
+
+// parseMain parses XML files in the main directory of the CLDR core.zip file.
+func parseMain() {
+       for _, f := range openArchive(cldr).File {
+               if m := mainRe.FindStringSubmatch(f.Name); m != nil {
+                       locale := m[1]
+                       x := decodeXML(f)
+                       if skipLang(x.Language.Attr) {
+                               continue
+                       }
+                       if x.Chars != nil {
+                               for _, ec := range x.Chars.ExemplarCharacters {
+                                       if ec.Draft != "" {
+                                               continue
+                                       }
+                                       if _, ok := localeChars[locale]; !ok {
+                                               mainLocales = append(mainLocales, locale)
+                                               localeChars[locale] = make(charSets)
+                                       }
+                                       localeChars[locale][ec.Type] = parseCharacters(ec.Value)
+                               }
+                       }
+               }
+       }
+}
+
+func parseCharacters(chars string) []string {
+       parseSingle := func(s string) (r rune, tail string, escaped bool) {
+               if s[0] == '\\' {
+                       if s[1] == 'u' || s[1] == 'U' {
+                               r, _, tail, err := strconv.UnquoteChar(s, 0)
+                               failOnError(err)
+                               return r, tail, false
+                       } else if strings.HasPrefix(s[1:], "&amp;") {
+                               return '&', s[6:], false
+                       }
+                       return rune(s[1]), s[2:], true
+               } else if strings.HasPrefix(s, "&quot;") {
+                       return '"', s[6:], false
+               }
+               r, sz := utf8.DecodeRuneInString(s)
+               return r, s[sz:], false
+       }
+       chars = strings.Trim(chars, "[ ]")
+       list := []string{}
+       var r, last, end rune
+       for len(chars) > 0 {
+               if chars[0] == '{' { // character sequence
+                       buf := []rune{}
+                       for chars = chars[1:]; len(chars) > 0; {
+                               r, chars, _ = parseSingle(chars)
+                               if r == '}' {
+                                       break
+                               }
+                               if r == ' ' {
+                                       log.Fatalf("space not supported in sequence %q", chars)
+                               }
+                               buf = append(buf, r)
+                       }
+                       list = append(list, string(buf))
+                       last = 0
+               } else { // single character
+                       escaped := false
+                       r, chars, escaped = parseSingle(chars)
+                       if r != ' ' {
+                               if r == '-' && !escaped {
+                                       if last == 0 {
+                                               log.Fatal("'-' should be preceded by a character")
+                                       }
+                                       end, chars, _ = parseSingle(chars)
+                                       for ; last <= end; last++ {
+                                               list = append(list, string(last))
+                                       }
+                                       last = 0
+                               } else {
+                                       list = append(list, string(r))
+                                       last = r
+                               }
+                       }
+               }
+       }
+       return list
+}
+
+var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
+
+// parseCollation parses XML files in the collation directory of the CLDR core.zip file.
+func parseCollation(b *build.Builder) {
+       for _, f := range openArchive(cldr).File {
+               if m := fileRe.FindStringSubmatch(f.Name); m != nil {
+                       lang := m[1]
+                       x := decodeXML(f)
+                       if skipLang(x.Language.Attr) {
+                               continue
+                       }
+                       def := "standard"
+                       if x.Default.Attr != "" {
+                               def = x.Default.Attr
+                       }
+                       todo := make(map[string]Collation)
+                       for _, c := range x.Collations {
+                               if c.Type != def && !types.contains(c.Type) {
+                                       continue
+                               }
+                               if c.Alt != "" && skipAlt(c.Alt) {
+                                       continue
+                               }
+                               for j, _ := range c.Rules.Any {
+                                       c.Rules.Any[j].rewrite()
+                               }
+                               locale := lang
+                               if c.Type != def {
+                                       locale += "_u_co_" + c.Type
+                               }
+                               _, exists := todo[locale]
+                               if c.Alt != "" || !exists {
+                                       todo[locale] = c
+                               }
+                       }
+                       for _, c := range x.Collations {
+                               locale := lang
+                               if c.Type != def {
+                                       locale += "_u_co_" + c.Type
+                               }
+                               if d, ok := todo[locale]; ok && d.Alt == c.Alt {
+                                       insertCollation(b, locale, &c)
+                               }
+                       }
+               }
+       }
+}
+
+var lmap = map[byte]collate.Level{
+       'p': collate.Primary,
+       's': collate.Secondary,
+       't': collate.Tertiary,
+       'i': collate.Identity,
+}
+
+// cldrIndex is a Unicode-reserved sentinel value used.
+// We ignore any rule that starts with this rune.
+// See http://unicode.org/reports/tr35/#Collation_Elements for details.
+const cldrIndex = 0xFDD0
+
+func insertTailoring(t *build.Tailoring, r RuleElem, context, extend string) {
+       switch l := r.XMLName.Local; l {
+       case "p", "s", "t", "i":
+               if []rune(r.Value)[0] != cldrIndex {
+                       str := context + r.Value
+                       if *test {
+                               testInput.add(str)
+                       }
+                       err := t.Insert(lmap[l[0]], str, extend)
+                       failOnError(err)
+               }
+       case "pc", "sc", "tc", "ic":
+               level := lmap[l[0]]
+               for _, s := range r.Value {
+                       str := context + string(s)
+                       if *test {
+                               testInput.add(str)
+                       }
+                       err := t.Insert(level, str, extend)
+                       failOnError(err)
+               }
+       default:
+               log.Fatalf("unsupported tag: %q", l)
+       }
+}
+
+func insertCollation(builder *build.Builder, locale string, c *Collation) {
+       t := builder.Tailoring(locale)
+       for _, r := range c.Rules.Any {
+               switch r.XMLName.Local {
+               case "reset":
+                       if r.Before == "" {
+                               failOnError(t.SetAnchor(r.Value))
+                       } else {
+                               failOnError(t.SetAnchorBefore(r.Value))
+                       }
+               case "x":
+                       var context, extend string
+                       for _, r1 := range r.Any {
+                               switch r1.XMLName.Local {
+                               case "context":
+                                       context = r1.Value
+                               case "extend":
+                                       extend = r1.Value
+                               }
+                       }
+                       for _, r1 := range r.Any {
+                               if t := r1.XMLName.Local; t == "context" || t == "extend" {
+                                       continue
+                               }
+                               insertTailoring(t, r1, context, extend)
+                       }
+               default:
+                       insertTailoring(t, r, "", "")
                }
        }
-       ss.set = a[:k+1]
 }
 
 func testCollator(c *collate.Collator) {
@@ -214,7 +699,16 @@ func printCollators(c *collate.Collator) {
 func main() {
        flag.Parse()
        b := build.NewBuilder()
-       parseUCA(b)
+       if *root != "" {
+               parseUCA(b)
+       }
+       if *cldr != "" {
+               if tables.contains("chars") {
+                       parseMain()
+               }
+               parseCollation(b)
+       }
+
        c, err := b.Build()
        failOnError(err)
 
@@ -222,18 +716,24 @@ func main() {
                testCollator(c)
        } else {
                fmt.Println("// Generated by running")
-               fmt.Printf("//  maketables --ducet=%s\n", *ducet)
+               fmt.Printf("//  maketables -root=%s -cldr=%s\n", *root, *cldr)
                fmt.Println("// DO NOT EDIT")
                fmt.Println("// TODO: implement more compact representation for sparse blocks.")
+               if *tags != "" {
+                       fmt.Printf("// +build %s\n", *tags)
+               }
                fmt.Println("")
-               fmt.Println("package collate")
-               fmt.Println("")
-               fmt.Println(`import "exp/norm"`)
-               fmt.Println("")
-
-               printCollators(c)
-
-               _, err = b.Print(os.Stdout)
-               failOnError(err)
+               fmt.Printf("package %s\n", *pkg)
+               if tables.contains("collate") {
+                       fmt.Println("")
+                       fmt.Println(`import "exp/norm"`)
+                       fmt.Println("")
+                       printCollators(c)
+                       _, err = b.Print(os.Stdout)
+                       failOnError(err)
+               }
+               if tables.contains("chars") {
+                       printExemplarCharacters(os.Stdout)
+               }
        }
 }
index 42cc74e22ab52db8aca8575dd50b64205a5798f9..f00b02e4bfb00f52dd2ad5ea92e58f8c95fd367f 100644 (file)
@@ -1,5 +1,5 @@
 // Generated by running
-//  maketables --ducet=http://unicode.org/Public/UCA/6.0.0/allkeys.txt
+//  maketables -root=file:///Users/mpvl/Projects/go/hg/gopub/src/pkg/exp/locale/collate/allkeys.txt -cldr=file:///Users/mpvl/Projects/go/hg/gopub/src/pkg/exp/locale/collate/core.zip
 // DO NOT EDIT
 // TODO: implement more compact representation for sparse blocks.