import (
"sort"
"strings"
+ "sync"
"unicode"
"unicode/utf8"
)
R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
}
+var asciiTable = &unicode.RangeTable{
+ R16: []unicode.Range16{{Lo: 0, Hi: 0x7F, Stride: 1}},
+}
+
+var asciiFoldTable = &unicode.RangeTable{
+ R16: []unicode.Range16{
+ {Lo: 0, Hi: 0x7F, Stride: 1},
+ {Lo: 0x017F, Hi: 0x017F, Stride: 1}, // Old English long s (ſ), folds to S/s.
+ {Lo: 0x212A, Hi: 0x212A, Stride: 1}, // Kelvin K, folds to K/k.
+ },
+}
+
+// categoryAliases is a lazily constructed copy of unicode.CategoryAliases
+// but with the keys passed through canonicalName, to support inexact matches.
+var categoryAliases struct {
+ once sync.Once
+ m map[string]string
+}
+
+// initCategoryAliases initializes categoryAliases by canonicalizing unicode.CategoryAliases.
+func initCategoryAliases() {
+ categoryAliases.m = make(map[string]string)
+ for name, actual := range unicode.CategoryAliases {
+ categoryAliases.m[canonicalName(name)] = actual
+ }
+}
+
+// canonicalName returns the canonical lookup string for name.
+// The canonical name has a leading uppercase letter and then lowercase letters,
+// and it omits all underscores, spaces, and hyphens.
+// (We could have used all lowercase, but this way most package unicode
+// map keys are already canonical.)
+func canonicalName(name string) string {
+ var b []byte
+ first := true
+ for i := range len(name) {
+ c := name[i]
+ switch {
+ case c == '_' || c == '-' || c == ' ':
+ c = ' '
+ case first:
+ if 'a' <= c && c <= 'z' {
+ c -= 'a' - 'A'
+ }
+ first = false
+ default:
+ if 'A' <= c && c <= 'Z' {
+ c += 'a' - 'A'
+ }
+ }
+ if b == nil {
+ if c == name[i] && c != ' ' {
+ // No changes so far, avoid allocating b.
+ continue
+ }
+ b = make([]byte, i, len(name))
+ copy(b, name[:i])
+ }
+ if c == ' ' {
+ continue
+ }
+ b = append(b, c)
+ }
+ if b == nil {
+ return name
+ }
+ return string(b)
+}
+
// unicodeTable returns the unicode.RangeTable identified by name
// and the table of additional fold-equivalent code points.
-func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
- // Special case: "Any" means any.
- if name == "Any" {
- return anyTable, anyTable
+// If sign < 0, the result should be inverted.
+func unicodeTable(name string) (tab, fold *unicode.RangeTable, sign int) {
+ name = canonicalName(name)
+
+ // Special cases: Any, Assigned, and ASCII.
+ // Also LC is the only non-canonical Categories key, so handle it here.
+ switch name {
+ case "Any":
+ return anyTable, anyTable, +1
+ case "Assigned":
+ return unicode.Cn, unicode.Cn, -1 // invert Cn (unassigned)
+ case "Ascii":
+ return asciiTable, asciiFoldTable, +1
+ case "Lc":
+ return unicode.Categories["LC"], unicode.FoldCategory["LC"], +1
}
if t := unicode.Categories[name]; t != nil {
- return t, unicode.FoldCategory[name]
+ return t, unicode.FoldCategory[name], +1
}
if t := unicode.Scripts[name]; t != nil {
- return t, unicode.FoldScript[name]
+ return t, unicode.FoldScript[name], +1
+ }
+
+ // unicode.CategoryAliases makes liberal use of underscores in its names
+ // (they are defined that way by Unicode), but we want to match ignoring
+ // the underscores, so make our own map with canonical names.
+ categoryAliases.once.Do(initCategoryAliases)
+ if actual := categoryAliases.m[name]; actual != "" {
+ t := unicode.Categories[actual]
+ return t, unicode.FoldCategory[actual], +1
}
- return nil, nil
+ return nil, nil, 0
}
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
name = name[1:]
}
- tab, fold := unicodeTable(name)
+ tab, fold, tsign := unicodeTable(name)
if tab == nil {
return nil, "", &Error{ErrInvalidCharRange, seq}
}
+ if tsign < 0 {
+ sign = -sign
+ }
if p.flags&FoldCase == 0 || fold == nil {
if sign > 0 {
{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
+ {`\p{Uppercase_Letter}`, mkCharClass(unicode.IsUpper)},
+ {`\p{upper case-let ter}`, mkCharClass(unicode.IsUpper)},
+ {`\p{__upper case-let ter}`, mkCharClass(unicode.IsUpper)},
{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
{`\p{Any}`, `dot{}`},
{`\p{^Any}`, `cc{}`},
+ {`(?i)\p{ascii}`, `cc{0x0-0x7f 0x17f 0x212a}`},
+ {`\p{Assigned}`, mkCharClass(func(r rune) bool { return !unicode.In(r, unicode.Cn) })},
+ {`\p{^Assigned}`, mkCharClass(func(r rune) bool { return unicode.In(r, unicode.Cn) })},
// Hex, octal.
{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},