regexp/syntax: recognize category aliases like \p{Letter}

author Russ Cox <rsc@golang.org>

Wed, 8 Jan 2025 16:27:07 +0000 (11:27 -0500)

committer Russ Cox <rsc@golang.org>

Fri, 18 Apr 2025 21:13:38 +0000 (14:13 -0700)
author Russ Cox <rsc@golang.org>
Wed, 8 Jan 2025 16:27:07 +0000 (11:27 -0500)
committer Russ Cox <rsc@golang.org>
Fri, 18 Apr 2025 21:13:38 +0000 (14:13 -0700)
diff --git a/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md b/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md

new file mode 100644 (file)

index 0000000..63794b4
--- /dev/null
+++ b/doc/next/6-stdlib/99-minor/regexp/syntax/70781.md
@@ -0,0 +1,4 @@
+The `\p{name}` and `\P{name}` character class syntaxes now accept the names
+Any, ASCII, Assigned, Cn, and LC, as well as Unicode category aliases like `\p{Letter}` for `\pL`.
+Following [Unicode TR18](https://unicode.org/reports/tr18/), they also now use
+case-insensitive name lookups, ignoring spaces, underscores, and hyphens.
diff --git a/src/regexp/syntax/doc.go b/src/regexp/syntax/doc.go

index 877f1043ddda8b4ead01122d7e9131ceb92fa98c..8a7d9992a2cfb8d8a99e7390062455f26019064c 100644 (file)
--- a/src/regexp/syntax/doc.go
+++ b/src/regexp/syntax/doc.go
@@ -137,6 +137,7 @@ ASCII character classes:
         [[:word:]]     word characters (== [0-9A-Za-z_])
         [[:xdigit:]]   hex digit (== [0-9A-Fa-f])
  
-Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
+Unicode character classes are those in [unicode.Categories],
+[unicode.CategoryAliases], and [unicode.Scripts].
  */
  package syntax
diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go

index ed239dafdf34710d5be12dcfd8d401212bf6d8c3..b77a7dab8e2e6028e2a220addf227e4dcb31f716 100644 (file)
--- a/src/regexp/syntax/parse.go
+++ b/src/regexp/syntax/parse.go
@@ -7,6 +7,7 @@ package syntax
  import (
         "sort"
         "strings"
+       "sync"
         "unicode"
         "unicode/utf8"
  )
@@ -1639,20 +1640,109 @@ var anyTable = &unicode.RangeTable{
         R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
  }
  
+var asciiTable = &unicode.RangeTable{
+       R16: []unicode.Range16{{Lo: 0, Hi: 0x7F, Stride: 1}},
+}
+
+var asciiFoldTable = &unicode.RangeTable{
+       R16: []unicode.Range16{
+               {Lo: 0, Hi: 0x7F, Stride: 1},
+               {Lo: 0x017F, Hi: 0x017F, Stride: 1}, // Old English long s (ſ), folds to S/s.
+               {Lo: 0x212A, Hi: 0x212A, Stride: 1}, // Kelvin K, folds to K/k.
+       },
+}
+
+// categoryAliases is a lazily constructed copy of unicode.CategoryAliases
+// but with the keys passed through canonicalName, to support inexact matches.
+var categoryAliases struct {
+       once sync.Once
+       m    map[string]string
+}
+
+// initCategoryAliases initializes categoryAliases by canonicalizing unicode.CategoryAliases.
+func initCategoryAliases() {
+       categoryAliases.m = make(map[string]string)
+       for name, actual := range unicode.CategoryAliases {
+               categoryAliases.m[canonicalName(name)] = actual
+       }
+}
+
+// canonicalName returns the canonical lookup string for name.
+// The canonical name has a leading uppercase letter and then lowercase letters,
+// and it omits all underscores, spaces, and hyphens.
+// (We could have used all lowercase, but this way most package unicode
+// map keys are already canonical.)
+func canonicalName(name string) string {
+       var b []byte
+       first := true
+       for i := range len(name) {
+               c := name[i]
+               switch {
+               case c == '_' || c == '-' || c == ' ':
+                       c = ' '
+               case first:
+                       if 'a' <= c && c <= 'z' {
+                               c -= 'a' - 'A'
+                       }
+                       first = false
+               default:
+                       if 'A' <= c && c <= 'Z' {
+                               c += 'a' - 'A'
+                       }
+               }
+               if b == nil {
+                       if c == name[i] && c != ' ' {
+                               // No changes so far, avoid allocating b.
+                               continue
+                       }
+                       b = make([]byte, i, len(name))
+                       copy(b, name[:i])
+               }
+               if c == ' ' {
+                       continue
+               }
+               b = append(b, c)
+       }
+       if b == nil {
+               return name
+       }
+       return string(b)
+}
+
  // unicodeTable returns the unicode.RangeTable identified by name
  // and the table of additional fold-equivalent code points.
-func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
-       // Special case: "Any" means any.
-       if name == "Any" {
-               return anyTable, anyTable
+// If sign < 0, the result should be inverted.
+func unicodeTable(name string) (tab, fold *unicode.RangeTable, sign int) {
+       name = canonicalName(name)
+
+       // Special cases: Any, Assigned, and ASCII.
+       // Also LC is the only non-canonical Categories key, so handle it here.
+       switch name {
+       case "Any":
+               return anyTable, anyTable, +1
+       case "Assigned":
+               return unicode.Cn, unicode.Cn, -1 // invert Cn (unassigned)
+       case "Ascii":
+               return asciiTable, asciiFoldTable, +1
+       case "Lc":
+               return unicode.Categories["LC"], unicode.FoldCategory["LC"], +1
         }
         if t := unicode.Categories[name]; t != nil {
-               return t, unicode.FoldCategory[name]
+               return t, unicode.FoldCategory[name], +1
         }
         if t := unicode.Scripts[name]; t != nil {
-               return t, unicode.FoldScript[name]
+               return t, unicode.FoldScript[name], +1
+       }
+
+       // unicode.CategoryAliases makes liberal use of underscores in its names
+       // (they are defined that way by Unicode), but we want to match ignoring
+       // the underscores, so make our own map with canonical names.
+       categoryAliases.once.Do(initCategoryAliases)
+       if actual := categoryAliases.m[name]; actual != "" {
+               t := unicode.Categories[actual]
+               return t, unicode.FoldCategory[actual], +1
         }
-       return nil, nil
+       return nil, nil, 0
  }
  
  // parseUnicodeClass parses a leading Unicode character class like \p{Han}
@@ -1700,10 +1790,13 @@ func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string,
                 name = name[1:]
         }
  
-       tab, fold := unicodeTable(name)
+       tab, fold, tsign := unicodeTable(name)
         if tab == nil {
                 return nil, "", &Error{ErrInvalidCharRange, seq}
         }
+       if tsign < 0 {
+               sign = -sign
+       }
  
         if p.flags&FoldCase == 0 || fold == nil {
                 if sign > 0 {
diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go

index 0f885bd5c8149fa4b4be69a8163c0a618234bb60..9d2f698e258d198aa5a898252ee3efd76b851c8e 100644 (file)
--- a/src/regexp/syntax/parse_test.go
+++ b/src/regexp/syntax/parse_test.go
@@ -107,10 +107,16 @@ var parseTests = []parseTest{
         {`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
         {`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
         {`\p{Lu}`, mkCharClass(unicode.IsUpper)},
+       {`\p{Uppercase_Letter}`, mkCharClass(unicode.IsUpper)},
+       {`\p{upper case-let ter}`, mkCharClass(unicode.IsUpper)},
+       {`\p{__upper case-let ter}`, mkCharClass(unicode.IsUpper)},
         {`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
         {`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
         {`\p{Any}`, `dot{}`},
         {`\p{^Any}`, `cc{}`},
+       {`(?i)\p{ascii}`, `cc{0x0-0x7f 0x17f 0x212a}`},
+       {`\p{Assigned}`, mkCharClass(func(r rune) bool { return !unicode.In(r, unicode.Cn) })},
+       {`\p{^Assigned}`, mkCharClass(func(r rune) bool { return unicode.In(r, unicode.Cn) })},
  
         // Hex, octal.
         {`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
author	Russ Cox <rsc@golang.org>
	Wed, 8 Jan 2025 16:27:07 +0000 (11:27 -0500)
committer	Russ Cox <rsc@golang.org>
	Fri, 18 Apr 2025 21:13:38 +0000 (14:13 -0700)
doc/next/6-stdlib/99-minor/regexp/syntax/70781.md	[new file with mode: 0644]	patch \| blob
src/regexp/syntax/doc.go		patch \| blob \| history
src/regexp/syntax/parse.go		patch \| blob \| history
src/regexp/syntax/parse_test.go		patch \| blob \| history