]> Cypherpunks repositories - gostls13.git/commitdiff
regexp/syntax: use more compact Regexp.String output
authorRuss Cox <rsc@golang.org>
Wed, 28 Jun 2023 21:45:26 +0000 (17:45 -0400)
committerGopher Robot <gobot@golang.org>
Wed, 16 Aug 2023 16:02:30 +0000 (16:02 +0000)
Compact the Regexp.String output. It was only ever intended for debugging,
but there are at least some uses in the wild where regexps are built up
using regexp/syntax and then formatted using the String method.
Compact the output to help that use case. Specifically:

 - Compact 2-element character class ranges: [a-b] -> [ab].
 - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E).

Fixes #57950.

Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/507015
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Rob Pike <r@golang.org>
Auto-Submit: Russ Cox <rsc@golang.org>

src/regexp/syntax/parse.go
src/regexp/syntax/parse_test.go
src/regexp/syntax/regexp.go
src/regexp/syntax/simplify_test.go

index a4ccfe3bdbc75ddcb69378b99f454f015ca5681a..6b360b8700028522581a26ba6f8d30c638419799 100644 (file)
@@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune {
        return r[:w]
 }
 
+// inCharClass reports whether r is in the class.
+// It assumes the class has been cleaned by cleanClass.
+func inCharClass(r rune, class []rune) bool {
+       _, ok := sort.Find(len(class)/2, func(i int) int {
+               lo, hi := class[2*i], class[2*i+1]
+               if r > hi {
+                       return +1
+               }
+               if r < lo {
+                       return -1
+               }
+               return 0
+       })
+       return ok
+}
+
 // appendLiteral returns the result of appending the literal x to the class r.
 func appendLiteral(r []rune, x rune, flags Flags) []rune {
        if flags&FoldCase != 0 {
index d7999046e0eaaa643a9dc29bb947954f1bb61653..0f885bd5c8149fa4b4be69a8163c0a618234bb60 100644 (file)
@@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) {
                }
        }
 }
+
+var stringTests = []struct {
+       re  string
+       out string
+}{
+       {`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
+       {`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
+       {`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
+       {`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
+       {`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
+       {`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
+       {`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
+       {`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
+       {`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
+       {`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
+       {`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
+       {`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
+       {`[Aa][Bb]`, `(?i:AB)`},
+       {`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
+       {`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
+       {`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
+}
+
+func TestString(t *testing.T) {
+       for _, tt := range stringTests {
+               re, err := Parse(tt.re, Perl)
+               if err != nil {
+                       t.Errorf("Parse(%#q): %v", tt.re, err)
+                       continue
+               }
+               out := re.String()
+               if out != tt.out {
+                       t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
+               }
+       }
+}
index 3a4d2d201cd418b1a111bf6e905490d0797a6fc7..4fa7d0e2f8330dca2cdf4388502c88459a05dafa 100644 (file)
@@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool {
        return true
 }
 
+// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
+type printFlags uint8
+
+const (
+       flagI    printFlags = 1 << iota // (?i:
+       flagM                           // (?m:
+       flagS                           // (?s:
+       flagOff                         // )
+       flagPrec                        // (?: )
+       negShift = 5                    // flagI<<negShift is (?-i:
+)
+
+// addSpan enables the flags f around start..last,
+// by setting flags[start] = f and flags[last] = flagOff.
+func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
+       if *flags == nil {
+               *flags = make(map[*Regexp]printFlags)
+       }
+       (*flags)[start] = f
+       (*flags)[last] |= flagOff // maybe start==last
+}
+
+// calcFlags calculates the flags to print around each subexpression in re,
+// storing that information in (*flags)[sub] for each affected subexpression.
+// The first time an entry needs to be written to *flags, calcFlags allocates the map.
+// calcFlags also calculates the flags that must be active or can't be active
+// around re and returns those flags.
+func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
+       switch re.Op {
+       default:
+               return 0, 0
+
+       case OpLiteral:
+               // If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
+               // according to whether (?i) is active.
+               // If literal is not fold-sensitive, return 0, 0.
+               for _, r := range re.Rune {
+                       if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
+                               if re.Flags&FoldCase != 0 {
+                                       return flagI, 0
+                               } else {
+                                       return 0, flagI
+                               }
+                       }
+               }
+               return 0, 0
+
+       case OpCharClass:
+               // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
+               // If literal is not fold-sensitive, return 0, 0.
+               for i := 0; i < len(re.Rune); i += 2 {
+                       lo := max(minFold, re.Rune[i])
+                       hi := min(maxFold, re.Rune[i+1])
+                       for r := lo; r <= hi; r++ {
+                               for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
+                                       if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
+                                               return 0, flagI
+                                       }
+                               }
+                       }
+               }
+               return 0, 0
+
+       case OpAnyCharNotNL: // (?-s).
+               return 0, flagS
+
+       case OpAnyChar: // (?s).
+               return flagS, 0
+
+       case OpBeginLine, OpEndLine: // (?m)^ (?m)$
+               return flagM, 0
+
+       case OpEndText:
+               if re.Flags&WasDollar != 0 { // (?-m)$
+                       return 0, flagM
+               }
+               return 0, 0
+
+       case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
+               return calcFlags(re.Sub[0], flags)
+
+       case OpConcat, OpAlternate:
+               // Gather the must and cant for each subexpression.
+               // When we find a conflicting subexpression, insert the necessary
+               // flags around the previously identified span and start over.
+               var must, cant, allCant printFlags
+               start := 0
+               last := 0
+               did := false
+               for i, sub := range re.Sub {
+                       subMust, subCant := calcFlags(sub, flags)
+                       if must&subCant != 0 || subMust&cant != 0 {
+                               if must != 0 {
+                                       addSpan(re.Sub[start], re.Sub[last], must, flags)
+                               }
+                               must = 0
+                               cant = 0
+                               start = i
+                               did = true
+                       }
+                       must |= subMust
+                       cant |= subCant
+                       allCant |= subCant
+                       if subMust != 0 {
+                               last = i
+                       }
+                       if must == 0 && start == i {
+                               start++
+                       }
+               }
+               if !did {
+                       // No conflicts: pass the accumulated must and cant upward.
+                       return must, cant
+               }
+               if must != 0 {
+                       // Conflicts found; need to finish final span.
+                       addSpan(re.Sub[start], re.Sub[last], must, flags)
+               }
+               return 0, allCant
+       }
+}
+
 // writeRegexp writes the Perl syntax for the regular expression re to b.
-func writeRegexp(b *strings.Builder, re *Regexp) {
+func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
+       f |= flags[re]
+       if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
+               // flagPrec is redundant with other flags being added and terminated
+               f &^= flagPrec
+       }
+       if f&^(flagOff|flagPrec) != 0 {
+               b.WriteString(`(?`)
+               if f&flagI != 0 {
+                       b.WriteString(`i`)
+               }
+               if f&flagM != 0 {
+                       b.WriteString(`m`)
+               }
+               if f&flagS != 0 {
+                       b.WriteString(`s`)
+               }
+               if f&((flagM|flagS)<<negShift) != 0 {
+                       b.WriteString(`-`)
+                       if f&(flagM<<negShift) != 0 {
+                               b.WriteString(`m`)
+                       }
+                       if f&(flagS<<negShift) != 0 {
+                               b.WriteString(`s`)
+                       }
+               }
+               b.WriteString(`:`)
+       }
+       if f&flagOff != 0 {
+               defer b.WriteString(`)`)
+       }
+       if f&flagPrec != 0 {
+               b.WriteString(`(?:`)
+               defer b.WriteString(`)`)
+       }
+
        switch re.Op {
        default:
                b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
@@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
        case OpEmptyMatch:
                b.WriteString(`(?:)`)
        case OpLiteral:
-               if re.Flags&FoldCase != 0 {
-                       b.WriteString(`(?i:`)
-               }
                for _, r := range re.Rune {
                        escape(b, r, false)
                }
-               if re.Flags&FoldCase != 0 {
-                       b.WriteString(`)`)
-               }
        case OpCharClass:
                if len(re.Rune)%2 != 0 {
                        b.WriteString(`[invalid char class]`)
@@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
                                lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
                                escape(b, lo, lo == '-')
                                if lo != hi {
-                                       b.WriteRune('-')
+                                       if hi != lo+1 {
+                                               b.WriteRune('-')
+                                       }
                                        escape(b, hi, hi == '-')
                                }
                        }
@@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
                                lo, hi := re.Rune[i], re.Rune[i+1]
                                escape(b, lo, lo == '-')
                                if lo != hi {
-                                       b.WriteRune('-')
+                                       if hi != lo+1 {
+                                               b.WriteRune('-')
+                                       }
                                        escape(b, hi, hi == '-')
                                }
                        }
                }
                b.WriteRune(']')
-       case OpAnyCharNotNL:
-               b.WriteString(`(?-s:.)`)
-       case OpAnyChar:
-               b.WriteString(`(?s:.)`)
+       case OpAnyCharNotNL, OpAnyChar:
+               b.WriteString(`.`)
        case OpBeginLine:
-               b.WriteString(`(?m:^)`)
+               b.WriteString(`^`)
        case OpEndLine:
-               b.WriteString(`(?m:$)`)
+               b.WriteString(`$`)
        case OpBeginText:
                b.WriteString(`\A`)
        case OpEndText:
                if re.Flags&WasDollar != 0 {
-                       b.WriteString(`(?-m:$)`)
+                       b.WriteString(`$`)
                } else {
                        b.WriteString(`\z`)
                }
@@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
                        b.WriteRune('(')
                }
                if re.Sub[0].Op != OpEmptyMatch {
-                       writeRegexp(b, re.Sub[0])
+                       writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
                }
                b.WriteRune(')')
        case OpStar, OpPlus, OpQuest, OpRepeat:
-               if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
-                       b.WriteString(`(?:`)
-                       writeRegexp(b, sub)
-                       b.WriteString(`)`)
-               } else {
-                       writeRegexp(b, sub)
+               p := printFlags(0)
+               sub := re.Sub[0]
+               if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+                       p = flagPrec
                }
+               writeRegexp(b, sub, p, flags)
+
                switch re.Op {
                case OpStar:
                        b.WriteRune('*')
@@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
                }
        case OpConcat:
                for _, sub := range re.Sub {
+                       p := printFlags(0)
                        if sub.Op == OpAlternate {
-                               b.WriteString(`(?:`)
-                               writeRegexp(b, sub)
-                               b.WriteString(`)`)
-                       } else {
-                               writeRegexp(b, sub)
+                               p = flagPrec
                        }
+                       writeRegexp(b, sub, p, flags)
                }
        case OpAlternate:
                for i, sub := range re.Sub {
                        if i > 0 {
                                b.WriteRune('|')
                        }
-                       writeRegexp(b, sub)
+                       writeRegexp(b, sub, 0, flags)
                }
        }
 }
 
 func (re *Regexp) String() string {
        var b strings.Builder
-       writeRegexp(&b, re)
+       var flags map[*Regexp]printFlags
+       must, cant := calcFlags(re, &flags)
+       must |= (cant &^ flagI) << negShift
+       if must != 0 {
+               must |= flagOff
+       }
+       writeRegexp(&b, re, must, flags)
        return b.String()
 }
 
index 9877db3d0a312752aea8f8053fe9a01905e0b2e6..6d06f99c1b6e0cd845f8e8a7898b92a6ba8a698a 100644 (file)
@@ -13,7 +13,7 @@ var simplifyTests = []struct {
        // Already-simple constructs
        {`a`, `a`},
        {`ab`, `ab`},
-       {`a|b`, `[a-b]`},
+       {`a|b`, `[ab]`},
        {`ab|cd`, `ab|cd`},
        {`(ab)*`, `(ab)*`},
        {`(ab)+`, `(ab)+`},
@@ -40,16 +40,16 @@ var simplifyTests = []struct {
 
        // Perl character classes
        {`\d`, `[0-9]`},
-       {`\s`, `[\t-\n\f-\r ]`},
+       {`\s`, `[\t\n\f\r ]`},
        {`\w`, `[0-9A-Z_a-z]`},
        {`\D`, `[^0-9]`},
-       {`\S`, `[^\t-\n\f-\r ]`},
+       {`\S`, `[^\t\n\f\r ]`},
        {`\W`, `[^0-9A-Z_a-z]`},
        {`[\d]`, `[0-9]`},
-       {`[\s]`, `[\t-\n\f-\r ]`},
+       {`[\s]`, `[\t\n\f\r ]`},
        {`[\w]`, `[0-9A-Z_a-z]`},
        {`[\D]`, `[^0-9]`},
-       {`[\S]`, `[^\t-\n\f-\r ]`},
+       {`[\S]`, `[^\t\n\f\r ]`},
        {`[\W]`, `[^0-9A-Z_a-z]`},
 
        // Posix repetitions
@@ -82,7 +82,8 @@ var simplifyTests = []struct {
        {`a{0}`, `(?:)`},
 
        // Character class simplification
-       {`[ab]`, `[a-b]`},
+       {`[ab]`, `[ab]`},
+       {`[abc]`, `[a-c]`},
        {`[a-za-za-z]`, `[a-z]`},
        {`[A-Za-zA-Za-z]`, `[A-Za-z]`},
        {`[ABCDEFGH]`, `[A-H]`},
@@ -120,7 +121,8 @@ var simplifyTests = []struct {
        // interesting than they might otherwise be. String inserts
        // explicit (?:) in place of non-parenthesized empty strings,
        // to make them easier to spot for other parsers.
-       {`(a|b|)`, `([a-b]|(?:))`},
+       {`(a|b|c|)`, `([a-c]|(?:))`},
+       {`(a|b|)`, `([ab]|(?:))`},
        {`(|)`, `()`},
        {`a()`, `a()`},
        {`(()|())`, `(()|())`},