From 98c9f271d67b501ecf2ce995539abd2cdc81d505 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 28 Jun 2023 17:45:26 -0400 Subject: [PATCH] regexp/syntax: use more compact Regexp.String output Compact the Regexp.String output. It was only ever intended for debugging, but there are at least some uses in the wild where regexps are built up using regexp/syntax and then formatted using the String method. Compact the output to help that use case. Specifically: - Compact 2-element character class ranges: [a-b] -> [ab]. - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E). Fixes #57950. Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb Reviewed-on: https://go-review.googlesource.com/c/go/+/507015 TryBot-Result: Gopher Robot Reviewed-by: Than McIntosh Run-TryBot: Russ Cox Reviewed-by: Rob Pike Auto-Submit: Russ Cox --- src/regexp/syntax/parse.go | 16 +++ src/regexp/syntax/parse_test.go | 36 +++++ src/regexp/syntax/regexp.go | 217 +++++++++++++++++++++++++---- src/regexp/syntax/simplify_test.go | 16 ++- 4 files changed, 248 insertions(+), 37 deletions(-) diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go index a4ccfe3bdb..6b360b8700 100644 --- a/src/regexp/syntax/parse.go +++ b/src/regexp/syntax/parse.go @@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune { return r[:w] } +// inCharClass reports whether r is in the class. +// It assumes the class has been cleaned by cleanClass. +func inCharClass(r rune, class []rune) bool { + _, ok := sort.Find(len(class)/2, func(i int) int { + lo, hi := class[2*i], class[2*i+1] + if r > hi { + return +1 + } + if r < lo { + return -1 + } + return 0 + }) + return ok +} + // appendLiteral returns the result of appending the literal x to the class r. func appendLiteral(r []rune, x rune, flags Flags) []rune { if flags&FoldCase != 0 { diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go index d7999046e0..0f885bd5c8 100644 --- a/src/regexp/syntax/parse_test.go +++ b/src/regexp/syntax/parse_test.go @@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) { } } } + +var stringTests = []struct { + re string + out string +}{ + {`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`}, + {`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`}, + {`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`}, + {`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`}, + {`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`}, + {`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`}, + {`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`}, + {`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`}, + {`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`}, + {`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`}, + {`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`}, + {`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`}, + {`[Aa][Bb]`, `(?i:AB)`}, + {`[Aa][Bb]*[Cc]`, `(?i:AB*C)`}, + {`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`}, + {`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`}, +} + +func TestString(t *testing.T) { + for _, tt := range stringTests { + re, err := Parse(tt.re, Perl) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.re, err) + continue + } + out := re.String() + if out != tt.out { + t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out) + } + } +} diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go index 3a4d2d201c..4fa7d0e2f8 100644 --- a/src/regexp/syntax/regexp.go +++ b/src/regexp/syntax/regexp.go @@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool { return true } +// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. +type printFlags uint8 + +const ( + flagI printFlags = 1 << iota // (?i: + flagM // (?m: + flagS // (?s: + flagOff // ) + flagPrec // (?: ) + negShift = 5 // flagI<") @@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { case OpEmptyMatch: b.WriteString(`(?:)`) case OpLiteral: - if re.Flags&FoldCase != 0 { - b.WriteString(`(?i:`) - } for _, r := range re.Rune { escape(b, r, false) } - if re.Flags&FoldCase != 0 { - b.WriteString(`)`) - } case OpCharClass: if len(re.Rune)%2 != 0 { b.WriteString(`[invalid char class]`) @@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } @@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i], re.Rune[i+1] escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } } b.WriteRune(']') - case OpAnyCharNotNL: - b.WriteString(`(?-s:.)`) - case OpAnyChar: - b.WriteString(`(?s:.)`) + case OpAnyCharNotNL, OpAnyChar: + b.WriteString(`.`) case OpBeginLine: - b.WriteString(`(?m:^)`) + b.WriteString(`^`) case OpEndLine: - b.WriteString(`(?m:$)`) + b.WriteString(`$`) case OpBeginText: b.WriteString(`\A`) case OpEndText: if re.Flags&WasDollar != 0 { - b.WriteString(`(?-m:$)`) + b.WriteString(`$`) } else { b.WriteString(`\z`) } @@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) { b.WriteRune('(') } if re.Sub[0].Op != OpEmptyMatch { - writeRegexp(b, re.Sub[0]) + writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) } b.WriteRune(')') case OpStar, OpPlus, OpQuest, OpRepeat: - if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p := printFlags(0) + sub := re.Sub[0] + if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { + p = flagPrec } + writeRegexp(b, sub, p, flags) + switch re.Op { case OpStar: b.WriteRune('*') @@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) { } case OpConcat: for _, sub := range re.Sub { + p := printFlags(0) if sub.Op == OpAlternate { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p = flagPrec } + writeRegexp(b, sub, p, flags) } case OpAlternate: for i, sub := range re.Sub { if i > 0 { b.WriteRune('|') } - writeRegexp(b, sub) + writeRegexp(b, sub, 0, flags) } } } func (re *Regexp) String() string { var b strings.Builder - writeRegexp(&b, re) + var flags map[*Regexp]printFlags + must, cant := calcFlags(re, &flags) + must |= (cant &^ flagI) << negShift + if must != 0 { + must |= flagOff + } + writeRegexp(&b, re, must, flags) return b.String() } diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go index 9877db3d0a..6d06f99c1b 100644 --- a/src/regexp/syntax/simplify_test.go +++ b/src/regexp/syntax/simplify_test.go @@ -13,7 +13,7 @@ var simplifyTests = []struct { // Already-simple constructs {`a`, `a`}, {`ab`, `ab`}, - {`a|b`, `[a-b]`}, + {`a|b`, `[ab]`}, {`ab|cd`, `ab|cd`}, {`(ab)*`, `(ab)*`}, {`(ab)+`, `(ab)+`}, @@ -40,16 +40,16 @@ var simplifyTests = []struct { // Perl character classes {`\d`, `[0-9]`}, - {`\s`, `[\t-\n\f-\r ]`}, + {`\s`, `[\t\n\f\r ]`}, {`\w`, `[0-9A-Z_a-z]`}, {`\D`, `[^0-9]`}, - {`\S`, `[^\t-\n\f-\r ]`}, + {`\S`, `[^\t\n\f\r ]`}, {`\W`, `[^0-9A-Z_a-z]`}, {`[\d]`, `[0-9]`}, - {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\s]`, `[\t\n\f\r ]`}, {`[\w]`, `[0-9A-Z_a-z]`}, {`[\D]`, `[^0-9]`}, - {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\S]`, `[^\t\n\f\r ]`}, {`[\W]`, `[^0-9A-Z_a-z]`}, // Posix repetitions @@ -82,7 +82,8 @@ var simplifyTests = []struct { {`a{0}`, `(?:)`}, // Character class simplification - {`[ab]`, `[a-b]`}, + {`[ab]`, `[ab]`}, + {`[abc]`, `[a-c]`}, {`[a-za-za-z]`, `[a-z]`}, {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, {`[ABCDEFGH]`, `[A-H]`}, @@ -120,7 +121,8 @@ var simplifyTests = []struct { // interesting than they might otherwise be. String inserts // explicit (?:) in place of non-parenthesized empty strings, // to make them easier to spot for other parsers. - {`(a|b|)`, `([a-b]|(?:))`}, + {`(a|b|c|)`, `([a-c]|(?:))`}, + {`(a|b|)`, `([ab]|(?:))`}, {`(|)`, `()`}, {`a()`, `a()`}, {`(()|())`, `(()|())`}, -- 2.50.0