From 98c9f271d67b501ecf2ce995539abd2cdc81d505 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Wed, 28 Jun 2023 17:45:26 -0400
Subject: [PATCH] regexp/syntax: use more compact Regexp.String output

Compact the Regexp.String output. It was only ever intended for debugging,
but there are at least some uses in the wild where regexps are built up
using regexp/syntax and then formatted using the String method.
Compact the output to help that use case. Specifically:

 - Compact 2-element character class ranges: [a-b] -> [ab].
 - Aggregate flags: (?i:A)(?i:B)*(?i:C)|(?i:D)?(?i:E) -> (?i:AB*C|D?E).

Fixes #57950.

Change-Id: I1161d0e3aa6c3ae5a302677032bb7cd55caae5fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/507015
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Rob Pike <r@golang.org>
Auto-Submit: Russ Cox <rsc@golang.org>
---
 src/regexp/syntax/parse.go         |  16 +++
 src/regexp/syntax/parse_test.go    |  36 +++++
 src/regexp/syntax/regexp.go        | 217 +++++++++++++++++++++++++----
 src/regexp/syntax/simplify_test.go |  16 ++-
 4 files changed, 248 insertions(+), 37 deletions(-)

diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go
index a4ccfe3bdb..6b360b8700 100644
--- a/src/regexp/syntax/parse.go
+++ b/src/regexp/syntax/parse.go
@@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune {
 	return r[:w]
 }
 
+// inCharClass reports whether r is in the class.
+// It assumes the class has been cleaned by cleanClass.
+func inCharClass(r rune, class []rune) bool {
+	_, ok := sort.Find(len(class)/2, func(i int) int {
+		lo, hi := class[2*i], class[2*i+1]
+		if r > hi {
+			return +1
+		}
+		if r < lo {
+			return -1
+		}
+		return 0
+	})
+	return ok
+}
+
 // appendLiteral returns the result of appending the literal x to the class r.
 func appendLiteral(r []rune, x rune, flags Flags) []rune {
 	if flags&FoldCase != 0 {
diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go
index d7999046e0..0f885bd5c8 100644
--- a/src/regexp/syntax/parse_test.go
+++ b/src/regexp/syntax/parse_test.go
@@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) {
 		}
 	}
 }
+
+var stringTests = []struct {
+	re  string
+	out string
+}{
+	{`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`},
+	{`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`},
+	{`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`},
+	{`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`},
+	{`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`},
+	{`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`},
+	{`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`},
+	{`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`},
+	{`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`},
+	{`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`},
+	{`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`},
+	{`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`},
+	{`[Aa][Bb]`, `(?i:AB)`},
+	{`[Aa][Bb]*[Cc]`, `(?i:AB*C)`},
+	{`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`},
+	{`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`},
+}
+
+func TestString(t *testing.T) {
+	for _, tt := range stringTests {
+		re, err := Parse(tt.re, Perl)
+		if err != nil {
+			t.Errorf("Parse(%#q): %v", tt.re, err)
+			continue
+		}
+		out := re.String()
+		if out != tt.out {
+			t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out)
+		}
+	}
+}
diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go
index 3a4d2d201c..4fa7d0e2f8 100644
--- a/src/regexp/syntax/regexp.go
+++ b/src/regexp/syntax/regexp.go
@@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool {
 	return true
 }
 
+// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
+type printFlags uint8
+
+const (
+	flagI    printFlags = 1 << iota // (?i:
+	flagM                           // (?m:
+	flagS                           // (?s:
+	flagOff                         // )
+	flagPrec                        // (?: )
+	negShift = 5                    // flagI<<negShift is (?-i:
+)
+
+// addSpan enables the flags f around start..last,
+// by setting flags[start] = f and flags[last] = flagOff.
+func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
+	if *flags == nil {
+		*flags = make(map[*Regexp]printFlags)
+	}
+	(*flags)[start] = f
+	(*flags)[last] |= flagOff // maybe start==last
+}
+
+// calcFlags calculates the flags to print around each subexpression in re,
+// storing that information in (*flags)[sub] for each affected subexpression.
+// The first time an entry needs to be written to *flags, calcFlags allocates the map.
+// calcFlags also calculates the flags that must be active or can't be active
+// around re and returns those flags.
+func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
+	switch re.Op {
+	default:
+		return 0, 0
+
+	case OpLiteral:
+		// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
+		// according to whether (?i) is active.
+		// If literal is not fold-sensitive, return 0, 0.
+		for _, r := range re.Rune {
+			if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
+				if re.Flags&FoldCase != 0 {
+					return flagI, 0
+				} else {
+					return 0, flagI
+				}
+			}
+		}
+		return 0, 0
+
+	case OpCharClass:
+		// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
+		// If literal is not fold-sensitive, return 0, 0.
+		for i := 0; i < len(re.Rune); i += 2 {
+			lo := max(minFold, re.Rune[i])
+			hi := min(maxFold, re.Rune[i+1])
+			for r := lo; r <= hi; r++ {
+				for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
+					if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
+						return 0, flagI
+					}
+				}
+			}
+		}
+		return 0, 0
+
+	case OpAnyCharNotNL: // (?-s).
+		return 0, flagS
+
+	case OpAnyChar: // (?s).
+		return flagS, 0
+
+	case OpBeginLine, OpEndLine: // (?m)^ (?m)$
+		return flagM, 0
+
+	case OpEndText:
+		if re.Flags&WasDollar != 0 { // (?-m)$
+			return 0, flagM
+		}
+		return 0, 0
+
+	case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
+		return calcFlags(re.Sub[0], flags)
+
+	case OpConcat, OpAlternate:
+		// Gather the must and cant for each subexpression.
+		// When we find a conflicting subexpression, insert the necessary
+		// flags around the previously identified span and start over.
+		var must, cant, allCant printFlags
+		start := 0
+		last := 0
+		did := false
+		for i, sub := range re.Sub {
+			subMust, subCant := calcFlags(sub, flags)
+			if must&subCant != 0 || subMust&cant != 0 {
+				if must != 0 {
+					addSpan(re.Sub[start], re.Sub[last], must, flags)
+				}
+				must = 0
+				cant = 0
+				start = i
+				did = true
+			}
+			must |= subMust
+			cant |= subCant
+			allCant |= subCant
+			if subMust != 0 {
+				last = i
+			}
+			if must == 0 && start == i {
+				start++
+			}
+		}
+		if !did {
+			// No conflicts: pass the accumulated must and cant upward.
+			return must, cant
+		}
+		if must != 0 {
+			// Conflicts found; need to finish final span.
+			addSpan(re.Sub[start], re.Sub[last], must, flags)
+		}
+		return 0, allCant
+	}
+}
+
 // writeRegexp writes the Perl syntax for the regular expression re to b.
-func writeRegexp(b *strings.Builder, re *Regexp) {
+func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
+	f |= flags[re]
+	if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
+		// flagPrec is redundant with other flags being added and terminated
+		f &^= flagPrec
+	}
+	if f&^(flagOff|flagPrec) != 0 {
+		b.WriteString(`(?`)
+		if f&flagI != 0 {
+			b.WriteString(`i`)
+		}
+		if f&flagM != 0 {
+			b.WriteString(`m`)
+		}
+		if f&flagS != 0 {
+			b.WriteString(`s`)
+		}
+		if f&((flagM|flagS)<<negShift) != 0 {
+			b.WriteString(`-`)
+			if f&(flagM<<negShift) != 0 {
+				b.WriteString(`m`)
+			}
+			if f&(flagS<<negShift) != 0 {
+				b.WriteString(`s`)
+			}
+		}
+		b.WriteString(`:`)
+	}
+	if f&flagOff != 0 {
+		defer b.WriteString(`)`)
+	}
+	if f&flagPrec != 0 {
+		b.WriteString(`(?:`)
+		defer b.WriteString(`)`)
+	}
+
 	switch re.Op {
 	default:
 		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
@@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 	case OpEmptyMatch:
 		b.WriteString(`(?:)`)
 	case OpLiteral:
-		if re.Flags&FoldCase != 0 {
-			b.WriteString(`(?i:`)
-		}
 		for _, r := range re.Rune {
 			escape(b, r, false)
 		}
-		if re.Flags&FoldCase != 0 {
-			b.WriteString(`)`)
-		}
 	case OpCharClass:
 		if len(re.Rune)%2 != 0 {
 			b.WriteString(`[invalid char class]`)
@@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
 				escape(b, lo, lo == '-')
 				if lo != hi {
-					b.WriteRune('-')
+					if hi != lo+1 {
+						b.WriteRune('-')
+					}
 					escape(b, hi, hi == '-')
 				}
 			}
@@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 				lo, hi := re.Rune[i], re.Rune[i+1]
 				escape(b, lo, lo == '-')
 				if lo != hi {
-					b.WriteRune('-')
+					if hi != lo+1 {
+						b.WriteRune('-')
+					}
 					escape(b, hi, hi == '-')
 				}
 			}
 		}
 		b.WriteRune(']')
-	case OpAnyCharNotNL:
-		b.WriteString(`(?-s:.)`)
-	case OpAnyChar:
-		b.WriteString(`(?s:.)`)
+	case OpAnyCharNotNL, OpAnyChar:
+		b.WriteString(`.`)
 	case OpBeginLine:
-		b.WriteString(`(?m:^)`)
+		b.WriteString(`^`)
 	case OpEndLine:
-		b.WriteString(`(?m:$)`)
+		b.WriteString(`$`)
 	case OpBeginText:
 		b.WriteString(`\A`)
 	case OpEndText:
 		if re.Flags&WasDollar != 0 {
-			b.WriteString(`(?-m:$)`)
+			b.WriteString(`$`)
 		} else {
 			b.WriteString(`\z`)
 		}
@@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 			b.WriteRune('(')
 		}
 		if re.Sub[0].Op != OpEmptyMatch {
-			writeRegexp(b, re.Sub[0])
+			writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
 		}
 		b.WriteRune(')')
 	case OpStar, OpPlus, OpQuest, OpRepeat:
-		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
-			b.WriteString(`(?:`)
-			writeRegexp(b, sub)
-			b.WriteString(`)`)
-		} else {
-			writeRegexp(b, sub)
+		p := printFlags(0)
+		sub := re.Sub[0]
+		if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
+			p = flagPrec
 		}
+		writeRegexp(b, sub, p, flags)
+
 		switch re.Op {
 		case OpStar:
 			b.WriteRune('*')
@@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) {
 		}
 	case OpConcat:
 		for _, sub := range re.Sub {
+			p := printFlags(0)
 			if sub.Op == OpAlternate {
-				b.WriteString(`(?:`)
-				writeRegexp(b, sub)
-				b.WriteString(`)`)
-			} else {
-				writeRegexp(b, sub)
+				p = flagPrec
 			}
+			writeRegexp(b, sub, p, flags)
 		}
 	case OpAlternate:
 		for i, sub := range re.Sub {
 			if i > 0 {
 				b.WriteRune('|')
 			}
-			writeRegexp(b, sub)
+			writeRegexp(b, sub, 0, flags)
 		}
 	}
 }
 
 func (re *Regexp) String() string {
 	var b strings.Builder
-	writeRegexp(&b, re)
+	var flags map[*Regexp]printFlags
+	must, cant := calcFlags(re, &flags)
+	must |= (cant &^ flagI) << negShift
+	if must != 0 {
+		must |= flagOff
+	}
+	writeRegexp(&b, re, must, flags)
 	return b.String()
 }
 
diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go
index 9877db3d0a..6d06f99c1b 100644
--- a/src/regexp/syntax/simplify_test.go
+++ b/src/regexp/syntax/simplify_test.go
@@ -13,7 +13,7 @@ var simplifyTests = []struct {
 	// Already-simple constructs
 	{`a`, `a`},
 	{`ab`, `ab`},
-	{`a|b`, `[a-b]`},
+	{`a|b`, `[ab]`},
 	{`ab|cd`, `ab|cd`},
 	{`(ab)*`, `(ab)*`},
 	{`(ab)+`, `(ab)+`},
@@ -40,16 +40,16 @@ var simplifyTests = []struct {
 
 	// Perl character classes
 	{`\d`, `[0-9]`},
-	{`\s`, `[\t-\n\f-\r ]`},
+	{`\s`, `[\t\n\f\r ]`},
 	{`\w`, `[0-9A-Z_a-z]`},
 	{`\D`, `[^0-9]`},
-	{`\S`, `[^\t-\n\f-\r ]`},
+	{`\S`, `[^\t\n\f\r ]`},
 	{`\W`, `[^0-9A-Z_a-z]`},
 	{`[\d]`, `[0-9]`},
-	{`[\s]`, `[\t-\n\f-\r ]`},
+	{`[\s]`, `[\t\n\f\r ]`},
 	{`[\w]`, `[0-9A-Z_a-z]`},
 	{`[\D]`, `[^0-9]`},
-	{`[\S]`, `[^\t-\n\f-\r ]`},
+	{`[\S]`, `[^\t\n\f\r ]`},
 	{`[\W]`, `[^0-9A-Z_a-z]`},
 
 	// Posix repetitions
@@ -82,7 +82,8 @@ var simplifyTests = []struct {
 	{`a{0}`, `(?:)`},
 
 	// Character class simplification
-	{`[ab]`, `[a-b]`},
+	{`[ab]`, `[ab]`},
+	{`[abc]`, `[a-c]`},
 	{`[a-za-za-z]`, `[a-z]`},
 	{`[A-Za-zA-Za-z]`, `[A-Za-z]`},
 	{`[ABCDEFGH]`, `[A-H]`},
@@ -120,7 +121,8 @@ var simplifyTests = []struct {
 	// interesting than they might otherwise be. String inserts
 	// explicit (?:) in place of non-parenthesized empty strings,
 	// to make them easier to spot for other parsers.
-	{`(a|b|)`, `([a-b]|(?:))`},
+	{`(a|b|c|)`, `([a-c]|(?:))`},
+	{`(a|b|)`, `([ab]|(?:))`},
 	{`(|)`, `()`},
 	{`a()`, `a()`},
 	{`(()|())`, `(()|())`},
-- 
2.50.0