regexp.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // Note to implementers:
     8  // In this package, re is always a *Regexp and r is always a rune.
     9  
    10  import (
    11  	"slices"
    12  	"strconv"
    13  	"strings"
    14  	"unicode"
    15  )
    16  
    17  // A Regexp is a node in a regular expression syntax tree.
    18  type Regexp struct {
    19  	Op       Op // operator
    20  	Flags    Flags
    21  	Sub      []*Regexp  // subexpressions, if any
    22  	Sub0     [1]*Regexp // storage for short Sub
    23  	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
    24  	Rune0    [2]rune    // storage for short Rune
    25  	Min, Max int        // min, max for OpRepeat
    26  	Cap      int        // capturing index, for OpCapture
    27  	Name     string     // capturing name, for OpCapture
    28  }
    29  
    30  //go:generate stringer -type Op -trimprefix Op
    31  
    32  // An Op is a single regular expression operator.
    33  type Op uint8
    34  
    35  // Operators are listed in precedence order, tightest binding to weakest.
    36  // Character class operators are listed simplest to most complex
    37  // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
    38  
    39  const (
    40  	OpNoMatch        Op = 1 + iota // matches no strings
    41  	OpEmptyMatch                   // matches empty string
    42  	OpLiteral                      // matches Runes sequence
    43  	OpCharClass                    // matches Runes interpreted as range pair list
    44  	OpAnyCharNotNL                 // matches any character except newline
    45  	OpAnyChar                      // matches any character
    46  	OpBeginLine                    // matches empty string at beginning of line
    47  	OpEndLine                      // matches empty string at end of line
    48  	OpBeginText                    // matches empty string at beginning of text
    49  	OpEndText                      // matches empty string at end of text
    50  	OpWordBoundary                 // matches word boundary `\b`
    51  	OpNoWordBoundary               // matches word non-boundary `\B`
    52  	OpCapture                      // capturing subexpression with index Cap, optional name Name
    53  	OpStar                         // matches Sub[0] zero or more times
    54  	OpPlus                         // matches Sub[0] one or more times
    55  	OpQuest                        // matches Sub[0] zero or one times
    56  	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
    57  	OpConcat                       // matches concatenation of Subs
    58  	OpAlternate                    // matches alternation of Subs
    59  )
    60  
    61  const opPseudo Op = 128 // where pseudo-ops start
    62  
    63  // Equal reports whether x and y have identical structure.
    64  func (x *Regexp) Equal(y *Regexp) bool {
    65  	if x == nil || y == nil {
    66  		return x == y
    67  	}
    68  	if x.Op != y.Op {
    69  		return false
    70  	}
    71  	switch x.Op {
    72  	case OpEndText:
    73  		// The parse flags remember whether this is \z or \Z.
    74  		if x.Flags&WasDollar != y.Flags&WasDollar {
    75  			return false
    76  		}
    77  
    78  	case OpLiteral, OpCharClass:
    79  		return slices.Equal(x.Rune, y.Rune)
    80  
    81  	case OpAlternate, OpConcat:
    82  		return slices.EqualFunc(x.Sub, y.Sub, (*Regexp).Equal)
    83  
    84  	case OpStar, OpPlus, OpQuest:
    85  		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
    86  			return false
    87  		}
    88  
    89  	case OpRepeat:
    90  		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
    91  			return false
    92  		}
    93  
    94  	case OpCapture:
    95  		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
    96  			return false
    97  		}
    98  	}
    99  	return true
   100  }
   101  
   102  // printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
   103  type printFlags uint8
   104  
   105  const (
   106  	flagI    printFlags = 1 << iota // (?i:
   107  	flagM                           // (?m:
   108  	flagS                           // (?s:
   109  	flagOff                         // )
   110  	flagPrec                        // (?: )
   111  	negShift = 5                    // flagI<<negShift is (?-i:
   112  )
   113  
   114  // addSpan enables the flags f around start..last,
   115  // by setting flags[start] = f and flags[last] = flagOff.
   116  func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
   117  	if *flags == nil {
   118  		*flags = make(map[*Regexp]printFlags)
   119  	}
   120  	(*flags)[start] = f
   121  	(*flags)[last] |= flagOff // maybe start==last
   122  }
   123  
   124  // calcFlags calculates the flags to print around each subexpression in re,
   125  // storing that information in (*flags)[sub] for each affected subexpression.
   126  // The first time an entry needs to be written to *flags, calcFlags allocates the map.
   127  // calcFlags also calculates the flags that must be active or can't be active
   128  // around re and returns those flags.
   129  func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
   130  	switch re.Op {
   131  	default:
   132  		return 0, 0
   133  
   134  	case OpLiteral:
   135  		// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
   136  		// according to whether (?i) is active.
   137  		// If literal is not fold-sensitive, return 0, 0.
   138  		for _, r := range re.Rune {
   139  			if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
   140  				if re.Flags&FoldCase != 0 {
   141  					return flagI, 0
   142  				} else {
   143  					return 0, flagI
   144  				}
   145  			}
   146  		}
   147  		return 0, 0
   148  
   149  	case OpCharClass:
   150  		// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
   151  		// If literal is not fold-sensitive, return 0, 0.
   152  		for i := 0; i < len(re.Rune); i += 2 {
   153  			lo := max(minFold, re.Rune[i])
   154  			hi := min(maxFold, re.Rune[i+1])
   155  			for r := lo; r <= hi; r++ {
   156  				for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
   157  					if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
   158  						return 0, flagI
   159  					}
   160  				}
   161  			}
   162  		}
   163  		return 0, 0
   164  
   165  	case OpAnyCharNotNL: // (?-s).
   166  		return 0, flagS
   167  
   168  	case OpAnyChar: // (?s).
   169  		return flagS, 0
   170  
   171  	case OpBeginLine, OpEndLine: // (?m)^ (?m)$
   172  		return flagM, 0
   173  
   174  	case OpEndText:
   175  		if re.Flags&WasDollar != 0 { // (?-m)$
   176  			return 0, flagM
   177  		}
   178  		return 0, 0
   179  
   180  	case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
   181  		return calcFlags(re.Sub[0], flags)
   182  
   183  	case OpConcat, OpAlternate:
   184  		// Gather the must and cant for each subexpression.
   185  		// When we find a conflicting subexpression, insert the necessary
   186  		// flags around the previously identified span and start over.
   187  		var must, cant, allCant printFlags
   188  		start := 0
   189  		last := 0
   190  		did := false
   191  		for i, sub := range re.Sub {
   192  			subMust, subCant := calcFlags(sub, flags)
   193  			if must&subCant != 0 || subMust&cant != 0 {
   194  				if must != 0 {
   195  					addSpan(re.Sub[start], re.Sub[last], must, flags)
   196  				}
   197  				must = 0
   198  				cant = 0
   199  				start = i
   200  				did = true
   201  			}
   202  			must |= subMust
   203  			cant |= subCant
   204  			allCant |= subCant
   205  			if subMust != 0 {
   206  				last = i
   207  			}
   208  			if must == 0 && start == i {
   209  				start++
   210  			}
   211  		}
   212  		if !did {
   213  			// No conflicts: pass the accumulated must and cant upward.
   214  			return must, cant
   215  		}
   216  		if must != 0 {
   217  			// Conflicts found; need to finish final span.
   218  			addSpan(re.Sub[start], re.Sub[last], must, flags)
   219  		}
   220  		return 0, allCant
   221  	}
   222  }
   223  
   224  // writeRegexp writes the Perl syntax for the regular expression re to b.
   225  func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
   226  	f |= flags[re]
   227  	if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
   228  		// flagPrec is redundant with other flags being added and terminated
   229  		f &^= flagPrec
   230  	}
   231  	if f&^(flagOff|flagPrec) != 0 {
   232  		b.WriteString(`(?`)
   233  		if f&flagI != 0 {
   234  			b.WriteString(`i`)
   235  		}
   236  		if f&flagM != 0 {
   237  			b.WriteString(`m`)
   238  		}
   239  		if f&flagS != 0 {
   240  			b.WriteString(`s`)
   241  		}
   242  		if f&((flagM|flagS)<<negShift) != 0 {
   243  			b.WriteString(`-`)
   244  			if f&(flagM<<negShift) != 0 {
   245  				b.WriteString(`m`)
   246  			}
   247  			if f&(flagS<<negShift) != 0 {
   248  				b.WriteString(`s`)
   249  			}
   250  		}
   251  		b.WriteString(`:`)
   252  	}
   253  	if f&flagOff != 0 {
   254  		defer b.WriteString(`)`)
   255  	}
   256  	if f&flagPrec != 0 {
   257  		b.WriteString(`(?:`)
   258  		defer b.WriteString(`)`)
   259  	}
   260  
   261  	switch re.Op {
   262  	default:
   263  		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
   264  	case OpNoMatch:
   265  		b.WriteString(`[^\x00-\x{10FFFF}]`)
   266  	case OpEmptyMatch:
   267  		b.WriteString(`(?:)`)
   268  	case OpLiteral:
   269  		for _, r := range re.Rune {
   270  			escape(b, r, false)
   271  		}
   272  	case OpCharClass:
   273  		if len(re.Rune)%2 != 0 {
   274  			b.WriteString(`[invalid char class]`)
   275  			break
   276  		}
   277  		b.WriteRune('[')
   278  		if len(re.Rune) == 0 {
   279  			b.WriteString(`^\x00-\x{10FFFF}`)
   280  		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
   281  			// Contains 0 and MaxRune. Probably a negated class.
   282  			// Print the gaps.
   283  			b.WriteRune('^')
   284  			for i := 1; i < len(re.Rune)-1; i += 2 {
   285  				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
   286  				escape(b, lo, lo == '-')
   287  				if lo != hi {
   288  					if hi != lo+1 {
   289  						b.WriteRune('-')
   290  					}
   291  					escape(b, hi, hi == '-')
   292  				}
   293  			}
   294  		} else {
   295  			for i := 0; i < len(re.Rune); i += 2 {
   296  				lo, hi := re.Rune[i], re.Rune[i+1]
   297  				escape(b, lo, lo == '-')
   298  				if lo != hi {
   299  					if hi != lo+1 {
   300  						b.WriteRune('-')
   301  					}
   302  					escape(b, hi, hi == '-')
   303  				}
   304  			}
   305  		}
   306  		b.WriteRune(']')
   307  	case OpAnyCharNotNL, OpAnyChar:
   308  		b.WriteString(`.`)
   309  	case OpBeginLine:
   310  		b.WriteString(`^`)
   311  	case OpEndLine:
   312  		b.WriteString(`$`)
   313  	case OpBeginText:
   314  		b.WriteString(`\A`)
   315  	case OpEndText:
   316  		if re.Flags&WasDollar != 0 {
   317  			b.WriteString(`$`)
   318  		} else {
   319  			b.WriteString(`\z`)
   320  		}
   321  	case OpWordBoundary:
   322  		b.WriteString(`\b`)
   323  	case OpNoWordBoundary:
   324  		b.WriteString(`\B`)
   325  	case OpCapture:
   326  		if re.Name != "" {
   327  			b.WriteString(`(?P<`)
   328  			b.WriteString(re.Name)
   329  			b.WriteRune('>')
   330  		} else {
   331  			b.WriteRune('(')
   332  		}
   333  		if re.Sub[0].Op != OpEmptyMatch {
   334  			writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
   335  		}
   336  		b.WriteRune(')')
   337  	case OpStar, OpPlus, OpQuest, OpRepeat:
   338  		p := printFlags(0)
   339  		sub := re.Sub[0]
   340  		if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
   341  			p = flagPrec
   342  		}
   343  		writeRegexp(b, sub, p, flags)
   344  
   345  		switch re.Op {
   346  		case OpStar:
   347  			b.WriteRune('*')
   348  		case OpPlus:
   349  			b.WriteRune('+')
   350  		case OpQuest:
   351  			b.WriteRune('?')
   352  		case OpRepeat:
   353  			b.WriteRune('{')
   354  			b.WriteString(strconv.Itoa(re.Min))
   355  			if re.Max != re.Min {
   356  				b.WriteRune(',')
   357  				if re.Max >= 0 {
   358  					b.WriteString(strconv.Itoa(re.Max))
   359  				}
   360  			}
   361  			b.WriteRune('}')
   362  		}
   363  		if re.Flags&NonGreedy != 0 {
   364  			b.WriteRune('?')
   365  		}
   366  	case OpConcat:
   367  		for _, sub := range re.Sub {
   368  			p := printFlags(0)
   369  			if sub.Op == OpAlternate {
   370  				p = flagPrec
   371  			}
   372  			writeRegexp(b, sub, p, flags)
   373  		}
   374  	case OpAlternate:
   375  		for i, sub := range re.Sub {
   376  			if i > 0 {
   377  				b.WriteRune('|')
   378  			}
   379  			writeRegexp(b, sub, 0, flags)
   380  		}
   381  	}
   382  }
   383  
   384  func (re *Regexp) String() string {
   385  	var b strings.Builder
   386  	var flags map[*Regexp]printFlags
   387  	must, cant := calcFlags(re, &flags)
   388  	must |= (cant &^ flagI) << negShift
   389  	if must != 0 {
   390  		must |= flagOff
   391  	}
   392  	writeRegexp(&b, re, must, flags)
   393  	return b.String()
   394  }
   395  
   396  const meta = `\.+*?()|[]{}^$`
   397  
   398  func escape(b *strings.Builder, r rune, force bool) {
   399  	if unicode.IsPrint(r) {
   400  		if strings.ContainsRune(meta, r) || force {
   401  			b.WriteRune('\\')
   402  		}
   403  		b.WriteRune(r)
   404  		return
   405  	}
   406  
   407  	switch r {
   408  	case '\a':
   409  		b.WriteString(`\a`)
   410  	case '\f':
   411  		b.WriteString(`\f`)
   412  	case '\n':
   413  		b.WriteString(`\n`)
   414  	case '\r':
   415  		b.WriteString(`\r`)
   416  	case '\t':
   417  		b.WriteString(`\t`)
   418  	case '\v':
   419  		b.WriteString(`\v`)
   420  	default:
   421  		if r < 0x100 {
   422  			b.WriteString(`\x`)
   423  			s := strconv.FormatInt(int64(r), 16)
   424  			if len(s) == 1 {
   425  				b.WriteRune('0')
   426  			}
   427  			b.WriteString(s)
   428  			break
   429  		}
   430  		b.WriteString(`\x{`)
   431  		b.WriteString(strconv.FormatInt(int64(r), 16))
   432  		b.WriteString(`}`)
   433  	}
   434  }
   435  
   436  // MaxCap walks the regexp to find the maximum capture index.
   437  func (re *Regexp) MaxCap() int {
   438  	m := 0
   439  	if re.Op == OpCapture {
   440  		m = re.Cap
   441  	}
   442  	for _, sub := range re.Sub {
   443  		if n := sub.MaxCap(); m < n {
   444  			m = n
   445  		}
   446  	}
   447  	return m
   448  }
   449  
   450  // CapNames walks the regexp to find the names of capturing groups.
   451  func (re *Regexp) CapNames() []string {
   452  	names := make([]string, re.MaxCap()+1)
   453  	re.capNames(names)
   454  	return names
   455  }
   456  
   457  func (re *Regexp) capNames(names []string) {
   458  	if re.Op == OpCapture {
   459  		names[re.Cap] = re.Name
   460  	}
   461  	for _, sub := range re.Sub {
   462  		sub.capNames(names)
   463  	}
   464  }
   465
View as plain text