1
2
3
4
5 package syntax
6
7
8
9
10 import (
11 "slices"
12 "strconv"
13 "strings"
14 "unicode"
15 )
16
17
18 type Regexp struct {
19 Op Op
20 Flags Flags
21 Sub []*Regexp
22 Sub0 [1]*Regexp
23 Rune []rune
24 Rune0 [2]rune
25 Min, Max int
26 Cap int
27 Name string
28 }
29
30
31
32
33 type Op uint8
34
35
36
37
38
39 const (
40 OpNoMatch Op = 1 + iota
41 OpEmptyMatch
42 OpLiteral
43 OpCharClass
44 OpAnyCharNotNL
45 OpAnyChar
46 OpBeginLine
47 OpEndLine
48 OpBeginText
49 OpEndText
50 OpWordBoundary
51 OpNoWordBoundary
52 OpCapture
53 OpStar
54 OpPlus
55 OpQuest
56 OpRepeat
57 OpConcat
58 OpAlternate
59 )
60
61 const opPseudo Op = 128
62
63
64 func (x *Regexp) Equal(y *Regexp) bool {
65 if x == nil || y == nil {
66 return x == y
67 }
68 if x.Op != y.Op {
69 return false
70 }
71 switch x.Op {
72 case OpEndText:
73
74 if x.Flags&WasDollar != y.Flags&WasDollar {
75 return false
76 }
77
78 case OpLiteral, OpCharClass:
79 return slices.Equal(x.Rune, y.Rune)
80
81 case OpAlternate, OpConcat:
82 return slices.EqualFunc(x.Sub, y.Sub, (*Regexp).Equal)
83
84 case OpStar, OpPlus, OpQuest:
85 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
86 return false
87 }
88
89 case OpRepeat:
90 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
91 return false
92 }
93
94 case OpCapture:
95 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
96 return false
97 }
98 }
99 return true
100 }
101
102
103 type printFlags uint8
104
105 const (
106 flagI printFlags = 1 << iota
107 flagM
108 flagS
109 flagOff
110 flagPrec
111 negShift = 5
112 )
113
114
115
116 func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
117 if *flags == nil {
118 *flags = make(map[*Regexp]printFlags)
119 }
120 (*flags)[start] = f
121 (*flags)[last] |= flagOff
122 }
123
124
125
126
127
128
129 func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
130 switch re.Op {
131 default:
132 return 0, 0
133
134 case OpLiteral:
135
136
137
138 for _, r := range re.Rune {
139 if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
140 if re.Flags&FoldCase != 0 {
141 return flagI, 0
142 } else {
143 return 0, flagI
144 }
145 }
146 }
147 return 0, 0
148
149 case OpCharClass:
150
151
152 for i := 0; i < len(re.Rune); i += 2 {
153 lo := max(minFold, re.Rune[i])
154 hi := min(maxFold, re.Rune[i+1])
155 for r := lo; r <= hi; r++ {
156 for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
157 if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
158 return 0, flagI
159 }
160 }
161 }
162 }
163 return 0, 0
164
165 case OpAnyCharNotNL:
166 return 0, flagS
167
168 case OpAnyChar:
169 return flagS, 0
170
171 case OpBeginLine, OpEndLine:
172 return flagM, 0
173
174 case OpEndText:
175 if re.Flags&WasDollar != 0 {
176 return 0, flagM
177 }
178 return 0, 0
179
180 case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
181 return calcFlags(re.Sub[0], flags)
182
183 case OpConcat, OpAlternate:
184
185
186
187 var must, cant, allCant printFlags
188 start := 0
189 last := 0
190 did := false
191 for i, sub := range re.Sub {
192 subMust, subCant := calcFlags(sub, flags)
193 if must&subCant != 0 || subMust&cant != 0 {
194 if must != 0 {
195 addSpan(re.Sub[start], re.Sub[last], must, flags)
196 }
197 must = 0
198 cant = 0
199 start = i
200 did = true
201 }
202 must |= subMust
203 cant |= subCant
204 allCant |= subCant
205 if subMust != 0 {
206 last = i
207 }
208 if must == 0 && start == i {
209 start++
210 }
211 }
212 if !did {
213
214 return must, cant
215 }
216 if must != 0 {
217
218 addSpan(re.Sub[start], re.Sub[last], must, flags)
219 }
220 return 0, allCant
221 }
222 }
223
224
225 func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
226 f |= flags[re]
227 if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
228
229 f &^= flagPrec
230 }
231 if f&^(flagOff|flagPrec) != 0 {
232 b.WriteString(`(?`)
233 if f&flagI != 0 {
234 b.WriteString(`i`)
235 }
236 if f&flagM != 0 {
237 b.WriteString(`m`)
238 }
239 if f&flagS != 0 {
240 b.WriteString(`s`)
241 }
242 if f&((flagM|flagS)<<negShift) != 0 {
243 b.WriteString(`-`)
244 if f&(flagM<<negShift) != 0 {
245 b.WriteString(`m`)
246 }
247 if f&(flagS<<negShift) != 0 {
248 b.WriteString(`s`)
249 }
250 }
251 b.WriteString(`:`)
252 }
253 if f&flagOff != 0 {
254 defer b.WriteString(`)`)
255 }
256 if f&flagPrec != 0 {
257 b.WriteString(`(?:`)
258 defer b.WriteString(`)`)
259 }
260
261 switch re.Op {
262 default:
263 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
264 case OpNoMatch:
265 b.WriteString(`[^\x00-\x{10FFFF}]`)
266 case OpEmptyMatch:
267 b.WriteString(`(?:)`)
268 case OpLiteral:
269 for _, r := range re.Rune {
270 escape(b, r, false)
271 }
272 case OpCharClass:
273 if len(re.Rune)%2 != 0 {
274 b.WriteString(`[invalid char class]`)
275 break
276 }
277 b.WriteRune('[')
278 if len(re.Rune) == 0 {
279 b.WriteString(`^\x00-\x{10FFFF}`)
280 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
281
282
283 b.WriteRune('^')
284 for i := 1; i < len(re.Rune)-1; i += 2 {
285 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
286 escape(b, lo, lo == '-')
287 if lo != hi {
288 if hi != lo+1 {
289 b.WriteRune('-')
290 }
291 escape(b, hi, hi == '-')
292 }
293 }
294 } else {
295 for i := 0; i < len(re.Rune); i += 2 {
296 lo, hi := re.Rune[i], re.Rune[i+1]
297 escape(b, lo, lo == '-')
298 if lo != hi {
299 if hi != lo+1 {
300 b.WriteRune('-')
301 }
302 escape(b, hi, hi == '-')
303 }
304 }
305 }
306 b.WriteRune(']')
307 case OpAnyCharNotNL, OpAnyChar:
308 b.WriteString(`.`)
309 case OpBeginLine:
310 b.WriteString(`^`)
311 case OpEndLine:
312 b.WriteString(`$`)
313 case OpBeginText:
314 b.WriteString(`\A`)
315 case OpEndText:
316 if re.Flags&WasDollar != 0 {
317 b.WriteString(`$`)
318 } else {
319 b.WriteString(`\z`)
320 }
321 case OpWordBoundary:
322 b.WriteString(`\b`)
323 case OpNoWordBoundary:
324 b.WriteString(`\B`)
325 case OpCapture:
326 if re.Name != "" {
327 b.WriteString(`(?P<`)
328 b.WriteString(re.Name)
329 b.WriteRune('>')
330 } else {
331 b.WriteRune('(')
332 }
333 if re.Sub[0].Op != OpEmptyMatch {
334 writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
335 }
336 b.WriteRune(')')
337 case OpStar, OpPlus, OpQuest, OpRepeat:
338 p := printFlags(0)
339 sub := re.Sub[0]
340 if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
341 p = flagPrec
342 }
343 writeRegexp(b, sub, p, flags)
344
345 switch re.Op {
346 case OpStar:
347 b.WriteRune('*')
348 case OpPlus:
349 b.WriteRune('+')
350 case OpQuest:
351 b.WriteRune('?')
352 case OpRepeat:
353 b.WriteRune('{')
354 b.WriteString(strconv.Itoa(re.Min))
355 if re.Max != re.Min {
356 b.WriteRune(',')
357 if re.Max >= 0 {
358 b.WriteString(strconv.Itoa(re.Max))
359 }
360 }
361 b.WriteRune('}')
362 }
363 if re.Flags&NonGreedy != 0 {
364 b.WriteRune('?')
365 }
366 case OpConcat:
367 for _, sub := range re.Sub {
368 p := printFlags(0)
369 if sub.Op == OpAlternate {
370 p = flagPrec
371 }
372 writeRegexp(b, sub, p, flags)
373 }
374 case OpAlternate:
375 for i, sub := range re.Sub {
376 if i > 0 {
377 b.WriteRune('|')
378 }
379 writeRegexp(b, sub, 0, flags)
380 }
381 }
382 }
383
384 func (re *Regexp) String() string {
385 var b strings.Builder
386 var flags map[*Regexp]printFlags
387 must, cant := calcFlags(re, &flags)
388 must |= (cant &^ flagI) << negShift
389 if must != 0 {
390 must |= flagOff
391 }
392 writeRegexp(&b, re, must, flags)
393 return b.String()
394 }
395
396 const meta = `\.+*?()|[]{}^$`
397
398 func escape(b *strings.Builder, r rune, force bool) {
399 if unicode.IsPrint(r) {
400 if strings.ContainsRune(meta, r) || force {
401 b.WriteRune('\\')
402 }
403 b.WriteRune(r)
404 return
405 }
406
407 switch r {
408 case '\a':
409 b.WriteString(`\a`)
410 case '\f':
411 b.WriteString(`\f`)
412 case '\n':
413 b.WriteString(`\n`)
414 case '\r':
415 b.WriteString(`\r`)
416 case '\t':
417 b.WriteString(`\t`)
418 case '\v':
419 b.WriteString(`\v`)
420 default:
421 if r < 0x100 {
422 b.WriteString(`\x`)
423 s := strconv.FormatInt(int64(r), 16)
424 if len(s) == 1 {
425 b.WriteRune('0')
426 }
427 b.WriteString(s)
428 break
429 }
430 b.WriteString(`\x{`)
431 b.WriteString(strconv.FormatInt(int64(r), 16))
432 b.WriteString(`}`)
433 }
434 }
435
436
437 func (re *Regexp) MaxCap() int {
438 m := 0
439 if re.Op == OpCapture {
440 m = re.Cap
441 }
442 for _, sub := range re.Sub {
443 if n := sub.MaxCap(); m < n {
444 m = n
445 }
446 }
447 return m
448 }
449
450
451 func (re *Regexp) CapNames() []string {
452 names := make([]string, re.MaxCap()+1)
453 re.capNames(names)
454 return names
455 }
456
457 func (re *Regexp) capNames(names []string) {
458 if re.Op == OpCapture {
459 names[re.Cap] = re.Name
460 }
461 for _, sub := range re.Sub {
462 sub.capNames(names)
463 }
464 }
465
View as plain text