1
2
3
4
5 package syntax
6
7 import (
8 "strconv"
9 "strings"
10 "unicode"
11 "unicode/utf8"
12 )
13
14
15
16
17
18 type Prog struct {
19 Inst []Inst
20 Start int
21 NumCap int
22 }
23
24
25 type InstOp uint8
26
27 const (
28 InstAlt InstOp = iota
29 InstAltMatch
30 InstCapture
31 InstEmptyWidth
32 InstMatch
33 InstFail
34 InstNop
35 InstRune
36 InstRune1
37 InstRuneAny
38 InstRuneAnyNotNL
39 )
40
41 var instOpNames = []string{
42 "InstAlt",
43 "InstAltMatch",
44 "InstCapture",
45 "InstEmptyWidth",
46 "InstMatch",
47 "InstFail",
48 "InstNop",
49 "InstRune",
50 "InstRune1",
51 "InstRuneAny",
52 "InstRuneAnyNotNL",
53 }
54
55 func (i InstOp) String() string {
56 if uint(i) >= uint(len(instOpNames)) {
57 return ""
58 }
59 return instOpNames[i]
60 }
61
62
63 type EmptyOp uint8
64
65 const (
66 EmptyBeginLine EmptyOp = 1 << iota
67 EmptyEndLine
68 EmptyBeginText
69 EmptyEndText
70 EmptyWordBoundary
71 EmptyNoWordBoundary
72 )
73
74
75
76
77
78
79
80 func EmptyOpContext(r1, r2 rune) EmptyOp {
81 var op EmptyOp = EmptyNoWordBoundary
82 var boundary byte
83 switch {
84 case IsWordChar(r1):
85 boundary = 1
86 case r1 == '\n':
87 op |= EmptyBeginLine
88 case r1 < 0:
89 op |= EmptyBeginText | EmptyBeginLine
90 }
91 switch {
92 case IsWordChar(r2):
93 boundary ^= 1
94 case r2 == '\n':
95 op |= EmptyEndLine
96 case r2 < 0:
97 op |= EmptyEndText | EmptyEndLine
98 }
99 if boundary != 0 {
100 op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
101 }
102 return op
103 }
104
105
106
107
108 func IsWordChar(r rune) bool {
109
110
111 return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || '0' <= r && r <= '9' || r == '_'
112 }
113
114
115 type Inst struct {
116 Op InstOp
117 Out uint32
118 Arg uint32
119 Rune []rune
120 }
121
122 func (p *Prog) String() string {
123 var b strings.Builder
124 dumpProg(&b, p)
125 return b.String()
126 }
127
128
129 func (p *Prog) skipNop(pc uint32) *Inst {
130 i := &p.Inst[pc]
131 for i.Op == InstNop || i.Op == InstCapture {
132 i = &p.Inst[i.Out]
133 }
134 return i
135 }
136
137
138 func (i *Inst) op() InstOp {
139 op := i.Op
140 switch op {
141 case InstRune1, InstRuneAny, InstRuneAnyNotNL:
142 op = InstRune
143 }
144 return op
145 }
146
147
148
149
150 func (p *Prog) Prefix() (prefix string, complete bool) {
151 i := p.skipNop(uint32(p.Start))
152
153
154 if i.op() != InstRune || len(i.Rune) != 1 {
155 return "", i.Op == InstMatch
156 }
157
158
159 var buf strings.Builder
160 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
161 buf.WriteRune(i.Rune[0])
162 i = p.skipNop(i.Out)
163 }
164 return buf.String(), i.Op == InstMatch
165 }
166
167
168
169 func (p *Prog) StartCond() EmptyOp {
170 var flag EmptyOp
171 pc := uint32(p.Start)
172 i := &p.Inst[pc]
173 Loop:
174 for {
175 switch i.Op {
176 case InstEmptyWidth:
177 flag |= EmptyOp(i.Arg)
178 case InstFail:
179 return ^EmptyOp(0)
180 case InstCapture, InstNop:
181
182 default:
183 break Loop
184 }
185 pc = i.Out
186 i = &p.Inst[pc]
187 }
188 return flag
189 }
190
191 const noMatch = -1
192
193
194
195 func (i *Inst) MatchRune(r rune) bool {
196 return i.MatchRunePos(r) != noMatch
197 }
198
199
200
201
202
203
204 func (i *Inst) MatchRunePos(r rune) int {
205 rune := i.Rune
206
207 switch len(rune) {
208 case 0:
209 return noMatch
210
211 case 1:
212
213 r0 := rune[0]
214 if r == r0 {
215 return 0
216 }
217 if Flags(i.Arg)&FoldCase != 0 {
218 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
219 if r == r1 {
220 return 0
221 }
222 }
223 }
224 return noMatch
225
226 case 2:
227 if r >= rune[0] && r <= rune[1] {
228 return 0
229 }
230 return noMatch
231
232 case 4, 6, 8:
233
234
235 for j := 0; j < len(rune); j += 2 {
236 if r < rune[j] {
237 return noMatch
238 }
239 if r <= rune[j+1] {
240 return j / 2
241 }
242 }
243 return noMatch
244 }
245
246
247 lo := 0
248 hi := len(rune) / 2
249 for lo < hi {
250 m := int(uint(lo+hi) >> 1)
251 if c := rune[2*m]; c <= r {
252 if r <= rune[2*m+1] {
253 return m
254 }
255 lo = m + 1
256 } else {
257 hi = m
258 }
259 }
260 return noMatch
261 }
262
263
264
265
266 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
267 switch EmptyOp(i.Arg) {
268 case EmptyBeginLine:
269 return before == '\n' || before == -1
270 case EmptyEndLine:
271 return after == '\n' || after == -1
272 case EmptyBeginText:
273 return before == -1
274 case EmptyEndText:
275 return after == -1
276 case EmptyWordBoundary:
277 return IsWordChar(before) != IsWordChar(after)
278 case EmptyNoWordBoundary:
279 return IsWordChar(before) == IsWordChar(after)
280 }
281 panic("unknown empty width arg")
282 }
283
284 func (i *Inst) String() string {
285 var b strings.Builder
286 dumpInst(&b, i)
287 return b.String()
288 }
289
290 func bw(b *strings.Builder, args ...string) {
291 for _, s := range args {
292 b.WriteString(s)
293 }
294 }
295
296 func dumpProg(b *strings.Builder, p *Prog) {
297 for j := range p.Inst {
298 i := &p.Inst[j]
299 pc := strconv.Itoa(j)
300 if len(pc) < 3 {
301 b.WriteString(" "[len(pc):])
302 }
303 if j == p.Start {
304 pc += "*"
305 }
306 bw(b, pc, "\t")
307 dumpInst(b, i)
308 bw(b, "\n")
309 }
310 }
311
312 func u32(i uint32) string {
313 return strconv.FormatUint(uint64(i), 10)
314 }
315
316 func dumpInst(b *strings.Builder, i *Inst) {
317 switch i.Op {
318 case InstAlt:
319 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
320 case InstAltMatch:
321 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
322 case InstCapture:
323 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
324 case InstEmptyWidth:
325 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
326 case InstMatch:
327 bw(b, "match")
328 case InstFail:
329 bw(b, "fail")
330 case InstNop:
331 bw(b, "nop -> ", u32(i.Out))
332 case InstRune:
333 if i.Rune == nil {
334
335 bw(b, "rune <nil>")
336 }
337 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
338 if Flags(i.Arg)&FoldCase != 0 {
339 bw(b, "/i")
340 }
341 bw(b, " -> ", u32(i.Out))
342 case InstRune1:
343 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
344 case InstRuneAny:
345 bw(b, "any -> ", u32(i.Out))
346 case InstRuneAnyNotNL:
347 bw(b, "anynotnl -> ", u32(i.Out))
348 }
349 }
350
View as plain text