1
2
3
4
5
6
7
8
9
10
11
12
13 package syntax
14
15 import (
16 "fmt"
17 "io"
18 "unicode"
19 "unicode/utf8"
20 )
21
22
23
24
25 const (
26 comments uint = 1 << iota
27 directives
28 )
29
30 type scanner struct {
31 source
32 mode uint
33 nlsemi bool
34
35
36 line, col uint
37 blank bool
38 tok token
39 lit string
40 bad bool
41 kind LitKind
42 op Operator
43 prec int
44 }
45
46 func (s *scanner) init(src io.Reader, errh func(line, col uint, msg string), mode uint) {
47 s.source.init(src, errh)
48 s.mode = mode
49 s.nlsemi = false
50 }
51
52
53 func (s *scanner) errorf(format string, args ...interface{}) {
54 s.error(fmt.Sprintf(format, args...))
55 }
56
57
58 func (s *scanner) errorAtf(offset int, format string, args ...interface{}) {
59 s.errh(s.line, s.col+uint(offset), fmt.Sprintf(format, args...))
60 }
61
62
63 func (s *scanner) setLit(kind LitKind, ok bool) {
64 s.nlsemi = true
65 s.tok = _Literal
66 s.lit = string(s.segment())
67 s.bad = !ok
68 s.kind = kind
69 }
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88 func (s *scanner) next() {
89 nlsemi := s.nlsemi
90 s.nlsemi = false
91
92 redo:
93
94 s.stop()
95 startLine, startCol := s.pos()
96 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
97 s.nextch()
98 }
99
100
101 s.line, s.col = s.pos()
102 s.blank = s.line > startLine || startCol == colbase
103 s.start()
104 if isLetter(s.ch) || s.ch >= utf8.RuneSelf && s.atIdentChar(true) {
105 s.nextch()
106 s.ident()
107 return
108 }
109
110 switch s.ch {
111 case -1:
112 if nlsemi {
113 s.lit = "EOF"
114 s.tok = _Semi
115 break
116 }
117 s.tok = _EOF
118
119 case '\n':
120 s.nextch()
121 s.lit = "newline"
122 s.tok = _Semi
123
124 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
125 s.number(false)
126
127 case '"':
128 s.stdString()
129
130 case '`':
131 s.rawString()
132
133 case '\'':
134 s.rune()
135
136 case '(':
137 s.nextch()
138 s.tok = _Lparen
139
140 case '[':
141 s.nextch()
142 s.tok = _Lbrack
143
144 case '{':
145 s.nextch()
146 s.tok = _Lbrace
147
148 case ',':
149 s.nextch()
150 s.tok = _Comma
151
152 case ';':
153 s.nextch()
154 s.lit = "semicolon"
155 s.tok = _Semi
156
157 case ')':
158 s.nextch()
159 s.nlsemi = true
160 s.tok = _Rparen
161
162 case ']':
163 s.nextch()
164 s.nlsemi = true
165 s.tok = _Rbrack
166
167 case '}':
168 s.nextch()
169 s.nlsemi = true
170 s.tok = _Rbrace
171
172 case ':':
173 s.nextch()
174 if s.ch == '=' {
175 s.nextch()
176 s.tok = _Define
177 break
178 }
179 s.tok = _Colon
180
181 case '.':
182 s.nextch()
183 if isDecimal(s.ch) {
184 s.number(true)
185 break
186 }
187 if s.ch == '.' {
188 s.nextch()
189 if s.ch == '.' {
190 s.nextch()
191 s.tok = _DotDotDot
192 break
193 }
194 s.rewind()
195 s.nextch()
196 }
197 s.tok = _Dot
198
199 case '+':
200 s.nextch()
201 s.op, s.prec = Add, precAdd
202 if s.ch != '+' {
203 goto assignop
204 }
205 s.nextch()
206 s.nlsemi = true
207 s.tok = _IncOp
208
209 case '-':
210 s.nextch()
211 s.op, s.prec = Sub, precAdd
212 if s.ch != '-' {
213 goto assignop
214 }
215 s.nextch()
216 s.nlsemi = true
217 s.tok = _IncOp
218
219 case '*':
220 s.nextch()
221 s.op, s.prec = Mul, precMul
222
223 if s.ch == '=' {
224 s.nextch()
225 s.tok = _AssignOp
226 break
227 }
228 s.tok = _Star
229
230 case '/':
231 s.nextch()
232 if s.ch == '/' {
233 s.nextch()
234 s.lineComment()
235 goto redo
236 }
237 if s.ch == '*' {
238 s.nextch()
239 s.fullComment()
240 if line, _ := s.pos(); line > s.line && nlsemi {
241
242
243 s.lit = "newline"
244 s.tok = _Semi
245 break
246 }
247 goto redo
248 }
249 s.op, s.prec = Div, precMul
250 goto assignop
251
252 case '%':
253 s.nextch()
254 s.op, s.prec = Rem, precMul
255 goto assignop
256
257 case '&':
258 s.nextch()
259 if s.ch == '&' {
260 s.nextch()
261 s.op, s.prec = AndAnd, precAndAnd
262 s.tok = _Operator
263 break
264 }
265 s.op, s.prec = And, precMul
266 if s.ch == '^' {
267 s.nextch()
268 s.op = AndNot
269 }
270 goto assignop
271
272 case '|':
273 s.nextch()
274 if s.ch == '|' {
275 s.nextch()
276 s.op, s.prec = OrOr, precOrOr
277 s.tok = _Operator
278 break
279 }
280 s.op, s.prec = Or, precAdd
281 goto assignop
282
283 case '^':
284 s.nextch()
285 s.op, s.prec = Xor, precAdd
286 goto assignop
287
288 case '<':
289 s.nextch()
290 if s.ch == '=' {
291 s.nextch()
292 s.op, s.prec = Leq, precCmp
293 s.tok = _Operator
294 break
295 }
296 if s.ch == '<' {
297 s.nextch()
298 s.op, s.prec = Shl, precMul
299 goto assignop
300 }
301 if s.ch == '-' {
302 s.nextch()
303 s.tok = _Arrow
304 break
305 }
306 s.op, s.prec = Lss, precCmp
307 s.tok = _Operator
308
309 case '>':
310 s.nextch()
311 if s.ch == '=' {
312 s.nextch()
313 s.op, s.prec = Geq, precCmp
314 s.tok = _Operator
315 break
316 }
317 if s.ch == '>' {
318 s.nextch()
319 s.op, s.prec = Shr, precMul
320 goto assignop
321 }
322 s.op, s.prec = Gtr, precCmp
323 s.tok = _Operator
324
325 case '=':
326 s.nextch()
327 if s.ch == '=' {
328 s.nextch()
329 s.op, s.prec = Eql, precCmp
330 s.tok = _Operator
331 break
332 }
333 s.tok = _Assign
334
335 case '!':
336 s.nextch()
337 if s.ch == '=' {
338 s.nextch()
339 s.op, s.prec = Neq, precCmp
340 s.tok = _Operator
341 break
342 }
343 s.op, s.prec = Not, 0
344 s.tok = _Operator
345
346 case '~':
347 s.nextch()
348 s.op, s.prec = Tilde, 0
349 s.tok = _Operator
350
351 default:
352 s.errorf("invalid character %#U", s.ch)
353 s.nextch()
354 goto redo
355 }
356
357 return
358
359 assignop:
360 if s.ch == '=' {
361 s.nextch()
362 s.tok = _AssignOp
363 return
364 }
365 s.tok = _Operator
366 }
367
368 func (s *scanner) ident() {
369
370 for isLetter(s.ch) || isDecimal(s.ch) {
371 s.nextch()
372 }
373
374
375 if s.ch >= utf8.RuneSelf {
376 for s.atIdentChar(false) {
377 s.nextch()
378 }
379 }
380
381
382 lit := s.segment()
383 if len(lit) >= 2 {
384 if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
385 s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
386 s.tok = tok
387 return
388 }
389 }
390
391 s.nlsemi = true
392 s.lit = string(lit)
393 s.tok = _Name
394 }
395
396
397
398 func tokStrFast(tok token) string {
399 return _token_name[_token_index[tok-1]:_token_index[tok]]
400 }
401
402 func (s *scanner) atIdentChar(first bool) bool {
403 switch {
404 case unicode.IsLetter(s.ch) || s.ch == '_':
405
406 case unicode.IsDigit(s.ch):
407 if first {
408 s.errorf("identifier cannot begin with digit %#U", s.ch)
409 }
410 case s.ch >= utf8.RuneSelf:
411 s.errorf("invalid character %#U in identifier", s.ch)
412 default:
413 return false
414 }
415 return true
416 }
417
418
419
420 func hash(s []byte) uint {
421 return (uint(s[0])<<4 ^ uint(s[1]) + uint(len(s))) & uint(len(keywordMap)-1)
422 }
423
424 var keywordMap [1 << 6]token
425
426 func init() {
427
428 for tok := _Break; tok <= _Var; tok++ {
429 h := hash([]byte(tok.String()))
430 if keywordMap[h] != 0 {
431 panic("imperfect hash")
432 }
433 keywordMap[h] = tok
434 }
435 }
436
437 func lower(ch rune) rune { return ('a' - 'A') | ch }
438 func isLetter(ch rune) bool { return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' }
439 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
440 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
441
442
443
444
445
446
447
448 func (s *scanner) digits(base int, invalid *int) (digsep int) {
449 if base <= 10 {
450 max := rune('0' + base)
451 for isDecimal(s.ch) || s.ch == '_' {
452 ds := 1
453 if s.ch == '_' {
454 ds = 2
455 } else if s.ch >= max && *invalid < 0 {
456 _, col := s.pos()
457 *invalid = int(col - s.col)
458 }
459 digsep |= ds
460 s.nextch()
461 }
462 } else {
463 for isHex(s.ch) || s.ch == '_' {
464 ds := 1
465 if s.ch == '_' {
466 ds = 2
467 }
468 digsep |= ds
469 s.nextch()
470 }
471 }
472 return
473 }
474
475 func (s *scanner) number(seenPoint bool) {
476 ok := true
477 kind := IntLit
478 base := 10
479 prefix := rune(0)
480 digsep := 0
481 invalid := -1
482
483
484 if !seenPoint {
485 if s.ch == '0' {
486 s.nextch()
487 switch lower(s.ch) {
488 case 'x':
489 s.nextch()
490 base, prefix = 16, 'x'
491 case 'o':
492 s.nextch()
493 base, prefix = 8, 'o'
494 case 'b':
495 s.nextch()
496 base, prefix = 2, 'b'
497 default:
498 base, prefix = 8, '0'
499 digsep = 1
500 }
501 }
502 digsep |= s.digits(base, &invalid)
503 if s.ch == '.' {
504 if prefix == 'o' || prefix == 'b' {
505 s.errorf("invalid radix point in %s literal", baseName(base))
506 ok = false
507 }
508 s.nextch()
509 seenPoint = true
510 }
511 }
512
513
514 if seenPoint {
515 kind = FloatLit
516 digsep |= s.digits(base, &invalid)
517 }
518
519 if digsep&1 == 0 && ok {
520 s.errorf("%s literal has no digits", baseName(base))
521 ok = false
522 }
523
524
525 if e := lower(s.ch); e == 'e' || e == 'p' {
526 if ok {
527 switch {
528 case e == 'e' && prefix != 0 && prefix != '0':
529 s.errorf("%q exponent requires decimal mantissa", s.ch)
530 ok = false
531 case e == 'p' && prefix != 'x':
532 s.errorf("%q exponent requires hexadecimal mantissa", s.ch)
533 ok = false
534 }
535 }
536 s.nextch()
537 kind = FloatLit
538 if s.ch == '+' || s.ch == '-' {
539 s.nextch()
540 }
541 digsep = s.digits(10, nil) | digsep&2
542 if digsep&1 == 0 && ok {
543 s.errorf("exponent has no digits")
544 ok = false
545 }
546 } else if prefix == 'x' && kind == FloatLit && ok {
547 s.errorf("hexadecimal mantissa requires a 'p' exponent")
548 ok = false
549 }
550
551
552 if s.ch == 'i' {
553 kind = ImagLit
554 s.nextch()
555 }
556
557 s.setLit(kind, ok)
558
559 if kind == IntLit && invalid >= 0 && ok {
560 s.errorAtf(invalid, "invalid digit %q in %s literal", s.lit[invalid], baseName(base))
561 ok = false
562 }
563
564 if digsep&2 != 0 && ok {
565 if i := invalidSep(s.lit); i >= 0 {
566 s.errorAtf(i, "'_' must separate successive digits")
567 ok = false
568 }
569 }
570
571 s.bad = !ok
572 }
573
574 func baseName(base int) string {
575 switch base {
576 case 2:
577 return "binary"
578 case 8:
579 return "octal"
580 case 10:
581 return "decimal"
582 case 16:
583 return "hexadecimal"
584 }
585 panic("invalid base")
586 }
587
588
589 func invalidSep(x string) int {
590 x1 := ' '
591 d := '.'
592 i := 0
593
594
595 if len(x) >= 2 && x[0] == '0' {
596 x1 = lower(rune(x[1]))
597 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
598 d = '0'
599 i = 2
600 }
601 }
602
603
604 for ; i < len(x); i++ {
605 p := d
606 d = rune(x[i])
607 switch {
608 case d == '_':
609 if p != '0' {
610 return i
611 }
612 case isDecimal(d) || x1 == 'x' && isHex(d):
613 d = '0'
614 default:
615 if p == '_' {
616 return i - 1
617 }
618 d = '.'
619 }
620 }
621 if d == '_' {
622 return len(x) - 1
623 }
624
625 return -1
626 }
627
628 func (s *scanner) rune() {
629 ok := true
630 s.nextch()
631
632 n := 0
633 for ; ; n++ {
634 if s.ch == '\'' {
635 if ok {
636 if n == 0 {
637 s.errorf("empty rune literal or unescaped '")
638 ok = false
639 } else if n != 1 {
640 s.errorAtf(0, "more than one character in rune literal")
641 ok = false
642 }
643 }
644 s.nextch()
645 break
646 }
647 if s.ch == '\\' {
648 s.nextch()
649 if !s.escape('\'') {
650 ok = false
651 }
652 continue
653 }
654 if s.ch == '\n' {
655 if ok {
656 s.errorf("newline in rune literal")
657 ok = false
658 }
659 break
660 }
661 if s.ch < 0 {
662 if ok {
663 s.errorAtf(0, "rune literal not terminated")
664 ok = false
665 }
666 break
667 }
668 s.nextch()
669 }
670
671 s.setLit(RuneLit, ok)
672 }
673
674 func (s *scanner) stdString() {
675 ok := true
676 s.nextch()
677
678 for {
679 if s.ch == '"' {
680 s.nextch()
681 break
682 }
683 if s.ch == '\\' {
684 s.nextch()
685 if !s.escape('"') {
686 ok = false
687 }
688 continue
689 }
690 if s.ch == '\n' {
691 s.errorf("newline in string")
692 ok = false
693 break
694 }
695 if s.ch < 0 {
696 s.errorAtf(0, "string not terminated")
697 ok = false
698 break
699 }
700 s.nextch()
701 }
702
703 s.setLit(StringLit, ok)
704 }
705
706 func (s *scanner) rawString() {
707 ok := true
708 s.nextch()
709
710 for {
711 if s.ch == '`' {
712 s.nextch()
713 break
714 }
715 if s.ch < 0 {
716 s.errorAtf(0, "string not terminated")
717 ok = false
718 break
719 }
720 s.nextch()
721 }
722
723
724
725
726 s.setLit(StringLit, ok)
727 }
728
729 func (s *scanner) comment(text string) {
730 s.errorAtf(0, "%s", text)
731 }
732
733 func (s *scanner) skipLine() {
734
735 for s.ch >= 0 && s.ch != '\n' {
736 s.nextch()
737 }
738 }
739
740 func (s *scanner) lineComment() {
741
742
743 if s.mode&comments != 0 {
744 s.skipLine()
745 s.comment(string(s.segment()))
746 return
747 }
748
749
750 if s.mode&directives == 0 || (s.ch != 'g' && s.ch != 'l') {
751 s.stop()
752 s.skipLine()
753 return
754 }
755
756
757 prefix := "go:"
758 if s.ch == 'l' {
759 prefix = "line "
760 }
761 for _, m := range prefix {
762 if s.ch != m {
763 s.stop()
764 s.skipLine()
765 return
766 }
767 s.nextch()
768 }
769
770
771 s.skipLine()
772 s.comment(string(s.segment()))
773 }
774
775 func (s *scanner) skipComment() bool {
776 for s.ch >= 0 {
777 for s.ch == '*' {
778 s.nextch()
779 if s.ch == '/' {
780 s.nextch()
781 return true
782 }
783 }
784 s.nextch()
785 }
786 s.errorAtf(0, "comment not terminated")
787 return false
788 }
789
790 func (s *scanner) fullComment() {
791
792
793 if s.mode&comments != 0 {
794 if s.skipComment() {
795 s.comment(string(s.segment()))
796 }
797 return
798 }
799
800 if s.mode&directives == 0 || s.ch != 'l' {
801 s.stop()
802 s.skipComment()
803 return
804 }
805
806
807 const prefix = "line "
808 for _, m := range prefix {
809 if s.ch != m {
810 s.stop()
811 s.skipComment()
812 return
813 }
814 s.nextch()
815 }
816
817
818 if s.skipComment() {
819 s.comment(string(s.segment()))
820 }
821 }
822
823 func (s *scanner) escape(quote rune) bool {
824 var n int
825 var base, max uint32
826
827 switch s.ch {
828 case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
829 s.nextch()
830 return true
831 case '0', '1', '2', '3', '4', '5', '6', '7':
832 n, base, max = 3, 8, 255
833 case 'x':
834 s.nextch()
835 n, base, max = 2, 16, 255
836 case 'u':
837 s.nextch()
838 n, base, max = 4, 16, unicode.MaxRune
839 case 'U':
840 s.nextch()
841 n, base, max = 8, 16, unicode.MaxRune
842 default:
843 if s.ch < 0 {
844 return true
845 }
846 s.errorf("unknown escape")
847 return false
848 }
849
850 var x uint32
851 for i := n; i > 0; i-- {
852 if s.ch < 0 {
853 return true
854 }
855 d := base
856 if isDecimal(s.ch) {
857 d = uint32(s.ch) - '0'
858 } else if 'a' <= lower(s.ch) && lower(s.ch) <= 'f' {
859 d = uint32(lower(s.ch)) - 'a' + 10
860 }
861 if d >= base {
862 s.errorf("invalid character %q in %s escape", s.ch, baseName(int(base)))
863 return false
864 }
865
866 x = x*base + d
867 s.nextch()
868 }
869
870 if x > max && base == 8 {
871 s.errorf("octal escape value %d > 255", x)
872 return false
873 }
874
875 if x > max || 0xD800 <= x && x < 0xE000 {
876 s.errorf("escape is invalid Unicode code point %#U", x)
877 return false
878 }
879
880 return true
881 }
882
View as plain text