Source file
src/go/scanner/scanner.go
1
2
3
4
5
6
7
8 package scanner
9
10 import (
11 "bytes"
12 "fmt"
13 "go/token"
14 "path/filepath"
15 "strconv"
16 "unicode"
17 "unicode/utf8"
18 )
19
20
21
22
23
24 type ErrorHandler func(pos token.Position, msg string)
25
26
27
28
29 type Scanner struct {
30
31 file *token.File
32 dir string
33 src []byte
34 err ErrorHandler
35 mode Mode
36
37
38 ch rune
39 offset int
40 rdOffset int
41 lineOffset int
42 insertSemi bool
43 nlPos token.Pos
44
45
46 ErrorCount int
47 }
48
49 const (
50 bom = 0xFEFF
51 eof = -1
52 )
53
54
55
56
57
58
59 func (s *Scanner) next() {
60 if s.rdOffset < len(s.src) {
61 s.offset = s.rdOffset
62 if s.ch == '\n' {
63 s.lineOffset = s.offset
64 s.file.AddLine(s.offset)
65 }
66 r, w := rune(s.src[s.rdOffset]), 1
67 switch {
68 case r == 0:
69 s.error(s.offset, "illegal character NUL")
70 case r >= utf8.RuneSelf:
71
72 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
73 if r == utf8.RuneError && w == 1 {
74 s.error(s.offset, "illegal UTF-8 encoding")
75 } else if r == bom && s.offset > 0 {
76 s.error(s.offset, "illegal byte order mark")
77 }
78 }
79 s.rdOffset += w
80 s.ch = r
81 } else {
82 s.offset = len(s.src)
83 if s.ch == '\n' {
84 s.lineOffset = s.offset
85 s.file.AddLine(s.offset)
86 }
87 s.ch = eof
88 }
89 }
90
91
92
93 func (s *Scanner) peek() byte {
94 if s.rdOffset < len(s.src) {
95 return s.src[s.rdOffset]
96 }
97 return 0
98 }
99
100
101
102 type Mode uint
103
104 const (
105 ScanComments Mode = 1 << iota
106 dontInsertSemis
107 )
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
124
125 if file.Size() != len(src) {
126 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
127 }
128 s.file = file
129 s.dir, _ = filepath.Split(file.Name())
130 s.src = src
131 s.err = err
132 s.mode = mode
133
134 s.ch = ' '
135 s.offset = 0
136 s.rdOffset = 0
137 s.lineOffset = 0
138 s.insertSemi = false
139 s.ErrorCount = 0
140
141 s.next()
142 if s.ch == bom {
143 s.next()
144 }
145 }
146
147 func (s *Scanner) error(offs int, msg string) {
148 if s.err != nil {
149 s.err(s.file.Position(s.file.Pos(offs)), msg)
150 }
151 s.ErrorCount++
152 }
153
154 func (s *Scanner) errorf(offs int, format string, args ...any) {
155 s.error(offs, fmt.Sprintf(format, args...))
156 }
157
158
159
160
161 func (s *Scanner) scanComment() (string, int) {
162
163 offs := s.offset - 1
164 next := -1
165 numCR := 0
166 nlOffset := 0
167
168 if s.ch == '/' {
169
170
171 s.next()
172 for s.ch != '\n' && s.ch >= 0 {
173 if s.ch == '\r' {
174 numCR++
175 }
176 s.next()
177 }
178
179 next = s.offset
180 if s.ch == '\n' {
181 next++
182 }
183 goto exit
184 }
185
186
187 s.next()
188 for s.ch >= 0 {
189 ch := s.ch
190 if ch == '\r' {
191 numCR++
192 } else if ch == '\n' && nlOffset == 0 {
193 nlOffset = s.offset
194 }
195 s.next()
196 if ch == '*' && s.ch == '/' {
197 s.next()
198 next = s.offset
199 goto exit
200 }
201 }
202
203 s.error(offs, "comment not terminated")
204
205 exit:
206 lit := s.src[offs:s.offset]
207
208
209
210
211
212
213 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
214 lit = lit[:len(lit)-1]
215 numCR--
216 }
217
218
219
220 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
221 s.updateLineInfo(next, offs, lit)
222 }
223
224 if numCR > 0 {
225 lit = stripCR(lit, lit[1] == '*')
226 }
227
228 return string(lit), nlOffset
229 }
230
231 var prefix = []byte("line ")
232
233
234
235
236 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
237
238 if text[1] == '*' {
239 text = text[:len(text)-2]
240 }
241 text = text[7:]
242 offs += 7
243
244 i, n, ok := trailingDigits(text)
245 if i == 0 {
246 return
247 }
248
249
250 if !ok {
251
252 s.error(offs+i, "invalid line number: "+string(text[i:]))
253 return
254 }
255
256
257
258
259 const maxLineCol = 1 << 30
260 var line, col int
261 i2, n2, ok2 := trailingDigits(text[:i-1])
262 if ok2 {
263
264 i, i2 = i2, i
265 line, col = n2, n
266 if col == 0 || col > maxLineCol {
267 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
268 return
269 }
270 text = text[:i2-1]
271 } else {
272
273 line = n
274 }
275
276 if line == 0 || line > maxLineCol {
277 s.error(offs+i, "invalid line number: "+string(text[i:]))
278 return
279 }
280
281
282
283 filename := string(text[:i-1])
284 if filename == "" && ok2 {
285 filename = s.file.Position(s.file.Pos(offs)).Filename
286 } else if filename != "" {
287
288
289
290 filename = filepath.Clean(filename)
291 if !filepath.IsAbs(filename) {
292 filename = filepath.Join(s.dir, filename)
293 }
294 }
295
296 s.file.AddLineColumnInfo(next, filename, line, col)
297 }
298
299 func trailingDigits(text []byte) (int, int, bool) {
300 i := bytes.LastIndexByte(text, ':')
301 if i < 0 {
302 return 0, 0, false
303 }
304
305 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
306 return i + 1, int(n), err == nil
307 }
308
309 func isLetter(ch rune) bool {
310 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
311 }
312
313 func isDigit(ch rune) bool {
314 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
315 }
316
317
318
319
320
321
322 func (s *Scanner) scanIdentifier() string {
323 offs := s.offset
324
325
326
327
328
329
330
331
332 for rdOffset, b := range s.src[s.rdOffset:] {
333 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
334
335 continue
336 }
337 s.rdOffset += rdOffset
338 if 0 < b && b < utf8.RuneSelf {
339
340
341
342
343
344
345 s.ch = rune(b)
346 s.offset = s.rdOffset
347 s.rdOffset++
348 goto exit
349 }
350
351
352
353 s.next()
354 for isLetter(s.ch) || isDigit(s.ch) {
355 s.next()
356 }
357 goto exit
358 }
359 s.offset = len(s.src)
360 s.rdOffset = len(s.src)
361 s.ch = eof
362
363 exit:
364 return string(s.src[offs:s.offset])
365 }
366
367 func digitVal(ch rune) int {
368 switch {
369 case '0' <= ch && ch <= '9':
370 return int(ch - '0')
371 case 'a' <= lower(ch) && lower(ch) <= 'f':
372 return int(lower(ch) - 'a' + 10)
373 }
374 return 16
375 }
376
377 func lower(ch rune) rune { return ('a' - 'A') | ch }
378 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
379 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
380
381
382
383
384
385
386
387 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
388 if base <= 10 {
389 max := rune('0' + base)
390 for isDecimal(s.ch) || s.ch == '_' {
391 ds := 1
392 if s.ch == '_' {
393 ds = 2
394 } else if s.ch >= max && *invalid < 0 {
395 *invalid = s.offset
396 }
397 digsep |= ds
398 s.next()
399 }
400 } else {
401 for isHex(s.ch) || s.ch == '_' {
402 ds := 1
403 if s.ch == '_' {
404 ds = 2
405 }
406 digsep |= ds
407 s.next()
408 }
409 }
410 return
411 }
412
413 func (s *Scanner) scanNumber() (token.Token, string) {
414 offs := s.offset
415 tok := token.ILLEGAL
416
417 base := 10
418 prefix := rune(0)
419 digsep := 0
420 invalid := -1
421
422
423 if s.ch != '.' {
424 tok = token.INT
425 if s.ch == '0' {
426 s.next()
427 switch lower(s.ch) {
428 case 'x':
429 s.next()
430 base, prefix = 16, 'x'
431 case 'o':
432 s.next()
433 base, prefix = 8, 'o'
434 case 'b':
435 s.next()
436 base, prefix = 2, 'b'
437 default:
438 base, prefix = 8, '0'
439 digsep = 1
440 }
441 }
442 digsep |= s.digits(base, &invalid)
443 }
444
445
446 if s.ch == '.' {
447 tok = token.FLOAT
448 if prefix == 'o' || prefix == 'b' {
449 s.error(s.offset, "invalid radix point in "+litname(prefix))
450 }
451 s.next()
452 digsep |= s.digits(base, &invalid)
453 }
454
455 if digsep&1 == 0 {
456 s.error(s.offset, litname(prefix)+" has no digits")
457 }
458
459
460 if e := lower(s.ch); e == 'e' || e == 'p' {
461 switch {
462 case e == 'e' && prefix != 0 && prefix != '0':
463 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
464 case e == 'p' && prefix != 'x':
465 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
466 }
467 s.next()
468 tok = token.FLOAT
469 if s.ch == '+' || s.ch == '-' {
470 s.next()
471 }
472 ds := s.digits(10, nil)
473 digsep |= ds
474 if ds&1 == 0 {
475 s.error(s.offset, "exponent has no digits")
476 }
477 } else if prefix == 'x' && tok == token.FLOAT {
478 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
479 }
480
481
482 if s.ch == 'i' {
483 tok = token.IMAG
484 s.next()
485 }
486
487 lit := string(s.src[offs:s.offset])
488 if tok == token.INT && invalid >= 0 {
489 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
490 }
491 if digsep&2 != 0 {
492 if i := invalidSep(lit); i >= 0 {
493 s.error(offs+i, "'_' must separate successive digits")
494 }
495 }
496
497 return tok, lit
498 }
499
500 func litname(prefix rune) string {
501 switch prefix {
502 case 'x':
503 return "hexadecimal literal"
504 case 'o', '0':
505 return "octal literal"
506 case 'b':
507 return "binary literal"
508 }
509 return "decimal literal"
510 }
511
512
513 func invalidSep(x string) int {
514 x1 := ' '
515 d := '.'
516 i := 0
517
518
519 if len(x) >= 2 && x[0] == '0' {
520 x1 = lower(rune(x[1]))
521 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
522 d = '0'
523 i = 2
524 }
525 }
526
527
528 for ; i < len(x); i++ {
529 p := d
530 d = rune(x[i])
531 switch {
532 case d == '_':
533 if p != '0' {
534 return i
535 }
536 case isDecimal(d) || x1 == 'x' && isHex(d):
537 d = '0'
538 default:
539 if p == '_' {
540 return i - 1
541 }
542 d = '.'
543 }
544 }
545 if d == '_' {
546 return len(x) - 1
547 }
548
549 return -1
550 }
551
552
553
554
555
556 func (s *Scanner) scanEscape(quote rune) bool {
557 offs := s.offset
558
559 var n int
560 var base, max uint32
561 switch s.ch {
562 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
563 s.next()
564 return true
565 case '0', '1', '2', '3', '4', '5', '6', '7':
566 n, base, max = 3, 8, 255
567 case 'x':
568 s.next()
569 n, base, max = 2, 16, 255
570 case 'u':
571 s.next()
572 n, base, max = 4, 16, unicode.MaxRune
573 case 'U':
574 s.next()
575 n, base, max = 8, 16, unicode.MaxRune
576 default:
577 msg := "unknown escape sequence"
578 if s.ch < 0 {
579 msg = "escape sequence not terminated"
580 }
581 s.error(offs, msg)
582 return false
583 }
584
585 var x uint32
586 for n > 0 {
587 d := uint32(digitVal(s.ch))
588 if d >= base {
589 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
590 if s.ch < 0 {
591 msg = "escape sequence not terminated"
592 }
593 s.error(s.offset, msg)
594 return false
595 }
596 x = x*base + d
597 s.next()
598 n--
599 }
600
601 if x > max || 0xD800 <= x && x < 0xE000 {
602 s.error(offs, "escape sequence is invalid Unicode code point")
603 return false
604 }
605
606 return true
607 }
608
609 func (s *Scanner) scanRune() string {
610
611 offs := s.offset - 1
612
613 valid := true
614 n := 0
615 for {
616 ch := s.ch
617 if ch == '\n' || ch < 0 {
618
619 if valid {
620 s.error(offs, "rune literal not terminated")
621 valid = false
622 }
623 break
624 }
625 s.next()
626 if ch == '\'' {
627 break
628 }
629 n++
630 if ch == '\\' {
631 if !s.scanEscape('\'') {
632 valid = false
633 }
634
635 }
636 }
637
638 if valid && n != 1 {
639 s.error(offs, "illegal rune literal")
640 }
641
642 return string(s.src[offs:s.offset])
643 }
644
645 func (s *Scanner) scanString() string {
646
647 offs := s.offset - 1
648
649 for {
650 ch := s.ch
651 if ch == '\n' || ch < 0 {
652 s.error(offs, "string literal not terminated")
653 break
654 }
655 s.next()
656 if ch == '"' {
657 break
658 }
659 if ch == '\\' {
660 s.scanEscape('"')
661 }
662 }
663
664 return string(s.src[offs:s.offset])
665 }
666
667 func stripCR(b []byte, comment bool) []byte {
668 c := make([]byte, len(b))
669 i := 0
670 for j, ch := range b {
671
672
673
674
675
676 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
677 c[i] = ch
678 i++
679 }
680 }
681 return c[:i]
682 }
683
684 func (s *Scanner) scanRawString() string {
685
686 offs := s.offset - 1
687
688 hasCR := false
689 for {
690 ch := s.ch
691 if ch < 0 {
692 s.error(offs, "raw string literal not terminated")
693 break
694 }
695 s.next()
696 if ch == '`' {
697 break
698 }
699 if ch == '\r' {
700 hasCR = true
701 }
702 }
703
704 lit := s.src[offs:s.offset]
705 if hasCR {
706 lit = stripCR(lit, false)
707 }
708
709 return string(lit)
710 }
711
712 func (s *Scanner) skipWhitespace() {
713 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
714 s.next()
715 }
716 }
717
718
719
720
721
722
723
724 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
725 if s.ch == '=' {
726 s.next()
727 return tok1
728 }
729 return tok0
730 }
731
732 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
733 if s.ch == '=' {
734 s.next()
735 return tok1
736 }
737 if s.ch == ch2 {
738 s.next()
739 return tok2
740 }
741 return tok0
742 }
743
744 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
745 if s.ch == '=' {
746 s.next()
747 return tok1
748 }
749 if s.ch == ch2 {
750 s.next()
751 if s.ch == '=' {
752 s.next()
753 return tok3
754 }
755 return tok2
756 }
757 return tok0
758 }
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
791 scanAgain:
792 if s.nlPos.IsValid() {
793
794
795 pos, tok, lit = s.nlPos, token.SEMICOLON, "\n"
796 s.nlPos = token.NoPos
797 return
798 }
799
800 s.skipWhitespace()
801
802
803 pos = s.file.Pos(s.offset)
804
805
806 insertSemi := false
807 switch ch := s.ch; {
808 case isLetter(ch):
809 lit = s.scanIdentifier()
810 if len(lit) > 1 {
811
812 tok = token.Lookup(lit)
813 switch tok {
814 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
815 insertSemi = true
816 }
817 } else {
818 insertSemi = true
819 tok = token.IDENT
820 }
821 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
822 insertSemi = true
823 tok, lit = s.scanNumber()
824 default:
825 s.next()
826 switch ch {
827 case eof:
828 if s.insertSemi {
829 s.insertSemi = false
830 return pos, token.SEMICOLON, "\n"
831 }
832 tok = token.EOF
833 case '\n':
834
835
836
837 s.insertSemi = false
838 return pos, token.SEMICOLON, "\n"
839 case '"':
840 insertSemi = true
841 tok = token.STRING
842 lit = s.scanString()
843 case '\'':
844 insertSemi = true
845 tok = token.CHAR
846 lit = s.scanRune()
847 case '`':
848 insertSemi = true
849 tok = token.STRING
850 lit = s.scanRawString()
851 case ':':
852 tok = s.switch2(token.COLON, token.DEFINE)
853 case '.':
854
855 tok = token.PERIOD
856 if s.ch == '.' && s.peek() == '.' {
857 s.next()
858 s.next()
859 tok = token.ELLIPSIS
860 }
861 case ',':
862 tok = token.COMMA
863 case ';':
864 tok = token.SEMICOLON
865 lit = ";"
866 case '(':
867 tok = token.LPAREN
868 case ')':
869 insertSemi = true
870 tok = token.RPAREN
871 case '[':
872 tok = token.LBRACK
873 case ']':
874 insertSemi = true
875 tok = token.RBRACK
876 case '{':
877 tok = token.LBRACE
878 case '}':
879 insertSemi = true
880 tok = token.RBRACE
881 case '+':
882 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
883 if tok == token.INC {
884 insertSemi = true
885 }
886 case '-':
887 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
888 if tok == token.DEC {
889 insertSemi = true
890 }
891 case '*':
892 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
893 case '/':
894 if s.ch == '/' || s.ch == '*' {
895
896 comment, nlOffset := s.scanComment()
897 if s.insertSemi && nlOffset != 0 {
898
899
900 s.nlPos = s.file.Pos(nlOffset)
901 s.insertSemi = false
902 } else {
903 insertSemi = s.insertSemi
904 }
905 if s.mode&ScanComments == 0 {
906
907 goto scanAgain
908 }
909 tok = token.COMMENT
910 lit = comment
911 } else {
912
913 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
914 }
915 case '%':
916 tok = s.switch2(token.REM, token.REM_ASSIGN)
917 case '^':
918 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
919 case '<':
920 if s.ch == '-' {
921 s.next()
922 tok = token.ARROW
923 } else {
924 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
925 }
926 case '>':
927 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
928 case '=':
929 tok = s.switch2(token.ASSIGN, token.EQL)
930 case '!':
931 tok = s.switch2(token.NOT, token.NEQ)
932 case '&':
933 if s.ch == '^' {
934 s.next()
935 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
936 } else {
937 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
938 }
939 case '|':
940 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
941 case '~':
942 tok = token.TILDE
943 default:
944
945 if ch != bom {
946
947
948 if ch == '“' || ch == '”' {
949 s.errorf(s.file.Offset(pos), "curly quotation mark %q (use neutral %q)", ch, '"')
950 } else {
951 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
952 }
953 }
954 insertSemi = s.insertSemi
955 tok = token.ILLEGAL
956 lit = string(ch)
957 }
958 }
959 if s.mode&dontInsertSemis == 0 {
960 s.insertSemi = insertSemi
961 }
962
963 return
964 }
965
View as plain text