Source file src/cmd/compile/internal/syntax/source.go
1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This file implements source, a buffered rune reader 6 // specialized for scanning Go code: Reading 7 // ASCII characters, maintaining current (line, col) 8 // position information, and recording of the most 9 // recently read source segment are highly optimized. 10 // This file is self-contained (go tool compile source.go 11 // compiles) and thus could be made into its own package. 12 13 package syntax 14 15 import ( 16 "io" 17 "unicode/utf8" 18 ) 19 20 // The source buffer is accessed using three indices b (begin), 21 // r (read), and e (end): 22 // 23 // - If b >= 0, it points to the beginning of a segment of most 24 // recently read characters (typically a Go literal). 25 // 26 // - r points to the byte immediately following the most recently 27 // read character ch, which starts at r-chw. 28 // 29 // - e points to the byte immediately following the last byte that 30 // was read into the buffer. 31 // 32 // The buffer content is terminated at buf[e] with the sentinel 33 // character utf8.RuneSelf. This makes it possible to test for 34 // the common case of ASCII characters with a single 'if' (see 35 // nextch method). 36 // 37 // +------ content in use -------+ 38 // v v 39 // buf [...read...|...segment...|ch|...unread...|s|...free...] 40 // ^ ^ ^ ^ 41 // | | | | 42 // b r-chw r e 43 // 44 // Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel 45 46 type source struct { 47 in io.Reader 48 errh func(line, col uint, msg string) 49 50 buf []byte // source buffer 51 ioerr error // pending I/O error, or nil 52 b, r, e int // buffer indices (see comment above) 53 line, col uint // source position of ch (0-based) 54 ch rune // most recently read character 55 chw int // width of ch 56 } 57 58 const sentinel = utf8.RuneSelf 59 60 func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) { 61 s.in = in 62 s.errh = errh 63 64 if s.buf == nil { 65 s.buf = make([]byte, nextSize(0)) 66 } 67 s.buf[0] = sentinel 68 s.ioerr = nil 69 s.b, s.r, s.e = -1, 0, 0 70 s.line, s.col = 0, 0 71 s.ch = ' ' 72 s.chw = 0 73 } 74 75 // starting points for line and column numbers 76 const linebase = 1 77 const colbase = 1 78 79 // pos returns the (line, col) source position of s.ch. 80 func (s *source) pos() (line, col uint) { 81 return linebase + s.line, colbase + s.col 82 } 83 84 // error reports the error msg at source position s.pos(). 85 func (s *source) error(msg string) { 86 line, col := s.pos() 87 s.errh(line, col, msg) 88 } 89 90 // start starts a new active source segment (including s.ch). 91 // As long as stop has not been called, the active segment's 92 // bytes (excluding s.ch) may be retrieved by calling segment. 93 func (s *source) start() { s.b = s.r - s.chw } 94 func (s *source) stop() { s.b = -1 } 95 func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] } 96 97 // rewind rewinds the scanner's read position and character s.ch 98 // to the start of the currently active segment, which must not 99 // contain any newlines (otherwise position information will be 100 // incorrect). Currently, rewind is only needed for handling the 101 // source sequence ".."; it must not be called outside an active 102 // segment. 103 func (s *source) rewind() { 104 // ok to verify precondition - rewind is rarely called 105 if s.b < 0 { 106 panic("no active segment") 107 } 108 s.col -= uint(s.r - s.b) 109 s.r = s.b 110 s.nextch() 111 } 112 113 func (s *source) nextch() { 114 redo: 115 s.col += uint(s.chw) 116 if s.ch == '\n' { 117 s.line++ 118 s.col = 0 119 } 120 121 // fast common case: at least one ASCII character 122 if s.ch = rune(s.buf[s.r]); s.ch < sentinel { 123 s.r++ 124 s.chw = 1 125 if s.ch == 0 { 126 s.error("invalid NUL character") 127 goto redo 128 } 129 return 130 } 131 132 // slower general case: add more bytes to buffer if we don't have a full rune 133 for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil { 134 s.fill() 135 } 136 137 // EOF 138 if s.r == s.e { 139 if s.ioerr != io.EOF { 140 // ensure we never start with a '/' (e.g., rooted path) in the error message 141 s.error("I/O error: " + s.ioerr.Error()) 142 s.ioerr = nil 143 } 144 s.ch = -1 145 s.chw = 0 146 return 147 } 148 149 s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e]) 150 s.r += s.chw 151 152 if s.ch == utf8.RuneError && s.chw == 1 { 153 s.error("invalid UTF-8 encoding") 154 goto redo 155 } 156 157 // BOM's are only allowed as the first character in a file 158 const BOM = 0xfeff 159 if s.ch == BOM { 160 if s.line > 0 || s.col > 0 { 161 s.error("invalid BOM in the middle of the file") 162 } 163 goto redo 164 } 165 } 166 167 // fill reads more source bytes into s.buf. 168 // It returns with at least one more byte in the buffer, or with s.ioerr != nil. 169 func (s *source) fill() { 170 // determine content to preserve 171 b := s.r 172 if s.b >= 0 { 173 b = s.b 174 s.b = 0 // after buffer has grown or content has been moved down 175 } 176 content := s.buf[b:s.e] 177 178 // grow buffer or move content down 179 if len(content)*2 > len(s.buf) { 180 s.buf = make([]byte, nextSize(len(s.buf))) 181 copy(s.buf, content) 182 } else if b > 0 { 183 copy(s.buf, content) 184 } 185 s.r -= b 186 s.e -= b 187 188 // read more data: try a limited number of times 189 for i := 0; i < 10; i++ { 190 var n int 191 n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel 192 if n < 0 { 193 panic("negative read") // incorrect underlying io.Reader implementation 194 } 195 if n > 0 || s.ioerr != nil { 196 s.e += n 197 s.buf[s.e] = sentinel 198 return 199 } 200 // n == 0 201 } 202 203 s.buf[s.e] = sentinel 204 s.ioerr = io.ErrNoProgress 205 } 206 207 // nextSize returns the next bigger size for a buffer of a given size. 208 func nextSize(size int) int { 209 const min = 4 << 10 // 4K: minimum buffer size 210 const max = 1 << 20 // 1M: maximum buffer size which is still doubled 211 if size < min { 212 return min 213 } 214 if size <= max { 215 return size << 1 216 } 217 return size + max 218 } 219