1
2
3
4
5
6
7
8 package utf8
9
10
11
12
13
14
15 const (
16 RuneError = '\uFFFD'
17 RuneSelf = 0x80
18 MaxRune = '\U0010FFFF'
19 UTFMax = 4
20 )
21
22
23 const (
24 surrogateMin = 0xD800
25 surrogateMax = 0xDFFF
26 )
27
28 const (
29 t1 = 0b00000000
30 tx = 0b10000000
31 t2 = 0b11000000
32 t3 = 0b11100000
33 t4 = 0b11110000
34 t5 = 0b11111000
35
36 maskx = 0b00111111
37 mask2 = 0b00011111
38 mask3 = 0b00001111
39 mask4 = 0b00000111
40
41 rune1Max = 1<<7 - 1
42 rune2Max = 1<<11 - 1
43 rune3Max = 1<<16 - 1
44
45
46 locb = 0b10000000
47 hicb = 0b10111111
48
49
50
51
52
53 xx = 0xF1
54 as = 0xF0
55 s1 = 0x02
56 s2 = 0x13
57 s3 = 0x03
58 s4 = 0x23
59 s5 = 0x34
60 s6 = 0x04
61 s7 = 0x44
62 )
63
64 const (
65 runeErrorByte0 = t3 | (RuneError >> 12)
66 runeErrorByte1 = tx | (RuneError>>6)&maskx
67 runeErrorByte2 = tx | RuneError&maskx
68 )
69
70
71 var first = [256]uint8{
72
73 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
74 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
75 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
76 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
77 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
78 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
79 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
80 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as,
81
82 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
83 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
84 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
85 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
86 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
87 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
88 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
89 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
90 }
91
92
93
94 type acceptRange struct {
95 lo uint8
96 hi uint8
97 }
98
99
100 var acceptRanges = [16]acceptRange{
101 0: {locb, hicb},
102 1: {0xA0, hicb},
103 2: {locb, 0x9F},
104 3: {0x90, hicb},
105 4: {locb, 0x8F},
106 }
107
108
109
110 func FullRune(p []byte) bool {
111 n := len(p)
112 if n == 0 {
113 return false
114 }
115 x := first[p[0]]
116 if n >= int(x&7) {
117 return true
118 }
119
120 accept := acceptRanges[x>>4]
121 if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
122 return true
123 } else if n > 2 && (p[2] < locb || hicb < p[2]) {
124 return true
125 }
126 return false
127 }
128
129
130 func FullRuneInString(s string) bool {
131 n := len(s)
132 if n == 0 {
133 return false
134 }
135 x := first[s[0]]
136 if n >= int(x&7) {
137 return true
138 }
139
140 accept := acceptRanges[x>>4]
141 if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
142 return true
143 } else if n > 2 && (s[2] < locb || hicb < s[2]) {
144 return true
145 }
146 return false
147 }
148
149
150
151
152
153
154
155
156
157 func DecodeRune(p []byte) (r rune, size int) {
158
159
160
161 for _, b := range p {
162 if b < RuneSelf {
163 return rune(b), 1
164 }
165 break
166 }
167 r, size = decodeRuneSlow(p)
168 return
169 }
170
171 func decodeRuneSlow(p []byte) (r rune, size int) {
172 n := len(p)
173 if n < 1 {
174 return RuneError, 0
175 }
176 p0 := p[0]
177 x := first[p0]
178 if x >= as {
179
180
181
182 mask := rune(x) << 31 >> 31
183 return rune(p[0])&^mask | RuneError&mask, 1
184 }
185 sz := int(x & 7)
186 accept := acceptRanges[x>>4]
187 if n < sz {
188 return RuneError, 1
189 }
190 b1 := p[1]
191 if b1 < accept.lo || accept.hi < b1 {
192 return RuneError, 1
193 }
194 if sz <= 2 {
195 return rune(p0&mask2)<<6 | rune(b1&maskx), 2
196 }
197 b2 := p[2]
198 if b2 < locb || hicb < b2 {
199 return RuneError, 1
200 }
201 if sz <= 3 {
202 return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
203 }
204 b3 := p[3]
205 if b3 < locb || hicb < b3 {
206 return RuneError, 1
207 }
208 return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
209 }
210
211
212
213
214
215
216
217
218
219 func DecodeRuneInString(s string) (r rune, size int) {
220
221
222
223 if s != "" && s[0] < RuneSelf {
224 return rune(s[0]), 1
225 } else {
226 r, size = decodeRuneInStringSlow(s)
227 }
228 return
229 }
230
231 func decodeRuneInStringSlow(s string) (rune, int) {
232 n := len(s)
233 if n < 1 {
234 return RuneError, 0
235 }
236 s0 := s[0]
237 x := first[s0]
238 if x >= as {
239
240
241
242 mask := rune(x) << 31 >> 31
243 return rune(s[0])&^mask | RuneError&mask, 1
244 }
245 sz := int(x & 7)
246 accept := acceptRanges[x>>4]
247 if n < sz {
248 return RuneError, 1
249 }
250 s1 := s[1]
251 if s1 < accept.lo || accept.hi < s1 {
252 return RuneError, 1
253 }
254 if sz <= 2 {
255 return rune(s0&mask2)<<6 | rune(s1&maskx), 2
256 }
257 s2 := s[2]
258 if s2 < locb || hicb < s2 {
259 return RuneError, 1
260 }
261 if sz <= 3 {
262 return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
263 }
264 s3 := s[3]
265 if s3 < locb || hicb < s3 {
266 return RuneError, 1
267 }
268 return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
269 }
270
271
272
273
274
275
276
277
278
279 func DecodeLastRune(p []byte) (r rune, size int) {
280 end := len(p)
281 if end == 0 {
282 return RuneError, 0
283 }
284 start := end - 1
285 r = rune(p[start])
286 if r < RuneSelf {
287 return r, 1
288 }
289
290
291
292 lim := max(end-UTFMax, 0)
293 for start--; start >= lim; start-- {
294 if RuneStart(p[start]) {
295 break
296 }
297 }
298 if start < 0 {
299 start = 0
300 }
301 r, size = DecodeRune(p[start:end])
302 if start+size != end {
303 return RuneError, 1
304 }
305 return r, size
306 }
307
308
309
310
311
312
313
314
315
316 func DecodeLastRuneInString(s string) (r rune, size int) {
317 end := len(s)
318 if end == 0 {
319 return RuneError, 0
320 }
321 start := end - 1
322 r = rune(s[start])
323 if r < RuneSelf {
324 return r, 1
325 }
326
327
328
329 lim := max(end-UTFMax, 0)
330 for start--; start >= lim; start-- {
331 if RuneStart(s[start]) {
332 break
333 }
334 }
335 if start < 0 {
336 start = 0
337 }
338 r, size = DecodeRuneInString(s[start:end])
339 if start+size != end {
340 return RuneError, 1
341 }
342 return r, size
343 }
344
345
346
347 func RuneLen(r rune) int {
348 switch {
349 case r < 0:
350 return -1
351 case r <= rune1Max:
352 return 1
353 case r <= rune2Max:
354 return 2
355 case surrogateMin <= r && r <= surrogateMax:
356 return -1
357 case r <= rune3Max:
358 return 3
359 case r <= MaxRune:
360 return 4
361 }
362 return -1
363 }
364
365
366
367
368 func EncodeRune(p []byte, r rune) int {
369
370 if uint32(r) <= rune1Max {
371 p[0] = byte(r)
372 return 1
373 }
374 return encodeRuneNonASCII(p, r)
375 }
376
377 func encodeRuneNonASCII(p []byte, r rune) int {
378
379 switch i := uint32(r); {
380 case i <= rune2Max:
381 _ = p[1]
382 p[0] = t2 | byte(r>>6)
383 p[1] = tx | byte(r)&maskx
384 return 2
385 case i < surrogateMin, surrogateMax < i && i <= rune3Max:
386 _ = p[2]
387 p[0] = t3 | byte(r>>12)
388 p[1] = tx | byte(r>>6)&maskx
389 p[2] = tx | byte(r)&maskx
390 return 3
391 case i > rune3Max && i <= MaxRune:
392 _ = p[3]
393 p[0] = t4 | byte(r>>18)
394 p[1] = tx | byte(r>>12)&maskx
395 p[2] = tx | byte(r>>6)&maskx
396 p[3] = tx | byte(r)&maskx
397 return 4
398 default:
399 _ = p[2]
400 p[0] = runeErrorByte0
401 p[1] = runeErrorByte1
402 p[2] = runeErrorByte2
403 return 3
404 }
405 }
406
407
408
409
410 func AppendRune(p []byte, r rune) []byte {
411
412 if uint32(r) <= rune1Max {
413 return append(p, byte(r))
414 }
415 return appendRuneNonASCII(p, r)
416 }
417
418 func appendRuneNonASCII(p []byte, r rune) []byte {
419
420 switch i := uint32(r); {
421 case i <= rune2Max:
422 return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
423 case i < surrogateMin, surrogateMax < i && i <= rune3Max:
424 return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
425 case i > rune3Max && i <= MaxRune:
426 return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
427 default:
428 return append(p, runeErrorByte0, runeErrorByte1, runeErrorByte2)
429 }
430 }
431
432
433
434 func RuneCount(p []byte) int {
435 np := len(p)
436 var n int
437 for ; n < np; n++ {
438 if c := p[n]; c >= RuneSelf {
439
440 return n + RuneCountInString(string(p[n:]))
441 }
442 }
443 return n
444 }
445
446
447 func RuneCountInString(s string) (n int) {
448 for range s {
449 n++
450 }
451 return n
452 }
453
454
455
456
457 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
458
459 const ptrSize = 4 << (^uintptr(0) >> 63)
460 const hiBits = 0x8080808080808080 >> (64 - 8*ptrSize)
461
462 func word[T string | []byte](s T) uintptr {
463 if ptrSize == 4 {
464 return uintptr(s[0]) | uintptr(s[1])<<8 | uintptr(s[2])<<16 | uintptr(s[3])<<24
465 }
466 return uintptr(uint64(s[0]) | uint64(s[1])<<8 | uint64(s[2])<<16 | uint64(s[3])<<24 | uint64(s[4])<<32 | uint64(s[5])<<40 | uint64(s[6])<<48 | uint64(s[7])<<56)
467 }
468
469
470 func Valid(p []byte) bool {
471
472
473
474 p = p[:len(p):len(p)]
475
476 for len(p) > 0 {
477 p0 := p[0]
478 if p0 < RuneSelf {
479 p = p[1:]
480
481
482
483
484 if len(p) > ptrSize && word(p)&hiBits == 0 {
485 p = p[ptrSize:]
486 if len(p) > 2*ptrSize && (word(p)|word(p[ptrSize:]))&hiBits == 0 {
487 p = p[2*ptrSize:]
488 for len(p) > 4*ptrSize && ((word(p)|word(p[ptrSize:]))|(word(p[2*ptrSize:])|word(p[3*ptrSize:])))&hiBits == 0 {
489 p = p[4*ptrSize:]
490 }
491 }
492 }
493 continue
494 }
495 x := first[p0]
496 size := int(x & 7)
497 accept := acceptRanges[x>>4]
498 switch size {
499 case 2:
500 if len(p) < 2 || p[1] < accept.lo || accept.hi < p[1] {
501 return false
502 }
503 p = p[2:]
504 case 3:
505 if len(p) < 3 || p[1] < accept.lo || accept.hi < p[1] || p[2] < locb || hicb < p[2] {
506 return false
507 }
508 p = p[3:]
509 case 4:
510 if len(p) < 4 || p[1] < accept.lo || accept.hi < p[1] || p[2] < locb || hicb < p[2] || p[3] < locb || hicb < p[3] {
511 return false
512 }
513 p = p[4:]
514 default:
515 return false
516 }
517 }
518 return true
519 }
520
521
522 func ValidString(s string) bool {
523 for len(s) > 0 {
524 s0 := s[0]
525 if s0 < RuneSelf {
526 s = s[1:]
527
528
529
530
531 if len(s) > ptrSize && word(s)&hiBits == 0 {
532 s = s[ptrSize:]
533 if len(s) > 2*ptrSize && (word(s)|word(s[ptrSize:]))&hiBits == 0 {
534 s = s[2*ptrSize:]
535 for len(s) > 4*ptrSize && ((word(s)|word(s[ptrSize:]))|(word(s[2*ptrSize:])|word(s[3*ptrSize:])))&hiBits == 0 {
536 s = s[4*ptrSize:]
537 }
538 }
539 }
540 continue
541 }
542 x := first[s0]
543 size := int(x & 7)
544 accept := acceptRanges[x>>4]
545 switch size {
546 case 2:
547 if len(s) < 2 || s[1] < accept.lo || accept.hi < s[1] {
548 return false
549 }
550 s = s[2:]
551 case 3:
552 if len(s) < 3 || s[1] < accept.lo || accept.hi < s[1] || s[2] < locb || hicb < s[2] {
553 return false
554 }
555 s = s[3:]
556 case 4:
557 if len(s) < 4 || s[1] < accept.lo || accept.hi < s[1] || s[2] < locb || hicb < s[2] || s[3] < locb || hicb < s[3] {
558 return false
559 }
560 s = s[4:]
561 default:
562 return false
563 }
564 }
565 return true
566 }
567
568
569
570 func ValidRune(r rune) bool {
571 switch {
572 case 0 <= r && r < surrogateMin:
573 return true
574 case surrogateMax < r && r <= MaxRune:
575 return true
576 }
577 return false
578 }
579
View as plain text