1
2
3
4
5 package utf8_test
6
7 import (
8 "bytes"
9 "strings"
10 "testing"
11 "unicode"
12 . "unicode/utf8"
13 )
14
15
16 func init() {
17 if MaxRune != unicode.MaxRune {
18 panic("utf8.MaxRune is wrong")
19 }
20 if RuneError != unicode.ReplacementChar {
21 panic("utf8.RuneError is wrong")
22 }
23 }
24
25
26 func TestConstants(t *testing.T) {
27 if MaxRune != unicode.MaxRune {
28 t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
29 }
30 if RuneError != unicode.ReplacementChar {
31 t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
32 }
33 }
34
35 type Utf8Map struct {
36 r rune
37 str string
38 }
39
40 var utf8map = []Utf8Map{
41 {0x0000, "\x00"},
42 {0x0001, "\x01"},
43 {0x007e, "\x7e"},
44 {0x007f, "\x7f"},
45 {0x0080, "\xc2\x80"},
46 {0x0081, "\xc2\x81"},
47 {0x00bf, "\xc2\xbf"},
48 {0x00c0, "\xc3\x80"},
49 {0x00c1, "\xc3\x81"},
50 {0x00c8, "\xc3\x88"},
51 {0x00d0, "\xc3\x90"},
52 {0x00e0, "\xc3\xa0"},
53 {0x00f0, "\xc3\xb0"},
54 {0x00f8, "\xc3\xb8"},
55 {0x00ff, "\xc3\xbf"},
56 {0x0100, "\xc4\x80"},
57 {0x07ff, "\xdf\xbf"},
58 {0x0400, "\xd0\x80"},
59 {0x0800, "\xe0\xa0\x80"},
60 {0x0801, "\xe0\xa0\x81"},
61 {0x1000, "\xe1\x80\x80"},
62 {0xd000, "\xed\x80\x80"},
63 {0xd7ff, "\xed\x9f\xbf"},
64 {0xe000, "\xee\x80\x80"},
65 {0xfffe, "\xef\xbf\xbe"},
66 {0xffff, "\xef\xbf\xbf"},
67 {0x10000, "\xf0\x90\x80\x80"},
68 {0x10001, "\xf0\x90\x80\x81"},
69 {0x40000, "\xf1\x80\x80\x80"},
70 {0x10fffe, "\xf4\x8f\xbf\xbe"},
71 {0x10ffff, "\xf4\x8f\xbf\xbf"},
72 {0xFFFD, "\xef\xbf\xbd"},
73 }
74
75 var surrogateMap = []Utf8Map{
76 {0xd800, "\xed\xa0\x80"},
77 {0xdfff, "\xed\xbf\xbf"},
78 }
79
80 var testStrings = []string{
81 "",
82 "abcd",
83 "☺☻☹",
84 "日a本b語ç日ð本Ê語þ日¥本¼語i日©",
85 "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
86 "\x80\x80\x80\x80",
87 }
88
89 func TestFullRune(t *testing.T) {
90 for _, m := range utf8map {
91 b := []byte(m.str)
92 if !FullRune(b) {
93 t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
94 }
95 s := m.str
96 if !FullRuneInString(s) {
97 t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
98 }
99 b1 := b[0 : len(b)-1]
100 if FullRune(b1) {
101 t.Errorf("FullRune(%q) = true, want false", b1)
102 }
103 s1 := string(b1)
104 if FullRuneInString(s1) {
105 t.Errorf("FullRune(%q) = true, want false", s1)
106 }
107 }
108 for _, s := range []string{"\xc0", "\xc1"} {
109 b := []byte(s)
110 if !FullRune(b) {
111 t.Errorf("FullRune(%q) = false, want true", s)
112 }
113 if !FullRuneInString(s) {
114 t.Errorf("FullRuneInString(%q) = false, want true", s)
115 }
116 }
117 }
118
119 func TestEncodeRune(t *testing.T) {
120 for _, m := range utf8map {
121 b := []byte(m.str)
122 var buf [10]byte
123 n := EncodeRune(buf[0:], m.r)
124 b1 := buf[0:n]
125 if !bytes.Equal(b, b1) {
126 t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
127 }
128 }
129 }
130
131 func TestAppendRune(t *testing.T) {
132 for _, m := range utf8map {
133 if buf := AppendRune(nil, m.r); string(buf) != m.str {
134 t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
135 }
136 if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
137 t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
138 }
139 }
140 }
141
142 func TestDecodeRune(t *testing.T) {
143 for _, m := range utf8map {
144 b := []byte(m.str)
145 r, size := DecodeRune(b)
146 if r != m.r || size != len(b) {
147 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
148 }
149 s := m.str
150 r, size = DecodeRuneInString(s)
151 if r != m.r || size != len(b) {
152 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
153 }
154
155
156 r, size = DecodeRune(b[0:cap(b)])
157 if r != m.r || size != len(b) {
158 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
159 }
160 s = m.str + "\x00"
161 r, size = DecodeRuneInString(s)
162 if r != m.r || size != len(b) {
163 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
164 }
165
166
167 wantsize := 1
168 if wantsize >= len(b) {
169 wantsize = 0
170 }
171 r, size = DecodeRune(b[0 : len(b)-1])
172 if r != RuneError || size != wantsize {
173 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
174 }
175 s = m.str[0 : len(m.str)-1]
176 r, size = DecodeRuneInString(s)
177 if r != RuneError || size != wantsize {
178 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
179 }
180
181
182 if len(b) == 1 {
183 b[0] = 0x80
184 } else {
185 b[len(b)-1] = 0x7F
186 }
187 r, size = DecodeRune(b)
188 if r != RuneError || size != 1 {
189 t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
190 }
191 s = string(b)
192 r, size = DecodeRuneInString(s)
193 if r != RuneError || size != 1 {
194 t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
195 }
196
197 }
198 }
199
200 func TestDecodeSurrogateRune(t *testing.T) {
201 for _, m := range surrogateMap {
202 b := []byte(m.str)
203 r, size := DecodeRune(b)
204 if r != RuneError || size != 1 {
205 t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
206 }
207 s := m.str
208 r, size = DecodeRuneInString(s)
209 if r != RuneError || size != 1 {
210 t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
211 }
212 }
213 }
214
215
216
217 func TestSequencing(t *testing.T) {
218 for _, ts := range testStrings {
219 for _, m := range utf8map {
220 for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
221 testSequence(t, s)
222 }
223 }
224 }
225 }
226
227 func runtimeRuneCount(s string) int {
228 return len([]rune(s))
229 }
230
231
232
233
234
235 func TestRuntimeConversion(t *testing.T) {
236 for _, ts := range testStrings {
237 count := RuneCountInString(ts)
238 if n := runtimeRuneCount(ts); n != count {
239 t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCountInString", ts, n, count)
240 break
241 }
242
243 runes := []rune(ts)
244 if n := len(runes); n != count {
245 t.Errorf("%q: []rune() has length %d; got %d from RuneCountInString", ts, n, count)
246 break
247 }
248 i := 0
249 for _, r := range ts {
250 if r != runes[i] {
251 t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
252 }
253 i++
254 }
255 }
256 }
257
258 var invalidSequenceTests = []string{
259 "\xed\xa0\x80\x80",
260 "\xed\xbf\xbf\x80",
261
262
263 "\x91\x80\x80\x80",
264
265
266 "\xC2\x7F\x80\x80",
267 "\xC2\xC0\x80\x80",
268 "\xDF\x7F\x80\x80",
269 "\xDF\xC0\x80\x80",
270
271
272 "\xE0\x9F\xBF\x80",
273 "\xE0\xA0\x7F\x80",
274 "\xE0\xBF\xC0\x80",
275 "\xE0\xC0\x80\x80",
276
277
278 "\xE1\x7F\xBF\x80",
279 "\xE1\x80\x7F\x80",
280 "\xE1\xBF\xC0\x80",
281 "\xE1\xC0\x80\x80",
282
283
284 "\xED\x7F\xBF\x80",
285 "\xED\x80\x7F\x80",
286 "\xED\x9F\xC0\x80",
287 "\xED\xA0\x80\x80",
288
289
290 "\xF0\x8F\xBF\xBF",
291 "\xF0\x90\x7F\xBF",
292 "\xF0\x90\x80\x7F",
293 "\xF0\xBF\xBF\xC0",
294 "\xF0\xBF\xC0\x80",
295 "\xF0\xC0\x80\x80",
296
297
298 "\xF1\x7F\xBF\xBF",
299 "\xF1\x80\x7F\xBF",
300 "\xF1\x80\x80\x7F",
301 "\xF1\xBF\xBF\xC0",
302 "\xF1\xBF\xC0\x80",
303 "\xF1\xC0\x80\x80",
304
305
306 "\xF4\x7F\xBF\xBF",
307 "\xF4\x80\x7F\xBF",
308 "\xF4\x80\x80\x7F",
309 "\xF4\x8F\xBF\xC0",
310 "\xF4\x8F\xC0\x80",
311 "\xF4\x90\x80\x80",
312 }
313
314 func runtimeDecodeRune(s string) rune {
315 for _, r := range s {
316 return r
317 }
318 return -1
319 }
320
321 func TestDecodeInvalidSequence(t *testing.T) {
322 for _, s := range invalidSequenceTests {
323 r1, _ := DecodeRune([]byte(s))
324 if want := RuneError; r1 != want {
325 t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want)
326 return
327 }
328 r2, _ := DecodeRuneInString(s)
329 if want := RuneError; r2 != want {
330 t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want)
331 return
332 }
333 if r1 != r2 {
334 t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2)
335 return
336 }
337 r3 := runtimeDecodeRune(s)
338 if r2 != r3 {
339 t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
340 return
341 }
342 }
343 }
344
345 func testSequence(t *testing.T, s string) {
346 type info struct {
347 index int
348 r rune
349 }
350 index := make([]info, len(s))
351 b := []byte(s)
352 si := 0
353 j := 0
354 for i, r := range s {
355 if si != i {
356 t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
357 return
358 }
359 index[j] = info{i, r}
360 j++
361 r1, size1 := DecodeRune(b[i:])
362 if r != r1 {
363 t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
364 return
365 }
366 r2, size2 := DecodeRuneInString(s[i:])
367 if r != r2 {
368 t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
369 return
370 }
371 if size1 != size2 {
372 t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
373 return
374 }
375 si += size1
376 }
377 j--
378 for si = len(s); si > 0; {
379 r1, size1 := DecodeLastRune(b[0:si])
380 r2, size2 := DecodeLastRuneInString(s[0:si])
381 if size1 != size2 {
382 t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
383 return
384 }
385 if r1 != index[j].r {
386 t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
387 return
388 }
389 if r2 != index[j].r {
390 t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
391 return
392 }
393 si -= size1
394 if si != index[j].index {
395 t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
396 return
397 }
398 j--
399 }
400 if si != 0 {
401 t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
402 }
403 }
404
405
406 func TestNegativeRune(t *testing.T) {
407 errorbuf := make([]byte, UTFMax)
408 errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
409 buf := make([]byte, UTFMax)
410 buf = buf[0:EncodeRune(buf, -1)]
411 if !bytes.Equal(buf, errorbuf) {
412 t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
413 }
414 }
415
416 type RuneCountTest struct {
417 in string
418 out int
419 }
420
421 var runecounttests = []RuneCountTest{
422 {"abcd", 4},
423 {"☺☻☹", 3},
424 {"1,2,3,4", 7},
425 {"\xe2\x00", 2},
426 {"\xe2\x80", 2},
427 {"a\xe2\x80", 3},
428 }
429
430 func TestRuneCount(t *testing.T) {
431 for _, tt := range runecounttests {
432 if out := RuneCountInString(tt.in); out != tt.out {
433 t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
434 }
435 if out := RuneCount([]byte(tt.in)); out != tt.out {
436 t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
437 }
438 }
439 }
440
441 type RuneLenTest struct {
442 r rune
443 size int
444 }
445
446 var runelentests = []RuneLenTest{
447 {0, 1},
448 {'e', 1},
449 {'é', 2},
450 {'☺', 3},
451 {RuneError, 3},
452 {MaxRune, 4},
453 {0xD800, -1},
454 {0xDFFF, -1},
455 {MaxRune + 1, -1},
456 {-1, -1},
457 }
458
459 func TestRuneLen(t *testing.T) {
460 for _, tt := range runelentests {
461 if size := RuneLen(tt.r); size != tt.size {
462 t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
463 }
464 }
465 }
466
467 type ValidTest struct {
468 in string
469 out bool
470 }
471
472 var validTests = []ValidTest{
473 {"", true},
474 {"a", true},
475 {"abc", true},
476 {"Ж", true},
477 {"ЖЖ", true},
478 {"брэд-ЛГТМ", true},
479 {"☺☻☹", true},
480 {"aa\xe2", false},
481 {string([]byte{66, 250}), false},
482 {string([]byte{66, 250, 67}), false},
483 {"a\uFFFDb", true},
484 {string("\xF4\x8F\xBF\xBF"), true},
485 {string("\xF4\x90\x80\x80"), false},
486 {string("\xF7\xBF\xBF\xBF"), false},
487 {string("\xFB\xBF\xBF\xBF\xBF"), false},
488 {string("\xc0\x80"), false},
489 {string("\xed\xa0\x80"), false},
490 {string("\xed\xbf\xbf"), false},
491 }
492
493 func TestValid(t *testing.T) {
494 for _, tt := range validTests {
495 if Valid([]byte(tt.in)) != tt.out {
496 t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
497 }
498 if ValidString(tt.in) != tt.out {
499 t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
500 }
501 }
502 }
503
504 type ValidRuneTest struct {
505 r rune
506 ok bool
507 }
508
509 var validrunetests = []ValidRuneTest{
510 {0, true},
511 {'e', true},
512 {'é', true},
513 {'☺', true},
514 {RuneError, true},
515 {MaxRune, true},
516 {0xD7FF, true},
517 {0xD800, false},
518 {0xDFFF, false},
519 {0xE000, true},
520 {MaxRune + 1, false},
521 {-1, false},
522 }
523
524 func TestValidRune(t *testing.T) {
525 for _, tt := range validrunetests {
526 if ok := ValidRune(tt.r); ok != tt.ok {
527 t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
528 }
529 }
530 }
531
532 func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
533 s := []byte("0123456789")
534 for i := 0; i < b.N; i++ {
535 RuneCount(s)
536 }
537 }
538
539 func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
540 s := []byte("日本語日本語日本語日")
541 for i := 0; i < b.N; i++ {
542 RuneCount(s)
543 }
544 }
545
546 func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) {
547 for i := 0; i < b.N; i++ {
548 RuneCountInString("0123456789")
549 }
550 }
551
552 func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
553 for i := 0; i < b.N; i++ {
554 RuneCountInString("日本語日本語日本語日")
555 }
556 }
557
558 var ascii100000 = strings.Repeat("0123456789", 10000)
559
560 func BenchmarkValidTenASCIIChars(b *testing.B) {
561 s := []byte("0123456789")
562 for i := 0; i < b.N; i++ {
563 Valid(s)
564 }
565 }
566
567 func BenchmarkValid100KASCIIChars(b *testing.B) {
568 s := []byte(ascii100000)
569 for i := 0; i < b.N; i++ {
570 Valid(s)
571 }
572 }
573
574 func BenchmarkValidTenJapaneseChars(b *testing.B) {
575 s := []byte("日本語日本語日本語日")
576 for i := 0; i < b.N; i++ {
577 Valid(s)
578 }
579 }
580 func BenchmarkValidLongMostlyASCII(b *testing.B) {
581 longMostlyASCII := []byte(longStringMostlyASCII)
582 for i := 0; i < b.N; i++ {
583 Valid(longMostlyASCII)
584 }
585 }
586
587 func BenchmarkValidLongJapanese(b *testing.B) {
588 longJapanese := []byte(longStringJapanese)
589 for i := 0; i < b.N; i++ {
590 Valid(longJapanese)
591 }
592 }
593
594 func BenchmarkValidStringTenASCIIChars(b *testing.B) {
595 for i := 0; i < b.N; i++ {
596 ValidString("0123456789")
597 }
598 }
599
600 func BenchmarkValidString100KASCIIChars(b *testing.B) {
601 for i := 0; i < b.N; i++ {
602 ValidString(ascii100000)
603 }
604 }
605
606 func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
607 for i := 0; i < b.N; i++ {
608 ValidString("日本語日本語日本語日")
609 }
610 }
611
612 func BenchmarkValidStringLongMostlyASCII(b *testing.B) {
613 for i := 0; i < b.N; i++ {
614 ValidString(longStringMostlyASCII)
615 }
616 }
617
618 func BenchmarkValidStringLongJapanese(b *testing.B) {
619 for i := 0; i < b.N; i++ {
620 ValidString(longStringJapanese)
621 }
622 }
623
624 var longStringMostlyASCII string
625 var longStringJapanese string
626
627 func init() {
628 const japanese = "日本語日本語日本語日"
629 var b strings.Builder
630 for i := 0; b.Len() < 100_000; i++ {
631 if i%100 == 0 {
632 b.WriteString(japanese)
633 } else {
634 b.WriteString("0123456789")
635 }
636 }
637 longStringMostlyASCII = b.String()
638 longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese))
639 }
640
641 func BenchmarkEncodeASCIIRune(b *testing.B) {
642 buf := make([]byte, UTFMax)
643 for i := 0; i < b.N; i++ {
644 EncodeRune(buf, 'a')
645 }
646 }
647
648 func BenchmarkEncodeJapaneseRune(b *testing.B) {
649 buf := make([]byte, UTFMax)
650 for i := 0; i < b.N; i++ {
651 EncodeRune(buf, '本')
652 }
653 }
654
655 func BenchmarkAppendASCIIRune(b *testing.B) {
656 buf := make([]byte, UTFMax)
657 for i := 0; i < b.N; i++ {
658 AppendRune(buf[:0], 'a')
659 }
660 }
661
662 func BenchmarkAppendJapaneseRune(b *testing.B) {
663 buf := make([]byte, UTFMax)
664 for i := 0; i < b.N; i++ {
665 AppendRune(buf[:0], '本')
666 }
667 }
668
669 func BenchmarkDecodeASCIIRune(b *testing.B) {
670 a := []byte{'a'}
671 for i := 0; i < b.N; i++ {
672 DecodeRune(a)
673 }
674 }
675
676 func BenchmarkDecodeJapaneseRune(b *testing.B) {
677 nihon := []byte("本")
678 for i := 0; i < b.N; i++ {
679 DecodeRune(nihon)
680 }
681 }
682
683
684
685 var boolSink bool
686
687 func BenchmarkFullRune(b *testing.B) {
688 benchmarks := []struct {
689 name string
690 data []byte
691 }{
692 {"ASCII", []byte("a")},
693 {"Incomplete", []byte("\xf0\x90\x80")},
694 {"Japanese", []byte("本")},
695 }
696 for _, bm := range benchmarks {
697 b.Run(bm.name, func(b *testing.B) {
698 for i := 0; i < b.N; i++ {
699 boolSink = FullRune(bm.data)
700 }
701 })
702 }
703 }
704
View as plain text