1
2
3
4
5 package suffixarray
6
7 import (
8 "bytes"
9 "fmt"
10 "io/fs"
11 "math/rand"
12 "os"
13 "path/filepath"
14 "regexp"
15 "slices"
16 "sort"
17 "strings"
18 "testing"
19 )
20
21 type testCase struct {
22 name string
23 source string
24 patterns []string
25 }
26
27 var testCases = []testCase{
28 {
29 "empty string",
30 "",
31 []string{
32 "",
33 "foo",
34 "(foo)",
35 ".*",
36 "a*",
37 },
38 },
39
40 {
41 "all a's",
42 "aaaaaaaaaa",
43 []string{
44 "",
45 "a",
46 "aa",
47 "aaa",
48 "aaaa",
49 "aaaaa",
50 "aaaaaa",
51 "aaaaaaa",
52 "aaaaaaaa",
53 "aaaaaaaaa",
54 "aaaaaaaaaa",
55 "aaaaaaaaaaa",
56 ".",
57 ".*",
58 "a+",
59 "aa+",
60 "aaaa[b]?",
61 "aaa*",
62 },
63 },
64
65 {
66 "abc",
67 "abc",
68 []string{
69 "a",
70 "b",
71 "c",
72 "ab",
73 "bc",
74 "abc",
75 "a.c",
76 "a(b|c)",
77 "abc?",
78 },
79 },
80
81 {
82 "barbara*3",
83 "barbarabarbarabarbara",
84 []string{
85 "a",
86 "bar",
87 "rab",
88 "arab",
89 "barbar",
90 "bara?bar",
91 },
92 },
93
94 {
95 "typing drill",
96 "Now is the time for all good men to come to the aid of their country.",
97 []string{
98 "Now",
99 "the time",
100 "to come the aid",
101 "is the time for all good men to come to the aid of their",
102 "to (come|the)?",
103 },
104 },
105
106 {
107 "godoc simulation",
108 "package main\n\nimport(\n \"rand\"\n ",
109 []string{},
110 },
111 }
112
113
114 func find(src, s string, n int) []int {
115 var res []int
116 if s != "" && n != 0 {
117
118 for i := -1; n < 0 || len(res) < n; {
119 j := strings.Index(src[i+1:], s)
120 if j < 0 {
121 break
122 }
123 i += j + 1
124 res = append(res, i)
125 }
126 }
127 return res
128 }
129
130 func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) {
131 res := x.Lookup([]byte(s), n)
132 exp := find(tc.source, s, n)
133
134
135 if len(res) != len(exp) {
136 t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res))
137 }
138
139
140
141
142
143
144
145 slices.Sort(res)
146 for i, r := range res {
147 if r < 0 || len(tc.source) <= r {
148 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source))
149 } else if !strings.HasPrefix(tc.source[r:], s) {
150 t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r)
151 }
152 if i > 0 && res[i-1] == r {
153 t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r)
154 }
155 }
156
157 if n < 0 {
158
159 for i, r := range res {
160 e := exp[i]
161 if r != e {
162 t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r)
163 }
164 }
165 }
166 }
167
168 func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) {
169 res := x.FindAllIndex(rx, n)
170 exp := rx.FindAllStringIndex(tc.source, n)
171
172
173 if len(res) != len(exp) {
174 t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res))
175 }
176
177
178
179
180
181
182
183 for i, r := range res {
184 if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] {
185 t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1])
186 } else if !rx.MatchString(tc.source[r[0]:r[1]]) {
187 t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1])
188 }
189 }
190
191 if n < 0 {
192
193 for i, r := range res {
194 e := exp[i]
195 if r[0] != e[0] || r[1] != e[1] {
196 t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]",
197 tc.name, rx, i, e[0], e[1], r[0], r[1])
198 }
199 }
200 }
201 }
202
203 func testLookups(t *testing.T, tc *testCase, x *Index, n int) {
204 for _, pat := range tc.patterns {
205 testLookup(t, tc, x, pat, n)
206 if rx, err := regexp.Compile(pat); err == nil {
207 testFindAllIndex(t, tc, x, rx, n)
208 }
209 }
210 }
211
212
213 type index Index
214
215 func (x *index) Len() int { return x.sa.len() }
216 func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
217 func (x *index) Swap(i, j int) {
218 if x.sa.int32 != nil {
219 x.sa.int32[i], x.sa.int32[j] = x.sa.int32[j], x.sa.int32[i]
220 } else {
221 x.sa.int64[i], x.sa.int64[j] = x.sa.int64[j], x.sa.int64[i]
222 }
223 }
224
225 func (x *index) at(i int) []byte {
226 return x.data[x.sa.get(i):]
227 }
228
229 func testConstruction(t *testing.T, tc *testCase, x *Index) {
230 if !sort.IsSorted((*index)(x)) {
231 t.Errorf("failed testConstruction %s", tc.name)
232 }
233 }
234
235 func equal(x, y *Index) bool {
236 if !bytes.Equal(x.data, y.data) {
237 return false
238 }
239 if x.sa.len() != y.sa.len() {
240 return false
241 }
242 n := x.sa.len()
243 for i := 0; i < n; i++ {
244 if x.sa.get(i) != y.sa.get(i) {
245 return false
246 }
247 }
248 return true
249 }
250
251
252 func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
253 var buf bytes.Buffer
254 if err := x.Write(&buf); err != nil {
255 t.Errorf("failed writing index %s (%s)", tc.name, err)
256 }
257 size := buf.Len()
258 var y Index
259 if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
260 t.Errorf("failed reading index %s (%s)", tc.name, err)
261 }
262 if !equal(x, &y) {
263 t.Errorf("restored index doesn't match saved index %s", tc.name)
264 }
265
266 old := maxData32
267 defer func() {
268 maxData32 = old
269 }()
270
271 y = Index{}
272 maxData32 = realMaxData32
273 if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
274 t.Errorf("failed reading index %s (%s)", tc.name, err)
275 }
276 if !equal(x, &y) {
277 t.Errorf("restored index doesn't match saved index %s", tc.name)
278 }
279
280
281 y = Index{}
282 maxData32 = -1
283 if err := y.Read(bytes.NewReader(buf.Bytes())); err != nil {
284 t.Errorf("failed reading index %s (%s)", tc.name, err)
285 }
286 if !equal(x, &y) {
287 t.Errorf("restored index doesn't match saved index %s", tc.name)
288 }
289
290 return size
291 }
292
293 func testIndex(t *testing.T) {
294 for _, tc := range testCases {
295 x := New([]byte(tc.source))
296 testConstruction(t, &tc, x)
297 testSaveRestore(t, &tc, x)
298 testLookups(t, &tc, x, 0)
299 testLookups(t, &tc, x, 1)
300 testLookups(t, &tc, x, 10)
301 testLookups(t, &tc, x, 2e9)
302 testLookups(t, &tc, x, -1)
303 }
304 }
305
306 func TestIndex32(t *testing.T) {
307 testIndex(t)
308 }
309
310 func TestIndex64(t *testing.T) {
311 maxData32 = -1
312 defer func() {
313 maxData32 = realMaxData32
314 }()
315 testIndex(t)
316 }
317
318 func TestNew32(t *testing.T) {
319 test(t, func(x []byte) []int {
320 sa := make([]int32, len(x))
321 text_32(x, sa)
322 out := make([]int, len(sa))
323 for i, v := range sa {
324 out[i] = int(v)
325 }
326 return out
327 })
328 }
329
330 func TestNew64(t *testing.T) {
331 test(t, func(x []byte) []int {
332 sa := make([]int64, len(x))
333 text_64(x, sa)
334 out := make([]int, len(sa))
335 for i, v := range sa {
336 out[i] = int(v)
337 }
338 return out
339 })
340 }
341
342
343
344 func test(t *testing.T, build func([]byte) []int) {
345 t.Run("ababab...", func(t *testing.T) {
346
347
348
349 size := 100000
350 if testing.Short() {
351 size = 10000
352 }
353 x := make([]byte, size)
354 for i := range x {
355 x[i] = "ab"[i%2]
356 }
357 testSA(t, x, build)
358 })
359
360 t.Run("forcealloc", func(t *testing.T) {
361
362
363
364
365
366
367
368
369
370
371
372 x := make([]byte, 100000, 100001)
373 lo := byte(1)
374 hi := byte(255)
375 for i := range x {
376 if i%2 == 0 {
377 x[i] = lo
378 } else {
379 x[i] = hi
380 hi--
381 if hi <= lo {
382 lo++
383 if lo == 0 {
384 lo = 1
385 }
386 hi = 255
387 }
388 }
389 }
390 x[:cap(x)][len(x)] = 0
391 testSA(t, x, build)
392 })
393
394 t.Run("exhaustive2", func(t *testing.T) {
395
396
397 x := make([]byte, 30)
398 numFail := 0
399 for n := 0; n <= 21; n++ {
400 if n > 12 && testing.Short() {
401 break
402 }
403 x[n] = 0
404 testRec(t, x[:n], 0, 2, &numFail, build)
405 }
406 })
407
408 t.Run("exhaustive3", func(t *testing.T) {
409
410
411 x := make([]byte, 30)
412 numFail := 0
413 for n := 0; n <= 14; n++ {
414 if n > 8 && testing.Short() {
415 break
416 }
417 x[n] = 0
418 testRec(t, x[:n], 0, 3, &numFail, build)
419 }
420 })
421 }
422
423
424
425 func testRec(t *testing.T, x []byte, i, max int, numFail *int, build func([]byte) []int) {
426 if i < len(x) {
427 for x[i] = 1; x[i] <= byte(max); x[i]++ {
428 testRec(t, x, i+1, max, numFail, build)
429 }
430 return
431 }
432
433 if !testSA(t, x, build) {
434 *numFail++
435 if *numFail >= 10 {
436 t.Errorf("stopping after %d failures", *numFail)
437 t.FailNow()
438 }
439 }
440 }
441
442
443
444 func testSA(t *testing.T, x []byte, build func([]byte) []int) bool {
445 defer func() {
446 if e := recover(); e != nil {
447 t.Logf("build %v", x)
448 panic(e)
449 }
450 }()
451 sa := build(x)
452 if len(sa) != len(x) {
453 t.Errorf("build %v: len(sa) = %d, want %d", x, len(sa), len(x))
454 return false
455 }
456 for i := 0; i+1 < len(sa); i++ {
457 if sa[i] < 0 || sa[i] >= len(x) || sa[i+1] < 0 || sa[i+1] >= len(x) {
458 t.Errorf("build %s: sa out of range: %v\n", x, sa)
459 return false
460 }
461 if bytes.Compare(x[sa[i]:], x[sa[i+1]:]) >= 0 {
462 t.Errorf("build %v -> %v\nsa[%d:] = %d,%d out of order", x, sa, i, sa[i], sa[i+1])
463 return false
464 }
465 }
466
467 return true
468 }
469
470 var (
471 benchdata = make([]byte, 1e6)
472 benchrand = make([]byte, 1e6)
473 )
474
475
476
477
478 func benchmarkNew(b *testing.B, random bool) {
479 b.ReportAllocs()
480 b.StopTimer()
481 data := benchdata
482 if random {
483 data = benchrand
484 if data[0] == 0 {
485 for i := range data {
486 data[i] = byte(rand.Intn(256))
487 }
488 }
489 }
490 b.StartTimer()
491 b.SetBytes(int64(len(data)))
492 for i := 0; i < b.N; i++ {
493 New(data)
494 }
495 }
496
497 func makeText(name string) ([]byte, error) {
498 var data []byte
499 switch name {
500 case "opticks":
501 var err error
502 data, err = os.ReadFile("../../testdata/Isaac.Newton-Opticks.txt")
503 if err != nil {
504 return nil, err
505 }
506 case "go":
507 err := filepath.WalkDir("../..", func(path string, info fs.DirEntry, err error) error {
508 if err == nil && strings.HasSuffix(path, ".go") && !info.IsDir() {
509 file, err := os.ReadFile(path)
510 if err != nil {
511 return err
512 }
513 data = append(data, file...)
514 }
515 return nil
516 })
517 if err != nil {
518 return nil, err
519 }
520 case "zero":
521 data = make([]byte, 50e6)
522 case "rand":
523 data = make([]byte, 50e6)
524 for i := range data {
525 data[i] = byte(rand.Intn(256))
526 }
527 }
528 return data, nil
529 }
530
531 func setBits(bits int) (cleanup func()) {
532 if bits == 32 {
533 maxData32 = realMaxData32
534 } else {
535 maxData32 = -1
536 }
537 return func() {
538 maxData32 = realMaxData32
539 }
540 }
541
542 func BenchmarkNew(b *testing.B) {
543 for _, text := range []string{"opticks", "go", "zero", "rand"} {
544 b.Run("text="+text, func(b *testing.B) {
545 data, err := makeText(text)
546 if err != nil {
547 b.Fatal(err)
548 }
549 if testing.Short() && len(data) > 5e6 {
550 data = data[:5e6]
551 }
552 for _, size := range []int{100e3, 500e3, 1e6, 5e6, 10e6, 50e6} {
553 if len(data) < size {
554 continue
555 }
556 data := data[:size]
557 name := fmt.Sprintf("%dK", size/1e3)
558 if size >= 1e6 {
559 name = fmt.Sprintf("%dM", size/1e6)
560 }
561 b.Run("size="+name, func(b *testing.B) {
562 for _, bits := range []int{32, 64} {
563 if ^uint(0) == 0xffffffff && bits == 64 {
564 continue
565 }
566 b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
567 cleanup := setBits(bits)
568 defer cleanup()
569
570 b.SetBytes(int64(len(data)))
571 b.ReportAllocs()
572 for i := 0; i < b.N; i++ {
573 New(data)
574 }
575 })
576 }
577 })
578 }
579 })
580 }
581 }
582
583 func BenchmarkSaveRestore(b *testing.B) {
584 r := rand.New(rand.NewSource(0x5a77a1))
585 data := make([]byte, 1<<20)
586 for i := range data {
587 data[i] = byte(r.Intn(256))
588 }
589 for _, bits := range []int{32, 64} {
590 if ^uint(0) == 0xffffffff && bits == 64 {
591 continue
592 }
593 b.Run(fmt.Sprintf("bits=%d", bits), func(b *testing.B) {
594 cleanup := setBits(bits)
595 defer cleanup()
596
597 b.StopTimer()
598 x := New(data)
599 size := testSaveRestore(nil, nil, x)
600 buf := bytes.NewBuffer(make([]byte, size))
601 b.SetBytes(int64(size))
602 b.StartTimer()
603 b.ReportAllocs()
604 for i := 0; i < b.N; i++ {
605 buf.Reset()
606 if err := x.Write(buf); err != nil {
607 b.Fatal(err)
608 }
609 var y Index
610 if err := y.Read(buf); err != nil {
611 b.Fatal(err)
612 }
613 }
614 })
615 }
616 }
617
View as plain text