Source file src/index/suffixarray/sais2.go
1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Code generated by go generate; DO NOT EDIT. 6 7 package suffixarray 8 9 func text_64(text []byte, sa []int64) { 10 if int(int64(len(text))) != len(text) || len(text) != len(sa) { 11 panic("suffixarray: misuse of text_64") 12 } 13 sais_8_64(text, 256, sa, make([]int64, 2*256)) 14 } 15 16 func sais_8_64(text []byte, textMax int, sa, tmp []int64) { 17 if len(sa) != len(text) || len(tmp) < textMax { 18 panic("suffixarray: misuse of sais_8_64") 19 } 20 21 // Trivial base cases. Sorting 0 or 1 things is easy. 22 if len(text) == 0 { 23 return 24 } 25 if len(text) == 1 { 26 sa[0] = 0 27 return 28 } 29 30 // Establish slices indexed by text character 31 // holding character frequency and bucket-sort offsets. 32 // If there's only enough tmp for one slice, 33 // we make it the bucket offsets and recompute 34 // the character frequency each time we need it. 35 var freq, bucket []int64 36 if len(tmp) >= 2*textMax { 37 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] 38 freq[0] = -1 // mark as uninitialized 39 } else { 40 freq, bucket = nil, tmp[:textMax] 41 } 42 43 // The SAIS algorithm. 44 // Each of these calls makes one scan through sa. 45 // See the individual functions for documentation 46 // about each's role in the algorithm. 47 numLMS := placeLMS_8_64(text, sa, freq, bucket) 48 if numLMS <= 1 { 49 // 0 or 1 items are already sorted. Do nothing. 50 } else { 51 induceSubL_8_64(text, sa, freq, bucket) 52 induceSubS_8_64(text, sa, freq, bucket) 53 length_8_64(text, sa, numLMS) 54 maxID := assignID_8_64(text, sa, numLMS) 55 if maxID < numLMS { 56 map_64(sa, numLMS) 57 recurse_64(sa, tmp, numLMS, maxID) 58 unmap_8_64(text, sa, numLMS) 59 } else { 60 // If maxID == numLMS, then each LMS-substring 61 // is unique, so the relative ordering of two LMS-suffixes 62 // is determined by just the leading LMS-substring. 63 // That is, the LMS-suffix sort order matches the 64 // (simpler) LMS-substring sort order. 65 // Copy the original LMS-substring order into the 66 // suffix array destination. 67 copy(sa, sa[len(sa)-numLMS:]) 68 } 69 expand_8_64(text, freq, bucket, sa, numLMS) 70 } 71 induceL_8_64(text, sa, freq, bucket) 72 induceS_8_64(text, sa, freq, bucket) 73 74 // Mark for caller that we overwrote tmp. 75 tmp[0] = -1 76 } 77 78 func sais_32(text []int32, textMax int, sa, tmp []int32) { 79 if len(sa) != len(text) || len(tmp) < textMax { 80 panic("suffixarray: misuse of sais_32") 81 } 82 83 // Trivial base cases. Sorting 0 or 1 things is easy. 84 if len(text) == 0 { 85 return 86 } 87 if len(text) == 1 { 88 sa[0] = 0 89 return 90 } 91 92 // Establish slices indexed by text character 93 // holding character frequency and bucket-sort offsets. 94 // If there's only enough tmp for one slice, 95 // we make it the bucket offsets and recompute 96 // the character frequency each time we need it. 97 var freq, bucket []int32 98 if len(tmp) >= 2*textMax { 99 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] 100 freq[0] = -1 // mark as uninitialized 101 } else { 102 freq, bucket = nil, tmp[:textMax] 103 } 104 105 // The SAIS algorithm. 106 // Each of these calls makes one scan through sa. 107 // See the individual functions for documentation 108 // about each's role in the algorithm. 109 numLMS := placeLMS_32(text, sa, freq, bucket) 110 if numLMS <= 1 { 111 // 0 or 1 items are already sorted. Do nothing. 112 } else { 113 induceSubL_32(text, sa, freq, bucket) 114 induceSubS_32(text, sa, freq, bucket) 115 length_32(text, sa, numLMS) 116 maxID := assignID_32(text, sa, numLMS) 117 if maxID < numLMS { 118 map_32(sa, numLMS) 119 recurse_32(sa, tmp, numLMS, maxID) 120 unmap_32(text, sa, numLMS) 121 } else { 122 // If maxID == numLMS, then each LMS-substring 123 // is unique, so the relative ordering of two LMS-suffixes 124 // is determined by just the leading LMS-substring. 125 // That is, the LMS-suffix sort order matches the 126 // (simpler) LMS-substring sort order. 127 // Copy the original LMS-substring order into the 128 // suffix array destination. 129 copy(sa, sa[len(sa)-numLMS:]) 130 } 131 expand_32(text, freq, bucket, sa, numLMS) 132 } 133 induceL_32(text, sa, freq, bucket) 134 induceS_32(text, sa, freq, bucket) 135 136 // Mark for caller that we overwrote tmp. 137 tmp[0] = -1 138 } 139 140 func sais_64(text []int64, textMax int, sa, tmp []int64) { 141 if len(sa) != len(text) || len(tmp) < textMax { 142 panic("suffixarray: misuse of sais_64") 143 } 144 145 // Trivial base cases. Sorting 0 or 1 things is easy. 146 if len(text) == 0 { 147 return 148 } 149 if len(text) == 1 { 150 sa[0] = 0 151 return 152 } 153 154 // Establish slices indexed by text character 155 // holding character frequency and bucket-sort offsets. 156 // If there's only enough tmp for one slice, 157 // we make it the bucket offsets and recompute 158 // the character frequency each time we need it. 159 var freq, bucket []int64 160 if len(tmp) >= 2*textMax { 161 freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] 162 freq[0] = -1 // mark as uninitialized 163 } else { 164 freq, bucket = nil, tmp[:textMax] 165 } 166 167 // The SAIS algorithm. 168 // Each of these calls makes one scan through sa. 169 // See the individual functions for documentation 170 // about each's role in the algorithm. 171 numLMS := placeLMS_64(text, sa, freq, bucket) 172 if numLMS <= 1 { 173 // 0 or 1 items are already sorted. Do nothing. 174 } else { 175 induceSubL_64(text, sa, freq, bucket) 176 induceSubS_64(text, sa, freq, bucket) 177 length_64(text, sa, numLMS) 178 maxID := assignID_64(text, sa, numLMS) 179 if maxID < numLMS { 180 map_64(sa, numLMS) 181 recurse_64(sa, tmp, numLMS, maxID) 182 unmap_64(text, sa, numLMS) 183 } else { 184 // If maxID == numLMS, then each LMS-substring 185 // is unique, so the relative ordering of two LMS-suffixes 186 // is determined by just the leading LMS-substring. 187 // That is, the LMS-suffix sort order matches the 188 // (simpler) LMS-substring sort order. 189 // Copy the original LMS-substring order into the 190 // suffix array destination. 191 copy(sa, sa[len(sa)-numLMS:]) 192 } 193 expand_64(text, freq, bucket, sa, numLMS) 194 } 195 induceL_64(text, sa, freq, bucket) 196 induceS_64(text, sa, freq, bucket) 197 198 // Mark for caller that we overwrote tmp. 199 tmp[0] = -1 200 } 201 202 func freq_8_64(text []byte, freq, bucket []int64) []int64 { 203 if freq != nil && freq[0] >= 0 { 204 return freq // already computed 205 } 206 if freq == nil { 207 freq = bucket 208 } 209 210 freq = freq[:256] // eliminate bounds check for freq[c] below 211 clear(freq) 212 for _, c := range text { 213 freq[c]++ 214 } 215 return freq 216 } 217 218 func freq_32(text []int32, freq, bucket []int32) []int32 { 219 if freq != nil && freq[0] >= 0 { 220 return freq // already computed 221 } 222 if freq == nil { 223 freq = bucket 224 } 225 226 clear(freq) 227 for _, c := range text { 228 freq[c]++ 229 } 230 return freq 231 } 232 233 func freq_64(text []int64, freq, bucket []int64) []int64 { 234 if freq != nil && freq[0] >= 0 { 235 return freq // already computed 236 } 237 if freq == nil { 238 freq = bucket 239 } 240 241 clear(freq) 242 for _, c := range text { 243 freq[c]++ 244 } 245 return freq 246 } 247 248 func bucketMin_8_64(text []byte, freq, bucket []int64) { 249 freq = freq_8_64(text, freq, bucket) 250 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below 251 bucket = bucket[:256] // eliminate bounds check for bucket[i] below 252 total := int64(0) 253 for i, n := range freq { 254 bucket[i] = total 255 total += n 256 } 257 } 258 259 func bucketMin_32(text []int32, freq, bucket []int32) { 260 freq = freq_32(text, freq, bucket) 261 total := int32(0) 262 for i, n := range freq { 263 bucket[i] = total 264 total += n 265 } 266 } 267 268 func bucketMin_64(text []int64, freq, bucket []int64) { 269 freq = freq_64(text, freq, bucket) 270 total := int64(0) 271 for i, n := range freq { 272 bucket[i] = total 273 total += n 274 } 275 } 276 277 func bucketMax_8_64(text []byte, freq, bucket []int64) { 278 freq = freq_8_64(text, freq, bucket) 279 freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below 280 bucket = bucket[:256] // eliminate bounds check for bucket[i] below 281 total := int64(0) 282 for i, n := range freq { 283 total += n 284 bucket[i] = total 285 } 286 } 287 288 func bucketMax_32(text []int32, freq, bucket []int32) { 289 freq = freq_32(text, freq, bucket) 290 total := int32(0) 291 for i, n := range freq { 292 total += n 293 bucket[i] = total 294 } 295 } 296 297 func bucketMax_64(text []int64, freq, bucket []int64) { 298 freq = freq_64(text, freq, bucket) 299 total := int64(0) 300 for i, n := range freq { 301 total += n 302 bucket[i] = total 303 } 304 } 305 306 func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int { 307 bucketMax_8_64(text, freq, bucket) 308 309 numLMS := 0 310 lastB := int64(-1) 311 bucket = bucket[:256] // eliminate bounds check for bucket[c1] below 312 313 // The next stanza of code (until the blank line) loop backward 314 // over text, stopping to execute a code body at each position i 315 // such that text[i] is an L-character and text[i+1] is an S-character. 316 // That is, i+1 is the position of the start of an LMS-substring. 317 // These could be hoisted out into a function with a callback, 318 // but at a significant speed cost. Instead, we just write these 319 // seven lines a few times in this source file. The copies below 320 // refer back to the pattern established by this original as the 321 // "LMS-substring iterator". 322 // 323 // In every scan through the text, c0, c1 are successive characters of text. 324 // In this backward scan, c0 == text[i] and c1 == text[i+1]. 325 // By scanning backward, we can keep track of whether the current 326 // position is type-S or type-L according to the usual definition: 327 // 328 // - position len(text) is type S with text[len(text)] == -1 (the sentinel) 329 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. 330 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. 331 // 332 // The backward scan lets us maintain the current type, 333 // update it when we see c0 != c1, and otherwise leave it alone. 334 // We want to identify all S positions with a preceding L. 335 // Position len(text) is one such position by definition, but we have 336 // nowhere to write it down, so we eliminate it by untruthfully 337 // setting isTypeS = false at the start of the loop. 338 c0, c1, isTypeS := byte(0), byte(0), false 339 for i := len(text) - 1; i >= 0; i-- { 340 c0, c1 = text[i], c0 341 if c0 < c1 { 342 isTypeS = true 343 } else if c0 > c1 && isTypeS { 344 isTypeS = false 345 346 // Bucket the index i+1 for the start of an LMS-substring. 347 b := bucket[c1] - 1 348 bucket[c1] = b 349 sa[b] = int64(i + 1) 350 lastB = b 351 numLMS++ 352 } 353 } 354 355 // We recorded the LMS-substring starts but really want the ends. 356 // Luckily, with two differences, the start indexes and the end indexes are the same. 357 // The first difference is that the rightmost LMS-substring's end index is len(text), 358 // so the caller must pretend that sa[-1] == len(text), as noted above. 359 // The second difference is that the first leftmost LMS-substring start index 360 // does not end an earlier LMS-substring, so as an optimization we can omit 361 // that leftmost LMS-substring start index (the last one we wrote). 362 // 363 // Exception: if numLMS <= 1, the caller is not going to bother with 364 // the recursion at all and will treat the result as containing LMS-substring starts. 365 // In that case, we don't remove the final entry. 366 if numLMS > 1 { 367 sa[lastB] = 0 368 } 369 return numLMS 370 } 371 372 func placeLMS_32(text []int32, sa, freq, bucket []int32) int { 373 bucketMax_32(text, freq, bucket) 374 375 numLMS := 0 376 lastB := int32(-1) 377 378 // The next stanza of code (until the blank line) loop backward 379 // over text, stopping to execute a code body at each position i 380 // such that text[i] is an L-character and text[i+1] is an S-character. 381 // That is, i+1 is the position of the start of an LMS-substring. 382 // These could be hoisted out into a function with a callback, 383 // but at a significant speed cost. Instead, we just write these 384 // seven lines a few times in this source file. The copies below 385 // refer back to the pattern established by this original as the 386 // "LMS-substring iterator". 387 // 388 // In every scan through the text, c0, c1 are successive characters of text. 389 // In this backward scan, c0 == text[i] and c1 == text[i+1]. 390 // By scanning backward, we can keep track of whether the current 391 // position is type-S or type-L according to the usual definition: 392 // 393 // - position len(text) is type S with text[len(text)] == -1 (the sentinel) 394 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. 395 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. 396 // 397 // The backward scan lets us maintain the current type, 398 // update it when we see c0 != c1, and otherwise leave it alone. 399 // We want to identify all S positions with a preceding L. 400 // Position len(text) is one such position by definition, but we have 401 // nowhere to write it down, so we eliminate it by untruthfully 402 // setting isTypeS = false at the start of the loop. 403 c0, c1, isTypeS := int32(0), int32(0), false 404 for i := len(text) - 1; i >= 0; i-- { 405 c0, c1 = text[i], c0 406 if c0 < c1 { 407 isTypeS = true 408 } else if c0 > c1 && isTypeS { 409 isTypeS = false 410 411 // Bucket the index i+1 for the start of an LMS-substring. 412 b := bucket[c1] - 1 413 bucket[c1] = b 414 sa[b] = int32(i + 1) 415 lastB = b 416 numLMS++ 417 } 418 } 419 420 // We recorded the LMS-substring starts but really want the ends. 421 // Luckily, with two differences, the start indexes and the end indexes are the same. 422 // The first difference is that the rightmost LMS-substring's end index is len(text), 423 // so the caller must pretend that sa[-1] == len(text), as noted above. 424 // The second difference is that the first leftmost LMS-substring start index 425 // does not end an earlier LMS-substring, so as an optimization we can omit 426 // that leftmost LMS-substring start index (the last one we wrote). 427 // 428 // Exception: if numLMS <= 1, the caller is not going to bother with 429 // the recursion at all and will treat the result as containing LMS-substring starts. 430 // In that case, we don't remove the final entry. 431 if numLMS > 1 { 432 sa[lastB] = 0 433 } 434 return numLMS 435 } 436 437 func placeLMS_64(text []int64, sa, freq, bucket []int64) int { 438 bucketMax_64(text, freq, bucket) 439 440 numLMS := 0 441 lastB := int64(-1) 442 443 // The next stanza of code (until the blank line) loop backward 444 // over text, stopping to execute a code body at each position i 445 // such that text[i] is an L-character and text[i+1] is an S-character. 446 // That is, i+1 is the position of the start of an LMS-substring. 447 // These could be hoisted out into a function with a callback, 448 // but at a significant speed cost. Instead, we just write these 449 // seven lines a few times in this source file. The copies below 450 // refer back to the pattern established by this original as the 451 // "LMS-substring iterator". 452 // 453 // In every scan through the text, c0, c1 are successive characters of text. 454 // In this backward scan, c0 == text[i] and c1 == text[i+1]. 455 // By scanning backward, we can keep track of whether the current 456 // position is type-S or type-L according to the usual definition: 457 // 458 // - position len(text) is type S with text[len(text)] == -1 (the sentinel) 459 // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. 460 // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. 461 // 462 // The backward scan lets us maintain the current type, 463 // update it when we see c0 != c1, and otherwise leave it alone. 464 // We want to identify all S positions with a preceding L. 465 // Position len(text) is one such position by definition, but we have 466 // nowhere to write it down, so we eliminate it by untruthfully 467 // setting isTypeS = false at the start of the loop. 468 c0, c1, isTypeS := int64(0), int64(0), false 469 for i := len(text) - 1; i >= 0; i-- { 470 c0, c1 = text[i], c0 471 if c0 < c1 { 472 isTypeS = true 473 } else if c0 > c1 && isTypeS { 474 isTypeS = false 475 476 // Bucket the index i+1 for the start of an LMS-substring. 477 b := bucket[c1] - 1 478 bucket[c1] = b 479 sa[b] = int64(i + 1) 480 lastB = b 481 numLMS++ 482 } 483 } 484 485 // We recorded the LMS-substring starts but really want the ends. 486 // Luckily, with two differences, the start indexes and the end indexes are the same. 487 // The first difference is that the rightmost LMS-substring's end index is len(text), 488 // so the caller must pretend that sa[-1] == len(text), as noted above. 489 // The second difference is that the first leftmost LMS-substring start index 490 // does not end an earlier LMS-substring, so as an optimization we can omit 491 // that leftmost LMS-substring start index (the last one we wrote). 492 // 493 // Exception: if numLMS <= 1, the caller is not going to bother with 494 // the recursion at all and will treat the result as containing LMS-substring starts. 495 // In that case, we don't remove the final entry. 496 if numLMS > 1 { 497 sa[lastB] = 0 498 } 499 return numLMS 500 } 501 502 func induceSubL_8_64(text []byte, sa, freq, bucket []int64) { 503 // Initialize positions for left side of character buckets. 504 bucketMin_8_64(text, freq, bucket) 505 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 506 507 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly 508 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. 509 // Because j-1 is type L, inserting it into sa now will sort it correctly. 510 // But we want to distinguish a j-1 with j-2 of type L from type S. 511 // We can process the former but want to leave the latter for the caller. 512 // We record the difference by negating j-1 if it is preceded by type S. 513 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 514 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have 515 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 516 // and so on, in sorted but not necessarily adjacent order, until it finds 517 // one preceded by an index of type S, at which point it must stop. 518 // 519 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 520 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing 521 // only the indexes of the leftmost L-type indexes for each LMS-substring. 522 // 523 // The suffix array sa therefore serves simultaneously as input, output, 524 // and a miraculously well-tailored work queue. 525 526 // placeLMS_8_64 left out the implicit entry sa[-1] == len(text), 527 // corresponding to the identified type-L index len(text)-1. 528 // Process it before the left-to-right scan of sa proper. 529 // See body in loop for commentary. 530 k := len(text) - 1 531 c0, c1 := text[k-1], text[k] 532 if c0 < c1 { 533 k = -k 534 } 535 536 // Cache recently used bucket index: 537 // we're processing suffixes in sorted order 538 // and accessing buckets indexed by the 539 // byte before the sorted order, which still 540 // has very good locality. 541 // Invariant: b is cached, possibly dirty copy of bucket[cB]. 542 cB := c1 543 b := bucket[cB] 544 sa[b] = int64(k) 545 b++ 546 547 for i := 0; i < len(sa); i++ { 548 j := int(sa[i]) 549 if j == 0 { 550 // Skip empty entry. 551 continue 552 } 553 if j < 0 { 554 // Leave discovered type-S index for caller. 555 sa[i] = int64(-j) 556 continue 557 } 558 sa[i] = 0 559 560 // Index j was on work queue, meaning k := j-1 is L-type, 561 // so we can now place k correctly into sa. 562 // If k-1 is L-type, queue k for processing later in this loop. 563 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 564 k := j - 1 565 c0, c1 := text[k-1], text[k] 566 if c0 < c1 { 567 k = -k 568 } 569 570 if cB != c1 { 571 bucket[cB] = b 572 cB = c1 573 b = bucket[cB] 574 } 575 sa[b] = int64(k) 576 b++ 577 } 578 } 579 580 func induceSubL_32(text []int32, sa, freq, bucket []int32) { 581 // Initialize positions for left side of character buckets. 582 bucketMin_32(text, freq, bucket) 583 584 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly 585 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. 586 // Because j-1 is type L, inserting it into sa now will sort it correctly. 587 // But we want to distinguish a j-1 with j-2 of type L from type S. 588 // We can process the former but want to leave the latter for the caller. 589 // We record the difference by negating j-1 if it is preceded by type S. 590 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 591 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have 592 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 593 // and so on, in sorted but not necessarily adjacent order, until it finds 594 // one preceded by an index of type S, at which point it must stop. 595 // 596 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 597 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing 598 // only the indexes of the leftmost L-type indexes for each LMS-substring. 599 // 600 // The suffix array sa therefore serves simultaneously as input, output, 601 // and a miraculously well-tailored work queue. 602 603 // placeLMS_32 left out the implicit entry sa[-1] == len(text), 604 // corresponding to the identified type-L index len(text)-1. 605 // Process it before the left-to-right scan of sa proper. 606 // See body in loop for commentary. 607 k := len(text) - 1 608 c0, c1 := text[k-1], text[k] 609 if c0 < c1 { 610 k = -k 611 } 612 613 // Cache recently used bucket index: 614 // we're processing suffixes in sorted order 615 // and accessing buckets indexed by the 616 // int32 before the sorted order, which still 617 // has very good locality. 618 // Invariant: b is cached, possibly dirty copy of bucket[cB]. 619 cB := c1 620 b := bucket[cB] 621 sa[b] = int32(k) 622 b++ 623 624 for i := 0; i < len(sa); i++ { 625 j := int(sa[i]) 626 if j == 0 { 627 // Skip empty entry. 628 continue 629 } 630 if j < 0 { 631 // Leave discovered type-S index for caller. 632 sa[i] = int32(-j) 633 continue 634 } 635 sa[i] = 0 636 637 // Index j was on work queue, meaning k := j-1 is L-type, 638 // so we can now place k correctly into sa. 639 // If k-1 is L-type, queue k for processing later in this loop. 640 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 641 k := j - 1 642 c0, c1 := text[k-1], text[k] 643 if c0 < c1 { 644 k = -k 645 } 646 647 if cB != c1 { 648 bucket[cB] = b 649 cB = c1 650 b = bucket[cB] 651 } 652 sa[b] = int32(k) 653 b++ 654 } 655 } 656 657 func induceSubL_64(text []int64, sa, freq, bucket []int64) { 658 // Initialize positions for left side of character buckets. 659 bucketMin_64(text, freq, bucket) 660 661 // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly 662 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. 663 // Because j-1 is type L, inserting it into sa now will sort it correctly. 664 // But we want to distinguish a j-1 with j-2 of type L from type S. 665 // We can process the former but want to leave the latter for the caller. 666 // We record the difference by negating j-1 if it is preceded by type S. 667 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 668 // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have 669 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 670 // and so on, in sorted but not necessarily adjacent order, until it finds 671 // one preceded by an index of type S, at which point it must stop. 672 // 673 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 674 // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing 675 // only the indexes of the leftmost L-type indexes for each LMS-substring. 676 // 677 // The suffix array sa therefore serves simultaneously as input, output, 678 // and a miraculously well-tailored work queue. 679 680 // placeLMS_64 left out the implicit entry sa[-1] == len(text), 681 // corresponding to the identified type-L index len(text)-1. 682 // Process it before the left-to-right scan of sa proper. 683 // See body in loop for commentary. 684 k := len(text) - 1 685 c0, c1 := text[k-1], text[k] 686 if c0 < c1 { 687 k = -k 688 } 689 690 // Cache recently used bucket index: 691 // we're processing suffixes in sorted order 692 // and accessing buckets indexed by the 693 // int64 before the sorted order, which still 694 // has very good locality. 695 // Invariant: b is cached, possibly dirty copy of bucket[cB]. 696 cB := c1 697 b := bucket[cB] 698 sa[b] = int64(k) 699 b++ 700 701 for i := 0; i < len(sa); i++ { 702 j := int(sa[i]) 703 if j == 0 { 704 // Skip empty entry. 705 continue 706 } 707 if j < 0 { 708 // Leave discovered type-S index for caller. 709 sa[i] = int64(-j) 710 continue 711 } 712 sa[i] = 0 713 714 // Index j was on work queue, meaning k := j-1 is L-type, 715 // so we can now place k correctly into sa. 716 // If k-1 is L-type, queue k for processing later in this loop. 717 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 718 k := j - 1 719 c0, c1 := text[k-1], text[k] 720 if c0 < c1 { 721 k = -k 722 } 723 724 if cB != c1 { 725 bucket[cB] = b 726 cB = c1 727 b = bucket[cB] 728 } 729 sa[b] = int64(k) 730 b++ 731 } 732 } 733 734 func induceSubS_8_64(text []byte, sa, freq, bucket []int64) { 735 // Initialize positions for right side of character buckets. 736 bucketMax_8_64(text, freq, bucket) 737 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 738 739 // Analogous to induceSubL_8_64 above, 740 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly 741 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. 742 // Because j-1 is type S, inserting it into sa now will sort it correctly. 743 // But we want to distinguish a j-1 with j-2 of type S from type L. 744 // We can process the former but want to leave the latter for the caller. 745 // We record the difference by negating j-1 if it is preceded by type L. 746 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 747 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have 748 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 749 // and so on, in sorted but not necessarily adjacent order, until it finds 750 // one preceded by an index of type L, at which point it must stop. 751 // That index (preceded by one of type L) is an LMS-substring start. 752 // 753 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 754 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, 755 // so that the loop finishes with the top of sa containing exactly 756 // the LMS-substring start indexes, sorted by LMS-substring. 757 758 // Cache recently used bucket index: 759 cB := byte(0) 760 b := bucket[cB] 761 762 top := len(sa) 763 for i := len(sa) - 1; i >= 0; i-- { 764 j := int(sa[i]) 765 if j == 0 { 766 // Skip empty entry. 767 continue 768 } 769 sa[i] = 0 770 if j < 0 { 771 // Leave discovered LMS-substring start index for caller. 772 top-- 773 sa[top] = int64(-j) 774 continue 775 } 776 777 // Index j was on work queue, meaning k := j-1 is S-type, 778 // so we can now place k correctly into sa. 779 // If k-1 is S-type, queue k for processing later in this loop. 780 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. 781 k := j - 1 782 c1 := text[k] 783 c0 := text[k-1] 784 if c0 > c1 { 785 k = -k 786 } 787 788 if cB != c1 { 789 bucket[cB] = b 790 cB = c1 791 b = bucket[cB] 792 } 793 b-- 794 sa[b] = int64(k) 795 } 796 } 797 798 func induceSubS_32(text []int32, sa, freq, bucket []int32) { 799 // Initialize positions for right side of character buckets. 800 bucketMax_32(text, freq, bucket) 801 802 // Analogous to induceSubL_32 above, 803 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly 804 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. 805 // Because j-1 is type S, inserting it into sa now will sort it correctly. 806 // But we want to distinguish a j-1 with j-2 of type S from type L. 807 // We can process the former but want to leave the latter for the caller. 808 // We record the difference by negating j-1 if it is preceded by type L. 809 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 810 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have 811 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 812 // and so on, in sorted but not necessarily adjacent order, until it finds 813 // one preceded by an index of type L, at which point it must stop. 814 // That index (preceded by one of type L) is an LMS-substring start. 815 // 816 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 817 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, 818 // so that the loop finishes with the top of sa containing exactly 819 // the LMS-substring start indexes, sorted by LMS-substring. 820 821 // Cache recently used bucket index: 822 cB := int32(0) 823 b := bucket[cB] 824 825 top := len(sa) 826 for i := len(sa) - 1; i >= 0; i-- { 827 j := int(sa[i]) 828 if j == 0 { 829 // Skip empty entry. 830 continue 831 } 832 sa[i] = 0 833 if j < 0 { 834 // Leave discovered LMS-substring start index for caller. 835 top-- 836 sa[top] = int32(-j) 837 continue 838 } 839 840 // Index j was on work queue, meaning k := j-1 is S-type, 841 // so we can now place k correctly into sa. 842 // If k-1 is S-type, queue k for processing later in this loop. 843 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. 844 k := j - 1 845 c1 := text[k] 846 c0 := text[k-1] 847 if c0 > c1 { 848 k = -k 849 } 850 851 if cB != c1 { 852 bucket[cB] = b 853 cB = c1 854 b = bucket[cB] 855 } 856 b-- 857 sa[b] = int32(k) 858 } 859 } 860 861 func induceSubS_64(text []int64, sa, freq, bucket []int64) { 862 // Initialize positions for right side of character buckets. 863 bucketMax_64(text, freq, bucket) 864 865 // Analogous to induceSubL_64 above, 866 // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly 867 // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. 868 // Because j-1 is type S, inserting it into sa now will sort it correctly. 869 // But we want to distinguish a j-1 with j-2 of type S from type L. 870 // We can process the former but want to leave the latter for the caller. 871 // We record the difference by negating j-1 if it is preceded by type L. 872 // Either way, the insertion (into the text[j-1] bucket) is guaranteed to 873 // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have 874 // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, 875 // and so on, in sorted but not necessarily adjacent order, until it finds 876 // one preceded by an index of type L, at which point it must stop. 877 // That index (preceded by one of type L) is an LMS-substring start. 878 // 879 // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, 880 // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, 881 // so that the loop finishes with the top of sa containing exactly 882 // the LMS-substring start indexes, sorted by LMS-substring. 883 884 // Cache recently used bucket index: 885 cB := int64(0) 886 b := bucket[cB] 887 888 top := len(sa) 889 for i := len(sa) - 1; i >= 0; i-- { 890 j := int(sa[i]) 891 if j == 0 { 892 // Skip empty entry. 893 continue 894 } 895 sa[i] = 0 896 if j < 0 { 897 // Leave discovered LMS-substring start index for caller. 898 top-- 899 sa[top] = int64(-j) 900 continue 901 } 902 903 // Index j was on work queue, meaning k := j-1 is S-type, 904 // so we can now place k correctly into sa. 905 // If k-1 is S-type, queue k for processing later in this loop. 906 // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. 907 k := j - 1 908 c1 := text[k] 909 c0 := text[k-1] 910 if c0 > c1 { 911 k = -k 912 } 913 914 if cB != c1 { 915 bucket[cB] = b 916 cB = c1 917 b = bucket[cB] 918 } 919 b-- 920 sa[b] = int64(k) 921 } 922 } 923 924 func length_8_64(text []byte, sa []int64, numLMS int) { 925 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) 926 927 // The encoding of N text bytes into a “length” word 928 // adds 1 to each byte, packs them into the bottom 929 // N*8 bits of a word, and then bitwise inverts the result. 930 // That is, the text sequence A B C (hex 41 42 43) 931 // encodes as ^uint64(0x42_43_44). 932 // LMS-substrings can never start or end with 0xFF. 933 // Adding 1 ensures the encoded byte sequence never 934 // starts or ends with 0x00, so that present bytes can be 935 // distinguished from zero-padding in the top bits, 936 // so the length need not be separately encoded. 937 // Inverting the bytes increases the chance that a 938 // 4-byte encoding will still be ≥ len(text). 939 // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F) 940 // then the high bit of the inversion will be set, 941 // making it clearly not a valid length (it would be a negative one). 942 // 943 // cx holds the pre-inverted encoding (the packed incremented bytes). 944 cx := uint64(0) // byte-only 945 946 // This stanza (until the blank line) is the "LMS-substring iterator", 947 // described in placeLMS_8_64 above, with one line added to maintain cx. 948 c0, c1, isTypeS := byte(0), byte(0), false 949 for i := len(text) - 1; i >= 0; i-- { 950 c0, c1 = text[i], c0 951 cx = cx<<8 | uint64(c1+1) // byte-only 952 if c0 < c1 { 953 isTypeS = true 954 } else if c0 > c1 && isTypeS { 955 isTypeS = false 956 957 // Index j = i+1 is the start of an LMS-substring. 958 // Compute length or encoded text to store in sa[j/2]. 959 j := i + 1 960 var code int64 961 if end == 0 { 962 code = 0 963 } else { 964 code = int64(end - j) 965 if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only 966 code = int64(^cx) // byte-only 967 } // byte-only 968 } 969 sa[j>>1] = code 970 end = j + 1 971 cx = uint64(c1 + 1) // byte-only 972 } 973 } 974 } 975 976 func length_32(text []int32, sa []int32, numLMS int) { 977 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) 978 979 // The encoding of N text int32s into a “length” word 980 // adds 1 to each int32, packs them into the bottom 981 // N*8 bits of a word, and then bitwise inverts the result. 982 // That is, the text sequence A B C (hex 41 42 43) 983 // encodes as ^uint32(0x42_43_44). 984 // LMS-substrings can never start or end with 0xFF. 985 // Adding 1 ensures the encoded int32 sequence never 986 // starts or ends with 0x00, so that present int32s can be 987 // distinguished from zero-padding in the top bits, 988 // so the length need not be separately encoded. 989 // Inverting the int32s increases the chance that a 990 // 4-int32 encoding will still be ≥ len(text). 991 // In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F) 992 // then the high bit of the inversion will be set, 993 // making it clearly not a valid length (it would be a negative one). 994 // 995 // cx holds the pre-inverted encoding (the packed incremented int32s). 996 997 // This stanza (until the blank line) is the "LMS-substring iterator", 998 // described in placeLMS_32 above, with one line added to maintain cx. 999 c0, c1, isTypeS := int32(0), int32(0), false 1000 for i := len(text) - 1; i >= 0; i-- { 1001 c0, c1 = text[i], c0 1002 if c0 < c1 { 1003 isTypeS = true 1004 } else if c0 > c1 && isTypeS { 1005 isTypeS = false 1006 1007 // Index j = i+1 is the start of an LMS-substring. 1008 // Compute length or encoded text to store in sa[j/2]. 1009 j := i + 1 1010 var code int32 1011 if end == 0 { 1012 code = 0 1013 } else { 1014 code = int32(end - j) 1015 } 1016 sa[j>>1] = code 1017 end = j + 1 1018 } 1019 } 1020 } 1021 1022 func length_64(text []int64, sa []int64, numLMS int) { 1023 end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) 1024 1025 // The encoding of N text int64s into a “length” word 1026 // adds 1 to each int64, packs them into the bottom 1027 // N*8 bits of a word, and then bitwise inverts the result. 1028 // That is, the text sequence A B C (hex 41 42 43) 1029 // encodes as ^uint64(0x42_43_44). 1030 // LMS-substrings can never start or end with 0xFF. 1031 // Adding 1 ensures the encoded int64 sequence never 1032 // starts or ends with 0x00, so that present int64s can be 1033 // distinguished from zero-padding in the top bits, 1034 // so the length need not be separately encoded. 1035 // Inverting the int64s increases the chance that a 1036 // 4-int64 encoding will still be ≥ len(text). 1037 // In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F) 1038 // then the high bit of the inversion will be set, 1039 // making it clearly not a valid length (it would be a negative one). 1040 // 1041 // cx holds the pre-inverted encoding (the packed incremented int64s). 1042 1043 // This stanza (until the blank line) is the "LMS-substring iterator", 1044 // described in placeLMS_64 above, with one line added to maintain cx. 1045 c0, c1, isTypeS := int64(0), int64(0), false 1046 for i := len(text) - 1; i >= 0; i-- { 1047 c0, c1 = text[i], c0 1048 if c0 < c1 { 1049 isTypeS = true 1050 } else if c0 > c1 && isTypeS { 1051 isTypeS = false 1052 1053 // Index j = i+1 is the start of an LMS-substring. 1054 // Compute length or encoded text to store in sa[j/2]. 1055 j := i + 1 1056 var code int64 1057 if end == 0 { 1058 code = 0 1059 } else { 1060 code = int64(end - j) 1061 } 1062 sa[j>>1] = code 1063 end = j + 1 1064 } 1065 } 1066 } 1067 1068 func assignID_8_64(text []byte, sa []int64, numLMS int) int { 1069 id := 0 1070 lastLen := int64(-1) // impossible 1071 lastPos := int64(0) 1072 for _, j := range sa[len(sa)-numLMS:] { 1073 // Is the LMS-substring at index j new, or is it the same as the last one we saw? 1074 n := sa[j/2] 1075 if n != lastLen { 1076 goto New 1077 } 1078 if uint64(n) >= uint64(len(text)) { 1079 // “Length” is really encoded full text, and they match. 1080 goto Same 1081 } 1082 { 1083 // Compare actual texts. 1084 n := int(n) 1085 this := text[j:][:n] 1086 last := text[lastPos:][:n] 1087 for i := 0; i < n; i++ { 1088 if this[i] != last[i] { 1089 goto New 1090 } 1091 } 1092 goto Same 1093 } 1094 New: 1095 id++ 1096 lastPos = j 1097 lastLen = n 1098 Same: 1099 sa[j/2] = int64(id) 1100 } 1101 return id 1102 } 1103 1104 func assignID_32(text []int32, sa []int32, numLMS int) int { 1105 id := 0 1106 lastLen := int32(-1) // impossible 1107 lastPos := int32(0) 1108 for _, j := range sa[len(sa)-numLMS:] { 1109 // Is the LMS-substring at index j new, or is it the same as the last one we saw? 1110 n := sa[j/2] 1111 if n != lastLen { 1112 goto New 1113 } 1114 if uint32(n) >= uint32(len(text)) { 1115 // “Length” is really encoded full text, and they match. 1116 goto Same 1117 } 1118 { 1119 // Compare actual texts. 1120 n := int(n) 1121 this := text[j:][:n] 1122 last := text[lastPos:][:n] 1123 for i := 0; i < n; i++ { 1124 if this[i] != last[i] { 1125 goto New 1126 } 1127 } 1128 goto Same 1129 } 1130 New: 1131 id++ 1132 lastPos = j 1133 lastLen = n 1134 Same: 1135 sa[j/2] = int32(id) 1136 } 1137 return id 1138 } 1139 1140 func assignID_64(text []int64, sa []int64, numLMS int) int { 1141 id := 0 1142 lastLen := int64(-1) // impossible 1143 lastPos := int64(0) 1144 for _, j := range sa[len(sa)-numLMS:] { 1145 // Is the LMS-substring at index j new, or is it the same as the last one we saw? 1146 n := sa[j/2] 1147 if n != lastLen { 1148 goto New 1149 } 1150 if uint64(n) >= uint64(len(text)) { 1151 // “Length” is really encoded full text, and they match. 1152 goto Same 1153 } 1154 { 1155 // Compare actual texts. 1156 n := int(n) 1157 this := text[j:][:n] 1158 last := text[lastPos:][:n] 1159 for i := 0; i < n; i++ { 1160 if this[i] != last[i] { 1161 goto New 1162 } 1163 } 1164 goto Same 1165 } 1166 New: 1167 id++ 1168 lastPos = j 1169 lastLen = n 1170 Same: 1171 sa[j/2] = int64(id) 1172 } 1173 return id 1174 } 1175 1176 func map_64(sa []int64, numLMS int) { 1177 w := len(sa) 1178 for i := len(sa) / 2; i >= 0; i-- { 1179 j := sa[i] 1180 if j > 0 { 1181 w-- 1182 sa[w] = j - 1 1183 } 1184 } 1185 } 1186 1187 func recurse_64(sa, oldTmp []int64, numLMS, maxID int) { 1188 dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:] 1189 1190 // Set up temporary space for recursive call. 1191 // We must pass sais_64 a tmp buffer with at least maxID entries. 1192 // 1193 // The subproblem is guaranteed to have length at most len(sa)/2, 1194 // so that sa can hold both the subproblem and its suffix array. 1195 // Nearly all the time, however, the subproblem has length < len(sa)/3, 1196 // in which case there is a subproblem-sized middle of sa that 1197 // we can reuse for temporary space (saTmp). 1198 // When recurse_64 is called from sais_8_64, oldTmp is length 512 1199 // (from text_64), and saTmp will typically be much larger, so we'll use saTmp. 1200 // When deeper recursions come back to recurse_64, now oldTmp is 1201 // the saTmp from the top-most recursion, it is typically larger than 1202 // the current saTmp (because the current sa gets smaller and smaller 1203 // as the recursion gets deeper), and we keep reusing that top-most 1204 // large saTmp instead of the offered smaller ones. 1205 // 1206 // Why is the subproblem length so often just under len(sa)/3? 1207 // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation. 1208 // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern 1209 // in the input, perfect alternation of larger and smaller input bytes. 1210 // Real text doesn't do that. If each L-type index is randomly followed 1211 // by either an L-type or S-type index, then half the substrings will 1212 // be of the form SLS, but the other half will be longer. Of that half, 1213 // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on. 1214 // Not counting the final S in each (which overlaps the first S in the next), 1215 // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3. 1216 // The space we need is further reduced by the fact that many of the 1217 // short patterns like SLS will often be the same character sequences 1218 // repeated throughout the text, reducing maxID relative to numLMS. 1219 // 1220 // For short inputs, the averages may not run in our favor, but then we 1221 // can often fall back to using the length-512 tmp available in the 1222 // top-most call. (Also a short allocation would not be a big deal.) 1223 // 1224 // For pathological inputs, we fall back to allocating a new tmp of length 1225 // max(maxID, numLMS/2). This level of the recursion needs maxID, 1226 // and all deeper levels of the recursion will need no more than numLMS/2, 1227 // so this one allocation is guaranteed to suffice for the entire stack 1228 // of recursive calls. 1229 tmp := oldTmp 1230 if len(tmp) < len(saTmp) { 1231 tmp = saTmp 1232 } 1233 if len(tmp) < numLMS { 1234 // TestSAIS/forcealloc reaches this code. 1235 n := maxID 1236 if n < numLMS/2 { 1237 n = numLMS / 2 1238 } 1239 tmp = make([]int64, n) 1240 } 1241 1242 // sais_64 requires that the caller arrange to clear dst, 1243 // because in general the caller may know dst is 1244 // freshly-allocated and already cleared. But this one is not. 1245 clear(dst) 1246 sais_64(text, maxID, dst, tmp) 1247 } 1248 1249 func unmap_8_64(text []byte, sa []int64, numLMS int) { 1250 unmap := sa[len(sa)-numLMS:] 1251 j := len(unmap) 1252 1253 // "LMS-substring iterator" (see placeLMS_8_64 above). 1254 c0, c1, isTypeS := byte(0), byte(0), false 1255 for i := len(text) - 1; i >= 0; i-- { 1256 c0, c1 = text[i], c0 1257 if c0 < c1 { 1258 isTypeS = true 1259 } else if c0 > c1 && isTypeS { 1260 isTypeS = false 1261 1262 // Populate inverse map. 1263 j-- 1264 unmap[j] = int64(i + 1) 1265 } 1266 } 1267 1268 // Apply inverse map to subproblem suffix array. 1269 sa = sa[:numLMS] 1270 for i := 0; i < len(sa); i++ { 1271 sa[i] = unmap[sa[i]] 1272 } 1273 } 1274 1275 func unmap_32(text []int32, sa []int32, numLMS int) { 1276 unmap := sa[len(sa)-numLMS:] 1277 j := len(unmap) 1278 1279 // "LMS-substring iterator" (see placeLMS_32 above). 1280 c0, c1, isTypeS := int32(0), int32(0), false 1281 for i := len(text) - 1; i >= 0; i-- { 1282 c0, c1 = text[i], c0 1283 if c0 < c1 { 1284 isTypeS = true 1285 } else if c0 > c1 && isTypeS { 1286 isTypeS = false 1287 1288 // Populate inverse map. 1289 j-- 1290 unmap[j] = int32(i + 1) 1291 } 1292 } 1293 1294 // Apply inverse map to subproblem suffix array. 1295 sa = sa[:numLMS] 1296 for i := 0; i < len(sa); i++ { 1297 sa[i] = unmap[sa[i]] 1298 } 1299 } 1300 1301 func unmap_64(text []int64, sa []int64, numLMS int) { 1302 unmap := sa[len(sa)-numLMS:] 1303 j := len(unmap) 1304 1305 // "LMS-substring iterator" (see placeLMS_64 above). 1306 c0, c1, isTypeS := int64(0), int64(0), false 1307 for i := len(text) - 1; i >= 0; i-- { 1308 c0, c1 = text[i], c0 1309 if c0 < c1 { 1310 isTypeS = true 1311 } else if c0 > c1 && isTypeS { 1312 isTypeS = false 1313 1314 // Populate inverse map. 1315 j-- 1316 unmap[j] = int64(i + 1) 1317 } 1318 } 1319 1320 // Apply inverse map to subproblem suffix array. 1321 sa = sa[:numLMS] 1322 for i := 0; i < len(sa); i++ { 1323 sa[i] = unmap[sa[i]] 1324 } 1325 } 1326 1327 func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) { 1328 bucketMax_8_64(text, freq, bucket) 1329 bucket = bucket[:256] // eliminate bound check for bucket[c] below 1330 1331 // Loop backward through sa, always tracking 1332 // the next index to populate from sa[:numLMS]. 1333 // When we get to one, populate it. 1334 // Zero the rest of the slots; they have dead values in them. 1335 x := numLMS - 1 1336 saX := sa[x] 1337 c := text[saX] 1338 b := bucket[c] - 1 1339 bucket[c] = b 1340 1341 for i := len(sa) - 1; i >= 0; i-- { 1342 if i != int(b) { 1343 sa[i] = 0 1344 continue 1345 } 1346 sa[i] = saX 1347 1348 // Load next entry to put down (if any). 1349 if x > 0 { 1350 x-- 1351 saX = sa[x] // TODO bounds check 1352 c = text[saX] 1353 b = bucket[c] - 1 1354 bucket[c] = b 1355 } 1356 } 1357 } 1358 1359 func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) { 1360 bucketMax_32(text, freq, bucket) 1361 1362 // Loop backward through sa, always tracking 1363 // the next index to populate from sa[:numLMS]. 1364 // When we get to one, populate it. 1365 // Zero the rest of the slots; they have dead values in them. 1366 x := numLMS - 1 1367 saX := sa[x] 1368 c := text[saX] 1369 b := bucket[c] - 1 1370 bucket[c] = b 1371 1372 for i := len(sa) - 1; i >= 0; i-- { 1373 if i != int(b) { 1374 sa[i] = 0 1375 continue 1376 } 1377 sa[i] = saX 1378 1379 // Load next entry to put down (if any). 1380 if x > 0 { 1381 x-- 1382 saX = sa[x] // TODO bounds check 1383 c = text[saX] 1384 b = bucket[c] - 1 1385 bucket[c] = b 1386 } 1387 } 1388 } 1389 1390 func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) { 1391 bucketMax_64(text, freq, bucket) 1392 1393 // Loop backward through sa, always tracking 1394 // the next index to populate from sa[:numLMS]. 1395 // When we get to one, populate it. 1396 // Zero the rest of the slots; they have dead values in them. 1397 x := numLMS - 1 1398 saX := sa[x] 1399 c := text[saX] 1400 b := bucket[c] - 1 1401 bucket[c] = b 1402 1403 for i := len(sa) - 1; i >= 0; i-- { 1404 if i != int(b) { 1405 sa[i] = 0 1406 continue 1407 } 1408 sa[i] = saX 1409 1410 // Load next entry to put down (if any). 1411 if x > 0 { 1412 x-- 1413 saX = sa[x] // TODO bounds check 1414 c = text[saX] 1415 b = bucket[c] - 1 1416 bucket[c] = b 1417 } 1418 } 1419 } 1420 1421 func induceL_8_64(text []byte, sa, freq, bucket []int64) { 1422 // Initialize positions for left side of character buckets. 1423 bucketMin_8_64(text, freq, bucket) 1424 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 1425 1426 // This scan is similar to the one in induceSubL_8_64 above. 1427 // That one arranges to clear all but the leftmost L-type indexes. 1428 // This scan leaves all the L-type indexes and the original S-type 1429 // indexes, but it negates the positive leftmost L-type indexes 1430 // (the ones that induceS_8_64 needs to process). 1431 1432 // expand_8_64 left out the implicit entry sa[-1] == len(text), 1433 // corresponding to the identified type-L index len(text)-1. 1434 // Process it before the left-to-right scan of sa proper. 1435 // See body in loop for commentary. 1436 k := len(text) - 1 1437 c0, c1 := text[k-1], text[k] 1438 if c0 < c1 { 1439 k = -k 1440 } 1441 1442 // Cache recently used bucket index. 1443 cB := c1 1444 b := bucket[cB] 1445 sa[b] = int64(k) 1446 b++ 1447 1448 for i := 0; i < len(sa); i++ { 1449 j := int(sa[i]) 1450 if j <= 0 { 1451 // Skip empty or negated entry (including negated zero). 1452 continue 1453 } 1454 1455 // Index j was on work queue, meaning k := j-1 is L-type, 1456 // so we can now place k correctly into sa. 1457 // If k-1 is L-type, queue k for processing later in this loop. 1458 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 1459 // If k is zero, k-1 doesn't exist, so we only need to leave it 1460 // for the caller. The caller can't tell the difference between 1461 // an empty slot and a non-empty zero, but there's no need 1462 // to distinguish them anyway: the final suffix array will end up 1463 // with one zero somewhere, and that will be a real zero. 1464 k := j - 1 1465 c1 := text[k] 1466 if k > 0 { 1467 if c0 := text[k-1]; c0 < c1 { 1468 k = -k 1469 } 1470 } 1471 1472 if cB != c1 { 1473 bucket[cB] = b 1474 cB = c1 1475 b = bucket[cB] 1476 } 1477 sa[b] = int64(k) 1478 b++ 1479 } 1480 } 1481 1482 func induceL_32(text []int32, sa, freq, bucket []int32) { 1483 // Initialize positions for left side of character buckets. 1484 bucketMin_32(text, freq, bucket) 1485 1486 // This scan is similar to the one in induceSubL_32 above. 1487 // That one arranges to clear all but the leftmost L-type indexes. 1488 // This scan leaves all the L-type indexes and the original S-type 1489 // indexes, but it negates the positive leftmost L-type indexes 1490 // (the ones that induceS_32 needs to process). 1491 1492 // expand_32 left out the implicit entry sa[-1] == len(text), 1493 // corresponding to the identified type-L index len(text)-1. 1494 // Process it before the left-to-right scan of sa proper. 1495 // See body in loop for commentary. 1496 k := len(text) - 1 1497 c0, c1 := text[k-1], text[k] 1498 if c0 < c1 { 1499 k = -k 1500 } 1501 1502 // Cache recently used bucket index. 1503 cB := c1 1504 b := bucket[cB] 1505 sa[b] = int32(k) 1506 b++ 1507 1508 for i := 0; i < len(sa); i++ { 1509 j := int(sa[i]) 1510 if j <= 0 { 1511 // Skip empty or negated entry (including negated zero). 1512 continue 1513 } 1514 1515 // Index j was on work queue, meaning k := j-1 is L-type, 1516 // so we can now place k correctly into sa. 1517 // If k-1 is L-type, queue k for processing later in this loop. 1518 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 1519 // If k is zero, k-1 doesn't exist, so we only need to leave it 1520 // for the caller. The caller can't tell the difference between 1521 // an empty slot and a non-empty zero, but there's no need 1522 // to distinguish them anyway: the final suffix array will end up 1523 // with one zero somewhere, and that will be a real zero. 1524 k := j - 1 1525 c1 := text[k] 1526 if k > 0 { 1527 if c0 := text[k-1]; c0 < c1 { 1528 k = -k 1529 } 1530 } 1531 1532 if cB != c1 { 1533 bucket[cB] = b 1534 cB = c1 1535 b = bucket[cB] 1536 } 1537 sa[b] = int32(k) 1538 b++ 1539 } 1540 } 1541 1542 func induceL_64(text []int64, sa, freq, bucket []int64) { 1543 // Initialize positions for left side of character buckets. 1544 bucketMin_64(text, freq, bucket) 1545 1546 // This scan is similar to the one in induceSubL_64 above. 1547 // That one arranges to clear all but the leftmost L-type indexes. 1548 // This scan leaves all the L-type indexes and the original S-type 1549 // indexes, but it negates the positive leftmost L-type indexes 1550 // (the ones that induceS_64 needs to process). 1551 1552 // expand_64 left out the implicit entry sa[-1] == len(text), 1553 // corresponding to the identified type-L index len(text)-1. 1554 // Process it before the left-to-right scan of sa proper. 1555 // See body in loop for commentary. 1556 k := len(text) - 1 1557 c0, c1 := text[k-1], text[k] 1558 if c0 < c1 { 1559 k = -k 1560 } 1561 1562 // Cache recently used bucket index. 1563 cB := c1 1564 b := bucket[cB] 1565 sa[b] = int64(k) 1566 b++ 1567 1568 for i := 0; i < len(sa); i++ { 1569 j := int(sa[i]) 1570 if j <= 0 { 1571 // Skip empty or negated entry (including negated zero). 1572 continue 1573 } 1574 1575 // Index j was on work queue, meaning k := j-1 is L-type, 1576 // so we can now place k correctly into sa. 1577 // If k-1 is L-type, queue k for processing later in this loop. 1578 // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. 1579 // If k is zero, k-1 doesn't exist, so we only need to leave it 1580 // for the caller. The caller can't tell the difference between 1581 // an empty slot and a non-empty zero, but there's no need 1582 // to distinguish them anyway: the final suffix array will end up 1583 // with one zero somewhere, and that will be a real zero. 1584 k := j - 1 1585 c1 := text[k] 1586 if k > 0 { 1587 if c0 := text[k-1]; c0 < c1 { 1588 k = -k 1589 } 1590 } 1591 1592 if cB != c1 { 1593 bucket[cB] = b 1594 cB = c1 1595 b = bucket[cB] 1596 } 1597 sa[b] = int64(k) 1598 b++ 1599 } 1600 } 1601 1602 func induceS_8_64(text []byte, sa, freq, bucket []int64) { 1603 // Initialize positions for right side of character buckets. 1604 bucketMax_8_64(text, freq, bucket) 1605 bucket = bucket[:256] // eliminate bounds check for bucket[cB] below 1606 1607 cB := byte(0) 1608 b := bucket[cB] 1609 1610 for i := len(sa) - 1; i >= 0; i-- { 1611 j := int(sa[i]) 1612 if j >= 0 { 1613 // Skip non-flagged entry. 1614 // (This loop can't see an empty entry; 0 means the real zero index.) 1615 continue 1616 } 1617 1618 // Negative j is a work queue entry; rewrite to positive j for final suffix array. 1619 j = -j 1620 sa[i] = int64(j) 1621 1622 // Index j was on work queue (encoded as -j but now decoded), 1623 // meaning k := j-1 is L-type, 1624 // so we can now place k correctly into sa. 1625 // If k-1 is S-type, queue -k for processing later in this loop. 1626 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. 1627 // If k is zero, k-1 doesn't exist, so we only need to leave it 1628 // for the caller. 1629 k := j - 1 1630 c1 := text[k] 1631 if k > 0 { 1632 if c0 := text[k-1]; c0 <= c1 { 1633 k = -k 1634 } 1635 } 1636 1637 if cB != c1 { 1638 bucket[cB] = b 1639 cB = c1 1640 b = bucket[cB] 1641 } 1642 b-- 1643 sa[b] = int64(k) 1644 } 1645 } 1646 1647 func induceS_32(text []int32, sa, freq, bucket []int32) { 1648 // Initialize positions for right side of character buckets. 1649 bucketMax_32(text, freq, bucket) 1650 1651 cB := int32(0) 1652 b := bucket[cB] 1653 1654 for i := len(sa) - 1; i >= 0; i-- { 1655 j := int(sa[i]) 1656 if j >= 0 { 1657 // Skip non-flagged entry. 1658 // (This loop can't see an empty entry; 0 means the real zero index.) 1659 continue 1660 } 1661 1662 // Negative j is a work queue entry; rewrite to positive j for final suffix array. 1663 j = -j 1664 sa[i] = int32(j) 1665 1666 // Index j was on work queue (encoded as -j but now decoded), 1667 // meaning k := j-1 is L-type, 1668 // so we can now place k correctly into sa. 1669 // If k-1 is S-type, queue -k for processing later in this loop. 1670 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. 1671 // If k is zero, k-1 doesn't exist, so we only need to leave it 1672 // for the caller. 1673 k := j - 1 1674 c1 := text[k] 1675 if k > 0 { 1676 if c0 := text[k-1]; c0 <= c1 { 1677 k = -k 1678 } 1679 } 1680 1681 if cB != c1 { 1682 bucket[cB] = b 1683 cB = c1 1684 b = bucket[cB] 1685 } 1686 b-- 1687 sa[b] = int32(k) 1688 } 1689 } 1690 1691 func induceS_64(text []int64, sa, freq, bucket []int64) { 1692 // Initialize positions for right side of character buckets. 1693 bucketMax_64(text, freq, bucket) 1694 1695 cB := int64(0) 1696 b := bucket[cB] 1697 1698 for i := len(sa) - 1; i >= 0; i-- { 1699 j := int(sa[i]) 1700 if j >= 0 { 1701 // Skip non-flagged entry. 1702 // (This loop can't see an empty entry; 0 means the real zero index.) 1703 continue 1704 } 1705 1706 // Negative j is a work queue entry; rewrite to positive j for final suffix array. 1707 j = -j 1708 sa[i] = int64(j) 1709 1710 // Index j was on work queue (encoded as -j but now decoded), 1711 // meaning k := j-1 is L-type, 1712 // so we can now place k correctly into sa. 1713 // If k-1 is S-type, queue -k for processing later in this loop. 1714 // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. 1715 // If k is zero, k-1 doesn't exist, so we only need to leave it 1716 // for the caller. 1717 k := j - 1 1718 c1 := text[k] 1719 if k > 0 { 1720 if c0 := text[k-1]; c0 <= c1 { 1721 k = -k 1722 } 1723 } 1724 1725 if cB != c1 { 1726 bucket[cB] = b 1727 cB = c1 1728 b = bucket[cB] 1729 } 1730 b-- 1731 sa[b] = int64(k) 1732 } 1733 } 1734