sais2.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Code generated by go generate; DO NOT EDIT.
     6  
     7  package suffixarray
     8  
     9  func text_64(text []byte, sa []int64) {
    10  	if int(int64(len(text))) != len(text) || len(text) != len(sa) {
    11  		panic("suffixarray: misuse of text_64")
    12  	}
    13  	sais_8_64(text, 256, sa, make([]int64, 2*256))
    14  }
    15  
    16  func sais_8_64(text []byte, textMax int, sa, tmp []int64) {
    17  	if len(sa) != len(text) || len(tmp) < textMax {
    18  		panic("suffixarray: misuse of sais_8_64")
    19  	}
    20  
    21  	// Trivial base cases. Sorting 0 or 1 things is easy.
    22  	if len(text) == 0 {
    23  		return
    24  	}
    25  	if len(text) == 1 {
    26  		sa[0] = 0
    27  		return
    28  	}
    29  
    30  	// Establish slices indexed by text character
    31  	// holding character frequency and bucket-sort offsets.
    32  	// If there's only enough tmp for one slice,
    33  	// we make it the bucket offsets and recompute
    34  	// the character frequency each time we need it.
    35  	var freq, bucket []int64
    36  	if len(tmp) >= 2*textMax {
    37  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
    38  		freq[0] = -1 // mark as uninitialized
    39  	} else {
    40  		freq, bucket = nil, tmp[:textMax]
    41  	}
    42  
    43  	// The SAIS algorithm.
    44  	// Each of these calls makes one scan through sa.
    45  	// See the individual functions for documentation
    46  	// about each's role in the algorithm.
    47  	numLMS := placeLMS_8_64(text, sa, freq, bucket)
    48  	if numLMS <= 1 {
    49  		// 0 or 1 items are already sorted. Do nothing.
    50  	} else {
    51  		induceSubL_8_64(text, sa, freq, bucket)
    52  		induceSubS_8_64(text, sa, freq, bucket)
    53  		length_8_64(text, sa, numLMS)
    54  		maxID := assignID_8_64(text, sa, numLMS)
    55  		if maxID < numLMS {
    56  			map_64(sa, numLMS)
    57  			recurse_64(sa, tmp, numLMS, maxID)
    58  			unmap_8_64(text, sa, numLMS)
    59  		} else {
    60  			// If maxID == numLMS, then each LMS-substring
    61  			// is unique, so the relative ordering of two LMS-suffixes
    62  			// is determined by just the leading LMS-substring.
    63  			// That is, the LMS-suffix sort order matches the
    64  			// (simpler) LMS-substring sort order.
    65  			// Copy the original LMS-substring order into the
    66  			// suffix array destination.
    67  			copy(sa, sa[len(sa)-numLMS:])
    68  		}
    69  		expand_8_64(text, freq, bucket, sa, numLMS)
    70  	}
    71  	induceL_8_64(text, sa, freq, bucket)
    72  	induceS_8_64(text, sa, freq, bucket)
    73  
    74  	// Mark for caller that we overwrote tmp.
    75  	tmp[0] = -1
    76  }
    77  
    78  func sais_32(text []int32, textMax int, sa, tmp []int32) {
    79  	if len(sa) != len(text) || len(tmp) < textMax {
    80  		panic("suffixarray: misuse of sais_32")
    81  	}
    82  
    83  	// Trivial base cases. Sorting 0 or 1 things is easy.
    84  	if len(text) == 0 {
    85  		return
    86  	}
    87  	if len(text) == 1 {
    88  		sa[0] = 0
    89  		return
    90  	}
    91  
    92  	// Establish slices indexed by text character
    93  	// holding character frequency and bucket-sort offsets.
    94  	// If there's only enough tmp for one slice,
    95  	// we make it the bucket offsets and recompute
    96  	// the character frequency each time we need it.
    97  	var freq, bucket []int32
    98  	if len(tmp) >= 2*textMax {
    99  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   100  		freq[0] = -1 // mark as uninitialized
   101  	} else {
   102  		freq, bucket = nil, tmp[:textMax]
   103  	}
   104  
   105  	// The SAIS algorithm.
   106  	// Each of these calls makes one scan through sa.
   107  	// See the individual functions for documentation
   108  	// about each's role in the algorithm.
   109  	numLMS := placeLMS_32(text, sa, freq, bucket)
   110  	if numLMS <= 1 {
   111  		// 0 or 1 items are already sorted. Do nothing.
   112  	} else {
   113  		induceSubL_32(text, sa, freq, bucket)
   114  		induceSubS_32(text, sa, freq, bucket)
   115  		length_32(text, sa, numLMS)
   116  		maxID := assignID_32(text, sa, numLMS)
   117  		if maxID < numLMS {
   118  			map_32(sa, numLMS)
   119  			recurse_32(sa, tmp, numLMS, maxID)
   120  			unmap_32(text, sa, numLMS)
   121  		} else {
   122  			// If maxID == numLMS, then each LMS-substring
   123  			// is unique, so the relative ordering of two LMS-suffixes
   124  			// is determined by just the leading LMS-substring.
   125  			// That is, the LMS-suffix sort order matches the
   126  			// (simpler) LMS-substring sort order.
   127  			// Copy the original LMS-substring order into the
   128  			// suffix array destination.
   129  			copy(sa, sa[len(sa)-numLMS:])
   130  		}
   131  		expand_32(text, freq, bucket, sa, numLMS)
   132  	}
   133  	induceL_32(text, sa, freq, bucket)
   134  	induceS_32(text, sa, freq, bucket)
   135  
   136  	// Mark for caller that we overwrote tmp.
   137  	tmp[0] = -1
   138  }
   139  
   140  func sais_64(text []int64, textMax int, sa, tmp []int64) {
   141  	if len(sa) != len(text) || len(tmp) < textMax {
   142  		panic("suffixarray: misuse of sais_64")
   143  	}
   144  
   145  	// Trivial base cases. Sorting 0 or 1 things is easy.
   146  	if len(text) == 0 {
   147  		return
   148  	}
   149  	if len(text) == 1 {
   150  		sa[0] = 0
   151  		return
   152  	}
   153  
   154  	// Establish slices indexed by text character
   155  	// holding character frequency and bucket-sort offsets.
   156  	// If there's only enough tmp for one slice,
   157  	// we make it the bucket offsets and recompute
   158  	// the character frequency each time we need it.
   159  	var freq, bucket []int64
   160  	if len(tmp) >= 2*textMax {
   161  		freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   162  		freq[0] = -1 // mark as uninitialized
   163  	} else {
   164  		freq, bucket = nil, tmp[:textMax]
   165  	}
   166  
   167  	// The SAIS algorithm.
   168  	// Each of these calls makes one scan through sa.
   169  	// See the individual functions for documentation
   170  	// about each's role in the algorithm.
   171  	numLMS := placeLMS_64(text, sa, freq, bucket)
   172  	if numLMS <= 1 {
   173  		// 0 or 1 items are already sorted. Do nothing.
   174  	} else {
   175  		induceSubL_64(text, sa, freq, bucket)
   176  		induceSubS_64(text, sa, freq, bucket)
   177  		length_64(text, sa, numLMS)
   178  		maxID := assignID_64(text, sa, numLMS)
   179  		if maxID < numLMS {
   180  			map_64(sa, numLMS)
   181  			recurse_64(sa, tmp, numLMS, maxID)
   182  			unmap_64(text, sa, numLMS)
   183  		} else {
   184  			// If maxID == numLMS, then each LMS-substring
   185  			// is unique, so the relative ordering of two LMS-suffixes
   186  			// is determined by just the leading LMS-substring.
   187  			// That is, the LMS-suffix sort order matches the
   188  			// (simpler) LMS-substring sort order.
   189  			// Copy the original LMS-substring order into the
   190  			// suffix array destination.
   191  			copy(sa, sa[len(sa)-numLMS:])
   192  		}
   193  		expand_64(text, freq, bucket, sa, numLMS)
   194  	}
   195  	induceL_64(text, sa, freq, bucket)
   196  	induceS_64(text, sa, freq, bucket)
   197  
   198  	// Mark for caller that we overwrote tmp.
   199  	tmp[0] = -1
   200  }
   201  
   202  func freq_8_64(text []byte, freq, bucket []int64) []int64 {
   203  	if freq != nil && freq[0] >= 0 {
   204  		return freq // already computed
   205  	}
   206  	if freq == nil {
   207  		freq = bucket
   208  	}
   209  
   210  	freq = freq[:256] // eliminate bounds check for freq[c] below
   211  	clear(freq)
   212  	for _, c := range text {
   213  		freq[c]++
   214  	}
   215  	return freq
   216  }
   217  
   218  func freq_32(text []int32, freq, bucket []int32) []int32 {
   219  	if freq != nil && freq[0] >= 0 {
   220  		return freq // already computed
   221  	}
   222  	if freq == nil {
   223  		freq = bucket
   224  	}
   225  
   226  	clear(freq)
   227  	for _, c := range text {
   228  		freq[c]++
   229  	}
   230  	return freq
   231  }
   232  
   233  func freq_64(text []int64, freq, bucket []int64) []int64 {
   234  	if freq != nil && freq[0] >= 0 {
   235  		return freq // already computed
   236  	}
   237  	if freq == nil {
   238  		freq = bucket
   239  	}
   240  
   241  	clear(freq)
   242  	for _, c := range text {
   243  		freq[c]++
   244  	}
   245  	return freq
   246  }
   247  
   248  func bucketMin_8_64(text []byte, freq, bucket []int64) {
   249  	freq = freq_8_64(text, freq, bucket)
   250  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   251  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   252  	total := int64(0)
   253  	for i, n := range freq {
   254  		bucket[i] = total
   255  		total += n
   256  	}
   257  }
   258  
   259  func bucketMin_32(text []int32, freq, bucket []int32) {
   260  	freq = freq_32(text, freq, bucket)
   261  	total := int32(0)
   262  	for i, n := range freq {
   263  		bucket[i] = total
   264  		total += n
   265  	}
   266  }
   267  
   268  func bucketMin_64(text []int64, freq, bucket []int64) {
   269  	freq = freq_64(text, freq, bucket)
   270  	total := int64(0)
   271  	for i, n := range freq {
   272  		bucket[i] = total
   273  		total += n
   274  	}
   275  }
   276  
   277  func bucketMax_8_64(text []byte, freq, bucket []int64) {
   278  	freq = freq_8_64(text, freq, bucket)
   279  	freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   280  	bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   281  	total := int64(0)
   282  	for i, n := range freq {
   283  		total += n
   284  		bucket[i] = total
   285  	}
   286  }
   287  
   288  func bucketMax_32(text []int32, freq, bucket []int32) {
   289  	freq = freq_32(text, freq, bucket)
   290  	total := int32(0)
   291  	for i, n := range freq {
   292  		total += n
   293  		bucket[i] = total
   294  	}
   295  }
   296  
   297  func bucketMax_64(text []int64, freq, bucket []int64) {
   298  	freq = freq_64(text, freq, bucket)
   299  	total := int64(0)
   300  	for i, n := range freq {
   301  		total += n
   302  		bucket[i] = total
   303  	}
   304  }
   305  
   306  func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int {
   307  	bucketMax_8_64(text, freq, bucket)
   308  
   309  	numLMS := 0
   310  	lastB := int64(-1)
   311  	bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
   312  
   313  	// The next stanza of code (until the blank line) loop backward
   314  	// over text, stopping to execute a code body at each position i
   315  	// such that text[i] is an L-character and text[i+1] is an S-character.
   316  	// That is, i+1 is the position of the start of an LMS-substring.
   317  	// These could be hoisted out into a function with a callback,
   318  	// but at a significant speed cost. Instead, we just write these
   319  	// seven lines a few times in this source file. The copies below
   320  	// refer back to the pattern established by this original as the
   321  	// "LMS-substring iterator".
   322  	//
   323  	// In every scan through the text, c0, c1 are successive characters of text.
   324  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
   325  	// By scanning backward, we can keep track of whether the current
   326  	// position is type-S or type-L according to the usual definition:
   327  	//
   328  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   329  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   330  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   331  	//
   332  	// The backward scan lets us maintain the current type,
   333  	// update it when we see c0 != c1, and otherwise leave it alone.
   334  	// We want to identify all S positions with a preceding L.
   335  	// Position len(text) is one such position by definition, but we have
   336  	// nowhere to write it down, so we eliminate it by untruthfully
   337  	// setting isTypeS = false at the start of the loop.
   338  	c0, c1, isTypeS := byte(0), byte(0), false
   339  	for i := len(text) - 1; i >= 0; i-- {
   340  		c0, c1 = text[i], c0
   341  		if c0 < c1 {
   342  			isTypeS = true
   343  		} else if c0 > c1 && isTypeS {
   344  			isTypeS = false
   345  
   346  			// Bucket the index i+1 for the start of an LMS-substring.
   347  			b := bucket[c1] - 1
   348  			bucket[c1] = b
   349  			sa[b] = int64(i + 1)
   350  			lastB = b
   351  			numLMS++
   352  		}
   353  	}
   354  
   355  	// We recorded the LMS-substring starts but really want the ends.
   356  	// Luckily, with two differences, the start indexes and the end indexes are the same.
   357  	// The first difference is that the rightmost LMS-substring's end index is len(text),
   358  	// so the caller must pretend that sa[-1] == len(text), as noted above.
   359  	// The second difference is that the first leftmost LMS-substring start index
   360  	// does not end an earlier LMS-substring, so as an optimization we can omit
   361  	// that leftmost LMS-substring start index (the last one we wrote).
   362  	//
   363  	// Exception: if numLMS <= 1, the caller is not going to bother with
   364  	// the recursion at all and will treat the result as containing LMS-substring starts.
   365  	// In that case, we don't remove the final entry.
   366  	if numLMS > 1 {
   367  		sa[lastB] = 0
   368  	}
   369  	return numLMS
   370  }
   371  
   372  func placeLMS_32(text []int32, sa, freq, bucket []int32) int {
   373  	bucketMax_32(text, freq, bucket)
   374  
   375  	numLMS := 0
   376  	lastB := int32(-1)
   377  
   378  	// The next stanza of code (until the blank line) loop backward
   379  	// over text, stopping to execute a code body at each position i
   380  	// such that text[i] is an L-character and text[i+1] is an S-character.
   381  	// That is, i+1 is the position of the start of an LMS-substring.
   382  	// These could be hoisted out into a function with a callback,
   383  	// but at a significant speed cost. Instead, we just write these
   384  	// seven lines a few times in this source file. The copies below
   385  	// refer back to the pattern established by this original as the
   386  	// "LMS-substring iterator".
   387  	//
   388  	// In every scan through the text, c0, c1 are successive characters of text.
   389  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
   390  	// By scanning backward, we can keep track of whether the current
   391  	// position is type-S or type-L according to the usual definition:
   392  	//
   393  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   394  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   395  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   396  	//
   397  	// The backward scan lets us maintain the current type,
   398  	// update it when we see c0 != c1, and otherwise leave it alone.
   399  	// We want to identify all S positions with a preceding L.
   400  	// Position len(text) is one such position by definition, but we have
   401  	// nowhere to write it down, so we eliminate it by untruthfully
   402  	// setting isTypeS = false at the start of the loop.
   403  	c0, c1, isTypeS := int32(0), int32(0), false
   404  	for i := len(text) - 1; i >= 0; i-- {
   405  		c0, c1 = text[i], c0
   406  		if c0 < c1 {
   407  			isTypeS = true
   408  		} else if c0 > c1 && isTypeS {
   409  			isTypeS = false
   410  
   411  			// Bucket the index i+1 for the start of an LMS-substring.
   412  			b := bucket[c1] - 1
   413  			bucket[c1] = b
   414  			sa[b] = int32(i + 1)
   415  			lastB = b
   416  			numLMS++
   417  		}
   418  	}
   419  
   420  	// We recorded the LMS-substring starts but really want the ends.
   421  	// Luckily, with two differences, the start indexes and the end indexes are the same.
   422  	// The first difference is that the rightmost LMS-substring's end index is len(text),
   423  	// so the caller must pretend that sa[-1] == len(text), as noted above.
   424  	// The second difference is that the first leftmost LMS-substring start index
   425  	// does not end an earlier LMS-substring, so as an optimization we can omit
   426  	// that leftmost LMS-substring start index (the last one we wrote).
   427  	//
   428  	// Exception: if numLMS <= 1, the caller is not going to bother with
   429  	// the recursion at all and will treat the result as containing LMS-substring starts.
   430  	// In that case, we don't remove the final entry.
   431  	if numLMS > 1 {
   432  		sa[lastB] = 0
   433  	}
   434  	return numLMS
   435  }
   436  
   437  func placeLMS_64(text []int64, sa, freq, bucket []int64) int {
   438  	bucketMax_64(text, freq, bucket)
   439  
   440  	numLMS := 0
   441  	lastB := int64(-1)
   442  
   443  	// The next stanza of code (until the blank line) loop backward
   444  	// over text, stopping to execute a code body at each position i
   445  	// such that text[i] is an L-character and text[i+1] is an S-character.
   446  	// That is, i+1 is the position of the start of an LMS-substring.
   447  	// These could be hoisted out into a function with a callback,
   448  	// but at a significant speed cost. Instead, we just write these
   449  	// seven lines a few times in this source file. The copies below
   450  	// refer back to the pattern established by this original as the
   451  	// "LMS-substring iterator".
   452  	//
   453  	// In every scan through the text, c0, c1 are successive characters of text.
   454  	// In this backward scan, c0 == text[i] and c1 == text[i+1].
   455  	// By scanning backward, we can keep track of whether the current
   456  	// position is type-S or type-L according to the usual definition:
   457  	//
   458  	//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   459  	//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   460  	//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   461  	//
   462  	// The backward scan lets us maintain the current type,
   463  	// update it when we see c0 != c1, and otherwise leave it alone.
   464  	// We want to identify all S positions with a preceding L.
   465  	// Position len(text) is one such position by definition, but we have
   466  	// nowhere to write it down, so we eliminate it by untruthfully
   467  	// setting isTypeS = false at the start of the loop.
   468  	c0, c1, isTypeS := int64(0), int64(0), false
   469  	for i := len(text) - 1; i >= 0; i-- {
   470  		c0, c1 = text[i], c0
   471  		if c0 < c1 {
   472  			isTypeS = true
   473  		} else if c0 > c1 && isTypeS {
   474  			isTypeS = false
   475  
   476  			// Bucket the index i+1 for the start of an LMS-substring.
   477  			b := bucket[c1] - 1
   478  			bucket[c1] = b
   479  			sa[b] = int64(i + 1)
   480  			lastB = b
   481  			numLMS++
   482  		}
   483  	}
   484  
   485  	// We recorded the LMS-substring starts but really want the ends.
   486  	// Luckily, with two differences, the start indexes and the end indexes are the same.
   487  	// The first difference is that the rightmost LMS-substring's end index is len(text),
   488  	// so the caller must pretend that sa[-1] == len(text), as noted above.
   489  	// The second difference is that the first leftmost LMS-substring start index
   490  	// does not end an earlier LMS-substring, so as an optimization we can omit
   491  	// that leftmost LMS-substring start index (the last one we wrote).
   492  	//
   493  	// Exception: if numLMS <= 1, the caller is not going to bother with
   494  	// the recursion at all and will treat the result as containing LMS-substring starts.
   495  	// In that case, we don't remove the final entry.
   496  	if numLMS > 1 {
   497  		sa[lastB] = 0
   498  	}
   499  	return numLMS
   500  }
   501  
   502  func induceSubL_8_64(text []byte, sa, freq, bucket []int64) {
   503  	// Initialize positions for left side of character buckets.
   504  	bucketMin_8_64(text, freq, bucket)
   505  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   506  
   507  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   508  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   509  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
   510  	// But we want to distinguish a j-1 with j-2 of type L from type S.
   511  	// We can process the former but want to leave the latter for the caller.
   512  	// We record the difference by negating j-1 if it is preceded by type S.
   513  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   514  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   515  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   516  	// and so on, in sorted but not necessarily adjacent order, until it finds
   517  	// one preceded by an index of type S, at which point it must stop.
   518  	//
   519  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   520  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   521  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
   522  	//
   523  	// The suffix array sa therefore serves simultaneously as input, output,
   524  	// and a miraculously well-tailored work queue.
   525  
   526  	// placeLMS_8_64 left out the implicit entry sa[-1] == len(text),
   527  	// corresponding to the identified type-L index len(text)-1.
   528  	// Process it before the left-to-right scan of sa proper.
   529  	// See body in loop for commentary.
   530  	k := len(text) - 1
   531  	c0, c1 := text[k-1], text[k]
   532  	if c0 < c1 {
   533  		k = -k
   534  	}
   535  
   536  	// Cache recently used bucket index:
   537  	// we're processing suffixes in sorted order
   538  	// and accessing buckets indexed by the
   539  	// byte before the sorted order, which still
   540  	// has very good locality.
   541  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
   542  	cB := c1
   543  	b := bucket[cB]
   544  	sa[b] = int64(k)
   545  	b++
   546  
   547  	for i := 0; i < len(sa); i++ {
   548  		j := int(sa[i])
   549  		if j == 0 {
   550  			// Skip empty entry.
   551  			continue
   552  		}
   553  		if j < 0 {
   554  			// Leave discovered type-S index for caller.
   555  			sa[i] = int64(-j)
   556  			continue
   557  		}
   558  		sa[i] = 0
   559  
   560  		// Index j was on work queue, meaning k := j-1 is L-type,
   561  		// so we can now place k correctly into sa.
   562  		// If k-1 is L-type, queue k for processing later in this loop.
   563  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   564  		k := j - 1
   565  		c0, c1 := text[k-1], text[k]
   566  		if c0 < c1 {
   567  			k = -k
   568  		}
   569  
   570  		if cB != c1 {
   571  			bucket[cB] = b
   572  			cB = c1
   573  			b = bucket[cB]
   574  		}
   575  		sa[b] = int64(k)
   576  		b++
   577  	}
   578  }
   579  
   580  func induceSubL_32(text []int32, sa, freq, bucket []int32) {
   581  	// Initialize positions for left side of character buckets.
   582  	bucketMin_32(text, freq, bucket)
   583  
   584  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   585  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   586  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
   587  	// But we want to distinguish a j-1 with j-2 of type L from type S.
   588  	// We can process the former but want to leave the latter for the caller.
   589  	// We record the difference by negating j-1 if it is preceded by type S.
   590  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   591  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   592  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   593  	// and so on, in sorted but not necessarily adjacent order, until it finds
   594  	// one preceded by an index of type S, at which point it must stop.
   595  	//
   596  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   597  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   598  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
   599  	//
   600  	// The suffix array sa therefore serves simultaneously as input, output,
   601  	// and a miraculously well-tailored work queue.
   602  
   603  	// placeLMS_32 left out the implicit entry sa[-1] == len(text),
   604  	// corresponding to the identified type-L index len(text)-1.
   605  	// Process it before the left-to-right scan of sa proper.
   606  	// See body in loop for commentary.
   607  	k := len(text) - 1
   608  	c0, c1 := text[k-1], text[k]
   609  	if c0 < c1 {
   610  		k = -k
   611  	}
   612  
   613  	// Cache recently used bucket index:
   614  	// we're processing suffixes in sorted order
   615  	// and accessing buckets indexed by the
   616  	// int32 before the sorted order, which still
   617  	// has very good locality.
   618  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
   619  	cB := c1
   620  	b := bucket[cB]
   621  	sa[b] = int32(k)
   622  	b++
   623  
   624  	for i := 0; i < len(sa); i++ {
   625  		j := int(sa[i])
   626  		if j == 0 {
   627  			// Skip empty entry.
   628  			continue
   629  		}
   630  		if j < 0 {
   631  			// Leave discovered type-S index for caller.
   632  			sa[i] = int32(-j)
   633  			continue
   634  		}
   635  		sa[i] = 0
   636  
   637  		// Index j was on work queue, meaning k := j-1 is L-type,
   638  		// so we can now place k correctly into sa.
   639  		// If k-1 is L-type, queue k for processing later in this loop.
   640  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   641  		k := j - 1
   642  		c0, c1 := text[k-1], text[k]
   643  		if c0 < c1 {
   644  			k = -k
   645  		}
   646  
   647  		if cB != c1 {
   648  			bucket[cB] = b
   649  			cB = c1
   650  			b = bucket[cB]
   651  		}
   652  		sa[b] = int32(k)
   653  		b++
   654  	}
   655  }
   656  
   657  func induceSubL_64(text []int64, sa, freq, bucket []int64) {
   658  	// Initialize positions for left side of character buckets.
   659  	bucketMin_64(text, freq, bucket)
   660  
   661  	// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   662  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   663  	// Because j-1 is type L, inserting it into sa now will sort it correctly.
   664  	// But we want to distinguish a j-1 with j-2 of type L from type S.
   665  	// We can process the former but want to leave the latter for the caller.
   666  	// We record the difference by negating j-1 if it is preceded by type S.
   667  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   668  	// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   669  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   670  	// and so on, in sorted but not necessarily adjacent order, until it finds
   671  	// one preceded by an index of type S, at which point it must stop.
   672  	//
   673  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   674  	// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   675  	// only the indexes of the leftmost L-type indexes for each LMS-substring.
   676  	//
   677  	// The suffix array sa therefore serves simultaneously as input, output,
   678  	// and a miraculously well-tailored work queue.
   679  
   680  	// placeLMS_64 left out the implicit entry sa[-1] == len(text),
   681  	// corresponding to the identified type-L index len(text)-1.
   682  	// Process it before the left-to-right scan of sa proper.
   683  	// See body in loop for commentary.
   684  	k := len(text) - 1
   685  	c0, c1 := text[k-1], text[k]
   686  	if c0 < c1 {
   687  		k = -k
   688  	}
   689  
   690  	// Cache recently used bucket index:
   691  	// we're processing suffixes in sorted order
   692  	// and accessing buckets indexed by the
   693  	// int64 before the sorted order, which still
   694  	// has very good locality.
   695  	// Invariant: b is cached, possibly dirty copy of bucket[cB].
   696  	cB := c1
   697  	b := bucket[cB]
   698  	sa[b] = int64(k)
   699  	b++
   700  
   701  	for i := 0; i < len(sa); i++ {
   702  		j := int(sa[i])
   703  		if j == 0 {
   704  			// Skip empty entry.
   705  			continue
   706  		}
   707  		if j < 0 {
   708  			// Leave discovered type-S index for caller.
   709  			sa[i] = int64(-j)
   710  			continue
   711  		}
   712  		sa[i] = 0
   713  
   714  		// Index j was on work queue, meaning k := j-1 is L-type,
   715  		// so we can now place k correctly into sa.
   716  		// If k-1 is L-type, queue k for processing later in this loop.
   717  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   718  		k := j - 1
   719  		c0, c1 := text[k-1], text[k]
   720  		if c0 < c1 {
   721  			k = -k
   722  		}
   723  
   724  		if cB != c1 {
   725  			bucket[cB] = b
   726  			cB = c1
   727  			b = bucket[cB]
   728  		}
   729  		sa[b] = int64(k)
   730  		b++
   731  	}
   732  }
   733  
   734  func induceSubS_8_64(text []byte, sa, freq, bucket []int64) {
   735  	// Initialize positions for right side of character buckets.
   736  	bucketMax_8_64(text, freq, bucket)
   737  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   738  
   739  	// Analogous to induceSubL_8_64 above,
   740  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   741  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   742  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
   743  	// But we want to distinguish a j-1 with j-2 of type S from type L.
   744  	// We can process the former but want to leave the latter for the caller.
   745  	// We record the difference by negating j-1 if it is preceded by type L.
   746  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   747  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   748  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   749  	// and so on, in sorted but not necessarily adjacent order, until it finds
   750  	// one preceded by an index of type L, at which point it must stop.
   751  	// That index (preceded by one of type L) is an LMS-substring start.
   752  	//
   753  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   754  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   755  	// so that the loop finishes with the top of sa containing exactly
   756  	// the LMS-substring start indexes, sorted by LMS-substring.
   757  
   758  	// Cache recently used bucket index:
   759  	cB := byte(0)
   760  	b := bucket[cB]
   761  
   762  	top := len(sa)
   763  	for i := len(sa) - 1; i >= 0; i-- {
   764  		j := int(sa[i])
   765  		if j == 0 {
   766  			// Skip empty entry.
   767  			continue
   768  		}
   769  		sa[i] = 0
   770  		if j < 0 {
   771  			// Leave discovered LMS-substring start index for caller.
   772  			top--
   773  			sa[top] = int64(-j)
   774  			continue
   775  		}
   776  
   777  		// Index j was on work queue, meaning k := j-1 is S-type,
   778  		// so we can now place k correctly into sa.
   779  		// If k-1 is S-type, queue k for processing later in this loop.
   780  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   781  		k := j - 1
   782  		c1 := text[k]
   783  		c0 := text[k-1]
   784  		if c0 > c1 {
   785  			k = -k
   786  		}
   787  
   788  		if cB != c1 {
   789  			bucket[cB] = b
   790  			cB = c1
   791  			b = bucket[cB]
   792  		}
   793  		b--
   794  		sa[b] = int64(k)
   795  	}
   796  }
   797  
   798  func induceSubS_32(text []int32, sa, freq, bucket []int32) {
   799  	// Initialize positions for right side of character buckets.
   800  	bucketMax_32(text, freq, bucket)
   801  
   802  	// Analogous to induceSubL_32 above,
   803  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   804  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   805  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
   806  	// But we want to distinguish a j-1 with j-2 of type S from type L.
   807  	// We can process the former but want to leave the latter for the caller.
   808  	// We record the difference by negating j-1 if it is preceded by type L.
   809  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   810  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   811  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   812  	// and so on, in sorted but not necessarily adjacent order, until it finds
   813  	// one preceded by an index of type L, at which point it must stop.
   814  	// That index (preceded by one of type L) is an LMS-substring start.
   815  	//
   816  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   817  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   818  	// so that the loop finishes with the top of sa containing exactly
   819  	// the LMS-substring start indexes, sorted by LMS-substring.
   820  
   821  	// Cache recently used bucket index:
   822  	cB := int32(0)
   823  	b := bucket[cB]
   824  
   825  	top := len(sa)
   826  	for i := len(sa) - 1; i >= 0; i-- {
   827  		j := int(sa[i])
   828  		if j == 0 {
   829  			// Skip empty entry.
   830  			continue
   831  		}
   832  		sa[i] = 0
   833  		if j < 0 {
   834  			// Leave discovered LMS-substring start index for caller.
   835  			top--
   836  			sa[top] = int32(-j)
   837  			continue
   838  		}
   839  
   840  		// Index j was on work queue, meaning k := j-1 is S-type,
   841  		// so we can now place k correctly into sa.
   842  		// If k-1 is S-type, queue k for processing later in this loop.
   843  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   844  		k := j - 1
   845  		c1 := text[k]
   846  		c0 := text[k-1]
   847  		if c0 > c1 {
   848  			k = -k
   849  		}
   850  
   851  		if cB != c1 {
   852  			bucket[cB] = b
   853  			cB = c1
   854  			b = bucket[cB]
   855  		}
   856  		b--
   857  		sa[b] = int32(k)
   858  	}
   859  }
   860  
   861  func induceSubS_64(text []int64, sa, freq, bucket []int64) {
   862  	// Initialize positions for right side of character buckets.
   863  	bucketMax_64(text, freq, bucket)
   864  
   865  	// Analogous to induceSubL_64 above,
   866  	// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   867  	// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   868  	// Because j-1 is type S, inserting it into sa now will sort it correctly.
   869  	// But we want to distinguish a j-1 with j-2 of type S from type L.
   870  	// We can process the former but want to leave the latter for the caller.
   871  	// We record the difference by negating j-1 if it is preceded by type L.
   872  	// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   873  	// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   874  	// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   875  	// and so on, in sorted but not necessarily adjacent order, until it finds
   876  	// one preceded by an index of type L, at which point it must stop.
   877  	// That index (preceded by one of type L) is an LMS-substring start.
   878  	//
   879  	// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   880  	// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   881  	// so that the loop finishes with the top of sa containing exactly
   882  	// the LMS-substring start indexes, sorted by LMS-substring.
   883  
   884  	// Cache recently used bucket index:
   885  	cB := int64(0)
   886  	b := bucket[cB]
   887  
   888  	top := len(sa)
   889  	for i := len(sa) - 1; i >= 0; i-- {
   890  		j := int(sa[i])
   891  		if j == 0 {
   892  			// Skip empty entry.
   893  			continue
   894  		}
   895  		sa[i] = 0
   896  		if j < 0 {
   897  			// Leave discovered LMS-substring start index for caller.
   898  			top--
   899  			sa[top] = int64(-j)
   900  			continue
   901  		}
   902  
   903  		// Index j was on work queue, meaning k := j-1 is S-type,
   904  		// so we can now place k correctly into sa.
   905  		// If k-1 is S-type, queue k for processing later in this loop.
   906  		// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   907  		k := j - 1
   908  		c1 := text[k]
   909  		c0 := text[k-1]
   910  		if c0 > c1 {
   911  			k = -k
   912  		}
   913  
   914  		if cB != c1 {
   915  			bucket[cB] = b
   916  			cB = c1
   917  			b = bucket[cB]
   918  		}
   919  		b--
   920  		sa[b] = int64(k)
   921  	}
   922  }
   923  
   924  func length_8_64(text []byte, sa []int64, numLMS int) {
   925  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   926  
   927  	// The encoding of N text bytes into a “length” word
   928  	// adds 1 to each byte, packs them into the bottom
   929  	// N*8 bits of a word, and then bitwise inverts the result.
   930  	// That is, the text sequence A B C (hex 41 42 43)
   931  	// encodes as ^uint64(0x42_43_44).
   932  	// LMS-substrings can never start or end with 0xFF.
   933  	// Adding 1 ensures the encoded byte sequence never
   934  	// starts or ends with 0x00, so that present bytes can be
   935  	// distinguished from zero-padding in the top bits,
   936  	// so the length need not be separately encoded.
   937  	// Inverting the bytes increases the chance that a
   938  	// 4-byte encoding will still be ≥ len(text).
   939  	// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
   940  	// then the high bit of the inversion will be set,
   941  	// making it clearly not a valid length (it would be a negative one).
   942  	//
   943  	// cx holds the pre-inverted encoding (the packed incremented bytes).
   944  	cx := uint64(0) // byte-only
   945  
   946  	// This stanza (until the blank line) is the "LMS-substring iterator",
   947  	// described in placeLMS_8_64 above, with one line added to maintain cx.
   948  	c0, c1, isTypeS := byte(0), byte(0), false
   949  	for i := len(text) - 1; i >= 0; i-- {
   950  		c0, c1 = text[i], c0
   951  		cx = cx<<8 | uint64(c1+1) // byte-only
   952  		if c0 < c1 {
   953  			isTypeS = true
   954  		} else if c0 > c1 && isTypeS {
   955  			isTypeS = false
   956  
   957  			// Index j = i+1 is the start of an LMS-substring.
   958  			// Compute length or encoded text to store in sa[j/2].
   959  			j := i + 1
   960  			var code int64
   961  			if end == 0 {
   962  				code = 0
   963  			} else {
   964  				code = int64(end - j)
   965  				if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only
   966  					code = int64(^cx) // byte-only
   967  				} // byte-only
   968  			}
   969  			sa[j>>1] = code
   970  			end = j + 1
   971  			cx = uint64(c1 + 1) // byte-only
   972  		}
   973  	}
   974  }
   975  
   976  func length_32(text []int32, sa []int32, numLMS int) {
   977  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   978  
   979  	// The encoding of N text int32s into a “length” word
   980  	// adds 1 to each int32, packs them into the bottom
   981  	// N*8 bits of a word, and then bitwise inverts the result.
   982  	// That is, the text sequence A B C (hex 41 42 43)
   983  	// encodes as ^uint32(0x42_43_44).
   984  	// LMS-substrings can never start or end with 0xFF.
   985  	// Adding 1 ensures the encoded int32 sequence never
   986  	// starts or ends with 0x00, so that present int32s can be
   987  	// distinguished from zero-padding in the top bits,
   988  	// so the length need not be separately encoded.
   989  	// Inverting the int32s increases the chance that a
   990  	// 4-int32 encoding will still be ≥ len(text).
   991  	// In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F)
   992  	// then the high bit of the inversion will be set,
   993  	// making it clearly not a valid length (it would be a negative one).
   994  	//
   995  	// cx holds the pre-inverted encoding (the packed incremented int32s).
   996  
   997  	// This stanza (until the blank line) is the "LMS-substring iterator",
   998  	// described in placeLMS_32 above, with one line added to maintain cx.
   999  	c0, c1, isTypeS := int32(0), int32(0), false
  1000  	for i := len(text) - 1; i >= 0; i-- {
  1001  		c0, c1 = text[i], c0
  1002  		if c0 < c1 {
  1003  			isTypeS = true
  1004  		} else if c0 > c1 && isTypeS {
  1005  			isTypeS = false
  1006  
  1007  			// Index j = i+1 is the start of an LMS-substring.
  1008  			// Compute length or encoded text to store in sa[j/2].
  1009  			j := i + 1
  1010  			var code int32
  1011  			if end == 0 {
  1012  				code = 0
  1013  			} else {
  1014  				code = int32(end - j)
  1015  			}
  1016  			sa[j>>1] = code
  1017  			end = j + 1
  1018  		}
  1019  	}
  1020  }
  1021  
  1022  func length_64(text []int64, sa []int64, numLMS int) {
  1023  	end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
  1024  
  1025  	// The encoding of N text int64s into a “length” word
  1026  	// adds 1 to each int64, packs them into the bottom
  1027  	// N*8 bits of a word, and then bitwise inverts the result.
  1028  	// That is, the text sequence A B C (hex 41 42 43)
  1029  	// encodes as ^uint64(0x42_43_44).
  1030  	// LMS-substrings can never start or end with 0xFF.
  1031  	// Adding 1 ensures the encoded int64 sequence never
  1032  	// starts or ends with 0x00, so that present int64s can be
  1033  	// distinguished from zero-padding in the top bits,
  1034  	// so the length need not be separately encoded.
  1035  	// Inverting the int64s increases the chance that a
  1036  	// 4-int64 encoding will still be ≥ len(text).
  1037  	// In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F)
  1038  	// then the high bit of the inversion will be set,
  1039  	// making it clearly not a valid length (it would be a negative one).
  1040  	//
  1041  	// cx holds the pre-inverted encoding (the packed incremented int64s).
  1042  
  1043  	// This stanza (until the blank line) is the "LMS-substring iterator",
  1044  	// described in placeLMS_64 above, with one line added to maintain cx.
  1045  	c0, c1, isTypeS := int64(0), int64(0), false
  1046  	for i := len(text) - 1; i >= 0; i-- {
  1047  		c0, c1 = text[i], c0
  1048  		if c0 < c1 {
  1049  			isTypeS = true
  1050  		} else if c0 > c1 && isTypeS {
  1051  			isTypeS = false
  1052  
  1053  			// Index j = i+1 is the start of an LMS-substring.
  1054  			// Compute length or encoded text to store in sa[j/2].
  1055  			j := i + 1
  1056  			var code int64
  1057  			if end == 0 {
  1058  				code = 0
  1059  			} else {
  1060  				code = int64(end - j)
  1061  			}
  1062  			sa[j>>1] = code
  1063  			end = j + 1
  1064  		}
  1065  	}
  1066  }
  1067  
  1068  func assignID_8_64(text []byte, sa []int64, numLMS int) int {
  1069  	id := 0
  1070  	lastLen := int64(-1) // impossible
  1071  	lastPos := int64(0)
  1072  	for _, j := range sa[len(sa)-numLMS:] {
  1073  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1074  		n := sa[j/2]
  1075  		if n != lastLen {
  1076  			goto New
  1077  		}
  1078  		if uint64(n) >= uint64(len(text)) {
  1079  			// “Length” is really encoded full text, and they match.
  1080  			goto Same
  1081  		}
  1082  		{
  1083  			// Compare actual texts.
  1084  			n := int(n)
  1085  			this := text[j:][:n]
  1086  			last := text[lastPos:][:n]
  1087  			for i := 0; i < n; i++ {
  1088  				if this[i] != last[i] {
  1089  					goto New
  1090  				}
  1091  			}
  1092  			goto Same
  1093  		}
  1094  	New:
  1095  		id++
  1096  		lastPos = j
  1097  		lastLen = n
  1098  	Same:
  1099  		sa[j/2] = int64(id)
  1100  	}
  1101  	return id
  1102  }
  1103  
  1104  func assignID_32(text []int32, sa []int32, numLMS int) int {
  1105  	id := 0
  1106  	lastLen := int32(-1) // impossible
  1107  	lastPos := int32(0)
  1108  	for _, j := range sa[len(sa)-numLMS:] {
  1109  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1110  		n := sa[j/2]
  1111  		if n != lastLen {
  1112  			goto New
  1113  		}
  1114  		if uint32(n) >= uint32(len(text)) {
  1115  			// “Length” is really encoded full text, and they match.
  1116  			goto Same
  1117  		}
  1118  		{
  1119  			// Compare actual texts.
  1120  			n := int(n)
  1121  			this := text[j:][:n]
  1122  			last := text[lastPos:][:n]
  1123  			for i := 0; i < n; i++ {
  1124  				if this[i] != last[i] {
  1125  					goto New
  1126  				}
  1127  			}
  1128  			goto Same
  1129  		}
  1130  	New:
  1131  		id++
  1132  		lastPos = j
  1133  		lastLen = n
  1134  	Same:
  1135  		sa[j/2] = int32(id)
  1136  	}
  1137  	return id
  1138  }
  1139  
  1140  func assignID_64(text []int64, sa []int64, numLMS int) int {
  1141  	id := 0
  1142  	lastLen := int64(-1) // impossible
  1143  	lastPos := int64(0)
  1144  	for _, j := range sa[len(sa)-numLMS:] {
  1145  		// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1146  		n := sa[j/2]
  1147  		if n != lastLen {
  1148  			goto New
  1149  		}
  1150  		if uint64(n) >= uint64(len(text)) {
  1151  			// “Length” is really encoded full text, and they match.
  1152  			goto Same
  1153  		}
  1154  		{
  1155  			// Compare actual texts.
  1156  			n := int(n)
  1157  			this := text[j:][:n]
  1158  			last := text[lastPos:][:n]
  1159  			for i := 0; i < n; i++ {
  1160  				if this[i] != last[i] {
  1161  					goto New
  1162  				}
  1163  			}
  1164  			goto Same
  1165  		}
  1166  	New:
  1167  		id++
  1168  		lastPos = j
  1169  		lastLen = n
  1170  	Same:
  1171  		sa[j/2] = int64(id)
  1172  	}
  1173  	return id
  1174  }
  1175  
  1176  func map_64(sa []int64, numLMS int) {
  1177  	w := len(sa)
  1178  	for i := len(sa) / 2; i >= 0; i-- {
  1179  		j := sa[i]
  1180  		if j > 0 {
  1181  			w--
  1182  			sa[w] = j - 1
  1183  		}
  1184  	}
  1185  }
  1186  
  1187  func recurse_64(sa, oldTmp []int64, numLMS, maxID int) {
  1188  	dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
  1189  
  1190  	// Set up temporary space for recursive call.
  1191  	// We must pass sais_64 a tmp buffer with at least maxID entries.
  1192  	//
  1193  	// The subproblem is guaranteed to have length at most len(sa)/2,
  1194  	// so that sa can hold both the subproblem and its suffix array.
  1195  	// Nearly all the time, however, the subproblem has length < len(sa)/3,
  1196  	// in which case there is a subproblem-sized middle of sa that
  1197  	// we can reuse for temporary space (saTmp).
  1198  	// When recurse_64 is called from sais_8_64, oldTmp is length 512
  1199  	// (from text_64), and saTmp will typically be much larger, so we'll use saTmp.
  1200  	// When deeper recursions come back to recurse_64, now oldTmp is
  1201  	// the saTmp from the top-most recursion, it is typically larger than
  1202  	// the current saTmp (because the current sa gets smaller and smaller
  1203  	// as the recursion gets deeper), and we keep reusing that top-most
  1204  	// large saTmp instead of the offered smaller ones.
  1205  	//
  1206  	// Why is the subproblem length so often just under len(sa)/3?
  1207  	// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
  1208  	// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
  1209  	// in the input, perfect alternation of larger and smaller input bytes.
  1210  	// Real text doesn't do that. If each L-type index is randomly followed
  1211  	// by either an L-type or S-type index, then half the substrings will
  1212  	// be of the form SLS, but the other half will be longer. Of that half,
  1213  	// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
  1214  	// Not counting the final S in each (which overlaps the first S in the next),
  1215  	// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
  1216  	// The space we need is further reduced by the fact that many of the
  1217  	// short patterns like SLS will often be the same character sequences
  1218  	// repeated throughout the text, reducing maxID relative to numLMS.
  1219  	//
  1220  	// For short inputs, the averages may not run in our favor, but then we
  1221  	// can often fall back to using the length-512 tmp available in the
  1222  	// top-most call. (Also a short allocation would not be a big deal.)
  1223  	//
  1224  	// For pathological inputs, we fall back to allocating a new tmp of length
  1225  	// max(maxID, numLMS/2). This level of the recursion needs maxID,
  1226  	// and all deeper levels of the recursion will need no more than numLMS/2,
  1227  	// so this one allocation is guaranteed to suffice for the entire stack
  1228  	// of recursive calls.
  1229  	tmp := oldTmp
  1230  	if len(tmp) < len(saTmp) {
  1231  		tmp = saTmp
  1232  	}
  1233  	if len(tmp) < numLMS {
  1234  		// TestSAIS/forcealloc reaches this code.
  1235  		n := maxID
  1236  		if n < numLMS/2 {
  1237  			n = numLMS / 2
  1238  		}
  1239  		tmp = make([]int64, n)
  1240  	}
  1241  
  1242  	// sais_64 requires that the caller arrange to clear dst,
  1243  	// because in general the caller may know dst is
  1244  	// freshly-allocated and already cleared. But this one is not.
  1245  	clear(dst)
  1246  	sais_64(text, maxID, dst, tmp)
  1247  }
  1248  
  1249  func unmap_8_64(text []byte, sa []int64, numLMS int) {
  1250  	unmap := sa[len(sa)-numLMS:]
  1251  	j := len(unmap)
  1252  
  1253  	// "LMS-substring iterator" (see placeLMS_8_64 above).
  1254  	c0, c1, isTypeS := byte(0), byte(0), false
  1255  	for i := len(text) - 1; i >= 0; i-- {
  1256  		c0, c1 = text[i], c0
  1257  		if c0 < c1 {
  1258  			isTypeS = true
  1259  		} else if c0 > c1 && isTypeS {
  1260  			isTypeS = false
  1261  
  1262  			// Populate inverse map.
  1263  			j--
  1264  			unmap[j] = int64(i + 1)
  1265  		}
  1266  	}
  1267  
  1268  	// Apply inverse map to subproblem suffix array.
  1269  	sa = sa[:numLMS]
  1270  	for i := 0; i < len(sa); i++ {
  1271  		sa[i] = unmap[sa[i]]
  1272  	}
  1273  }
  1274  
  1275  func unmap_32(text []int32, sa []int32, numLMS int) {
  1276  	unmap := sa[len(sa)-numLMS:]
  1277  	j := len(unmap)
  1278  
  1279  	// "LMS-substring iterator" (see placeLMS_32 above).
  1280  	c0, c1, isTypeS := int32(0), int32(0), false
  1281  	for i := len(text) - 1; i >= 0; i-- {
  1282  		c0, c1 = text[i], c0
  1283  		if c0 < c1 {
  1284  			isTypeS = true
  1285  		} else if c0 > c1 && isTypeS {
  1286  			isTypeS = false
  1287  
  1288  			// Populate inverse map.
  1289  			j--
  1290  			unmap[j] = int32(i + 1)
  1291  		}
  1292  	}
  1293  
  1294  	// Apply inverse map to subproblem suffix array.
  1295  	sa = sa[:numLMS]
  1296  	for i := 0; i < len(sa); i++ {
  1297  		sa[i] = unmap[sa[i]]
  1298  	}
  1299  }
  1300  
  1301  func unmap_64(text []int64, sa []int64, numLMS int) {
  1302  	unmap := sa[len(sa)-numLMS:]
  1303  	j := len(unmap)
  1304  
  1305  	// "LMS-substring iterator" (see placeLMS_64 above).
  1306  	c0, c1, isTypeS := int64(0), int64(0), false
  1307  	for i := len(text) - 1; i >= 0; i-- {
  1308  		c0, c1 = text[i], c0
  1309  		if c0 < c1 {
  1310  			isTypeS = true
  1311  		} else if c0 > c1 && isTypeS {
  1312  			isTypeS = false
  1313  
  1314  			// Populate inverse map.
  1315  			j--
  1316  			unmap[j] = int64(i + 1)
  1317  		}
  1318  	}
  1319  
  1320  	// Apply inverse map to subproblem suffix array.
  1321  	sa = sa[:numLMS]
  1322  	for i := 0; i < len(sa); i++ {
  1323  		sa[i] = unmap[sa[i]]
  1324  	}
  1325  }
  1326  
  1327  func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) {
  1328  	bucketMax_8_64(text, freq, bucket)
  1329  	bucket = bucket[:256] // eliminate bound check for bucket[c] below
  1330  
  1331  	// Loop backward through sa, always tracking
  1332  	// the next index to populate from sa[:numLMS].
  1333  	// When we get to one, populate it.
  1334  	// Zero the rest of the slots; they have dead values in them.
  1335  	x := numLMS - 1
  1336  	saX := sa[x]
  1337  	c := text[saX]
  1338  	b := bucket[c] - 1
  1339  	bucket[c] = b
  1340  
  1341  	for i := len(sa) - 1; i >= 0; i-- {
  1342  		if i != int(b) {
  1343  			sa[i] = 0
  1344  			continue
  1345  		}
  1346  		sa[i] = saX
  1347  
  1348  		// Load next entry to put down (if any).
  1349  		if x > 0 {
  1350  			x--
  1351  			saX = sa[x] // TODO bounds check
  1352  			c = text[saX]
  1353  			b = bucket[c] - 1
  1354  			bucket[c] = b
  1355  		}
  1356  	}
  1357  }
  1358  
  1359  func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) {
  1360  	bucketMax_32(text, freq, bucket)
  1361  
  1362  	// Loop backward through sa, always tracking
  1363  	// the next index to populate from sa[:numLMS].
  1364  	// When we get to one, populate it.
  1365  	// Zero the rest of the slots; they have dead values in them.
  1366  	x := numLMS - 1
  1367  	saX := sa[x]
  1368  	c := text[saX]
  1369  	b := bucket[c] - 1
  1370  	bucket[c] = b
  1371  
  1372  	for i := len(sa) - 1; i >= 0; i-- {
  1373  		if i != int(b) {
  1374  			sa[i] = 0
  1375  			continue
  1376  		}
  1377  		sa[i] = saX
  1378  
  1379  		// Load next entry to put down (if any).
  1380  		if x > 0 {
  1381  			x--
  1382  			saX = sa[x] // TODO bounds check
  1383  			c = text[saX]
  1384  			b = bucket[c] - 1
  1385  			bucket[c] = b
  1386  		}
  1387  	}
  1388  }
  1389  
  1390  func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) {
  1391  	bucketMax_64(text, freq, bucket)
  1392  
  1393  	// Loop backward through sa, always tracking
  1394  	// the next index to populate from sa[:numLMS].
  1395  	// When we get to one, populate it.
  1396  	// Zero the rest of the slots; they have dead values in them.
  1397  	x := numLMS - 1
  1398  	saX := sa[x]
  1399  	c := text[saX]
  1400  	b := bucket[c] - 1
  1401  	bucket[c] = b
  1402  
  1403  	for i := len(sa) - 1; i >= 0; i-- {
  1404  		if i != int(b) {
  1405  			sa[i] = 0
  1406  			continue
  1407  		}
  1408  		sa[i] = saX
  1409  
  1410  		// Load next entry to put down (if any).
  1411  		if x > 0 {
  1412  			x--
  1413  			saX = sa[x] // TODO bounds check
  1414  			c = text[saX]
  1415  			b = bucket[c] - 1
  1416  			bucket[c] = b
  1417  		}
  1418  	}
  1419  }
  1420  
  1421  func induceL_8_64(text []byte, sa, freq, bucket []int64) {
  1422  	// Initialize positions for left side of character buckets.
  1423  	bucketMin_8_64(text, freq, bucket)
  1424  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
  1425  
  1426  	// This scan is similar to the one in induceSubL_8_64 above.
  1427  	// That one arranges to clear all but the leftmost L-type indexes.
  1428  	// This scan leaves all the L-type indexes and the original S-type
  1429  	// indexes, but it negates the positive leftmost L-type indexes
  1430  	// (the ones that induceS_8_64 needs to process).
  1431  
  1432  	// expand_8_64 left out the implicit entry sa[-1] == len(text),
  1433  	// corresponding to the identified type-L index len(text)-1.
  1434  	// Process it before the left-to-right scan of sa proper.
  1435  	// See body in loop for commentary.
  1436  	k := len(text) - 1
  1437  	c0, c1 := text[k-1], text[k]
  1438  	if c0 < c1 {
  1439  		k = -k
  1440  	}
  1441  
  1442  	// Cache recently used bucket index.
  1443  	cB := c1
  1444  	b := bucket[cB]
  1445  	sa[b] = int64(k)
  1446  	b++
  1447  
  1448  	for i := 0; i < len(sa); i++ {
  1449  		j := int(sa[i])
  1450  		if j <= 0 {
  1451  			// Skip empty or negated entry (including negated zero).
  1452  			continue
  1453  		}
  1454  
  1455  		// Index j was on work queue, meaning k := j-1 is L-type,
  1456  		// so we can now place k correctly into sa.
  1457  		// If k-1 is L-type, queue k for processing later in this loop.
  1458  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1459  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1460  		// for the caller. The caller can't tell the difference between
  1461  		// an empty slot and a non-empty zero, but there's no need
  1462  		// to distinguish them anyway: the final suffix array will end up
  1463  		// with one zero somewhere, and that will be a real zero.
  1464  		k := j - 1
  1465  		c1 := text[k]
  1466  		if k > 0 {
  1467  			if c0 := text[k-1]; c0 < c1 {
  1468  				k = -k
  1469  			}
  1470  		}
  1471  
  1472  		if cB != c1 {
  1473  			bucket[cB] = b
  1474  			cB = c1
  1475  			b = bucket[cB]
  1476  		}
  1477  		sa[b] = int64(k)
  1478  		b++
  1479  	}
  1480  }
  1481  
  1482  func induceL_32(text []int32, sa, freq, bucket []int32) {
  1483  	// Initialize positions for left side of character buckets.
  1484  	bucketMin_32(text, freq, bucket)
  1485  
  1486  	// This scan is similar to the one in induceSubL_32 above.
  1487  	// That one arranges to clear all but the leftmost L-type indexes.
  1488  	// This scan leaves all the L-type indexes and the original S-type
  1489  	// indexes, but it negates the positive leftmost L-type indexes
  1490  	// (the ones that induceS_32 needs to process).
  1491  
  1492  	// expand_32 left out the implicit entry sa[-1] == len(text),
  1493  	// corresponding to the identified type-L index len(text)-1.
  1494  	// Process it before the left-to-right scan of sa proper.
  1495  	// See body in loop for commentary.
  1496  	k := len(text) - 1
  1497  	c0, c1 := text[k-1], text[k]
  1498  	if c0 < c1 {
  1499  		k = -k
  1500  	}
  1501  
  1502  	// Cache recently used bucket index.
  1503  	cB := c1
  1504  	b := bucket[cB]
  1505  	sa[b] = int32(k)
  1506  	b++
  1507  
  1508  	for i := 0; i < len(sa); i++ {
  1509  		j := int(sa[i])
  1510  		if j <= 0 {
  1511  			// Skip empty or negated entry (including negated zero).
  1512  			continue
  1513  		}
  1514  
  1515  		// Index j was on work queue, meaning k := j-1 is L-type,
  1516  		// so we can now place k correctly into sa.
  1517  		// If k-1 is L-type, queue k for processing later in this loop.
  1518  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1519  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1520  		// for the caller. The caller can't tell the difference between
  1521  		// an empty slot and a non-empty zero, but there's no need
  1522  		// to distinguish them anyway: the final suffix array will end up
  1523  		// with one zero somewhere, and that will be a real zero.
  1524  		k := j - 1
  1525  		c1 := text[k]
  1526  		if k > 0 {
  1527  			if c0 := text[k-1]; c0 < c1 {
  1528  				k = -k
  1529  			}
  1530  		}
  1531  
  1532  		if cB != c1 {
  1533  			bucket[cB] = b
  1534  			cB = c1
  1535  			b = bucket[cB]
  1536  		}
  1537  		sa[b] = int32(k)
  1538  		b++
  1539  	}
  1540  }
  1541  
  1542  func induceL_64(text []int64, sa, freq, bucket []int64) {
  1543  	// Initialize positions for left side of character buckets.
  1544  	bucketMin_64(text, freq, bucket)
  1545  
  1546  	// This scan is similar to the one in induceSubL_64 above.
  1547  	// That one arranges to clear all but the leftmost L-type indexes.
  1548  	// This scan leaves all the L-type indexes and the original S-type
  1549  	// indexes, but it negates the positive leftmost L-type indexes
  1550  	// (the ones that induceS_64 needs to process).
  1551  
  1552  	// expand_64 left out the implicit entry sa[-1] == len(text),
  1553  	// corresponding to the identified type-L index len(text)-1.
  1554  	// Process it before the left-to-right scan of sa proper.
  1555  	// See body in loop for commentary.
  1556  	k := len(text) - 1
  1557  	c0, c1 := text[k-1], text[k]
  1558  	if c0 < c1 {
  1559  		k = -k
  1560  	}
  1561  
  1562  	// Cache recently used bucket index.
  1563  	cB := c1
  1564  	b := bucket[cB]
  1565  	sa[b] = int64(k)
  1566  	b++
  1567  
  1568  	for i := 0; i < len(sa); i++ {
  1569  		j := int(sa[i])
  1570  		if j <= 0 {
  1571  			// Skip empty or negated entry (including negated zero).
  1572  			continue
  1573  		}
  1574  
  1575  		// Index j was on work queue, meaning k := j-1 is L-type,
  1576  		// so we can now place k correctly into sa.
  1577  		// If k-1 is L-type, queue k for processing later in this loop.
  1578  		// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1579  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1580  		// for the caller. The caller can't tell the difference between
  1581  		// an empty slot and a non-empty zero, but there's no need
  1582  		// to distinguish them anyway: the final suffix array will end up
  1583  		// with one zero somewhere, and that will be a real zero.
  1584  		k := j - 1
  1585  		c1 := text[k]
  1586  		if k > 0 {
  1587  			if c0 := text[k-1]; c0 < c1 {
  1588  				k = -k
  1589  			}
  1590  		}
  1591  
  1592  		if cB != c1 {
  1593  			bucket[cB] = b
  1594  			cB = c1
  1595  			b = bucket[cB]
  1596  		}
  1597  		sa[b] = int64(k)
  1598  		b++
  1599  	}
  1600  }
  1601  
  1602  func induceS_8_64(text []byte, sa, freq, bucket []int64) {
  1603  	// Initialize positions for right side of character buckets.
  1604  	bucketMax_8_64(text, freq, bucket)
  1605  	bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
  1606  
  1607  	cB := byte(0)
  1608  	b := bucket[cB]
  1609  
  1610  	for i := len(sa) - 1; i >= 0; i-- {
  1611  		j := int(sa[i])
  1612  		if j >= 0 {
  1613  			// Skip non-flagged entry.
  1614  			// (This loop can't see an empty entry; 0 means the real zero index.)
  1615  			continue
  1616  		}
  1617  
  1618  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1619  		j = -j
  1620  		sa[i] = int64(j)
  1621  
  1622  		// Index j was on work queue (encoded as -j but now decoded),
  1623  		// meaning k := j-1 is L-type,
  1624  		// so we can now place k correctly into sa.
  1625  		// If k-1 is S-type, queue -k for processing later in this loop.
  1626  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1627  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1628  		// for the caller.
  1629  		k := j - 1
  1630  		c1 := text[k]
  1631  		if k > 0 {
  1632  			if c0 := text[k-1]; c0 <= c1 {
  1633  				k = -k
  1634  			}
  1635  		}
  1636  
  1637  		if cB != c1 {
  1638  			bucket[cB] = b
  1639  			cB = c1
  1640  			b = bucket[cB]
  1641  		}
  1642  		b--
  1643  		sa[b] = int64(k)
  1644  	}
  1645  }
  1646  
  1647  func induceS_32(text []int32, sa, freq, bucket []int32) {
  1648  	// Initialize positions for right side of character buckets.
  1649  	bucketMax_32(text, freq, bucket)
  1650  
  1651  	cB := int32(0)
  1652  	b := bucket[cB]
  1653  
  1654  	for i := len(sa) - 1; i >= 0; i-- {
  1655  		j := int(sa[i])
  1656  		if j >= 0 {
  1657  			// Skip non-flagged entry.
  1658  			// (This loop can't see an empty entry; 0 means the real zero index.)
  1659  			continue
  1660  		}
  1661  
  1662  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1663  		j = -j
  1664  		sa[i] = int32(j)
  1665  
  1666  		// Index j was on work queue (encoded as -j but now decoded),
  1667  		// meaning k := j-1 is L-type,
  1668  		// so we can now place k correctly into sa.
  1669  		// If k-1 is S-type, queue -k for processing later in this loop.
  1670  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1671  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1672  		// for the caller.
  1673  		k := j - 1
  1674  		c1 := text[k]
  1675  		if k > 0 {
  1676  			if c0 := text[k-1]; c0 <= c1 {
  1677  				k = -k
  1678  			}
  1679  		}
  1680  
  1681  		if cB != c1 {
  1682  			bucket[cB] = b
  1683  			cB = c1
  1684  			b = bucket[cB]
  1685  		}
  1686  		b--
  1687  		sa[b] = int32(k)
  1688  	}
  1689  }
  1690  
  1691  func induceS_64(text []int64, sa, freq, bucket []int64) {
  1692  	// Initialize positions for right side of character buckets.
  1693  	bucketMax_64(text, freq, bucket)
  1694  
  1695  	cB := int64(0)
  1696  	b := bucket[cB]
  1697  
  1698  	for i := len(sa) - 1; i >= 0; i-- {
  1699  		j := int(sa[i])
  1700  		if j >= 0 {
  1701  			// Skip non-flagged entry.
  1702  			// (This loop can't see an empty entry; 0 means the real zero index.)
  1703  			continue
  1704  		}
  1705  
  1706  		// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1707  		j = -j
  1708  		sa[i] = int64(j)
  1709  
  1710  		// Index j was on work queue (encoded as -j but now decoded),
  1711  		// meaning k := j-1 is L-type,
  1712  		// so we can now place k correctly into sa.
  1713  		// If k-1 is S-type, queue -k for processing later in this loop.
  1714  		// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1715  		// If k is zero, k-1 doesn't exist, so we only need to leave it
  1716  		// for the caller.
  1717  		k := j - 1
  1718  		c1 := text[k]
  1719  		if k > 0 {
  1720  			if c0 := text[k-1]; c0 <= c1 {
  1721  				k = -k
  1722  			}
  1723  		}
  1724  
  1725  		if cB != c1 {
  1726  			bucket[cB] = b
  1727  			cB = c1
  1728  			b = bucket[cB]
  1729  		}
  1730  		b--
  1731  		sa[b] = int64(k)
  1732  	}
  1733  }
  1734
View as plain text