1 // Copyright 2021 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This is an implementation based on the s390x
6 // implementation.
7
8 // Find a separator with 2 <= len <= 32 within a string.
9 // Separators with lengths of 2, 3 or 4 are handled
10 // specially.
11
12 // This works on power8 and above. The loads and
13 // compares are done in big endian order
14 // since that allows the used of VCLZD, and allows
15 // the same implementation to work on big and little
16 // endian platforms with minimal conditional changes.
17
18 // NOTE: There is a power9 implementation that
19 // improves performance by 10-15% on little
20 // endian for some of the benchmarks.
21 // Unrolled index2to16 loop by 4 on ppc64le/power9
22 // Work is still needed for a big endian
23 // implementation on power9.
24
25 //go:build ppc64 || ppc64le
26
27 #include "go_asm.h"
28 #include "textflag.h"
29
30 // Needed to swap LXVD2X loads to the correct
31 // byte order to work on POWER8.
32
33 #ifdef GOARCH_ppc64
34 DATA byteswap<>+0(SB)/8, $0x0001020304050607
35 DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
36 #else
37 DATA byteswap<>+0(SB)/8, $0x0706050403020100
38 DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
39 #endif
40
41 // Load bytes in big endian order. Address
42 // alignment does not need checking.
43 #define VLOADSWAP(base, index, vreg, vsreg) \
44 LXVD2X (base)(index), vsreg; \
45 VPERM vreg, vreg, SWAP, vreg
46
47 GLOBL byteswap<>+0(SB), RODATA, $16
48
49 TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
50 // R3 = byte array pointer
51 // R4 = length
52 MOVD R6, R5 // R5 = separator pointer
53 MOVD R7, R6 // R6 = separator length
54
55 #ifdef GOARCH_ppc64le
56 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
57 CMP R7, $1
58 BNE power8
59 BR indexbodyp9<>(SB)
60 #endif
61 power8:
62 BR indexbody<>(SB)
63
64 TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
65 // R3 = string
66 // R4 = length
67 // R5 = separator pointer
68 // R6 = separator length
69
70 #ifdef GOARCH_ppc64le
71 MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
72 CMP R7, $1
73 BNE power8
74 BR indexbodyp9<>(SB)
75
76 #endif
77 power8:
78 BR indexbody<>(SB)
79
80 // s: string we are searching
81 // sep: string to search for
82 // R3=&s[0], R4=len(s)
83 // R5=&sep[0], R6=len(sep)
84 // R14=&ret (index where sep found)
85 // R7=working addr of string
86 // R16=index value 16
87 // R17=index value 17
88 // R18=index value 18
89 // R19=index value 1
90 // R26=LASTBYTE of string
91 // R27=LASTSTR last start byte to compare with sep
92 // R8, R9 scratch
93 // V0=sep left justified zero fill
94 // CR4=sep length >= 16
95
96 #define SEPMASK V17
97 #define LASTBYTE R26
98 #define LASTSTR R27
99 #define ONES V20
100 #define SWAP V21
101 #define SWAP_ VS53
102 TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
103 CMP R6, R4 // Compare lengths
104 BGT notfound // If sep len is > string, notfound
105 ADD R4, R3, LASTBYTE // find last byte addr
106 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
107 CMP R6, $0 // Check sep len
108 BEQ notfound // sep len 0 -- not found
109 MOVD R3, R7 // Copy of string addr
110 MOVD $16, R16 // Index value 16
111 MOVD $17, R17 // Index value 17
112 MOVD $18, R18 // Index value 18
113 MOVD $1, R19 // Index value 1
114 MOVD $byteswap<>+00(SB), R8
115 VSPLTISB $0xFF, ONES // splat all 1s
116 LXVD2X (R8)(R0), SWAP_ // Set up swap string
117
118 CMP R6, $16, CR4 // CR4 for len(sep) >= 16
119 VOR ONES, ONES, SEPMASK // Set up full SEPMASK
120 BGE CR4, loadge16 // Load for len(sep) >= 16
121 SUB R6, R16, R9 // 16-len of sep
122 SLD $3, R9 // Set up for VSLO
123 MTVSRD R9, V9 // Set up for VSLO
124 VSLDOI $8, V9, V9, V9 // Set up for VSLO
125 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
126
127 loadge16:
128 ANDCC $15, R5, R9 // Find byte offset of sep
129 ADD R9, R6, R10 // Add sep len
130 CMP R10, $16 // Check if sep len+offset > 16
131 BGT sepcross16 // Sep crosses 16 byte boundary
132
133 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
134 VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
135 SLD $3, R9 // Set up shift count for VSLO
136 MTVSRD R9, V8 // Set up shift count for VSLO
137 VSLDOI $8, V8, V8, V8
138 VSLO V0, V8, V0 // Shift by start byte
139
140 VAND V0, SEPMASK, V0 // Mask separator (< 16)
141 BR index2plus
142
143 sepcross16:
144 VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0
145
146 VAND V0, SEPMASK, V0 // mask out separator
147 BLE CR4, index2to16
148 BR index17plus // Handle sep > 16
149
150 index2plus:
151 CMP R6, $2 // Check length of sep
152 BNE index3plus // If not 2, check for 3
153 ADD $16, R7, R9 // Check if next 16 bytes past last
154 CMP R9, LASTBYTE // compare with last
155 BGE index2to16 // 2 <= len(string) <= 16
156 MOVD $0xff00, R21 // Mask for later
157 MTVSRD R21, V25 // Move to Vreg
158 VSPLTH $3, V25, V31 // Splat mask
159 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
160 VSPLTISB $0, V10 // Clear V10
161
162 // First case: 2 byte separator
163 // V1: 2 byte separator splatted
164 // V2: 16 bytes at addr
165 // V4: 16 bytes at addr+1
166 // Compare 2 byte separator at start
167 // and at start+1. Use VSEL to combine
168 // those results to find the first
169 // matching start byte, returning
170 // that value when found. Loop as
171 // long as len(string) > 16
172 index2loop2:
173 VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
174
175 index2loop:
176 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
177 VCMPEQUH V1, V2, V5 // Search for sep
178 VCMPEQUH V1, V3, V6 // Search for sep offset by 1
179 VSEL V6, V5, V31, V7 // merge even and odd indices
180 VCLZD V7, V18 // find index of first match
181 MFVSRD V18, R25 // get first value
182 CMP R25, $64 // Found if < 64
183 BLT foundR25 // Return byte index where found
184 VSLDOI $8, V18, V18, V18 // Adjust 2nd value
185 MFVSRD V18, R25 // get second value
186 CMP R25, $64 // Found if < 64
187 ADD $64, R25 // Update byte offset
188 BLT foundR25 // Return value
189 ADD $16, R7 // R7+=16 Update string pointer
190 ADD $17, R7, R9 // R9=F7+17 since loop unrolled
191 CMP R9, LASTBYTE // Compare addr+17 against last byte
192 BLT index2loop2 // If < last, continue loop
193 CMP R7, LASTBYTE // Compare addr+16 against last byte
194 BLT index2to16 // If < 16 handle specially
195 VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
196 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
197 BR index2loop
198
199 index3plus:
200 CMP R6, $3 // Check if sep == 3
201 BNE index4plus // If not check larger
202 ADD $19, R7, R9 // Find bytes for use in this loop
203 CMP R9, LASTBYTE // Compare against last byte
204 BGE index2to16 // Remaining string 2<=len<=16
205 MOVD $0xff00, R21 // Set up mask for upcoming loop
206 MTVSRD R21, V25 // Move mask to Vreg
207 VSPLTH $3, V25, V31 // Splat mask
208 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
209 VSPLTB $2, V0, V8 // Splat 3rd byte of sep
210
211 // Loop to process 3 byte separator.
212 // string[0:16] is in V2
213 // string[2:18] is in V3
214 // sep[0:2] splatted in V1
215 // sec[3] splatted in v8
216 // Load vectors at string, string+1
217 // and string+2. Compare string, string+1
218 // against first 2 bytes of separator
219 // splatted, and string+2 against 3rd
220 // byte splatted. Merge the results with
221 // VSEL to find the first byte of a match.
222
223 // Special handling for last 16 bytes if the
224 // string fits in 16 byte multiple.
225 index3loop2:
226 MOVD $2, R21 // Set up index for 2
227 VSPLTISB $0, V10 // Clear V10
228 VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
229 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
230
231 index3loop:
232 VLOADSWAP(R7, R0, V2, V2) // Load with correct order
233 VSLDOI $1, V2, V3, V4 // string[1:17]
234 VSLDOI $2, V2, V3, V9 // string[2:18]
235 VCMPEQUH V1, V2, V5 // compare hw even indices
236 VCMPEQUH V1, V4, V6 // compare hw odd indices
237 VCMPEQUB V8, V9, V10 // compare 3rd to last byte
238 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
239 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
240 VCLZD V7, V18 // Find first nonzero indexes
241 MFVSRD V18, R25 // Move 1st doubleword
242 CMP R25, $64 // If < 64 found
243 BLT foundR25 // Return matching index
244 VSLDOI $8, V18, V18, V18 // Move value
245 MFVSRD V18, R25 // Move 2nd doubleword
246 CMP R25, $64 // If < 64 found
247 ADD $64, R25 // Update byte index
248 BLT foundR25 // Return matching index
249 ADD $16, R7 // R7+=16 string ptr
250 ADD $19, R7, R9 // Number of string bytes for loop
251 CMP R9, LASTBYTE // Compare against last byte of string
252 BLT index3loop2 // If within, continue this loop
253 CMP R7, LASTSTR // Compare against last start byte
254 BLT index2to16 // Process remainder
255 VSPLTISB $0, V3 // Special case for last 16 bytes
256 BR index3loop // Continue this loop
257
258 // Loop to process 4 byte separator
259 // string[0:16] in V2
260 // string[3:16] in V3
261 // sep[0:4] splatted in V1
262 // Set up vectors with strings at offsets
263 // 0, 1, 2, 3 and compare against the 4 byte
264 // separator also splatted. Use VSEL with the
265 // compare results to find the first byte where
266 // a separator match is found.
267 index4plus:
268 CMP R6, $4 // Check if 4 byte separator
269 BNE index5plus // If not next higher
270 ADD $20, R7, R9 // Check string size to load
271 CMP R9, LASTBYTE // Verify string length
272 BGE index2to16 // If not large enough, process remaining
273 MOVD $2, R15 // Set up index
274
275 // Set up masks for use with VSEL
276 MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
277 SLD $24, R21
278 MTVSRD R21, V10
279 VSPLTW $1, V10, V29
280 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
281 MOVD $0xffff, R21
282 SLD $16, R21
283 MTVSRD R21, V10
284 VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000...
285 VSPLTW $0, V0, V1 // Splat 1st word of separator
286
287 index4loop:
288 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
289
290 next4:
291 VSPLTISB $0, V10 // Clear
292 MOVD $3, R9 // Number of bytes beyond 16
293 VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3
294 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
295 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
296 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
297 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
298 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
299 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
300 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
301 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
302 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
303 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
304 VSEL V14, V13, V31, V7 // final merge
305 VCLZD V7, V18 // Find first index for each half
306 MFVSRD V18, R25 // Isolate value
307 CMP R25, $64 // If < 64, found
308 BLT foundR25 // Return found index
309 VSLDOI $8, V18, V18, V18 // Move for MFVSRD
310 MFVSRD V18, R25 // Isolate other value
311 CMP R25, $64 // If < 64, found
312 ADD $64, R25 // Update index for high doubleword
313 BLT foundR25 // Return found index
314 ADD $16, R7 // R7+=16 for next string
315 ADD $20, R7, R9 // R+20 for all bytes to load
316 CMP R9, LASTBYTE // Past end? Maybe check for extra?
317 BLT index4loop // If not, continue loop
318 CMP R7, LASTSTR // Check remainder
319 BLE index2to16 // Process remainder
320 BR notfound // Not found
321
322 index5plus:
323 CMP R6, $16 // Check for sep > 16
324 BGT index17plus // Handle large sep
325
326 // Assumption is that the separator is smaller than the string at this point
327 index2to16:
328 CMP R7, LASTSTR // Compare last start byte
329 BGT notfound // last takes len(sep) into account
330
331 ADD $16, R7, R9 // Check for last byte of string
332 CMP R9, LASTBYTE
333 BGT index2to16tail
334
335 // At least 16 bytes of string left
336 // Mask the number of bytes in sep
337 index2to16loop:
338 VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
339
340 compare:
341 VAND V1, SEPMASK, V2 // Mask out sep size
342 VCMPEQUBCC V0, V2, V3 // Compare masked string
343 BLT CR6, found // All equal
344 ADD $1, R7 // Update ptr to next byte
345 CMP R7, LASTSTR // Still less than last start byte
346 BGT notfound // Not found
347 ADD $16, R7, R9 // Verify remaining bytes
348 CMP R9, LASTBYTE // At least 16
349 BLT index2to16loop // Try again
350
351 // Less than 16 bytes remaining in string
352 // Separator >= 2
353 index2to16tail:
354 ADD R3, R4, R9 // End of string
355 SUB R7, R9, R9 // Number of bytes left
356 ANDCC $15, R7, R10 // 16 byte offset
357 ADD R10, R9, R11 // offset + len
358 CMP R11, $16 // >= 16?
359 BLE short // Does not cross 16 bytes
360 VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
361 BR index2to16next // Continue on
362
363 short:
364 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
365 VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
366 SLD $3, R10 // Set up shift
367 MTVSRD R10, V8 // Set up shift
368 VSLDOI $8, V8, V8, V8
369 VSLO V1, V8, V1 // Shift by start byte
370 VSPLTISB $0, V25 // Clear for later use
371
372 index2to16next:
373 VAND V1, SEPMASK, V2 // Just compare size of sep
374 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
375 BLT CR6, found // Found
376 ADD $1, R7 // Not found, try next partial string
377 CMP R7, LASTSTR // Check for end of string
378 BGT notfound // If at end, then not found
379 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
380 BR index2to16next // Check the next partial string
381
382 index17plus:
383 CMP R6, $32 // Check if 17 < len(sep) <= 32
384 BGT index33plus
385 SUB $16, R6, R9 // Extra > 16
386 SLD $56, R9, R10 // Shift to use in VSLO
387 MTVSRD R10, V9 // Set up for VSLO
388 VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
389 VSLO V1, V9, V1 // Shift left
390 VSPLTISB $0xff, V7 // Splat 1s
391 VSPLTISB $0, V27 // Splat 0
392
393 index17to32loop:
394 VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
395
396 next17:
397 VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3
398 VSLO V3, V9, V3 // Shift left
399 VCMPEQUB V0, V2, V4 // Compare first 16 bytes
400 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
401 VAND V4, V5, V6 // Check if both equal
402 VCMPEQUBCC V6, V7, V8 // All equal?
403 BLT CR6, found // Yes
404 ADD $1, R7 // On to next byte
405 CMP R7, LASTSTR // Check if last start byte
406 BGT notfound // If too high, not found
407 BR index17to32loop // Continue
408
409 notfound:
410 MOVD $-1, R3 // Return -1 if not found
411 RET
412
413 index33plus:
414 MOVD $0, (R0) // Case not implemented
415 RET // Crash before return
416
417 foundR25:
418 SRD $3, R25 // Convert from bits to bytes
419 ADD R25, R7 // Add to current string address
420 SUB R3, R7 // Subtract from start of string
421 MOVD R7, R3 // Return byte where found
422 RET
423
424 found:
425 SUB R3, R7 // Return byte where found
426 MOVD R7, R3
427 RET
428
429 TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
430 CMP R6, R4 // Compare lengths
431 BGT notfound // If sep len is > string, notfound
432 ADD R4, R3, LASTBYTE // find last byte addr
433 SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
434 CMP R6, $0 // Check sep len
435 BEQ notfound // sep len 0 -- not found
436 MOVD R3, R7 // Copy of string addr
437 #ifndef GOPPC64_power10
438 MOVD $16, R16 // Index value 16
439 MOVD $17, R17 // Index value 17
440 MOVD $18, R18 // Index value 18
441 VSPLTISB $0xFF, ONES // splat all 1s
442 VOR ONES, ONES, SEPMASK // Set up full SEPMASK
443 #else
444 SLD $56, R6, R14 // Set up separator length for LXVLL
445 #endif
446 MOVD $1, R19 // Index value 1
447 CMP R6, $16, CR4 // CR4 for len(sep) >= 16
448 BGE CR4, loadge16 // Load for len(sep) >= 16
449 #ifndef GOPPC64_power10
450 SUB R6, R16, R9 // 16-len of sep
451 SLD $3, R9 // Set up for VSLO
452 MTVSRD R9, V9 // Set up for VSLO
453 VSLDOI $8, V9, V9, V9 // Set up for VSLO
454 VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
455 #endif
456 loadge16:
457 ANDCC $15, R5, R9 // Find byte offset of sep
458 ADD R9, R6, R10 // Add sep len
459 CMP R10, $16 // Check if sep len+offset > 16
460 BGT sepcross16 // Sep crosses 16 byte boundary
461 #ifdef GOPPC64_power10
462 LXVLL R5, R14, V0 // Load separator
463 #else
464 RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
465 LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
466 SLD $3, R9 // Set up shift count for VSLO
467 MTVSRD R9, V8 // Set up shift count for VSLO
468 VSLDOI $8, V8, V8, V8
469 VSLO V0, V8, V0 // Shift by start byte
470 VAND V0, SEPMASK, V0 // Mask separator (< 16)
471 #endif
472 BR index2plus
473 sepcross16:
474 #ifdef GOPPC64_power10
475 LXVLL R5, R14, V0 // Load separator
476 #else
477 LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0\
478 VAND V0, SEPMASK, V0 // mask out separator
479 #endif
480 BLE CR4, index2to16
481 BR index17plus // Handle sep > 16
482
483 index2plus:
484 CMP R6, $2 // Check length of sep
485 BNE index3plus // If not 2, check for 3
486 ADD $16, R7, R9 // Check if next 16 bytes past last
487 CMP R9, LASTBYTE // compare with last
488 BGE index2to16 // 2 <= len(string) <= 16
489 MOVD $0xff00, R21 // Mask for later
490 MTVSRD R21, V25 // Move to Vreg
491 VSPLTH $3, V25, V31 // Splat mask
492 VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
493 VSPLTISB $0, V10 // Clear V10
494
495 // First case: 2 byte separator
496 // V1: 2 byte separator splatted
497 // V2: 16 bytes at addr
498 // V4: 16 bytes at addr+1
499 // Compare 2 byte separator at start
500 // and at start+1. Use VSEL to combine
501 // those results to find the first
502 // matching start byte, returning
503 // that value when found. Loop as
504 // long as len(string) > 16
505 index2loop2:
506 LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3
507
508 index2loop:
509 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
510 VCMPEQUH V1, V2, V5 // Search for sep
511 VCMPEQUH V1, V3, V6 // Search for sep offset by 1
512 VSEL V6, V5, V31, V7 // merge even and odd indices
513 VCLZD V7, V18 // find index of first match
514 MFVSRD V18, R25 // get first value
515 CMP R25, $64 // Found if < 64
516 BLT foundR25 // Return byte index where found
517
518 MFVSRLD V18, R25 // get second value
519 CMP R25, $64 // Found if < 64
520 ADD $64, R25 // Update byte offset
521 BLT foundR25 // Return value
522 ADD $16, R7 // R7+=16 Update string pointer
523 ADD $17, R7, R9 // R9=F7+17 since loop unrolled
524 CMP R9, LASTBYTE // Compare addr+17 against last byte
525 BLT index2loop2 // If < last, continue loop
526 CMP R7, LASTBYTE // Compare addr+16 against last byte
527 BLT index2to16 // If < 16 handle specially
528 LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3
529 VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
530 BR index2loop
531
532 index3plus:
533 CMP R6, $3 // Check if sep == 3
534 BNE index4plus // If not check larger
535 ADD $19, R7, R9 // Find bytes for use in this loop
536 CMP R9, LASTBYTE // Compare against last byte
537 BGE index2to16 // Remaining string 2<=len<=16
538 MOVD $0xff00, R21 // Set up mask for upcoming loop
539 MTVSRD R21, V25 // Move mask to Vreg
540 VSPLTH $3, V25, V31 // Splat mask
541 VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
542 VSPLTB $2, V0, V8 // Splat 3rd byte of sep
543
544 // Loop to process 3 byte separator.
545 // string[0:16] is in V2
546 // string[2:18] is in V3
547 // sep[0:2] splatted in V1
548 // sec[3] splatted in v8
549 // Load vectors at string, string+1
550 // and string+2. Compare string, string+1
551 // against first 2 bytes of separator
552 // splatted, and string+2 against 3rd
553 // byte splatted. Merge the results with
554 // VSEL to find the first byte of a match.
555
556 // Special handling for last 16 bytes if the
557 // string fits in 16 byte multiple.
558 index3loop2:
559 MOVD $2, R21 // Set up index for 2
560 VSPLTISB $0, V10 // Clear V10
561 LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3
562 VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
563
564 index3loop:
565 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7
566 VSLDOI $1, V2, V3, V4 // string[1:17]
567 VSLDOI $2, V2, V3, V9 // string[2:18]
568 VCMPEQUH V1, V2, V5 // compare hw even indices
569 VCMPEQUH V1, V4, V6 // compare hw odd indices
570 VCMPEQUB V8, V9, V10 // compare 3rd to last byte
571 VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
572 VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
573 VCLZD V7, V18 // Find first nonzero indexes
574 MFVSRD V18, R25 // Move 1st doubleword
575 CMP R25, $64 // If < 64 found
576 BLT foundR25 // Return matching index
577
578 MFVSRLD V18, R25 // Move 2nd doubleword
579 CMP R25, $64 // If < 64 found
580 ADD $64, R25 // Update byte index
581 BLT foundR25 // Return matching index
582 ADD $16, R7 // R7+=16 string ptr
583 ADD $19, R7, R9 // Number of string bytes for loop
584 CMP R9, LASTBYTE // Compare against last byte of string
585 BLT index3loop2 // If within, continue this loop
586 CMP R7, LASTSTR // Compare against last start byte
587 BLT index2to16 // Process remainder
588 VSPLTISB $0, V3 // Special case for last 16 bytes
589 BR index3loop // Continue this loop
590
591 // Loop to process 4 byte separator
592 // string[0:16] in V2
593 // string[3:16] in V3
594 // sep[0:4] splatted in V1
595 // Set up vectors with strings at offsets
596 // 0, 1, 2, 3 and compare against the 4 byte
597 // separator also splatted. Use VSEL with the
598 // compare results to find the first byte where
599 // a separator match is found.
600 index4plus:
601 CMP R6, $4 // Check if 4 byte separator
602 BNE index5plus // If not next higher
603 ADD $20, R7, R9 // Check string size to load
604 CMP R9, LASTBYTE // Verify string length
605 BGE index2to16 // If not large enough, process remaining
606
607 // Set up masks for use with VSEL
608 MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
609 SLD $24, R21
610 MTVSRWS R21, V29
611
612 VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
613 MOVD $0xffff, R21
614 SLD $16, R21
615 MTVSRWS R21, V31
616
617 VSPLTW $0, V0, V1 // Splat 1st word of separator
618
619 index4loop:
620 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
621
622 next4:
623 VSPLTISB $0, V10 // Clear
624 MOVD $3, R9 // Number of bytes beyond 16
625 LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V3
626 VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
627 VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
628 VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
629 VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
630 VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
631 VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
632 VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
633 VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
634 VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
635 VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
636 VSEL V14, V13, V31, V7 // final merge
637 VCLZD V7, V18 // Find first index for each half
638 MFVSRD V18, R25 // Isolate value
639 CMP R25, $64 // If < 64, found
640 BLT foundR25 // Return found index
641
642 MFVSRLD V18, R25 // Isolate other value
643 CMP R25, $64 // If < 64, found
644 ADD $64, R25 // Update index for high doubleword
645 BLT foundR25 // Return found index
646 ADD $16, R7 // R7+=16 for next string
647 ADD $20, R7, R9 // R+20 for all bytes to load
648 CMP R9, LASTBYTE // Past end? Maybe check for extra?
649 BLT index4loop // If not, continue loop
650 CMP R7, LASTSTR // Check remainder
651 BLE index2to16 // Process remainder
652 BR notfound // Not found
653
654 index5plus:
655 CMP R6, $16 // Check for sep > 16
656 BGT index17plus // Handle large sep
657
658 // Assumption is that the separator is smaller than the string at this point
659 index2to16:
660 CMP R7, LASTSTR // Compare last start byte
661 BGT notfound // last takes len(sep) into account
662
663 ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes
664 CMP R9, LASTBYTE
665 // At least 16 bytes of string left
666 // Mask the number of bytes in sep
667 VSPLTISB $0, V10 // Clear
668 BGT index2to16tail
669
670 #ifdef GOPPC64_power10
671 ADD $3,R7, R17 // Base+3
672 ADD $2,R7, R8 // Base+2
673 ADD $1,R7, R10 // Base+1
674 #else
675 MOVD $3, R17 // Number of bytes beyond 16
676 #endif
677 PCALIGN $16
678
679 index2to16loop:
680
681 #ifdef GOPPC64_power10
682 LXVLL R7, R14, V8 // Load next 16 bytes of string from Base
683 LXVLL R10, R14, V9 // Load next 16 bytes of string from Base+1
684 LXVLL R8, R14, V11 // Load next 16 bytes of string from Base+2
685 LXVLL R17,R14, V12 // Load next 16 bytes of string from Base+3
686 #else
687 LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
688 LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
689
690 VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes
691 VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1
692 VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2
693 VAND V1, SEPMASK, V8 // Mask out sep size 0th index
694 VAND V3, SEPMASK, V9 // Mask out sep size 1st index
695 VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
696 VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
697 #endif
698 VCMPEQUBCC V0, V8, V8 // compare masked string
699 BLT CR6, found // All equal while comparing 0th index
700 VCMPEQUBCC V0, V9, V9 // compare masked string
701 BLT CR6, found2 // All equal while comparing 1st index
702 VCMPEQUBCC V0, V11, V11 // compare masked string
703 BLT CR6, found3 // All equal while comparing 2nd index
704 VCMPEQUBCC V0, V12, V12 // compare masked string
705 BLT CR6, found4 // All equal while comparing 3rd index
706
707 ADD $4, R7 // Update ptr to next 4 bytes
708 #ifdef GOPPC64_power10
709 ADD $4, R17 // Update ptr to next 4 bytes
710 ADD $4, R8 // Update ptr to next 4 bytes
711 ADD $4, R10 // Update ptr to next 4 bytes
712 #endif
713 CMP R7, LASTSTR // Still less than last start byte
714 BGT notfound // Not found
715 ADD $19, R7, R9 // Verify remaining bytes
716 CMP R9, LASTBYTE // length of string at least 19
717 BLE index2to16loop // Try again, else do post processing and jump to index2to16next
718 PCALIGN $32
719 // <19 bytes left, post process the remaining string
720 index2to16tail:
721 #ifdef GOPPC64_power10
722 index2to16next_p10:
723 LXVLL R7,R14, V1 // Load 16 bytes @R7 into V1
724 VCMPEQUBCC V1, V0, V3 // Compare sep and partial string
725 BLT CR6, found // Found
726 ADD $1, R7 // Not found, try next partial string
727 CMP R7, LASTSTR // Check for end of string
728 BLE index2to16next_p10 // If at end, then not found
729 BR notfound // go to remainder loop
730 #else
731 ADD R3, R4, R9 // End of string
732 SUB R7, R9, R9 // Number of bytes left
733 ANDCC $15, R7, R10 // 16 byte offset
734 ADD R10, R9, R11 // offset + len
735 CMP R11, $16 // >= 16?
736 BLE short // Does not cross 16 bytes
737 LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1
738 CMP R9, $16 // Post-processing of unrolled loop
739 BLE index2to16next // continue to index2to16next if <= 16 bytes
740 SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2
741 LXVB16X (R7)(R10), V9
742 CMP R10, $1 // string length is 17, compare 1 more byte
743 BNE extra2 // string length is 18, compare 2 more bytes
744 VSLDOI $15, V9, V10, V25
745 VAND V1, SEPMASK, V2 // Just compare size of sep
746 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
747 BLT CR6, found // Found
748 ADD $1, R7 // Not found, try next partial string
749 CMP R7, LASTSTR // Check for end of string
750 BGT notfound // If at end, then not found
751 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
752 BR index2to16next // go to remainder loop
753 extra2:
754 VSLDOI $14, V9, V10, V25
755 VAND V1, SEPMASK, V2 // Just compare size of sep
756 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
757 BLT CR6, found // Found
758 ADD $1, R7 // Not found, try next partial string
759 CMP R7, LASTSTR // Check for end of string
760 BGT notfound // If at end, then not found
761 VOR V1, V1, V4 // save remaining string
762 VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte
763 VAND V1, SEPMASK, V2 // Just compare size of sep
764 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
765 BLT CR6, found // Found
766 ADD $1, R7 // Not found, try next partial string
767 CMP R7, LASTSTR // Check for end of string
768 BGT notfound // If at end, then not found
769 VSLDOI $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte
770 BR index2to16next // Check the remaining partial string in index2to16next
771
772 short:
773 RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
774 LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1
775 SLD $3, R10 // Set up shift
776 MTVSRD R10, V8 // Set up shift
777 VSLDOI $8, V8, V8, V8
778 VSLO V1, V8, V1 // Shift by start byte
779 PCALIGN $16
780 index2to16next:
781 VAND V1, SEPMASK, V2 // Just compare size of sep
782 VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
783 BLT CR6, found // Found
784 ADD $1, R7 // Not found, try next partial string
785 CMP R7, LASTSTR // Check for end of string
786 BGT notfound // If at end, then not found
787 VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
788 BR index2to16next // Check the next partial string
789 #endif // Tail processing if GOPPC64!=power10
790
791 index17plus:
792 CMP R6, $32 // Check if 17 < len(sep) <= 32
793 BGT index33plus
794 SUB $16, R6, R9 // Extra > 16
795 SLD $56, R9, R10 // Shift to use in VSLO
796 MTVSRD R10, V9 // Set up for VSLO
797 LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1
798 VSLO V1, V9, V1 // Shift left
799 VSPLTISB $0xff, V7 // Splat 1s
800 VSPLTISB $0, V27 // Splat 0
801
802 index17to32loop:
803 LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
804
805 next17:
806 LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3
807 VSLO V3, V9, V3 // Shift left
808 VCMPEQUB V0, V2, V4 // Compare first 16 bytes
809 VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
810 VAND V4, V5, V6 // Check if both equal
811 VCMPEQUBCC V6, V7, V8 // All equal?
812 BLT CR6, found // Yes
813 ADD $1, R7 // On to next byte
814 CMP R7, LASTSTR // Check if last start byte
815 BGT notfound // If too high, not found
816 BR index17to32loop // Continue
817
818 notfound:
819 MOVD $-1, R3 // Return -1 if not found
820 RET
821
822 index33plus:
823 MOVD $0, (R0) // Case not implemented
824 RET // Crash before return
825
826 foundR25:
827 SRD $3, R25 // Convert from bits to bytes
828 ADD R25, R7 // Add to current string address
829 SUB R3, R7 // Subtract from start of string
830 MOVD R7, R3 // Return byte where found
831 RET
832 found4:
833 ADD $1, R7 // found from unrolled loop at index 3
834 found3:
835 ADD $1, R7 // found from unrolled loop at index 2
836 found2:
837 ADD $1, R7 // found from unrolled loop at index 1
838 found: // found at index 0
839 SUB R3, R7 // Return byte where found
840 MOVD R7, R3
841 RET
842
View as plain text