1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build ppc64 || ppc64le
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
11 // R3 = byte array pointer
12 // R4 = length
13 MOVD R6, R5 // R5 = byte
14 BR indexbytebody<>(SB)
15
16 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
17 // R3 = string
18 // R4 = length
19 // R5 = byte
20 BR indexbytebody<>(SB)
21
22 #ifndef GOPPC64_power9
23 #ifdef GOARCH_ppc64le
24 DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
25 DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
26 #else
27 DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
28 DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
29 #endif
30 GLOBL indexbytevbperm<>+0(SB), RODATA, $16
31 #endif
32
33 // Some operations are endian specific, choose the correct opcode base on GOARCH.
34 // Note, _VCZBEBB is only available on power9 and newer.
35 #ifdef GOARCH_ppc64le
36 #define _LDBEX MOVDBR
37 #define _LWBEX MOVWBR
38 #define _LHBEX MOVHBR
39 #define _VCZBEBB VCTZLSBB
40 #else
41 #define _LDBEX MOVD
42 #define _LWBEX MOVW
43 #define _LHBEX MOVH
44 #define _VCZBEBB VCLZLSBB
45 #endif
46
47 // R3 = addr of string
48 // R4 = len of string
49 // R5 = byte to find
50 // On exit:
51 // R3 = return value
52 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
53 CMPU R4,$32
54
55 #ifndef GOPPC64_power9
56 // Load VBPERMQ constant to reduce compare into an ordered bit mask.
57 MOVD $indexbytevbperm<>+00(SB),R16
58 LXVD2X (R16),V0 // Set up swap string
59 #endif
60
61 MTVRD R5,V1
62 VSPLTB $7,V1,V1 // Replicate byte across V1
63
64 BLT cmp16 // Jump to the small string case if it's <32 bytes.
65
66 CMP R4,$64,CR1
67 MOVD $16,R11
68 MOVD R3,R8
69 BLT CR1,cmp32 // Special case for length 32 - 63
70 MOVD $32,R12
71 MOVD $48,R6
72
73 RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63
74 ADD R3,R9,R9 // R9 = &s[len &^ 63]
75 ANDCC $63,R4 // (len &= 63) cmp 0.
76
77 PCALIGN $16
78 loop64:
79 LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0]
80 VCMPEQUBCC V2,V1,V6
81 BNE CR6,foundat0 // Match found at R8, jump out
82
83 LXVD2X (R11)(R8),V2
84 VCMPEQUBCC V2,V1,V6
85 BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
86
87 LXVD2X (R12)(R8),V2
88 VCMPEQUBCC V2,V1,V6
89 BNE CR6,foundat2 // Match found at R8+32 bytes, jump out
90
91 LXVD2X (R6)(R8),V2
92 VCMPEQUBCC V2,V1,V6
93 BNE CR6,foundat3 // Match found at R8+48 bytes, jump out
94
95 ADD $64,R8
96 CMPU R8,R9,CR1
97 BNE CR1,loop64 // R8 != &s[len &^ 63]?
98
99 PCALIGN $32
100 BEQ notfound // Is tail length 0? CR0 is set before entering loop64.
101
102 CMP R4,$32 // Tail length >= 32, use cmp32 path.
103 CMP R4,$16,CR1
104 BGE cmp32
105
106 ADD R8,R4,R9
107 ADD $-16,R9
108 BLE CR1,cmp64_tail_gt0
109
110 cmp64_tail_gt16: // Tail length 17 - 32
111 LXVD2X (R0)(R8),V2
112 VCMPEQUBCC V2,V1,V6
113 BNE CR6,foundat0
114
115 cmp64_tail_gt0: // Tail length 1 - 16
116 MOVD R9,R8
117 LXVD2X (R0)(R9),V2
118 VCMPEQUBCC V2,V1,V6
119 BNE CR6,foundat0
120
121 BR notfound
122
123 cmp32: // Length 32 - 63
124
125 // Bytes 0 - 15
126 LXVD2X (R0)(R8),V2
127 VCMPEQUBCC V2,V1,V6
128 BNE CR6,foundat0
129
130 // Bytes 16 - 31
131 LXVD2X (R8)(R11),V2
132 VCMPEQUBCC V2,V1,V6
133 BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
134
135 BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32)
136 CMP R4,$48
137
138 ADD R4,R8,R9 // Compute &s[len(s)-16]
139 ADD $32,R8,R8
140 ADD $-16,R9,R9
141 ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8
142
143 // Bytes 33 - 47
144 LXVD2X (R0)(R8),V2
145 VCMPEQUBCC V2,V1,V6
146 BNE CR6,foundat0 // match found at R8+32 bytes, jump out
147
148 BLE notfound
149
150 // Bytes 48 - 63
151 MOVD R9,R8 // R9 holds the final check.
152 LXVD2X (R0)(R9),V2
153 VCMPEQUBCC V2,V1,V6
154 BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
155
156 BR notfound
157
158 // If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
159 #ifndef GOPPC64_power9
160 #define ADJUST_FOR_CNTLZW -16
161 #else
162 #define ADJUST_FOR_CNTLZW 0
163 #endif
164
165 // Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
166 // to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
167 foundat3:
168 SUB R3,R8,R3
169 ADD $48+ADJUST_FOR_CNTLZW,R3
170 BR vfound
171 foundat2:
172 SUB R3,R8,R3
173 ADD $32+ADJUST_FOR_CNTLZW,R3
174 BR vfound
175 foundat1:
176 SUB R3,R8,R3
177 ADD $16+ADJUST_FOR_CNTLZW,R3
178 BR vfound
179 foundat0:
180 SUB R3,R8,R3
181 ADD $0+ADJUST_FOR_CNTLZW,R3
182 vfound:
183 // Map equal values into a 16 bit value with earlier matches setting higher bits.
184 #ifndef GOPPC64_power9
185 VBPERMQ V6,V0,V6
186 MFVRD V6,R4
187 CNTLZW R4,R4
188 #else
189 #ifdef GOARCH_ppc64le
190 // Put the value back into LE ordering by swapping doublewords.
191 XXPERMDI V6,V6,$2,V6
192 #endif
193 _VCZBEBB V6,R4
194 #endif
195 ADD R3,R4,R3
196 RET
197
198 cmp16: // Length 16 - 31
199 CMPU R4,$16
200 ADD R4,R3,R9
201 BLT cmp8
202
203 ADD $-16,R9,R9 // &s[len(s)-16]
204
205 // Bytes 0 - 15
206 LXVD2X (R0)(R3),V2
207 VCMPEQUBCC V2,V1,V6
208 MOVD R3,R8
209 BNE CR6,foundat0 // Match found at R8+32 bytes, jump out
210
211 BEQ notfound
212
213 // Bytes 16 - 30
214 MOVD R9,R8 // R9 holds the final check.
215 LXVD2X (R0)(R9),V2
216 VCMPEQUBCC V2,V1,V6
217 BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
218
219 BR notfound
220
221
222 cmp8: // Length 8 - 15
223 #ifdef GOPPC64_power10
224 // Load all the bytes into a single VSR in BE order.
225 SLD $56,R4,R5
226 LXVLL R3,R5,V2
227 // Compare and count the number which don't match.
228 VCMPEQUB V2,V1,V6
229 VCLZLSBB V6,R3
230 // If count is the number of bytes, or more. No matches are found.
231 CMPU R3,R4
232 MOVD $-1,R5
233 // Otherwise, the count is the index of the first match.
234 ISEL CR0LT,R3,R5,R3
235 RET
236 #else
237 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
238 RLDIMI $16,R5,$32,R5
239 RLDIMI $32,R5,$0,R5
240 CMPU R4,$8
241 BLT cmp4
242 MOVD $-8,R11
243 ADD $-8,R4,R4
244
245 _LDBEX (R0)(R3),R10
246 _LDBEX (R11)(R9),R11
247 CMPB R10,R5,R10
248 CMPB R11,R5,R11
249 CMPU R10,$0
250 CMPU R11,$0,CR1
251 CNTLZD R10,R10
252 CNTLZD R11,R11
253 SRD $3,R10,R3
254 SRD $3,R11,R11
255 BNE found
256
257 ADD R4,R11,R4
258 MOVD $-1,R3
259 ISEL CR1EQ,R3,R4,R3
260 RET
261
262 cmp4: // Length 4 - 7
263 CMPU R4,$4
264 BLT cmp2
265 MOVD $-4,R11
266 ADD $-4,R4,R4
267
268 _LWBEX (R0)(R3),R10
269 _LWBEX (R11)(R9),R11
270 CMPB R10,R5,R10
271 CMPB R11,R5,R11
272 CNTLZW R10,R10
273 CNTLZW R11,R11
274 CMPU R10,$32
275 CMPU R11,$32,CR1
276 SRD $3,R10,R3
277 SRD $3,R11,R11
278 BNE found
279
280 ADD R4,R11,R4
281 MOVD $-1,R3
282 ISEL CR1EQ,R3,R4,R3
283 RET
284
285 cmp2: // Length 2 - 3
286 CMPU R4,$2
287 BLT cmp1
288
289 _LHBEX (R0)(R3),R10
290 CMPB R10,R5,R10
291 SLDCC $48,R10,R10
292 CNTLZD R10,R10
293 SRD $3,R10,R3
294 BNE found
295
296 cmp1: // Length 1
297 MOVD $-1,R3
298 ANDCC $1,R4,R31
299 BEQ found
300
301 MOVBZ -1(R9),R10
302 CMPB R10,R5,R10
303 ANDCC $1,R10
304 ADD $-1,R4
305 ISEL CR0EQ,R3,R4,R3
306
307 found:
308 RET
309 #endif
310
311 notfound:
312 MOVD $-1,R3
313 RET
314
315
View as plain text