1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Index(SB),NOSPLIT,$0-56
9 MOVQ a_base+0(FP), DI
10 MOVQ a_len+8(FP), DX
11 MOVQ b_base+24(FP), R8
12 MOVQ b_len+32(FP), AX
13 MOVQ DI, R10
14 LEAQ ret+48(FP), R11
15 JMP indexbody<>(SB)
16
17 TEXT ·IndexString(SB),NOSPLIT,$0-40
18 MOVQ a_base+0(FP), DI
19 MOVQ a_len+8(FP), DX
20 MOVQ b_base+16(FP), R8
21 MOVQ b_len+24(FP), AX
22 MOVQ DI, R10
23 LEAQ ret+32(FP), R11
24 JMP indexbody<>(SB)
25
26 // AX: length of string, that we are searching for
27 // DX: length of string, in which we are searching
28 // DI: pointer to string, in which we are searching
29 // R8: pointer to string, that we are searching for
30 // R11: address, where to put return value
31 // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
32 TEXT indexbody<>(SB),NOSPLIT,$0
33 CMPQ AX, DX
34 JA fail
35 CMPQ DX, $16
36 JAE sse42
37 no_sse42:
38 CMPQ AX, $2
39 JA _3_or_more
40 MOVW (R8), R8
41 LEAQ -1(DI)(DX*1), DX
42 PCALIGN $16
43 loop2:
44 MOVW (DI), SI
45 CMPW SI,R8
46 JZ success
47 ADDQ $1,DI
48 CMPQ DI,DX
49 JB loop2
50 JMP fail
51 _3_or_more:
52 CMPQ AX, $3
53 JA _4_or_more
54 MOVW 1(R8), BX
55 MOVW (R8), R8
56 LEAQ -2(DI)(DX*1), DX
57 loop3:
58 MOVW (DI), SI
59 CMPW SI,R8
60 JZ partial_success3
61 ADDQ $1,DI
62 CMPQ DI,DX
63 JB loop3
64 JMP fail
65 partial_success3:
66 MOVW 1(DI), SI
67 CMPW SI,BX
68 JZ success
69 ADDQ $1,DI
70 CMPQ DI,DX
71 JB loop3
72 JMP fail
73 _4_or_more:
74 CMPQ AX, $4
75 JA _5_or_more
76 MOVL (R8), R8
77 LEAQ -3(DI)(DX*1), DX
78 loop4:
79 MOVL (DI), SI
80 CMPL SI,R8
81 JZ success
82 ADDQ $1,DI
83 CMPQ DI,DX
84 JB loop4
85 JMP fail
86 _5_or_more:
87 CMPQ AX, $7
88 JA _8_or_more
89 LEAQ 1(DI)(DX*1), DX
90 SUBQ AX, DX
91 MOVL -4(R8)(AX*1), BX
92 MOVL (R8), R8
93 loop5to7:
94 MOVL (DI), SI
95 CMPL SI,R8
96 JZ partial_success5to7
97 ADDQ $1,DI
98 CMPQ DI,DX
99 JB loop5to7
100 JMP fail
101 partial_success5to7:
102 MOVL -4(AX)(DI*1), SI
103 CMPL SI,BX
104 JZ success
105 ADDQ $1,DI
106 CMPQ DI,DX
107 JB loop5to7
108 JMP fail
109 _8_or_more:
110 CMPQ AX, $8
111 JA _9_or_more
112 MOVQ (R8), R8
113 LEAQ -7(DI)(DX*1), DX
114 loop8:
115 MOVQ (DI), SI
116 CMPQ SI,R8
117 JZ success
118 ADDQ $1,DI
119 CMPQ DI,DX
120 JB loop8
121 JMP fail
122 _9_or_more:
123 CMPQ AX, $15
124 JA _16_or_more
125 LEAQ 1(DI)(DX*1), DX
126 SUBQ AX, DX
127 MOVQ -8(R8)(AX*1), BX
128 MOVQ (R8), R8
129 loop9to15:
130 MOVQ (DI), SI
131 CMPQ SI,R8
132 JZ partial_success9to15
133 ADDQ $1,DI
134 CMPQ DI,DX
135 JB loop9to15
136 JMP fail
137 partial_success9to15:
138 MOVQ -8(AX)(DI*1), SI
139 CMPQ SI,BX
140 JZ success
141 ADDQ $1,DI
142 CMPQ DI,DX
143 JB loop9to15
144 JMP fail
145 _16_or_more:
146 CMPQ AX, $16
147 JA _17_or_more
148 MOVOU (R8), X1
149 LEAQ -15(DI)(DX*1), DX
150 loop16:
151 MOVOU (DI), X2
152 PCMPEQB X1, X2
153 PMOVMSKB X2, SI
154 CMPQ SI, $0xffff
155 JE success
156 ADDQ $1,DI
157 CMPQ DI,DX
158 JB loop16
159 JMP fail
160 _17_or_more:
161 CMPQ AX, $31
162 JA _32_or_more
163 LEAQ 1(DI)(DX*1), DX
164 SUBQ AX, DX
165 MOVOU -16(R8)(AX*1), X0
166 MOVOU (R8), X1
167 loop17to31:
168 MOVOU (DI), X2
169 PCMPEQB X1,X2
170 PMOVMSKB X2, SI
171 CMPQ SI, $0xffff
172 JE partial_success17to31
173 ADDQ $1,DI
174 CMPQ DI,DX
175 JB loop17to31
176 JMP fail
177 partial_success17to31:
178 MOVOU -16(AX)(DI*1), X3
179 PCMPEQB X0, X3
180 PMOVMSKB X3, SI
181 CMPQ SI, $0xffff
182 JE success
183 ADDQ $1,DI
184 CMPQ DI,DX
185 JB loop17to31
186 JMP fail
187 // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
188 // So no need to check cpuid
189 _32_or_more:
190 CMPQ AX, $32
191 JA _33_to_63
192 VMOVDQU (R8), Y1
193 LEAQ -31(DI)(DX*1), DX
194 loop32:
195 VMOVDQU (DI), Y2
196 VPCMPEQB Y1, Y2, Y3
197 VPMOVMSKB Y3, SI
198 CMPL SI, $0xffffffff
199 JE success_avx2
200 ADDQ $1,DI
201 CMPQ DI,DX
202 JB loop32
203 JMP fail_avx2
204 _33_to_63:
205 LEAQ 1(DI)(DX*1), DX
206 SUBQ AX, DX
207 VMOVDQU -32(R8)(AX*1), Y0
208 VMOVDQU (R8), Y1
209 loop33to63:
210 VMOVDQU (DI), Y2
211 VPCMPEQB Y1, Y2, Y3
212 VPMOVMSKB Y3, SI
213 CMPL SI, $0xffffffff
214 JE partial_success33to63
215 ADDQ $1,DI
216 CMPQ DI,DX
217 JB loop33to63
218 JMP fail_avx2
219 partial_success33to63:
220 VMOVDQU -32(AX)(DI*1), Y3
221 VPCMPEQB Y0, Y3, Y4
222 VPMOVMSKB Y4, SI
223 CMPL SI, $0xffffffff
224 JE success_avx2
225 ADDQ $1,DI
226 CMPQ DI,DX
227 JB loop33to63
228 fail_avx2:
229 VZEROUPPER
230 fail:
231 MOVQ $-1, (R11)
232 RET
233 success_avx2:
234 VZEROUPPER
235 JMP success
236 sse42:
237 #ifndef hasSSE42
238 CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
239 JNE no_sse42
240 #endif
241 CMPQ AX, $12
242 // PCMPESTRI is slower than normal compare,
243 // so using it makes sense only if we advance 4+ bytes per compare
244 // This value was determined experimentally and is the ~same
245 // on Nehalem (first with SSE42) and Haswell.
246 JAE _9_or_more
247 LEAQ 16(R8), SI
248 TESTW $0xff0, SI
249 JEQ no_sse42
250 MOVOU (R8), X1
251 LEAQ -15(DI)(DX*1), SI
252 MOVQ $16, R9
253 SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
254 PCALIGN $16
255 loop_sse42:
256 // 0x0c means: unsigned byte compare (bits 0,1 are 00)
257 // for equality (bits 2,3 are 11)
258 // result is not masked or inverted (bits 4,5 are 00)
259 // and corresponds to first matching byte (bit 6 is 0)
260 PCMPESTRI $0x0c, (DI), X1
261 // CX == 16 means no match,
262 // CX > R9 means partial match at the end of the string,
263 // otherwise sep is at offset CX from X1 start
264 CMPQ CX, R9
265 JBE sse42_success
266 ADDQ R9, DI
267 CMPQ DI, SI
268 JB loop_sse42
269 PCMPESTRI $0x0c, -1(SI), X1
270 CMPQ CX, R9
271 JA fail
272 LEAQ -1(SI), DI
273 sse42_success:
274 ADDQ CX, DI
275 success:
276 SUBQ R10, DI
277 MOVQ DI, (R11)
278 RET
279
View as plain text