1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !plan9
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 TEXT ·IndexByte(SB), NOSPLIT, $0-40
11 MOVQ b_base+0(FP), SI
12 MOVQ b_len+8(FP), BX
13 MOVB c+24(FP), AL
14 LEAQ ret+32(FP), R8
15 JMP indexbytebody<>(SB)
16
17 TEXT ·IndexByteString(SB), NOSPLIT, $0-32
18 MOVQ s_base+0(FP), SI
19 MOVQ s_len+8(FP), BX
20 MOVB c+16(FP), AL
21 LEAQ ret+24(FP), R8
22 JMP indexbytebody<>(SB)
23
24 // input:
25 // SI: data
26 // BX: data len
27 // AL: byte sought
28 // R8: address to put result
29 TEXT indexbytebody<>(SB), NOSPLIT, $0
30 // Shuffle X0 around so that each byte contains
31 // the character we're looking for.
32 MOVD AX, X0
33 PUNPCKLBW X0, X0
34 PUNPCKLBW X0, X0
35 PSHUFL $0, X0, X0
36
37 CMPQ BX, $16
38 JLT small
39
40 MOVQ SI, DI
41
42 CMPQ BX, $32
43 JA avx2
44 sse:
45 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
46 JMP sseloopentry
47
48 PCALIGN $16
49 sseloop:
50 // Move the next 16-byte chunk of the data into X1.
51 MOVOU (DI), X1
52 // Compare bytes in X0 to X1.
53 PCMPEQB X0, X1
54 // Take the top bit of each byte in X1 and put the result in DX.
55 PMOVMSKB X1, DX
56 // Find first set bit, if any.
57 BSFL DX, DX
58 JNZ ssesuccess
59 // Advance to next block.
60 ADDQ $16, DI
61 sseloopentry:
62 CMPQ DI, AX
63 JB sseloop
64
65 // Search the last 16-byte chunk. This chunk may overlap with the
66 // chunks we've already searched, but that's ok.
67 MOVQ AX, DI
68 MOVOU (AX), X1
69 PCMPEQB X0, X1
70 PMOVMSKB X1, DX
71 BSFL DX, DX
72 JNZ ssesuccess
73
74 failure:
75 MOVQ $-1, (R8)
76 RET
77
78 // We've found a chunk containing the byte.
79 // The chunk was loaded from DI.
80 // The index of the matching byte in the chunk is DX.
81 // The start of the data is SI.
82 ssesuccess:
83 SUBQ SI, DI // Compute offset of chunk within data.
84 ADDQ DX, DI // Add offset of byte within chunk.
85 MOVQ DI, (R8)
86 RET
87
88 // handle for lengths < 16
89 small:
90 TESTQ BX, BX
91 JEQ failure
92
93 // Check if we'll load across a page boundary.
94 LEAQ 16(SI), AX
95 TESTW $0xff0, AX
96 JEQ endofpage
97
98 MOVOU (SI), X1 // Load data
99 PCMPEQB X0, X1 // Compare target byte with each byte in data.
100 PMOVMSKB X1, DX // Move result bits to integer register.
101 BSFL DX, DX // Find first set bit.
102 JZ failure // No set bit, failure.
103 CMPL DX, BX
104 JAE failure // Match is past end of data.
105 MOVQ DX, (R8)
106 RET
107
108 endofpage:
109 MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
110 PCMPEQB X0, X1 // Compare target byte with each byte in data.
111 PMOVMSKB X1, DX // Move result bits to integer register.
112 MOVL BX, CX
113 SHLL CX, DX
114 SHRL $16, DX // Shift desired bits down to bottom of register.
115 BSFL DX, DX // Find first set bit.
116 JZ failure // No set bit, failure.
117 MOVQ DX, (R8)
118 RET
119
120 avx2:
121 #ifndef hasAVX2
122 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
123 JNE sse
124 #endif
125 MOVD AX, X0
126 LEAQ -32(SI)(BX*1), R11
127 VPBROADCASTB X0, Y1
128
129 PCALIGN $32
130 avx2_loop:
131 VMOVDQU (DI), Y2
132 VPCMPEQB Y1, Y2, Y3
133 VPTEST Y3, Y3
134 JNZ avx2success
135 ADDQ $32, DI
136 CMPQ DI, R11
137 JLT avx2_loop
138 MOVQ R11, DI
139 VMOVDQU (DI), Y2
140 VPCMPEQB Y1, Y2, Y3
141 VPTEST Y3, Y3
142 JNZ avx2success
143 VZEROUPPER
144 MOVQ $-1, (R8)
145 RET
146
147 avx2success:
148 VPMOVMSKB Y3, DX
149 BSFL DX, DX
150 SUBQ SI, DI
151 ADDQ DI, DX
152 MOVQ DX, (R8)
153 VZEROUPPER
154 RET
155
View as plain text