1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "asm_amd64.h"
7 #include "textflag.h"
8
9 TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
10 // AX = a_base (want in SI)
11 // BX = a_len (want in BX)
12 // CX = a_cap (unused)
13 // DI = b_base (want in DI)
14 // SI = b_len (want in DX)
15 // R8 = b_cap (unused)
16 MOVQ SI, DX
17 MOVQ AX, SI
18 JMP cmpbody<>(SB)
19
20 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
21 // AX = a_base (want in SI)
22 // BX = a_len (want in BX)
23 // CX = b_base (want in DI)
24 // DI = b_len (want in DX)
25 MOVQ AX, SI
26 MOVQ DI, DX
27 MOVQ CX, DI
28 JMP cmpbody<>(SB)
29
30 // input:
31 // SI = a
32 // DI = b
33 // BX = alen
34 // DX = blen
35 // output:
36 // AX = output (-1/0/1)
37 TEXT cmpbody<>(SB),NOSPLIT,$0-0
38 CMPQ SI, DI
39 JEQ allsame
40 CMPQ BX, DX
41 MOVQ DX, R8
42 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
43 CMPQ R8, $8
44 JB small
45
46 CMPQ R8, $63
47 JBE loop
48 #ifndef hasAVX2
49 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
50 JEQ big_loop_avx2
51 JMP big_loop
52 #else
53 JMP big_loop_avx2
54 #endif
55 loop:
56 CMPQ R8, $16
57 JBE _0through16
58 MOVOU (SI), X0
59 MOVOU (DI), X1
60 PCMPEQB X0, X1
61 PMOVMSKB X1, AX
62 XORQ $0xffff, AX // convert EQ to NE
63 JNE diff16 // branch if at least one byte is not equal
64 ADDQ $16, SI
65 ADDQ $16, DI
66 SUBQ $16, R8
67 JMP loop
68
69 diff64:
70 ADDQ $48, SI
71 ADDQ $48, DI
72 JMP diff16
73 diff48:
74 ADDQ $32, SI
75 ADDQ $32, DI
76 JMP diff16
77 diff32:
78 ADDQ $16, SI
79 ADDQ $16, DI
80 // AX = bit mask of differences
81 diff16:
82 BSFQ AX, BX // index of first byte that differs
83 XORQ AX, AX
84 MOVB (SI)(BX*1), CX
85 CMPB CX, (DI)(BX*1)
86 SETHI AX
87 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
88 RET
89
90 // 0 through 16 bytes left, alen>=8, blen>=8
91 _0through16:
92 CMPQ R8, $8
93 JBE _0through8
94 MOVQ (SI), AX
95 MOVQ (DI), CX
96 CMPQ AX, CX
97 JNE diff8
98 _0through8:
99 MOVQ -8(SI)(R8*1), AX
100 MOVQ -8(DI)(R8*1), CX
101 CMPQ AX, CX
102 JEQ allsame
103
104 // AX and CX contain parts of a and b that differ.
105 diff8:
106 BSWAPQ AX // reverse order of bytes
107 BSWAPQ CX
108 XORQ AX, CX
109 BSRQ CX, CX // index of highest bit difference
110 SHRQ CX, AX // move a's bit to bottom
111 ANDQ $1, AX // mask bit
112 LEAQ -1(AX*2), AX // 1/0 => +1/-1
113 RET
114
115 // 0-7 bytes in common
116 small:
117 LEAQ (R8*8), CX // bytes left -> bits left
118 NEGQ CX // - bits lift (== 64 - bits left mod 64)
119 JEQ allsame
120
121 // load bytes of a into high bytes of AX
122 CMPB SI, $0xf8
123 JA si_high
124 MOVQ (SI), SI
125 JMP si_finish
126 si_high:
127 MOVQ -8(SI)(R8*1), SI
128 SHRQ CX, SI
129 si_finish:
130 SHLQ CX, SI
131
132 // load bytes of b in to high bytes of BX
133 CMPB DI, $0xf8
134 JA di_high
135 MOVQ (DI), DI
136 JMP di_finish
137 di_high:
138 MOVQ -8(DI)(R8*1), DI
139 SHRQ CX, DI
140 di_finish:
141 SHLQ CX, DI
142
143 BSWAPQ SI // reverse order of bytes
144 BSWAPQ DI
145 XORQ SI, DI // find bit differences
146 JEQ allsame
147 BSRQ DI, CX // index of highest bit difference
148 SHRQ CX, SI // move a's bit to bottom
149 ANDQ $1, SI // mask bit
150 LEAQ -1(SI*2), AX // 1/0 => +1/-1
151 RET
152
153 allsame:
154 XORQ AX, AX
155 XORQ CX, CX
156 CMPQ BX, DX
157 SETGT AX // 1 if alen > blen
158 SETEQ CX // 1 if alen == blen
159 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
160 RET
161
162 // this works for >= 64 bytes of data.
163 #ifndef hasAVX2
164 big_loop:
165 MOVOU (SI), X0
166 MOVOU (DI), X1
167 PCMPEQB X0, X1
168 PMOVMSKB X1, AX
169 XORQ $0xffff, AX
170 JNE diff16
171
172 MOVOU 16(SI), X0
173 MOVOU 16(DI), X1
174 PCMPEQB X0, X1
175 PMOVMSKB X1, AX
176 XORQ $0xffff, AX
177 JNE diff32
178
179 MOVOU 32(SI), X0
180 MOVOU 32(DI), X1
181 PCMPEQB X0, X1
182 PMOVMSKB X1, AX
183 XORQ $0xffff, AX
184 JNE diff48
185
186 MOVOU 48(SI), X0
187 MOVOU 48(DI), X1
188 PCMPEQB X0, X1
189 PMOVMSKB X1, AX
190 XORQ $0xffff, AX
191 JNE diff64
192
193 ADDQ $64, SI
194 ADDQ $64, DI
195 SUBQ $64, R8
196 CMPQ R8, $64
197 JBE loop
198 JMP big_loop
199 #endif
200
201 // Compare 64-bytes per loop iteration.
202 // Loop is unrolled and uses AVX2.
203 big_loop_avx2:
204 VMOVDQU (SI), Y2
205 VMOVDQU (DI), Y3
206 VMOVDQU 32(SI), Y4
207 VMOVDQU 32(DI), Y5
208 VPCMPEQB Y2, Y3, Y0
209 VPMOVMSKB Y0, AX
210 XORL $0xffffffff, AX
211 JNE diff32_avx2
212 VPCMPEQB Y4, Y5, Y6
213 VPMOVMSKB Y6, AX
214 XORL $0xffffffff, AX
215 JNE diff64_avx2
216
217 ADDQ $64, SI
218 ADDQ $64, DI
219 SUBQ $64, R8
220 CMPQ R8, $64
221 JB big_loop_avx2_exit
222 JMP big_loop_avx2
223
224 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
225 diff32_avx2:
226 VZEROUPPER
227 JMP diff16
228
229 // Same as diff32_avx2, but for last 32 bytes.
230 diff64_avx2:
231 VZEROUPPER
232 JMP diff48
233
234 // For <64 bytes remainder jump to normal loop.
235 big_loop_avx2_exit:
236 VZEROUPPER
237 JMP loop
238
View as plain text