1 // Copyright 2022 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "asm_riscv64.h"
6 #include "go_asm.h"
7 #include "textflag.h"
8
9 TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
10 // X10 = a_base
11 // X11 = a_len
12 // X12 = a_cap (unused)
13 // X13 = b_base (want in X12)
14 // X14 = b_len (want in X13)
15 // X15 = b_cap (unused)
16 MOV X13, X12
17 MOV X14, X13
18 JMP compare<>(SB)
19
20 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
21 // X10 = a_base
22 // X11 = a_len
23 // X12 = b_base
24 // X13 = b_len
25 JMP compare<>(SB)
26
27 // On entry:
28 // X10 points to start of a
29 // X11 length of a
30 // X12 points to start of b
31 // X13 length of b
32 // return value in X10 (-1/0/1)
33 TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
34 BEQ X10, X12, cmp_len
35
36 MIN X11, X13, X5
37 BEQZ X5, cmp_len
38
39 MOV $16, X6
40 BLT X5, X6, check8_unaligned
41
42 #ifndef hasV
43 MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X6
44 BEQZ X6, compare_scalar
45 #endif
46
47 // Use vector if not 8 byte aligned.
48 OR X10, X12, X6
49 AND $7, X6
50 BNEZ X6, vector_loop
51
52 // Use scalar if 8 byte aligned and <= 128 bytes.
53 SUB $128, X5, X6
54 BLEZ X6, compare_scalar_aligned
55
56 PCALIGN $16
57 vector_loop:
58 VSETVLI X5, E8, M8, TA, MA, X6
59 VLE8V (X10), V8
60 VLE8V (X12), V16
61 VMSNEVV V8, V16, V0
62 VFIRSTM V0, X7
63 BGEZ X7, vector_not_eq
64 ADD X6, X10
65 ADD X6, X12
66 SUB X6, X5
67 BNEZ X5, vector_loop
68 JMP cmp_len
69
70 vector_not_eq:
71 // Load first differing bytes in X8/X9.
72 ADD X7, X10
73 ADD X7, X12
74 MOVBU (X10), X8
75 MOVBU (X12), X9
76 JMP cmp
77
78 compare_scalar:
79 MOV $32, X6
80 BLT X5, X6, check8_unaligned
81
82 // Check alignment - if alignment differs we have to do one byte at a time.
83 AND $7, X10, X7
84 AND $7, X12, X8
85 BNE X7, X8, check8_unaligned
86 BEQZ X7, compare32
87
88 // Check one byte at a time until we reach 8 byte alignment.
89 SUB X7, X0, X7
90 ADD $8, X7, X7
91 SUB X7, X5, X5
92 align:
93 SUB $1, X7
94 MOVBU 0(X10), X8
95 MOVBU 0(X12), X9
96 BNE X8, X9, cmp
97 ADD $1, X10
98 ADD $1, X12
99 BNEZ X7, align
100
101 compare_scalar_aligned:
102 MOV $32, X6
103 BLT X5, X6, check16
104 compare32:
105 MOV 0(X10), X15
106 MOV 0(X12), X16
107 MOV 8(X10), X17
108 MOV 8(X12), X18
109 BNE X15, X16, cmp8a
110 BNE X17, X18, cmp8b
111 MOV 16(X10), X15
112 MOV 16(X12), X16
113 MOV 24(X10), X17
114 MOV 24(X12), X18
115 BNE X15, X16, cmp8a
116 BNE X17, X18, cmp8b
117 ADD $32, X10
118 ADD $32, X12
119 SUB $32, X5
120 BGE X5, X6, compare32
121 BEQZ X5, cmp_len
122
123 check16:
124 MOV $16, X6
125 BLT X5, X6, check8_unaligned
126 compare16:
127 MOV 0(X10), X15
128 MOV 0(X12), X16
129 MOV 8(X10), X17
130 MOV 8(X12), X18
131 BNE X15, X16, cmp8a
132 BNE X17, X18, cmp8b
133 ADD $16, X10
134 ADD $16, X12
135 SUB $16, X5
136 BEQZ X5, cmp_len
137
138 check8_unaligned:
139 MOV $8, X6
140 BLT X5, X6, check4_unaligned
141 compare8_unaligned:
142 MOVBU 0(X10), X8
143 MOVBU 1(X10), X15
144 MOVBU 2(X10), X17
145 MOVBU 3(X10), X19
146 MOVBU 4(X10), X21
147 MOVBU 5(X10), X23
148 MOVBU 6(X10), X25
149 MOVBU 7(X10), X29
150 MOVBU 0(X12), X9
151 MOVBU 1(X12), X16
152 MOVBU 2(X12), X18
153 MOVBU 3(X12), X20
154 MOVBU 4(X12), X22
155 MOVBU 5(X12), X24
156 MOVBU 6(X12), X28
157 MOVBU 7(X12), X30
158 BNE X8, X9, cmp1a
159 BNE X15, X16, cmp1b
160 BNE X17, X18, cmp1c
161 BNE X19, X20, cmp1d
162 BNE X21, X22, cmp1e
163 BNE X23, X24, cmp1f
164 BNE X25, X28, cmp1g
165 BNE X29, X30, cmp1h
166 ADD $8, X10
167 ADD $8, X12
168 SUB $8, X5
169 BGE X5, X6, compare8_unaligned
170 BEQZ X5, cmp_len
171
172 check4_unaligned:
173 MOV $4, X6
174 BLT X5, X6, compare1
175 compare4_unaligned:
176 MOVBU 0(X10), X8
177 MOVBU 1(X10), X15
178 MOVBU 2(X10), X17
179 MOVBU 3(X10), X19
180 MOVBU 0(X12), X9
181 MOVBU 1(X12), X16
182 MOVBU 2(X12), X18
183 MOVBU 3(X12), X20
184 BNE X8, X9, cmp1a
185 BNE X15, X16, cmp1b
186 BNE X17, X18, cmp1c
187 BNE X19, X20, cmp1d
188 ADD $4, X10
189 ADD $4, X12
190 SUB $4, X5
191 BGE X5, X6, compare4_unaligned
192
193 compare1:
194 BEQZ X5, cmp_len
195 MOVBU 0(X10), X8
196 MOVBU 0(X12), X9
197 BNE X8, X9, cmp
198 ADD $1, X10
199 ADD $1, X12
200 SUB $1, X5
201 JMP compare1
202
203 // Compare 8 bytes of memory in X15/X16 that are known to differ.
204 cmp8a:
205 MOV X15, X17
206 MOV X16, X18
207
208 // Compare 8 bytes of memory in X17/X18 that are known to differ.
209 cmp8b:
210 MOV $0xff, X19
211 cmp8_loop:
212 AND X17, X19, X8
213 AND X18, X19, X9
214 BNE X8, X9, cmp
215 SLLI $8, X19
216 JMP cmp8_loop
217
218 cmp1a:
219 SLTU X9, X8, X5
220 SLTU X8, X9, X6
221 JMP cmp_ret
222 cmp1b:
223 SLTU X16, X15, X5
224 SLTU X15, X16, X6
225 JMP cmp_ret
226 cmp1c:
227 SLTU X18, X17, X5
228 SLTU X17, X18, X6
229 JMP cmp_ret
230 cmp1d:
231 SLTU X20, X19, X5
232 SLTU X19, X20, X6
233 JMP cmp_ret
234 cmp1e:
235 SLTU X22, X21, X5
236 SLTU X21, X22, X6
237 JMP cmp_ret
238 cmp1f:
239 SLTU X24, X23, X5
240 SLTU X23, X24, X6
241 JMP cmp_ret
242 cmp1g:
243 SLTU X28, X25, X5
244 SLTU X25, X28, X6
245 JMP cmp_ret
246 cmp1h:
247 SLTU X30, X29, X5
248 SLTU X29, X30, X6
249 JMP cmp_ret
250
251 cmp_len:
252 MOV X11, X8
253 MOV X13, X9
254 cmp:
255 SLTU X9, X8, X5
256 SLTU X8, X9, X6
257 cmp_ret:
258 SUB X5, X6, X10
259 RET
260
View as plain text