Text file
src/math/big/arith_riscv64.s
1 // Copyright 2020 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !math_big_pure_go && riscv64
6
7 #include "textflag.h"
8
9 // This file provides fast assembly versions for the elementary
10 // arithmetic operations on vectors implemented in arith.go.
11
12 TEXT ·addVV(SB),NOSPLIT,$0
13 MOV x+24(FP), X5
14 MOV y+48(FP), X6
15 MOV z+0(FP), X7
16 MOV z_len+8(FP), X30
17
18 MOV $4, X28
19 MOV $0, X29 // c = 0
20
21 BEQZ X30, done
22 BLTU X30, X28, loop1
23
24 loop4:
25 MOV 0(X5), X8 // x[0]
26 MOV 0(X6), X9 // y[0]
27 MOV 8(X5), X11 // x[1]
28 MOV 8(X6), X12 // y[1]
29 MOV 16(X5), X14 // x[2]
30 MOV 16(X6), X15 // y[2]
31 MOV 24(X5), X17 // x[3]
32 MOV 24(X6), X18 // y[3]
33
34 ADD X8, X9, X21 // z[0] = x[0] + y[0]
35 SLTU X8, X21, X22
36 ADD X21, X29, X10 // z[0] = x[0] + y[0] + c
37 SLTU X21, X10, X23
38 ADD X22, X23, X29 // next c
39
40 ADD X11, X12, X24 // z[1] = x[1] + y[1]
41 SLTU X11, X24, X25
42 ADD X24, X29, X13 // z[1] = x[1] + y[1] + c
43 SLTU X24, X13, X26
44 ADD X25, X26, X29 // next c
45
46 ADD X14, X15, X21 // z[2] = x[2] + y[2]
47 SLTU X14, X21, X22
48 ADD X21, X29, X16 // z[2] = x[2] + y[2] + c
49 SLTU X21, X16, X23
50 ADD X22, X23, X29 // next c
51
52 ADD X17, X18, X21 // z[3] = x[3] + y[3]
53 SLTU X17, X21, X22
54 ADD X21, X29, X19 // z[3] = x[3] + y[3] + c
55 SLTU X21, X19, X23
56 ADD X22, X23, X29 // next c
57
58 MOV X10, 0(X7) // z[0]
59 MOV X13, 8(X7) // z[1]
60 MOV X16, 16(X7) // z[2]
61 MOV X19, 24(X7) // z[3]
62
63 ADD $32, X5
64 ADD $32, X6
65 ADD $32, X7
66 SUB $4, X30
67
68 BGEU X30, X28, loop4
69 BEQZ X30, done
70
71 loop1:
72 MOV 0(X5), X10 // x
73 MOV 0(X6), X11 // y
74
75 ADD X10, X11, X12 // z = x + y
76 SLTU X10, X12, X14
77 ADD X12, X29, X13 // z = x + y + c
78 SLTU X12, X13, X15
79 ADD X14, X15, X29 // next c
80
81 MOV X13, 0(X7) // z
82
83 ADD $8, X5
84 ADD $8, X6
85 ADD $8, X7
86 SUB $1, X30
87
88 BNEZ X30, loop1
89
90 done:
91 MOV X29, c+72(FP) // return c
92 RET
93
94 TEXT ·subVV(SB),NOSPLIT,$0
95 MOV x+24(FP), X5
96 MOV y+48(FP), X6
97 MOV z+0(FP), X7
98 MOV z_len+8(FP), X30
99
100 MOV $4, X28
101 MOV $0, X29 // b = 0
102
103 BEQZ X30, done
104 BLTU X30, X28, loop1
105
106 loop4:
107 MOV 0(X5), X8 // x[0]
108 MOV 0(X6), X9 // y[0]
109 MOV 8(X5), X11 // x[1]
110 MOV 8(X6), X12 // y[1]
111 MOV 16(X5), X14 // x[2]
112 MOV 16(X6), X15 // y[2]
113 MOV 24(X5), X17 // x[3]
114 MOV 24(X6), X18 // y[3]
115
116 SUB X9, X8, X21 // z[0] = x[0] - y[0]
117 SLTU X21, X8, X22
118 SUB X29, X21, X10 // z[0] = x[0] - y[0] - b
119 SLTU X10, X21, X23
120 ADD X22, X23, X29 // next b
121
122 SUB X12, X11, X24 // z[1] = x[1] - y[1]
123 SLTU X24, X11, X25
124 SUB X29, X24, X13 // z[1] = x[1] - y[1] - b
125 SLTU X13, X24, X26
126 ADD X25, X26, X29 // next b
127
128 SUB X15, X14, X21 // z[2] = x[2] - y[2]
129 SLTU X21, X14, X22
130 SUB X29, X21, X16 // z[2] = x[2] - y[2] - b
131 SLTU X16, X21, X23
132 ADD X22, X23, X29 // next b
133
134 SUB X18, X17, X21 // z[3] = x[3] - y[3]
135 SLTU X21, X17, X22
136 SUB X29, X21, X19 // z[3] = x[3] - y[3] - b
137 SLTU X19, X21, X23
138 ADD X22, X23, X29 // next b
139
140 MOV X10, 0(X7) // z[0]
141 MOV X13, 8(X7) // z[1]
142 MOV X16, 16(X7) // z[2]
143 MOV X19, 24(X7) // z[3]
144
145 ADD $32, X5
146 ADD $32, X6
147 ADD $32, X7
148 SUB $4, X30
149
150 BGEU X30, X28, loop4
151 BEQZ X30, done
152
153 loop1:
154 MOV 0(X5), X10 // x
155 MOV 0(X6), X11 // y
156
157 SUB X11, X10, X12 // z = x - y
158 SLTU X12, X10, X14
159 SUB X29, X12, X13 // z = x - y - b
160 SLTU X13, X12, X15
161 ADD X14, X15, X29 // next b
162
163 MOV X13, 0(X7) // z
164
165 ADD $8, X5
166 ADD $8, X6
167 ADD $8, X7
168 SUB $1, X30
169
170 BNEZ X30, loop1
171
172 done:
173 MOV X29, c+72(FP) // return b
174 RET
175
176 TEXT ·addVW(SB),NOSPLIT,$0
177 MOV x+24(FP), X5
178 MOV y+48(FP), X6
179 MOV z+0(FP), X7
180 MOV z_len+8(FP), X30
181
182 MOV $4, X28
183 MOV X6, X29 // c = y
184
185 BEQZ X30, done
186 BLTU X30, X28, loop1
187
188 loop4:
189 MOV 0(X5), X8 // x[0]
190 MOV 8(X5), X11 // x[1]
191 MOV 16(X5), X14 // x[2]
192 MOV 24(X5), X17 // x[3]
193
194 ADD X8, X29, X10 // z[0] = x[0] + c
195 SLTU X8, X10, X29 // next c
196
197 ADD X11, X29, X13 // z[1] = x[1] + c
198 SLTU X11, X13, X29 // next c
199
200 ADD X14, X29, X16 // z[2] = x[2] + c
201 SLTU X14, X16, X29 // next c
202
203 ADD X17, X29, X19 // z[3] = x[3] + c
204 SLTU X17, X19, X29 // next c
205
206 MOV X10, 0(X7) // z[0]
207 MOV X13, 8(X7) // z[1]
208 MOV X16, 16(X7) // z[2]
209 MOV X19, 24(X7) // z[3]
210
211 ADD $32, X5
212 ADD $32, X7
213 SUB $4, X30
214
215 BGEU X30, X28, loop4
216 BEQZ X30, done
217
218 loop1:
219 MOV 0(X5), X10 // x
220
221 ADD X10, X29, X12 // z = x + c
222 SLTU X10, X12, X29 // next c
223
224 MOV X12, 0(X7) // z
225
226 ADD $8, X5
227 ADD $8, X7
228 SUB $1, X30
229
230 BNEZ X30, loop1
231
232 done:
233 MOV X29, c+56(FP) // return c
234 RET
235
236 TEXT ·subVW(SB),NOSPLIT,$0
237 MOV x+24(FP), X5
238 MOV y+48(FP), X6
239 MOV z+0(FP), X7
240 MOV z_len+8(FP), X30
241
242 MOV $4, X28
243 MOV X6, X29 // b = y
244
245 BEQZ X30, done
246 BLTU X30, X28, loop1
247
248 loop4:
249 MOV 0(X5), X8 // x[0]
250 MOV 8(X5), X11 // x[1]
251 MOV 16(X5), X14 // x[2]
252 MOV 24(X5), X17 // x[3]
253
254 SUB X29, X8, X10 // z[0] = x[0] - b
255 SLTU X10, X8, X29 // next b
256
257 SUB X29, X11, X13 // z[1] = x[1] - b
258 SLTU X13, X11, X29 // next b
259
260 SUB X29, X14, X16 // z[2] = x[2] - b
261 SLTU X16, X14, X29 // next b
262
263 SUB X29, X17, X19 // z[3] = x[3] - b
264 SLTU X19, X17, X29 // next b
265
266 MOV X10, 0(X7) // z[0]
267 MOV X13, 8(X7) // z[1]
268 MOV X16, 16(X7) // z[2]
269 MOV X19, 24(X7) // z[3]
270
271 ADD $32, X5
272 ADD $32, X7
273 SUB $4, X30
274
275 BGEU X30, X28, loop4
276 BEQZ X30, done
277
278 loop1:
279 MOV 0(X5), X10 // x
280
281 SUB X29, X10, X12 // z = x - b
282 SLTU X12, X10, X29 // next b
283
284 MOV X12, 0(X7) // z
285
286 ADD $8, X5
287 ADD $8, X7
288 SUB $1, X30
289
290 BNEZ X30, loop1
291
292 done:
293 MOV X29, c+56(FP) // return b
294 RET
295
296 TEXT ·shlVU(SB),NOSPLIT,$0
297 JMP ·shlVU_g(SB)
298
299 TEXT ·shrVU(SB),NOSPLIT,$0
300 JMP ·shrVU_g(SB)
301
302 TEXT ·mulAddVWW(SB),NOSPLIT,$0
303 MOV x+24(FP), X5
304 MOV y+48(FP), X6
305 MOV z+0(FP), X7
306 MOV z_len+8(FP), X30
307 MOV r+56(FP), X29
308
309 MOV $4, X28
310
311 BEQ ZERO, X30, done
312 BLTU X30, X28, loop1
313
314 loop4:
315 MOV 0(X5), X8 // x[0]
316 MOV 8(X5), X11 // x[1]
317 MOV 16(X5), X14 // x[2]
318 MOV 24(X5), X17 // x[3]
319
320 MULHU X8, X6, X9 // z_hi[0] = x[0] * y
321 MUL X8, X6, X8 // z_lo[0] = x[0] * y
322 ADD X8, X29, X10 // z[0] = z_lo[0] + c
323 SLTU X8, X10, X23
324 ADD X23, X9, X29 // next c
325
326 MULHU X11, X6, X12 // z_hi[1] = x[1] * y
327 MUL X11, X6, X11 // z_lo[1] = x[1] * y
328 ADD X11, X29, X13 // z[1] = z_lo[1] + c
329 SLTU X11, X13, X23
330 ADD X23, X12, X29 // next c
331
332 MULHU X14, X6, X15 // z_hi[2] = x[2] * y
333 MUL X14, X6, X14 // z_lo[2] = x[2] * y
334 ADD X14, X29, X16 // z[2] = z_lo[2] + c
335 SLTU X14, X16, X23
336 ADD X23, X15, X29 // next c
337
338 MULHU X17, X6, X18 // z_hi[3] = x[3] * y
339 MUL X17, X6, X17 // z_lo[3] = x[3] * y
340 ADD X17, X29, X19 // z[3] = z_lo[3] + c
341 SLTU X17, X19, X23
342 ADD X23, X18, X29 // next c
343
344 MOV X10, 0(X7) // z[0]
345 MOV X13, 8(X7) // z[1]
346 MOV X16, 16(X7) // z[2]
347 MOV X19, 24(X7) // z[3]
348
349 ADD $32, X5
350 ADD $32, X7
351 SUB $4, X30
352
353 BGEU X30, X28, loop4
354 BEQZ X30, done
355
356 loop1:
357 MOV 0(X5), X10 // x
358
359 MULHU X10, X6, X12 // z_hi = x * y
360 MUL X10, X6, X10 // z_lo = x * y
361 ADD X10, X29, X13 // z_lo + c
362 SLTU X10, X13, X15
363 ADD X12, X15, X29 // next c
364
365 MOV X13, 0(X7) // z
366
367 ADD $8, X5
368 ADD $8, X7
369 SUB $1, X30
370
371 BNEZ X30, loop1
372
373 done:
374 MOV X29, c+64(FP) // return c
375 RET
376
377 TEXT ·addMulVVW(SB),NOSPLIT,$0
378 MOV x+24(FP), X5
379 MOV y+48(FP), X6
380 MOV z+0(FP), X7
381 MOV z_len+8(FP), X30
382
383 MOV $4, X28
384 MOV $0, X29 // c = 0
385
386 BEQZ X30, done
387 BLTU X30, X28, loop1
388
389 loop4:
390 MOV 0(X5), X8 // x[0]
391 MOV 0(X7), X10 // z[0]
392 MOV 8(X5), X11 // x[1]
393 MOV 8(X7), X13 // z[1]
394 MOV 16(X5), X14 // x[2]
395 MOV 16(X7), X16 // z[2]
396 MOV 24(X5), X17 // x[3]
397 MOV 24(X7), X19 // z[3]
398
399 MULHU X8, X6, X9 // z_hi[0] = x[0] * y
400 MUL X8, X6, X8 // z_lo[0] = x[0] * y
401 ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0]
402 SLTU X8, X21, X22
403 ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0]
404 ADD X21, X29, X10 // z[0] = x[0] * y + z[0] + c
405 SLTU X21, X10, X22
406 ADD X9, X22, X29 // next c
407
408 MULHU X11, X6, X12 // z_hi[1] = x[1] * y
409 MUL X11, X6, X11 // z_lo[1] = x[1] * y
410 ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1]
411 SLTU X11, X21, X22
412 ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1]
413 ADD X21, X29, X13 // z[1] = x[1] * y + z[1] + c
414 SLTU X21, X13, X22
415 ADD X12, X22, X29 // next c
416
417 MULHU X14, X6, X15 // z_hi[2] = x[2] * y
418 MUL X14, X6, X14 // z_lo[2] = x[2] * y
419 ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2]
420 SLTU X14, X21, X22
421 ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2]
422 ADD X21, X29, X16 // z[2] = x[2] * y + z[2] + c
423 SLTU X21, X16, X22
424 ADD X15, X22, X29 // next c
425
426 MULHU X17, X6, X18 // z_hi[3] = x[3] * y
427 MUL X17, X6, X17 // z_lo[3] = x[3] * y
428 ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3]
429 SLTU X17, X21, X22
430 ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3]
431 ADD X21, X29, X19 // z[3] = x[3] * y + z[3] + c
432 SLTU X21, X19, X22
433 ADD X18, X22, X29 // next c
434
435 MOV X10, 0(X7) // z[0]
436 MOV X13, 8(X7) // z[1]
437 MOV X16, 16(X7) // z[2]
438 MOV X19, 24(X7) // z[3]
439
440 ADD $32, X5
441 ADD $32, X7
442 SUB $4, X30
443
444 BGEU X30, X28, loop4
445 BEQZ X30, done
446
447 loop1:
448 MOV 0(X5), X10 // x
449 MOV 0(X7), X11 // z
450
451 MULHU X10, X6, X12 // z_hi = x * y
452 MUL X10, X6, X10 // z_lo = x * y
453 ADD X10, X11, X13 // z_lo = x * y + z
454 SLTU X10, X13, X15
455 ADD X12, X15, X12 // z_hi = x * y + z
456 ADD X13, X29, X10 // z = x * y + z + c
457 SLTU X13, X10, X15
458 ADD X12, X15, X29 // next c
459
460 MOV X10, 0(X7) // z
461
462 ADD $8, X5
463 ADD $8, X7
464 SUB $1, X30
465
466 BNEZ X30, loop1
467
468 done:
469 MOV X29, c+56(FP) // return c
470 RET
471
View as plain text