1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 TEXT ·addMulVVW1024(SB),$0-32
11 MOV $16, X30
12 JMP addMulVVWx(SB)
13
14 // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 TEXT ·addMulVVW1536(SB),$0-32
16 MOV $24, X30
17 JMP addMulVVWx(SB)
18
19 // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 TEXT ·addMulVVW2048(SB),$0-32
21 MOV $32, X30
22 JMP addMulVVWx(SB)
23
24 TEXT addMulVVWx(SB),NOFRAME|NOSPLIT,$0
25 MOV z+0(FP), X5
26 MOV x+8(FP), X7
27 MOV y+16(FP), X6
28 MOV $0, X29
29
30 BEQZ X30, done
31 loop:
32 MOV 0*8(X5), X10 // z[0]
33 MOV 1*8(X5), X13 // z[1]
34 MOV 2*8(X5), X16 // z[2]
35 MOV 3*8(X5), X19 // z[3]
36
37 MOV 0*8(X7), X8 // x[0]
38 MOV 1*8(X7), X11 // x[1]
39 MOV 2*8(X7), X14 // x[2]
40 MOV 3*8(X7), X17 // x[3]
41
42 MULHU X8, X6, X9 // z_hi[0] = x[0] * y
43 MUL X8, X6, X8 // z_lo[0] = x[0] * y
44 ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0]
45 SLTU X8, X21, X22
46 ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0]
47 ADD X21, X29, X10 // z_lo[0] = x[0] * y + z[0] + c
48 SLTU X21, X10, X22
49 ADD X9, X22, X29 // next c
50
51 MULHU X11, X6, X12 // z_hi[1] = x[1] * y
52 MUL X11, X6, X11 // z_lo[1] = x[1] * y
53 ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1]
54 SLTU X11, X21, X22
55 ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1]
56 ADD X21, X29, X13 // z_lo[1] = x[1] * y + z[1] + c
57 SLTU X21, X13, X22
58 ADD X12, X22, X29 // next c
59
60 MULHU X14, X6, X15 // z_hi[2] = x[2] * y
61 MUL X14, X6, X14 // z_lo[2] = x[2] * y
62 ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2]
63 SLTU X14, X21, X22
64 ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2]
65 ADD X21, X29, X16 // z_lo[2] = x[2] * y + z[2] + c
66 SLTU X21, X16, X22
67 ADD X15, X22, X29 // next c
68
69 MULHU X17, X6, X18 // z_hi[3] = x[3] * y
70 MUL X17, X6, X17 // z_lo[3] = x[3] * y
71 ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3]
72 SLTU X17, X21, X22
73 ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3]
74 ADD X21, X29, X19 // z_lo[3] = x[3] * y + z[3] + c
75 SLTU X21, X19, X22
76 ADD X18, X22, X29 // next c
77
78 MOV X10, 0*8(X5) // z[0]
79 MOV X13, 1*8(X5) // z[1]
80 MOV X16, 2*8(X5) // z[2]
81 MOV X19, 3*8(X5) // z[3]
82
83 ADD $32, X5
84 ADD $32, X7
85
86 SUB $4, X30
87 BNEZ X30, loop
88
89 done:
90 MOV X29, c+24(FP)
91 RET
92
View as plain text