Text file
src/crypto/md5/md5block_riscv64.s
1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 //
5 // RISCV64 version of md5block.go
6 // derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go
7
8 //go:build !purego
9
10 #include "textflag.h"
11
12 #define LOAD32U(base, offset, tmp, dest) \
13 MOVBU (offset+0*1)(base), dest; \
14 MOVBU (offset+1*1)(base), tmp; \
15 SLL $8, tmp; \
16 OR tmp, dest; \
17 MOVBU (offset+2*1)(base), tmp; \
18 SLL $16, tmp; \
19 OR tmp, dest; \
20 MOVBU (offset+3*1)(base), tmp; \
21 SLL $24, tmp; \
22 OR tmp, dest
23
24 #define LOAD64U(base, offset, tmp1, tmp2, dst) \
25 LOAD32U(base, offset, tmp1, dst); \
26 LOAD32U(base, offset+4, tmp1, tmp2); \
27 SLL $32, tmp2; \
28 OR tmp2, dst
29
30 #define ROUND1EVN(a, b, c, d, x, const, shift) \
31 MOV $const, X23; \
32 ADDW x, a; \
33 ADDW X23, a; \
34 XOR c, d, X23; \
35 AND b, X23; \
36 XOR d, X23; \
37 ADDW X23, a; \
38 RORIW $(32-shift), a; \
39 ADDW b, a
40
41 #define ROUND1ODD(a, b, c, d, x, const, shift) \
42 MOV $const, X23; \
43 ADDW X23, a; \
44 SRL $32, x, X23; \
45 ADDW X23, a; \
46 XOR c, d, X23; \
47 AND b, X23; \
48 XOR d, X23; \
49 ADDW X23, a; \
50 RORIW $(32-shift), a; \
51 ADDW b, a
52
53 #define ROUND2EVN(a, b, c, d, x, const, shift) \
54 MOV $const, X23; \
55 ADDW x, a; \
56 ADDW X23, a; \
57 XOR b, c, X23; \
58 AND d, X23; \
59 XOR c, X23; \
60 ADDW X23, a; \
61 RORIW $(32-shift), a; \
62 ADDW b, a
63
64 #define ROUND2ODD(a, b, c, d, x, const, shift) \
65 MOV $const, X23; \
66 ADDW X23, a; \
67 SRL $32, x, X23; \
68 ADDW X23, a; \
69 XOR b, c, X23; \
70 AND d, X23; \
71 XOR c, X23; \
72 ADDW X23, a; \
73 RORIW $(32-shift), a; \
74 ADDW b, a
75
76 #define ROUND3EVN(a, b, c, d, x, const, shift) \
77 MOV $const, X23; \
78 ADDW x, a; \
79 ADDW X23, a; \
80 XOR c, d, X23; \
81 XOR b, X23; \
82 ADDW X23, a; \
83 RORIW $(32-shift), a; \
84 ADDW b, a
85
86 #define ROUND3ODD(a, b, c, d, x, const, shift) \
87 MOV $const, X23; \
88 ADDW X23, a; \
89 SRL $32, x, X23; \
90 ADDW X23, a; \
91 XOR c, d, X23; \
92 XOR b, X23; \
93 ADDW X23, a; \
94 RORIW $(32-shift), a; \
95 ADDW b, a
96
97 #define ROUND4EVN(a, b, c, d, x, const, shift) \
98 MOV $const, X23; \
99 ADDW x, a; \
100 ADDW X23, a; \
101 ORN d, b, X23; \
102 XOR c, X23; \
103 ADDW X23, a; \
104 RORIW $(32-shift), a; \
105 ADDW b, a
106
107 #define ROUND4ODD(a, b, c, d, x, const, shift) \
108 MOV $const, X23; \
109 ADDW X23, a; \
110 SRL $32, x, X23; \
111 ADDW X23, a; \
112 ORN d, b, X23; \
113 XOR c, X23; \
114 ADDW X23, a; \
115 RORIW $(32-shift), a; \
116 ADDW b, a
117
118 // Register use for the block function
119 //
120 // X5 - X12 : contain the 16 32 bit data items in the block we're
121 // processing. Odd numbered values, e.g., x1, x3 are stored in
122 // the upper 32 bits of the register.
123 // X13 - X16 : a, b, c, d
124 // X17 - X20 : used to store the old values of a, b, c, d, i.e., aa, bb, cc,
125 // dd. X17 and X18 are also used as temporary registers when
126 // loading unaligned data.
127 // X22 : pointer to dig.s
128 // X23 : temporary register
129 // X28 : pointer to the first byte beyond the end of p
130 // X29 : pointer to current 64 byte block of data, initially set to
131 // &p[0]
132 // X30 : temporary register
133
134 TEXT ·block(SB),NOSPLIT,$0-32
135 MOV p+8(FP), X29
136 MOV p_len+16(FP), X30
137 SRL $6, X30
138 SLL $6, X30
139 BEQZ X30, zero
140
141 ADD X29, X30, X28
142
143 MOV dig+0(FP), X22
144 MOVWU (0*4)(X22), X13 // a = s[0]
145 MOVWU (1*4)(X22), X14 // b = s[1]
146 MOVWU (2*4)(X22), X15 // c = s[2]
147 MOVWU (3*4)(X22), X16 // d = s[3]
148
149 loop:
150
151 // Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
152 // Different paths are taken to load the values depending on whether the
153 // buffer is 8 byte aligned or not. We load all the values up front
154 // here at the start of the loop to avoid multiple alignment checks and
155 // to reduce code size. It takes 10 instructions to load an unaligned
156 // 32 bit value and this value will be used 4 times in the main body
157 // of the loop below.
158
159 AND $7, X29, X30
160 BEQZ X30, aligned
161
162 LOAD64U(X29,0, X17, X18, X5)
163 LOAD64U(X29,8, X17, X18, X6)
164 LOAD64U(X29,16, X17, X18, X7)
165 LOAD64U(X29,24, X17, X18, X8)
166 LOAD64U(X29,32, X17, X18, X9)
167 LOAD64U(X29,40, X17, X18, X10)
168 LOAD64U(X29,48, X17, X18, X11)
169 LOAD64U(X29,56, X17, X18, X12)
170 JMP block_loaded
171
172 aligned:
173 MOV (0*8)(X29), X5
174 MOV (1*8)(X29), X6
175 MOV (2*8)(X29), X7
176 MOV (3*8)(X29), X8
177 MOV (4*8)(X29), X9
178 MOV (5*8)(X29), X10
179 MOV (6*8)(X29), X11
180 MOV (7*8)(X29), X12
181
182 block_loaded:
183 MOV X13, X17
184 MOV X14, X18
185 MOV X15, X19
186 MOV X16, X20
187
188 // Some of the hex constants below are too large to fit into a
189 // signed 32 bit value. The assembler will handle these
190 // constants in a special way to ensure that they are
191 // zero extended. Our algorithm is only interested in the
192 // bottom 32 bits and doesn't care whether constants are
193 // sign or zero extended when moved into 64 bit registers.
194 // So we use signed constants instead of hex when bit 31 is
195 // set so all constants can be loaded by lui+addi.
196
197 ROUND1EVN(X13,X14,X15,X16,X5, -680876936, 7); // 0xd76aa478
198 ROUND1ODD(X16,X13,X14,X15,X5, -389564586,12); // 0xe8c7b756
199 ROUND1EVN(X15,X16,X13,X14,X6, 0x242070db,17); // 0x242070db
200 ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
201 ROUND1EVN(X13,X14,X15,X16,X7, -176418897, 7); // 0xf57c0faf
202 ROUND1ODD(X16,X13,X14,X15,X7, 0x4787c62a,12); // 0x4787c62a
203 ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
204 ROUND1ODD(X14,X15,X16,X13,X8, -45705983,22); // 0xfd469501
205 ROUND1EVN(X13,X14,X15,X16,X9, 0x698098d8, 7); // 0x698098d8
206 ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
207 ROUND1EVN(X15,X16,X13,X14,X10, -42063,17); // 0xffff5bb1
208 ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
209 ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
210 ROUND1ODD(X16,X13,X14,X15,X11, -40341101,12); // 0xfd987193
211 ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
212 ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821
213
214 ROUND2ODD(X13,X14,X15,X16,X5, -165796510, 5); // f61e2562
215 ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
216 ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
217 ROUND2EVN(X14,X15,X16,X13,X5, -373897302,20); // e9b6c7aa
218 ROUND2ODD(X13,X14,X15,X16,X7, -701558691, 5); // d62f105d
219 ROUND2EVN(X16,X13,X14,X15,X10, 0x2441453, 9); // 2441453
220 ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
221 ROUND2EVN(X14,X15,X16,X13,X7, -405537848,20); // e7d3fbc8
222 ROUND2ODD(X13,X14,X15,X16,X9, 0x21e1cde6, 5); // 21e1cde6
223 ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
224 ROUND2ODD(X15,X16,X13,X14,X6, -187363961,14); // f4d50d87
225 ROUND2EVN(X14,X15,X16,X13,X9, 0x455a14ed,20); // 455a14ed
226 ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
227 ROUND2EVN(X16,X13,X14,X15,X6, -51403784, 9); // fcefa3f8
228 ROUND2ODD(X15,X16,X13,X14,X8, 0x676f02d9,14); // 676f02d9
229 ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a
230
231 ROUND3ODD(X13,X14,X15,X16,X7, -378558, 4); // fffa3942
232 ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
233 ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
234 ROUND3EVN(X14,X15,X16,X13,X12, -35309556,23); // fde5380c
235 ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
236 ROUND3EVN(X16,X13,X14,X15,X7, 0x4bdecfa9,11); // 4bdecfa9
237 ROUND3ODD(X15,X16,X13,X14,X8, -155497632,16); // f6bb4b60
238 ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
239 ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
240 ROUND3EVN(X16,X13,X14,X15,X5, -358537222,11); // eaa127fa
241 ROUND3ODD(X15,X16,X13,X14,X6, -722521979,16); // d4ef3085
242 ROUND3EVN(X14,X15,X16,X13,X8, 0x4881d05,23); // 4881d05
243 ROUND3ODD(X13,X14,X15,X16,X9, -640364487, 4); // d9d4d039
244 ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
245 ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
246 ROUND3EVN(X14,X15,X16,X13,X6, -995338651,23); // c4ac5665
247
248 ROUND4EVN(X13,X14,X15,X16,X5, -198630844, 6); // f4292244
249 ROUND4ODD(X16,X13,X14,X15,X8, 0x432aff97,10); // 432aff97
250 ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
251 ROUND4ODD(X14,X15,X16,X13,X7, -57434055,21); // fc93a039
252 ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
253 ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
254 ROUND4EVN(X15,X16,X13,X14,X10 ,-1051523,15); // ffeff47d
255 ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
256 ROUND4EVN(X13,X14,X15,X16,X9, 0x6fa87e4f, 6); // 6fa87e4f
257 ROUND4ODD(X16,X13,X14,X15,X12, -30611744,10); // fe2ce6e0
258 ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
259 ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
260 ROUND4EVN(X13,X14,X15,X16,X7, -145523070, 6); // f7537e82
261 ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
262 ROUND4EVN(X15,X16,X13,X14,X6, 0x2ad7d2bb,15); // 2ad7d2bb
263 ROUND4ODD(X14,X15,X16,X13,X9, -343485551,21); // eb86d391
264
265 ADDW X17, X13
266 ADDW X18, X14
267 ADDW X19, X15
268 ADDW X20, X16
269
270 ADD $64, X29
271 BNE X28, X29, loop
272
273 MOVW X13, (0*4)(X22)
274 MOVW X14, (1*4)(X22)
275 MOVW X15, (2*4)(X22)
276 MOVW X16, (3*4)(X22)
277
278 zero:
279 RET
280
View as plain text