Text file
src/crypto/sha256/sha256block_amd64.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 // SHA256 block routine. See sha256block.go for Go equivalent.
10 //
11 // The algorithm is detailed in FIPS 180-4:
12 //
13 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
14
15 // The avx2-version is described in an Intel White-Paper:
16 // "Fast SHA-256 Implementations on Intel Architecture Processors"
17 // To find it, surf to http://www.intel.com/p/en_US/embedded
18 // and search for that title.
19 // AVX2 version by Intel, same algorithm as code in Linux kernel:
20 // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
21 // by
22 // James Guilford <james.guilford@intel.com>
23 // Kirk Yap <kirk.s.yap@intel.com>
24 // Tim Chen <tim.c.chen@linux.intel.com>
25
26 // Wt = Mt; for 0 <= t <= 15
27 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
28 //
29 // a = H0
30 // b = H1
31 // c = H2
32 // d = H3
33 // e = H4
34 // f = H5
35 // g = H6
36 // h = H7
37 //
38 // for t = 0 to 63 {
39 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
40 // T2 = BIGSIGMA0(a) + Maj(a,b,c)
41 // h = g
42 // g = f
43 // f = e
44 // e = d + T1
45 // d = c
46 // c = b
47 // b = a
48 // a = T1 + T2
49 // }
50 //
51 // H0 = a + H0
52 // H1 = b + H1
53 // H2 = c + H2
54 // H3 = d + H3
55 // H4 = e + H4
56 // H5 = f + H5
57 // H6 = g + H6
58 // H7 = h + H7
59
60 // Wt = Mt; for 0 <= t <= 15
61 #define MSGSCHEDULE0(index) \
62 MOVL (index*4)(SI), AX; \
63 BSWAPL AX; \
64 MOVL AX, (index*4)(BP)
65
66 // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
67 // SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
68 // SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
69 #define MSGSCHEDULE1(index) \
70 MOVL ((index-2)*4)(BP), AX; \
71 MOVL AX, CX; \
72 RORL $17, AX; \
73 MOVL CX, DX; \
74 RORL $19, CX; \
75 SHRL $10, DX; \
76 MOVL ((index-15)*4)(BP), BX; \
77 XORL CX, AX; \
78 MOVL BX, CX; \
79 XORL DX, AX; \
80 RORL $7, BX; \
81 MOVL CX, DX; \
82 SHRL $3, DX; \
83 RORL $18, CX; \
84 ADDL ((index-7)*4)(BP), AX; \
85 XORL CX, BX; \
86 XORL DX, BX; \
87 ADDL ((index-16)*4)(BP), BX; \
88 ADDL BX, AX; \
89 MOVL AX, ((index)*4)(BP)
90
91 // Calculate T1 in AX - uses AX, CX and DX registers.
92 // h is also used as an accumulator. Wt is passed in AX.
93 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
94 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
95 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
96 #define SHA256T1(const, e, f, g, h) \
97 ADDL AX, h; \
98 MOVL e, AX; \
99 ADDL $const, h; \
100 MOVL e, CX; \
101 RORL $6, AX; \
102 MOVL e, DX; \
103 RORL $11, CX; \
104 XORL CX, AX; \
105 MOVL e, CX; \
106 RORL $25, DX; \
107 ANDL f, CX; \
108 XORL AX, DX; \
109 MOVL e, AX; \
110 NOTL AX; \
111 ADDL DX, h; \
112 ANDL g, AX; \
113 XORL CX, AX; \
114 ADDL h, AX
115
116 // Calculate T2 in BX - uses BX, CX, DX and DI registers.
117 // T2 = BIGSIGMA0(a) + Maj(a, b, c)
118 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
119 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
120 #define SHA256T2(a, b, c) \
121 MOVL a, DI; \
122 MOVL c, BX; \
123 RORL $2, DI; \
124 MOVL a, DX; \
125 ANDL b, BX; \
126 RORL $13, DX; \
127 MOVL a, CX; \
128 ANDL c, CX; \
129 XORL DX, DI; \
130 XORL CX, BX; \
131 MOVL a, DX; \
132 MOVL b, CX; \
133 RORL $22, DX; \
134 ANDL a, CX; \
135 XORL CX, BX; \
136 XORL DX, DI; \
137 ADDL DI, BX
138
139 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
140 // The values for e and a are stored in d and h, ready for rotation.
141 #define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
142 SHA256T1(const, e, f, g, h); \
143 SHA256T2(a, b, c); \
144 MOVL BX, h; \
145 ADDL AX, d; \
146 ADDL AX, h
147
148 #define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
149 MSGSCHEDULE0(index); \
150 SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
151
152 #define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
153 MSGSCHEDULE1(index); \
154 SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
155
156
157 // Definitions for AVX2 version
158
159 // addm (mem), reg
160 // Add reg to mem using reg-mem add and store
161 #define addm(P1, P2) \
162 ADDL P2, P1; \
163 MOVL P1, P2
164
165 #define XDWORD0 Y4
166 #define XDWORD1 Y5
167 #define XDWORD2 Y6
168 #define XDWORD3 Y7
169
170 #define XWORD0 X4
171 #define XWORD1 X5
172 #define XWORD2 X6
173 #define XWORD3 X7
174
175 #define XTMP0 Y0
176 #define XTMP1 Y1
177 #define XTMP2 Y2
178 #define XTMP3 Y3
179 #define XTMP4 Y8
180 #define XTMP5 Y11
181
182 #define XFER Y9
183
184 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
185 #define X_BYTE_FLIP_MASK X13
186
187 #define NUM_BYTES DX
188 #define INP DI
189
190 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
191
192 #define a AX
193 #define b BX
194 #define c CX
195 #define d R8
196 #define e DX
197 #define f R9
198 #define g R10
199 #define h R11
200
201 #define old_h R11
202
203 #define TBL BP
204
205 #define SRND SI // SRND is same register as CTX
206
207 #define T1 R12
208
209 #define y0 R13
210 #define y1 R14
211 #define y2 R15
212 #define y3 DI
213
214 // Offsets
215 #define XFER_SIZE 2*64*4
216 #define INP_END_SIZE 8
217 #define INP_SIZE 8
218
219 #define _XFER 0
220 #define _INP_END _XFER + XFER_SIZE
221 #define _INP _INP_END + INP_END_SIZE
222 #define STACK_SIZE _INP + INP_SIZE
223
224 #define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
225 ; \ // ############################# RND N + 0 ############################//
226 MOVL a, y3; \ // y3 = a // MAJA
227 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
228 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
229 ; \
230 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w
231 ORL c, y3; \ // y3 = a|c // MAJA
232 VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
233 MOVL f, y2; \ // y2 = f // CH
234 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
235 ; \
236 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
237 XORL g, y2; \ // y2 = f^g // CH
238 VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
239 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
240 ; \
241 ANDL e, y2; \ // y2 = (f^g)&e // CH
242 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
243 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
244 ADDL h, d; \ // d = k + w + h + d // --
245 ; \
246 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
247 VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
248 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
249 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
250 ; \
251 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
252 VPSRLD $7, XTMP1, XTMP2; \
253 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
254 MOVL a, T1; \ // T1 = a // MAJB
255 ANDL c, T1; \ // T1 = a&c // MAJB
256 ; \
257 ADDL y0, y2; \ // y2 = S1 + CH // --
258 VPSLLD $(32-7), XTMP1, XTMP3; \
259 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
260 ADDL y1, h; \ // h = k + w + h + S0 // --
261 ; \
262 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
263 VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7
264 ; \
265 VPSRLD $18, XTMP1, XTMP2; \
266 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
267 ADDL y3, h // h = t1 + S0 + MAJ // --
268
269 #define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
270 ; \ // ################################### RND N + 1 ############################
271 ; \
272 MOVL a, y3; \ // y3 = a // MAJA
273 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
274 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
275 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
276 ORL c, y3; \ // y3 = a|c // MAJA
277 ; \
278 VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
279 MOVL f, y2; \ // y2 = f // CH
280 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
281 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
282 XORL g, y2; \ // y2 = f^g // CH
283 ; \
284 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
285 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
286 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
287 ANDL e, y2; \ // y2 = (f^g)&e // CH
288 ADDL h, d; \ // d = k + w + h + d // --
289 ; \
290 VPSLLD $(32-18), XTMP1, XTMP1; \
291 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
292 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
293 ; \
294 VPXOR XTMP1, XTMP3, XTMP3; \
295 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
296 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
297 ; \
298 VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
299 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
300 MOVL a, T1; \ // T1 = a // MAJB
301 ANDL c, T1; \ // T1 = a&c // MAJB
302 ADDL y0, y2; \ // y2 = S1 + CH // --
303 ; \
304 VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0
305 VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA}
306 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
307 ADDL y1, h; \ // h = k + w + h + S0 // --
308 ; \
309 VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0
310 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
311 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
312 ADDL y3, h; \ // h = t1 + S0 + MAJ // --
313 ; \
314 VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA}
315
316 #define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
317 ; \ // ################################### RND N + 2 ############################
318 ; \
319 MOVL a, y3; \ // y3 = a // MAJA
320 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
321 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
322 ; \
323 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
324 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
325 ORL c, y3; \ // y3 = a|c // MAJA
326 MOVL f, y2; \ // y2 = f // CH
327 XORL g, y2; \ // y2 = f^g // CH
328 ; \
329 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
330 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
331 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA}
332 ANDL e, y2; \ // y2 = (f^g)&e // CH
333 ; \
334 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
335 VPXOR XTMP3, XTMP2, XTMP2; \
336 ADDL h, d; \ // d = k + w + h + d // --
337 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
338 ; \
339 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
340 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
341 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA}
342 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
343 ; \
344 VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
345 ; \
346 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
347 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
348 VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]}
349 ; \
350 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
351 MOVL a, T1; \ // T1 = a // MAJB
352 ANDL c, T1; \ // T1 = a&c // MAJB
353 ADDL y0, y2; \ // y2 = S1 + CH // --
354 VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC}
355 ; \
356 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
357 ADDL y1, h; \ // h = k + w + h + S0 // --
358 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
359 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
360 ; \
361 ADDL y3, h // h = t1 + S0 + MAJ // --
362
363 #define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
364 ; \ // ################################### RND N + 3 ############################
365 ; \
366 MOVL a, y3; \ // y3 = a // MAJA
367 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
368 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
369 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
370 ORL c, y3; \ // y3 = a|c // MAJA
371 ; \
372 VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC}
373 MOVL f, y2; \ // y2 = f // CH
374 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
375 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
376 XORL g, y2; \ // y2 = f^g // CH
377 ; \
378 VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC}
379 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
380 ANDL e, y2; \ // y2 = (f^g)&e // CH
381 ADDL h, d; \ // d = k + w + h + d // --
382 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
383 ; \
384 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC}
385 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
386 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
387 ; \
388 VPXOR XTMP3, XTMP2, XTMP2; \
389 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
390 ADDL y0, y2; \ // y2 = S1 + CH // --
391 ; \
392 VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC}
393 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
394 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
395 ; \
396 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
397 ; \
398 VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
399 ; \
400 VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
401 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
402 MOVL a, T1; \ // T1 = a // MAJB
403 ANDL c, T1; \ // T1 = a&c // MAJB
404 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
405 ; \
406 ADDL y1, h; \ // h = k + w + h + S0 // --
407 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
408 ADDL y3, h // h = t1 + S0 + MAJ // --
409
410 #define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
411 ; \ // ################################### RND N + 0 ###########################
412 MOVL f, y2; \ // y2 = f // CH
413 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
414 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
415 XORL g, y2; \ // y2 = f^g // CH
416 ; \
417 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
418 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
419 ANDL e, y2; \ // y2 = (f^g)&e // CH
420 ; \
421 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
422 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
423 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
424 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
425 MOVL a, y3; \ // y3 = a // MAJA
426 ; \
427 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
428 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
429 ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
430 ORL c, y3; \ // y3 = a|c // MAJA
431 ; \
432 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
433 MOVL a, T1; \ // T1 = a // MAJB
434 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
435 ANDL c, T1; \ // T1 = a&c // MAJB
436 ADDL y0, y2; \ // y2 = S1 + CH // --
437 ; \
438 ADDL h, d; \ // d = k + w + h + d // --
439 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
440 ADDL y1, h; \ // h = k + w + h + S0 // --
441 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
442
443 #define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
444 ; \ // ################################### RND N + 1 ###########################
445 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
446 MOVL f, y2; \ // y2 = f // CH
447 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
448 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
449 XORL g, y2; \ // y2 = f^g // CH
450 ; \
451 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
452 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
453 ANDL e, y2; \ // y2 = (f^g)&e // CH
454 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
455 ; \
456 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
457 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
458 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
459 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
460 MOVL a, y3; \ // y3 = a // MAJA
461 ; \
462 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
463 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
464 ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
465 ORL c, y3; \ // y3 = a|c // MAJA
466 ; \
467 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
468 MOVL a, T1; \ // T1 = a // MAJB
469 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
470 ANDL c, T1; \ // T1 = a&c // MAJB
471 ADDL y0, y2; \ // y2 = S1 + CH // --
472 ; \
473 ADDL h, d; \ // d = k + w + h + d // --
474 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
475 ADDL y1, h; \ // h = k + w + h + S0 // --
476 ; \
477 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
478
479 #define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
480 ; \ // ################################### RND N + 2 ##############################
481 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
482 MOVL f, y2; \ // y2 = f // CH
483 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
484 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
485 XORL g, y2; \ // y2 = f^g // CH
486 ; \
487 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
488 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
489 ANDL e, y2; \ // y2 = (f^g)&e // CH
490 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
491 ; \
492 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
493 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
494 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
495 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
496 MOVL a, y3; \ // y3 = a // MAJA
497 ; \
498 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
499 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
500 ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
501 ORL c, y3; \ // y3 = a|c // MAJA
502 ; \
503 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
504 MOVL a, T1; \ // T1 = a // MAJB
505 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
506 ANDL c, T1; \ // T1 = a&c // MAJB
507 ADDL y0, y2; \ // y2 = S1 + CH // --
508 ; \
509 ADDL h, d; \ // d = k + w + h + d // --
510 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
511 ADDL y1, h; \ // h = k + w + h + S0 // --
512 ; \
513 ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
514
515 #define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
516 ; \ // ################################### RND N + 3 ###########################
517 ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
518 MOVL f, y2; \ // y2 = f // CH
519 RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
520 RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
521 XORL g, y2; \ // y2 = f^g // CH
522 ; \
523 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
524 RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
525 ANDL e, y2; \ // y2 = (f^g)&e // CH
526 ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
527 ; \
528 XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
529 RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
530 XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
531 RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
532 MOVL a, y3; \ // y3 = a // MAJA
533 ; \
534 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
535 RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
536 ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
537 ORL c, y3; \ // y3 = a|c // MAJA
538 ; \
539 XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
540 MOVL a, T1; \ // T1 = a // MAJB
541 ANDL b, y3; \ // y3 = (a|c)&b // MAJA
542 ANDL c, T1; \ // T1 = a&c // MAJB
543 ADDL y0, y2; \ // y2 = S1 + CH // --
544 ; \
545 ADDL h, d; \ // d = k + w + h + d // --
546 ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
547 ADDL y1, h; \ // h = k + w + h + S0 // --
548 ; \
549 ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
550 ; \
551 ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
552 ; \
553 ADDL y3, h // h = t1 + S0 + MAJ // --
554
555 // Definitions for sha-ni version
556 //
557 // The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
558 // It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
559 //
560 // Reference
561 // S. Gulley, et al, "New Instructions Supporting the Secure Hash
562 // Algorithm on Intel® Architecture Processors", July 2013
563 // https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
564 //
565
566 #define digestPtr DI // input/output, base pointer to digest hash vector H0, H1, ..., H7
567 #define dataPtr SI // input, base pointer to first input data block
568 #define numBytes DX // input, number of input bytes to be processed
569 #define sha256Constants AX // round contents from K256 table, indexed by round number x 32
570 #define msg X0 // input data
571 #define state0 X1 // round intermediates and outputs
572 #define state1 X2
573 #define m0 X3 // m0, m1,... m4 -- round message temps
574 #define m1 X4
575 #define m2 X5
576 #define m3 X6
577 #define m4 X7
578 #define shufMask X8 // input data endian conversion control mask
579 #define abefSave X9 // digest hash vector inter-block buffer abef
580 #define cdghSave X10 // digest hash vector inter-block buffer cdgh
581
582 #define nop(m,a) // nop instead of final SHA256MSG1 for first and last few rounds
583
584 #define sha256msg1(m,a) \ // final SHA256MSG1 for middle rounds that require it
585 SHA256MSG1 m, a
586
587 #define vmov(a,b) \ // msg copy for all but rounds 12-15
588 VMOVDQA a, b
589
590 #define vmovrev(a,b) \ // reverse copy for rounds 12-15
591 VMOVDQA b, a
592
593 // sha rounds 0 to 11
594 // identical with the exception of the final msg op
595 // which is replaced with a nop for rounds where it is not needed
596 // refer to Gulley, et al for more information
597 #define rounds0to11(m,a,c,sha256Msg1) \
598 VMOVDQU c*16(dataPtr), msg \
599 PSHUFB shufMask, msg \
600 VMOVDQA msg, m \
601 PADDD (c*32)(sha256Constants), msg \
602 SHA256RNDS2 msg, state0, state1 \
603 PSHUFD $0x0e, msg, msg \
604 SHA256RNDS2 msg, state1, state0 \
605 sha256Msg1 (m,a)
606
607 // sha rounds 12 to 59
608 // identical with the exception of the final msg op
609 // and the reverse copy(m,msg) in round 12 which is required
610 // after the last data load
611 // refer to Gulley, et al for more information
612 #define rounds12to59(m,c,a,t,sha256Msg1,movop) \
613 movop (m,msg) \
614 PADDD (c*32)(sha256Constants), msg \
615 SHA256RNDS2 msg, state0, state1 \
616 VMOVDQA m, m4 \
617 PALIGNR $4, a, m4 \
618 PADDD m4, t \
619 SHA256MSG2 m, t \
620 PSHUFD $0x0e, msg, msg \
621 SHA256RNDS2 msg, state1, state0 \
622 sha256Msg1 (m,a)
623
624 TEXT ·block(SB), 0, $536-32
625 CMPB ·useSHA(SB), $1
626 JE sha_ni
627 CMPB ·useAVX2(SB), $1
628 JE avx2
629
630 MOVQ p_base+8(FP), SI
631 MOVQ p_len+16(FP), DX
632 SHRQ $6, DX
633 SHLQ $6, DX
634
635 LEAQ (SI)(DX*1), DI
636 MOVQ DI, 256(SP)
637 CMPQ SI, DI
638 JEQ end
639
640 MOVQ dig+0(FP), BP
641 MOVL (0*4)(BP), R8 // a = H0
642 MOVL (1*4)(BP), R9 // b = H1
643 MOVL (2*4)(BP), R10 // c = H2
644 MOVL (3*4)(BP), R11 // d = H3
645 MOVL (4*4)(BP), R12 // e = H4
646 MOVL (5*4)(BP), R13 // f = H5
647 MOVL (6*4)(BP), R14 // g = H6
648 MOVL (7*4)(BP), R15 // h = H7
649
650 loop:
651 MOVQ SP, BP
652
653 SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
654 SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
655 SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
656 SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
657 SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
658 SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
659 SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
660 SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
661 SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
662 SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
663 SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
664 SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
665 SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
666 SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
667 SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
668 SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
669
670 SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
671 SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
672 SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
673 SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
674 SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
675 SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
676 SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
677 SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
678 SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
679 SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
680 SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
681 SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
682 SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
683 SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
684 SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
685 SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
686 SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
687 SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
688 SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
689 SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
690 SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
691 SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
692 SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
693 SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
694 SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
695 SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
696 SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
697 SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
698 SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
699 SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
700 SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
701 SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
702 SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
703 SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
704 SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
705 SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
706 SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
707 SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
708 SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
709 SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
710 SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
711 SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
712 SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
713 SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
714 SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
715 SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
716 SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
717 SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
718
719 MOVQ dig+0(FP), BP
720 ADDL (0*4)(BP), R8 // H0 = a + H0
721 MOVL R8, (0*4)(BP)
722 ADDL (1*4)(BP), R9 // H1 = b + H1
723 MOVL R9, (1*4)(BP)
724 ADDL (2*4)(BP), R10 // H2 = c + H2
725 MOVL R10, (2*4)(BP)
726 ADDL (3*4)(BP), R11 // H3 = d + H3
727 MOVL R11, (3*4)(BP)
728 ADDL (4*4)(BP), R12 // H4 = e + H4
729 MOVL R12, (4*4)(BP)
730 ADDL (5*4)(BP), R13 // H5 = f + H5
731 MOVL R13, (5*4)(BP)
732 ADDL (6*4)(BP), R14 // H6 = g + H6
733 MOVL R14, (6*4)(BP)
734 ADDL (7*4)(BP), R15 // H7 = h + H7
735 MOVL R15, (7*4)(BP)
736
737 ADDQ $64, SI
738 CMPQ SI, 256(SP)
739 JB loop
740
741 end:
742 RET
743
744 avx2:
745 MOVQ dig+0(FP), CTX // d.h[8]
746 MOVQ p_base+8(FP), INP
747 MOVQ p_len+16(FP), NUM_BYTES
748
749 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
750 MOVQ NUM_BYTES, _INP_END(SP)
751
752 CMPQ NUM_BYTES, INP
753 JE avx2_only_one_block
754
755 // Load initial digest
756 MOVL 0(CTX), a // a = H0
757 MOVL 4(CTX), b // b = H1
758 MOVL 8(CTX), c // c = H2
759 MOVL 12(CTX), d // d = H3
760 MOVL 16(CTX), e // e = H4
761 MOVL 20(CTX), f // f = H5
762 MOVL 24(CTX), g // g = H6
763 MOVL 28(CTX), h // h = H7
764
765 avx2_loop0: // at each iteration works with one block (512 bit)
766
767 VMOVDQU (0*32)(INP), XTMP0
768 VMOVDQU (1*32)(INP), XTMP1
769 VMOVDQU (2*32)(INP), XTMP2
770 VMOVDQU (3*32)(INP), XTMP3
771
772 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
773
774 // Apply Byte Flip Mask: LE -> BE
775 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
776 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
777 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
778 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
779
780 // Transpose data into high/low parts
781 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
782 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
783 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
784 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
785
786 MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
787
788 avx2_last_block_enter:
789 ADDQ $64, INP
790 MOVQ INP, _INP(SP)
791 XORQ SRND, SRND
792
793 avx2_loop1: // for w0 - w47
794 // Do 4 rounds and scheduling
795 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
796 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
797 ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
798 ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
799 ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
800 ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
801
802 // Do 4 rounds and scheduling
803 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
804 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
805 ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
806 ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
807 ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
808 ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
809
810 // Do 4 rounds and scheduling
811 VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER
812 VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
813 ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
814 ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
815 ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
816 ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
817
818 // Do 4 rounds and scheduling
819 VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER
820 VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
821 ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
822 ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
823 ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
824 ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
825
826 ADDQ $4*32, SRND
827 CMPQ SRND, $3*4*32
828 JB avx2_loop1
829
830 avx2_loop2:
831 // w48 - w63 processed with no scheduling (last 16 rounds)
832 VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
833 VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
834 DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
835 DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
836 DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
837 DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
838
839 VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
840 VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
841 DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
842 DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
843 DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
844 DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
845
846 ADDQ $2*32, SRND
847
848 VMOVDQU XDWORD2, XDWORD0
849 VMOVDQU XDWORD3, XDWORD1
850
851 CMPQ SRND, $4*4*32
852 JB avx2_loop2
853
854 MOVQ dig+0(FP), CTX // d.h[8]
855 MOVQ _INP(SP), INP
856
857 addm( 0(CTX), a)
858 addm( 4(CTX), b)
859 addm( 8(CTX), c)
860 addm( 12(CTX), d)
861 addm( 16(CTX), e)
862 addm( 20(CTX), f)
863 addm( 24(CTX), g)
864 addm( 28(CTX), h)
865
866 CMPQ _INP_END(SP), INP
867 JB done_hash
868
869 XORQ SRND, SRND
870
871 avx2_loop3: // Do second block using previously scheduled results
872 DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
873 DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
874 DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
875 DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
876
877 DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
878 DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
879 DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
880 DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
881
882 ADDQ $2*32, SRND
883 CMPQ SRND, $4*4*32
884 JB avx2_loop3
885
886 MOVQ dig+0(FP), CTX // d.h[8]
887 MOVQ _INP(SP), INP
888 ADDQ $64, INP
889
890 addm( 0(CTX), a)
891 addm( 4(CTX), b)
892 addm( 8(CTX), c)
893 addm( 12(CTX), d)
894 addm( 16(CTX), e)
895 addm( 20(CTX), f)
896 addm( 24(CTX), g)
897 addm( 28(CTX), h)
898
899 CMPQ _INP_END(SP), INP
900 JA avx2_loop0
901 JB done_hash
902
903 avx2_do_last_block:
904
905 VMOVDQU 0(INP), XWORD0
906 VMOVDQU 16(INP), XWORD1
907 VMOVDQU 32(INP), XWORD2
908 VMOVDQU 48(INP), XWORD3
909
910 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
911
912 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
913 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
914 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
915 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
916
917 MOVQ $K256<>(SB), TBL
918
919 JMP avx2_last_block_enter
920
921 avx2_only_one_block:
922 // Load initial digest
923 MOVL 0(CTX), a // a = H0
924 MOVL 4(CTX), b // b = H1
925 MOVL 8(CTX), c // c = H2
926 MOVL 12(CTX), d // d = H3
927 MOVL 16(CTX), e // e = H4
928 MOVL 20(CTX), f // f = H5
929 MOVL 24(CTX), g // g = H6
930 MOVL 28(CTX), h // h = H7
931
932 JMP avx2_do_last_block
933
934 done_hash:
935 VZEROUPPER
936 RET
937
938 sha_ni:
939 MOVQ dig+0(FP), digestPtr // init digest hash vector H0, H1,..., H7 pointer
940 MOVQ p_base+8(FP), dataPtr // init input data base pointer
941 MOVQ p_len+16(FP), numBytes // get number of input bytes to hash
942 SHRQ $6, numBytes // force modulo 64 input buffer length
943 SHLQ $6, numBytes
944 CMPQ numBytes, $0 // exit early for zero-length input buffer
945 JEQ done
946 ADDQ dataPtr, numBytes // point numBytes to end of input buffer
947 VMOVDQU (0*16)(digestPtr), state0 // load initial hash values and reorder
948 VMOVDQU (1*16)(digestPtr), state1 // DCBA, HGFE -> ABEF, CDGH
949 PSHUFD $0xb1, state0, state0 // CDAB
950 PSHUFD $0x1b, state1, state1 // EFGH
951 VMOVDQA state0, m4
952 PALIGNR $8, state1, state0 // ABEF
953 PBLENDW $0xf0, m4, state1 // CDGH
954 VMOVDQA flip_mask<>(SB), shufMask
955 LEAQ K256<>(SB), sha256Constants
956
957 roundLoop:
958 // save hash values for addition after rounds
959 VMOVDQA state0, abefSave
960 VMOVDQA state1, cdghSave
961
962 // do rounds 0-59
963 rounds0to11 (m0,-,0,nop) // 0-3
964 rounds0to11 (m1,m0,1,sha256msg1) // 4-7
965 rounds0to11 (m2,m1,2,sha256msg1) // 8-11
966 VMOVDQU (3*16)(dataPtr), msg
967 PSHUFB shufMask, msg
968 rounds12to59 (m3,3,m2,m0,sha256msg1,vmovrev) // 12-15
969 rounds12to59 (m0,4,m3,m1,sha256msg1,vmov) // 16-19
970 rounds12to59 (m1,5,m0,m2,sha256msg1,vmov) // 20-23
971 rounds12to59 (m2,6,m1,m3,sha256msg1,vmov) // 24-27
972 rounds12to59 (m3,7,m2,m0,sha256msg1,vmov) // 28-31
973 rounds12to59 (m0,8,m3,m1,sha256msg1,vmov) // 32-35
974 rounds12to59 (m1,9,m0,m2,sha256msg1,vmov) // 36-39
975 rounds12to59 (m2,10,m1,m3,sha256msg1,vmov) // 40-43
976 rounds12to59 (m3,11,m2,m0,sha256msg1,vmov) // 44-47
977 rounds12to59 (m0,12,m3,m1,sha256msg1,vmov) // 48-51
978 rounds12to59 (m1,13,m0,m2,nop,vmov) // 52-55
979 rounds12to59 (m2,14,m1,m3,nop,vmov) // 56-59
980
981 // do rounds 60-63
982 VMOVDQA m3, msg
983 PADDD (15*32)(sha256Constants), msg
984 SHA256RNDS2 msg, state0, state1
985 PSHUFD $0x0e, msg, msg
986 SHA256RNDS2 msg, state1, state0
987
988 // add current hash values with previously saved
989 PADDD abefSave, state0
990 PADDD cdghSave, state1
991
992 // advance data pointer; loop until buffer empty
993 ADDQ $64, dataPtr
994 CMPQ numBytes, dataPtr
995 JNE roundLoop
996
997 // write hash values back in the correct order
998 PSHUFD $0x1b, state0, state0 // FEBA
999 PSHUFD $0xb1, state1, state1 // DCHG
1000 VMOVDQA state0, m4
1001 PBLENDW $0xf0, state1, state0 // DCBA
1002 PALIGNR $8, m4, state1 // HGFE
1003 VMOVDQU state0, (0*16)(digestPtr)
1004 VMOVDQU state1, (1*16)(digestPtr)
1005
1006 done:
1007 RET
1008
1009 // shuffle byte order from LE to BE
1010 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
1011 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
1012 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
1013 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
1014 GLOBL flip_mask<>(SB), 8, $32
1015
1016 // shuffle xBxA -> 00BA
1017 DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
1018 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
1019 DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
1020 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
1021 GLOBL shuff_00BA<>(SB), 8, $32
1022
1023 // shuffle xDxC -> DC00
1024 DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
1025 DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
1026 DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
1027 DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
1028 GLOBL shuff_DC00<>(SB), 8, $32
1029
1030 // Round specific constants
1031 DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
1032 DATA K256<>+0x04(SB)/4, $0x71374491 // k2
1033 DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
1034 DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
1035 DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
1036 DATA K256<>+0x14(SB)/4, $0x71374491 // k2
1037 DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
1038 DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
1039
1040 DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
1041 DATA K256<>+0x24(SB)/4, $0x59f111f1
1042 DATA K256<>+0x28(SB)/4, $0x923f82a4
1043 DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
1044 DATA K256<>+0x30(SB)/4, $0x3956c25b
1045 DATA K256<>+0x34(SB)/4, $0x59f111f1
1046 DATA K256<>+0x38(SB)/4, $0x923f82a4
1047 DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
1048
1049 DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
1050 DATA K256<>+0x44(SB)/4, $0x12835b01
1051 DATA K256<>+0x48(SB)/4, $0x243185be
1052 DATA K256<>+0x4c(SB)/4, $0x550c7dc3
1053 DATA K256<>+0x50(SB)/4, $0xd807aa98
1054 DATA K256<>+0x54(SB)/4, $0x12835b01
1055 DATA K256<>+0x58(SB)/4, $0x243185be
1056 DATA K256<>+0x5c(SB)/4, $0x550c7dc3
1057
1058 DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
1059 DATA K256<>+0x64(SB)/4, $0x80deb1fe
1060 DATA K256<>+0x68(SB)/4, $0x9bdc06a7
1061 DATA K256<>+0x6c(SB)/4, $0xc19bf174
1062 DATA K256<>+0x70(SB)/4, $0x72be5d74
1063 DATA K256<>+0x74(SB)/4, $0x80deb1fe
1064 DATA K256<>+0x78(SB)/4, $0x9bdc06a7
1065 DATA K256<>+0x7c(SB)/4, $0xc19bf174
1066
1067 DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
1068 DATA K256<>+0x84(SB)/4, $0xefbe4786
1069 DATA K256<>+0x88(SB)/4, $0x0fc19dc6
1070 DATA K256<>+0x8c(SB)/4, $0x240ca1cc
1071 DATA K256<>+0x90(SB)/4, $0xe49b69c1
1072 DATA K256<>+0x94(SB)/4, $0xefbe4786
1073 DATA K256<>+0x98(SB)/4, $0x0fc19dc6
1074 DATA K256<>+0x9c(SB)/4, $0x240ca1cc
1075
1076 DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
1077 DATA K256<>+0xa4(SB)/4, $0x4a7484aa
1078 DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
1079 DATA K256<>+0xac(SB)/4, $0x76f988da
1080 DATA K256<>+0xb0(SB)/4, $0x2de92c6f
1081 DATA K256<>+0xb4(SB)/4, $0x4a7484aa
1082 DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
1083 DATA K256<>+0xbc(SB)/4, $0x76f988da
1084
1085 DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
1086 DATA K256<>+0xc4(SB)/4, $0xa831c66d
1087 DATA K256<>+0xc8(SB)/4, $0xb00327c8
1088 DATA K256<>+0xcc(SB)/4, $0xbf597fc7
1089 DATA K256<>+0xd0(SB)/4, $0x983e5152
1090 DATA K256<>+0xd4(SB)/4, $0xa831c66d
1091 DATA K256<>+0xd8(SB)/4, $0xb00327c8
1092 DATA K256<>+0xdc(SB)/4, $0xbf597fc7
1093
1094 DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
1095 DATA K256<>+0xe4(SB)/4, $0xd5a79147
1096 DATA K256<>+0xe8(SB)/4, $0x06ca6351
1097 DATA K256<>+0xec(SB)/4, $0x14292967
1098 DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
1099 DATA K256<>+0xf4(SB)/4, $0xd5a79147
1100 DATA K256<>+0xf8(SB)/4, $0x06ca6351
1101 DATA K256<>+0xfc(SB)/4, $0x14292967
1102
1103 DATA K256<>+0x100(SB)/4, $0x27b70a85
1104 DATA K256<>+0x104(SB)/4, $0x2e1b2138
1105 DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
1106 DATA K256<>+0x10c(SB)/4, $0x53380d13
1107 DATA K256<>+0x110(SB)/4, $0x27b70a85
1108 DATA K256<>+0x114(SB)/4, $0x2e1b2138
1109 DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
1110 DATA K256<>+0x11c(SB)/4, $0x53380d13
1111
1112 DATA K256<>+0x120(SB)/4, $0x650a7354
1113 DATA K256<>+0x124(SB)/4, $0x766a0abb
1114 DATA K256<>+0x128(SB)/4, $0x81c2c92e
1115 DATA K256<>+0x12c(SB)/4, $0x92722c85
1116 DATA K256<>+0x130(SB)/4, $0x650a7354
1117 DATA K256<>+0x134(SB)/4, $0x766a0abb
1118 DATA K256<>+0x138(SB)/4, $0x81c2c92e
1119 DATA K256<>+0x13c(SB)/4, $0x92722c85
1120
1121 DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
1122 DATA K256<>+0x144(SB)/4, $0xa81a664b
1123 DATA K256<>+0x148(SB)/4, $0xc24b8b70
1124 DATA K256<>+0x14c(SB)/4, $0xc76c51a3
1125 DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
1126 DATA K256<>+0x154(SB)/4, $0xa81a664b
1127 DATA K256<>+0x158(SB)/4, $0xc24b8b70
1128 DATA K256<>+0x15c(SB)/4, $0xc76c51a3
1129
1130 DATA K256<>+0x160(SB)/4, $0xd192e819
1131 DATA K256<>+0x164(SB)/4, $0xd6990624
1132 DATA K256<>+0x168(SB)/4, $0xf40e3585
1133 DATA K256<>+0x16c(SB)/4, $0x106aa070
1134 DATA K256<>+0x170(SB)/4, $0xd192e819
1135 DATA K256<>+0x174(SB)/4, $0xd6990624
1136 DATA K256<>+0x178(SB)/4, $0xf40e3585
1137 DATA K256<>+0x17c(SB)/4, $0x106aa070
1138
1139 DATA K256<>+0x180(SB)/4, $0x19a4c116
1140 DATA K256<>+0x184(SB)/4, $0x1e376c08
1141 DATA K256<>+0x188(SB)/4, $0x2748774c
1142 DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
1143 DATA K256<>+0x190(SB)/4, $0x19a4c116
1144 DATA K256<>+0x194(SB)/4, $0x1e376c08
1145 DATA K256<>+0x198(SB)/4, $0x2748774c
1146 DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
1147
1148 DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
1149 DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
1150 DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
1151 DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
1152 DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
1153 DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
1154 DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
1155 DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
1156
1157 DATA K256<>+0x1c0(SB)/4, $0x748f82ee
1158 DATA K256<>+0x1c4(SB)/4, $0x78a5636f
1159 DATA K256<>+0x1c8(SB)/4, $0x84c87814
1160 DATA K256<>+0x1cc(SB)/4, $0x8cc70208
1161 DATA K256<>+0x1d0(SB)/4, $0x748f82ee
1162 DATA K256<>+0x1d4(SB)/4, $0x78a5636f
1163 DATA K256<>+0x1d8(SB)/4, $0x84c87814
1164 DATA K256<>+0x1dc(SB)/4, $0x8cc70208
1165
1166 DATA K256<>+0x1e0(SB)/4, $0x90befffa
1167 DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
1168 DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
1169 DATA K256<>+0x1ec(SB)/4, $0xc67178f2
1170 DATA K256<>+0x1f0(SB)/4, $0x90befffa
1171 DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
1172 DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
1173 DATA K256<>+0x1fc(SB)/4, $0xc67178f2
1174
1175 GLOBL K256<>(SB), (NOPTR + RODATA), $512
1176
View as plain text