1
2
3
4
5 package main
6
7 import (
8 . "github.com/mmcloughlin/avo/build"
9 . "github.com/mmcloughlin/avo/operand"
10 . "github.com/mmcloughlin/avo/reg"
11 )
12
13
14
15
16
17
18
19
20
21
22
23
24 func blockAVX2() {
25 Implement("blockAVX2")
26 AllocLocal(536)
27
28 Load(Param("dig"), CTX)
29 Load(Param("p").Base(), INP)
30 Load(Param("p").Len(), NUM_BYTES)
31
32 LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES)
33 MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END))
34
35 CMPQ(NUM_BYTES, INP)
36 JE(LabelRef("avx2_only_one_block"))
37
38 Comment("Load initial digest")
39 CTX := Mem{Base: CTX}
40 MOVL(CTX.Offset(0), a)
41 MOVL(CTX.Offset(4), b)
42 MOVL(CTX.Offset(8), c)
43 MOVL(CTX.Offset(12), d)
44 MOVL(CTX.Offset(16), e)
45 MOVL(CTX.Offset(20), f)
46 MOVL(CTX.Offset(24), g)
47 MOVL(CTX.Offset(28), h)
48
49 avx2_loop0()
50 avx2_last_block_enter()
51 avx2_loop1()
52 avx2_loop2()
53 avx2_loop3()
54 avx2_do_last_block()
55 avx2_only_one_block()
56 done_hash()
57 }
58
59 func avx2_loop0() {
60 Label("avx2_loop0")
61 Comment("at each iteration works with one block (512 bit)")
62 VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0)
63 VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1)
64 VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2)
65 VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3)
66
67 flip_mask := flip_mask_DATA()
68
69 VMOVDQU(flip_mask, BYTE_FLIP_MASK)
70
71 Comment("Apply Byte Flip Mask: LE -> BE")
72 VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0)
73 VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1)
74 VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2)
75 VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3)
76
77 Comment("Transpose data into high/low parts")
78 VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0)
79 VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1)
80 VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2)
81 VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3)
82
83 K256 := K256_DATA()
84 LEAQ(K256, TBL)
85 }
86
87 func avx2_last_block_enter() {
88 Label("avx2_last_block_enter")
89 ADDQ(Imm(64), INP)
90 MOVQ(INP, Mem{Base: SP}.Offset(_INP))
91 XORQ(SRND, SRND)
92 }
93
94
95 func avx2_loop1() {
96 Label("avx2_loop1")
97
98 Comment("Do 4 rounds and scheduling")
99 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER)
100 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
101 roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
102 roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
103 roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
104 roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
105
106 Comment("Do 4 rounds and scheduling")
107 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
108 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
109 roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
110 roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
111 roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
112 roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
113
114 Comment("Do 4 rounds and scheduling")
115 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER)
116 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32))
117 roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
118 roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
119 roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
120 roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
121
122 Comment("Do 4 rounds and scheduling")
123 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER)
124 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32))
125 roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
126 roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
127 roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
128 roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
129
130 ADDQ(Imm(4*32), SRND)
131 CMPQ(SRND, U32(3*4*32))
132 JB(LabelRef("avx2_loop1"))
133 }
134
135
136 func avx2_loop2() {
137 Label("avx2_loop2")
138 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER)
139 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
140 doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h)
141 doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h)
142 doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g)
143 doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f)
144
145 VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
146 VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
147 doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e)
148 doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d)
149 doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c)
150 doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b)
151
152 ADDQ(Imm(2*32), SRND)
153
154 VMOVDQU(XDWORD2, XDWORD0)
155 VMOVDQU(XDWORD3, XDWORD1)
156
157 CMPQ(SRND, U32(4*4*32))
158 JB(LabelRef("avx2_loop2"))
159
160 Load(Param("dig"), CTX)
161 MOVQ(Mem{Base: SP}.Offset(_INP), INP)
162
163 registers := []GPPhysical{a, b, c, d, e, f, g, h}
164 for i, reg := range registers {
165 addm(Mem{Base: CTX}.Offset(i*4), reg)
166 }
167
168 CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
169 JB(LabelRef("done_hash"))
170
171 XORQ(SRND, SRND)
172 }
173
174
175 func avx2_loop3() {
176 Label("avx2_loop3")
177 doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a)
178 doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h)
179 doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g)
180 doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f)
181
182 doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e)
183 doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d)
184 doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c)
185 doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b)
186
187 ADDQ(Imm(2*32), SRND)
188 CMPQ(SRND, U32(4*4*32))
189 JB(LabelRef("avx2_loop3"))
190
191 Load(Param("dig"), CTX)
192 MOVQ(Mem{Base: SP}.Offset(_INP), INP)
193 ADDQ(Imm(64), INP)
194
195 registers := []GPPhysical{a, b, c, d, e, f, g, h}
196 for i, reg := range registers {
197 addm(Mem{Base: CTX}.Offset(i*4), reg)
198 }
199
200 CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
201 JA(LabelRef("avx2_loop0"))
202 JB(LabelRef("done_hash"))
203 }
204
205 func avx2_do_last_block() {
206 Label("avx2_do_last_block")
207 VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0)
208 VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1)
209 VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2)
210 VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3)
211
212 flip_mask := flip_mask_DATA()
213 VMOVDQU(flip_mask, BYTE_FLIP_MASK)
214
215 VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0)
216 VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1)
217 VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2)
218 VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3)
219
220 K256 := K256_DATA()
221 LEAQ(K256, TBL)
222
223 JMP(LabelRef("avx2_last_block_enter"))
224 }
225
226
227 func avx2_only_one_block() {
228 Label("avx2_only_one_block")
229 registers := []GPPhysical{a, b, c, d, e, f, g, h}
230 for i, reg := range registers {
231 MOVL(Mem{Base: CTX}.Offset(i*4), reg)
232 }
233 JMP(LabelRef("avx2_do_last_block"))
234 }
235
236 func done_hash() {
237 Label("done_hash")
238 VZEROUPPER()
239 RET()
240 }
241
242
243
244 func addm(P1 Mem, P2 GPPhysical) {
245 ADDL(P2, P1)
246 MOVL(P1, P2)
247 }
248
249 var (
250 XDWORD0 VecPhysical = Y4
251 XDWORD1 = Y5
252 XDWORD2 = Y6
253 XDWORD3 = Y7
254
255 XWORD0 = X4
256 XWORD1 = X5
257 XWORD2 = X6
258 XWORD3 = X7
259
260 XTMP0 = Y0
261 XTMP1 = Y1
262 XTMP2 = Y2
263 XTMP3 = Y3
264 XTMP4 = Y8
265 XTMP5 = Y11
266
267 XFER = Y9
268
269 BYTE_FLIP_MASK = Y13
270 X_BYTE_FLIP_MASK = X13
271
272 NUM_BYTES GPPhysical = RDX
273 INP = RDI
274
275 CTX = RSI
276
277 a = EAX
278 b = EBX
279 c = ECX
280 d = R8L
281 e = EDX
282 f = R9L
283 g = R10L
284 h = R11L
285
286 old_h = R11L
287
288 TBL = RBP
289
290 SRND = RSI
291
292 T1 = R12L
293
294 y0 = R13L
295 y1 = R14L
296 y2 = R15L
297 y3 = EDI
298
299
300 XFER_SIZE = 2 * 64 * 4
301 INP_END_SIZE = 8
302 INP_SIZE = 8
303
304 _XFER = 0
305 _INP_END = _XFER + XFER_SIZE
306 _INP = _INP_END + INP_END_SIZE
307 STACK_SIZE = _INP + INP_SIZE
308 )
309
310 func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
311
312 MOVL(a, y3)
313 RORXL(Imm(25), e, y0)
314 RORXL(Imm(11), e, y1)
315
316 ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h)
317 ORL(c, y3)
318 VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0)
319 MOVL(f, y2)
320 RORXL(Imm(13), a, T1)
321
322 XORL(y1, y0)
323 XORL(g, y2)
324 VPADDD(XDWORD0, XTMP0, XTMP0)
325 RORXL(Imm(6), e, y1)
326
327 ANDL(e, y2)
328 XORL(y1, y0)
329 RORXL(Imm(22), a, y1)
330 ADDL(h, d)
331
332 ANDL(b, y3)
333 VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1)
334 XORL(T1, y1)
335 RORXL(Imm(2), a, T1)
336
337 XORL(g, y2)
338 VPSRLD(Imm(7), XTMP1, XTMP2)
339 XORL(T1, y1)
340 MOVL(a, T1)
341 ANDL(c, T1)
342
343 ADDL(y0, y2)
344 VPSLLD(Imm(32-7), XTMP1, XTMP3)
345 ORL(T1, y3)
346 ADDL(y1, h)
347
348 ADDL(y2, d)
349 VPOR(XTMP2, XTMP3, XTMP3)
350
351 VPSRLD(Imm(18), XTMP1, XTMP2)
352 ADDL(y2, h)
353 ADDL(y3, h)
354 }
355
356 func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
357
358 MOVL(a, y3)
359 RORXL(Imm(25), e, y0)
360 RORXL(Imm(11), e, y1)
361 ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h)
362 ORL(c, y3)
363
364 VPSRLD(Imm(3), XTMP1, XTMP4)
365 MOVL(f, y2)
366 RORXL(Imm(13), a, T1)
367 XORL(y1, y0)
368 XORL(g, y2)
369
370 RORXL(Imm(6), e, y1)
371 XORL(y1, y0)
372 RORXL(Imm(22), a, y1)
373 ANDL(e, y2)
374 ADDL(h, d)
375
376 VPSLLD(Imm(32-18), XTMP1, XTMP1)
377 ANDL(b, y3)
378 XORL(T1, y1)
379
380 VPXOR(XTMP1, XTMP3, XTMP3)
381 RORXL(Imm(2), a, T1)
382 XORL(g, y2)
383
384 VPXOR(XTMP2, XTMP3, XTMP3)
385 XORL(T1, y1)
386 MOVL(a, T1)
387 ANDL(c, T1)
388 ADDL(y0, y2)
389
390 VPXOR(XTMP4, XTMP3, XTMP1)
391 VPSHUFD(Imm(0xFA), XDWORD3, XTMP2)
392 ORL(T1, y3)
393 ADDL(y1, h)
394
395 VPADDD(XTMP1, XTMP0, XTMP0)
396 ADDL(y2, d)
397 ADDL(y2, h)
398 ADDL(y3, h)
399
400 VPSRLD(Imm(10), XTMP2, XTMP4)
401 }
402
403 func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
404
405 var shuff_00BA Mem = shuff_00BA_DATA()
406
407 MOVL(a, y3)
408 RORXL(Imm(25), e, y0)
409 ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h)
410
411 VPSRLQ(Imm(19), XTMP2, XTMP3)
412 RORXL(Imm(11), e, y1)
413 ORL(c, y3)
414 MOVL(f, y2)
415 XORL(g, y2)
416
417 RORXL(Imm(13), a, T1)
418 XORL(y1, y0)
419 VPSRLQ(Imm(17), XTMP2, XTMP2)
420 ANDL(e, y2)
421
422 RORXL(Imm(6), e, y1)
423 VPXOR(XTMP3, XTMP2, XTMP2)
424 ADDL(h, d)
425 ANDL(b, y3)
426
427 XORL(y1, y0)
428 RORXL(Imm(22), a, y1)
429 VPXOR(XTMP2, XTMP4, XTMP4)
430 XORL(g, y2)
431
432 VPSHUFB(shuff_00BA, XTMP4, XTMP4)
433
434 XORL(T1, y1)
435 RORXL(Imm(2), a, T1)
436 VPADDD(XTMP4, XTMP0, XTMP0)
437
438 XORL(T1, y1)
439 MOVL(a, T1)
440 ANDL(c, T1)
441 ADDL(y0, y2)
442 VPSHUFD(Imm(80), XTMP0, XTMP2)
443
444 ORL(T1, y3)
445 ADDL(y1, h)
446 ADDL(y2, d)
447 ADDL(y2, h)
448
449 ADDL(y3, h)
450 }
451
452 func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
453
454 var shuff_DC00 Mem = shuff_DC00_DATA()
455
456 MOVL(a, y3)
457 RORXL(Imm(25), e, y0)
458 RORXL(Imm(11), e, y1)
459 ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h)
460 ORL(c, y3)
461
462 VPSRLD(Imm(10), XTMP2, XTMP5)
463 MOVL(f, y2)
464 RORXL(Imm(13), a, T1)
465 XORL(y1, y0)
466 XORL(g, y2)
467
468 VPSRLQ(Imm(19), XTMP2, XTMP3)
469 RORXL(Imm(6), e, y1)
470 ANDL(e, y2)
471 ADDL(h, d)
472 ANDL(b, y3)
473
474 VPSRLQ(Imm(17), XTMP2, XTMP2)
475 XORL(y1, y0)
476 XORL(g, y2)
477
478 VPXOR(XTMP3, XTMP2, XTMP2)
479 RORXL(Imm(22), a, y1)
480 ADDL(y0, y2)
481
482 VPXOR(XTMP2, XTMP5, XTMP5)
483 XORL(T1, y1)
484 ADDL(y2, d)
485
486 RORXL(Imm(2), a, T1)
487
488 VPSHUFB(shuff_DC00, XTMP5, XTMP5)
489
490 VPADDD(XTMP0, XTMP5, XDWORD0)
491 XORL(T1, y1)
492 MOVL(a, T1)
493 ANDL(c, T1)
494 ORL(T1, y3)
495
496 ADDL(y1, h)
497 ADDL(y2, h)
498 ADDL(y3, h)
499 }
500
501 func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
502
503 MOVL(f, y2)
504 RORXL(Imm(25), e, y0)
505 RORXL(Imm(11), e, y1)
506 XORL(g, y2)
507
508 XORL(y1, y0)
509 RORXL(Imm(6), e, y1)
510 ANDL(e, y2)
511
512 XORL(y1, y0)
513 RORXL(Imm(13), a, T1)
514 XORL(g, y2)
515 RORXL(Imm(22), a, y1)
516 MOVL(a, y3)
517
518 XORL(T1, y1)
519 RORXL(Imm(2), a, T1)
520 ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h)
521 ORL(c, y3)
522
523 XORL(T1, y1)
524 MOVL(a, T1)
525 ANDL(b, y3)
526 ANDL(c, T1)
527 ADDL(y0, y2)
528
529 ADDL(h, d)
530 ORL(T1, y3)
531 ADDL(y1, h)
532 ADDL(y2, d)
533 }
534
535 func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
536
537 ADDL(y2, old_h)
538 MOVL(f, y2)
539 RORXL(Imm(25), e, y0)
540 RORXL(Imm(11), e, y1)
541 XORL(g, y2)
542
543 XORL(y1, y0)
544 RORXL(Imm(6), e, y1)
545 ANDL(e, y2)
546 ADDL(y3, old_h)
547
548 XORL(y1, y0)
549 RORXL(Imm(13), a, T1)
550 XORL(g, y2)
551 RORXL(Imm(22), a, y1)
552 MOVL(a, y3)
553
554 XORL(T1, y1)
555 RORXL(Imm(2), a, T1)
556 ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h)
557 ORL(c, y3)
558
559 XORL(T1, y1)
560 MOVL(a, T1)
561 ANDL(b, y3)
562 ANDL(c, T1)
563 ADDL(y0, y2)
564
565 ADDL(h, d)
566 ORL(T1, y3)
567 ADDL(y1, h)
568
569 ADDL(y2, d)
570 }
571
572 func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
573
574 ADDL(y2, old_h)
575 MOVL(f, y2)
576 RORXL(Imm(25), e, y0)
577 RORXL(Imm(11), e, y1)
578 XORL(g, y2)
579
580 XORL(y1, y0)
581 RORXL(Imm(6), e, y1)
582 ANDL(e, y2)
583 ADDL(y3, old_h)
584
585 XORL(y1, y0)
586 RORXL(Imm(13), a, T1)
587 XORL(g, y2)
588 RORXL(Imm(22), a, y1)
589 MOVL(a, y3)
590
591 XORL(T1, y1)
592 RORXL(Imm(2), a, T1)
593 ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h)
594 ORL(c, y3)
595
596 XORL(T1, y1)
597 MOVL(a, T1)
598 ANDL(b, y3)
599 ANDL(c, T1)
600 ADDL(y0, y2)
601
602 ADDL(h, d)
603 ORL(T1, y3)
604 ADDL(y1, h)
605
606 ADDL(y2, d)
607 }
608
609 func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
610
611 ADDL(y2, old_h)
612 MOVL(f, y2)
613 RORXL(Imm(25), e, y0)
614 RORXL(Imm(11), e, y1)
615 XORL(g, y2)
616
617 XORL(y1, y0)
618 RORXL(Imm(6), e, y1)
619 ANDL(e, y2)
620 ADDL(y3, old_h)
621
622 XORL(y1, y0)
623 RORXL(Imm(13), a, T1)
624 XORL(g, y2)
625 RORXL(Imm(22), a, y1)
626 MOVL(a, y3)
627
628 XORL(T1, y1)
629 RORXL(Imm(2), a, T1)
630 ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h)
631 ORL(c, y3)
632
633 XORL(T1, y1)
634 MOVL(a, T1)
635 ANDL(b, y3)
636 ANDL(c, T1)
637 ADDL(y0, y2)
638
639 ADDL(h, d)
640 ORL(T1, y3)
641 ADDL(y1, h)
642
643 ADDL(y2, d)
644
645 ADDL(y2, h)
646
647 ADDL(y3, h)
648 }
649
650
651 var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem
652
653
654 func flip_mask_DATA() Mem {
655 if flip_maskPtr != nil {
656 return *flip_maskPtr
657 }
658
659 flip_mask := GLOBL("flip_mask", RODATA)
660 flip_maskPtr = &flip_mask
661
662 DATA(0x00, U64(0x0405060700010203))
663 DATA(0x08, U64(0x0c0d0e0f08090a0b))
664 DATA(0x10, U64(0x0405060700010203))
665 DATA(0x18, U64(0x0c0d0e0f08090a0b))
666 return flip_mask
667 }
668
669
670 func shuff_00BA_DATA() Mem {
671 if shuff_00BAPtr != nil {
672 return *shuff_00BAPtr
673 }
674
675 shuff_00BA := GLOBL("shuff_00BA", RODATA)
676 shuff_00BAPtr = &shuff_00BA
677
678 DATA(0x00, U64(0x0b0a090803020100))
679 DATA(0x08, U64(0xFFFFFFFFFFFFFFFF))
680 DATA(0x10, U64(0x0b0a090803020100))
681 DATA(0x18, U64(0xFFFFFFFFFFFFFFFF))
682 return shuff_00BA
683 }
684
685
686 func shuff_DC00_DATA() Mem {
687 if shuff_DC00Ptr != nil {
688 return *shuff_DC00Ptr
689 }
690
691 shuff_DC00 := GLOBL("shuff_DC00", RODATA)
692 shuff_DC00Ptr = &shuff_DC00
693
694 DATA(0x00, U64(0xFFFFFFFFFFFFFFFF))
695 DATA(0x08, U64(0x0b0a090803020100))
696 DATA(0x10, U64(0xFFFFFFFFFFFFFFFF))
697 DATA(0x18, U64(0x0b0a090803020100))
698 return shuff_DC00
699 }
700
701
702 func K256_DATA() Mem {
703 if K256Ptr != nil {
704 return *K256Ptr
705 }
706
707 K256 := GLOBL("K256", NOPTR+RODATA)
708 K256Ptr = &K256
709
710 offset_idx := 0
711
712 for i := 0; i < len(_K); i += 4 {
713 DATA((offset_idx+0)*4, U32(_K[i+0]))
714 DATA((offset_idx+1)*4, U32(_K[i+1]))
715 DATA((offset_idx+2)*4, U32(_K[i+2]))
716 DATA((offset_idx+3)*4, U32(_K[i+3]))
717
718 DATA((offset_idx+4)*4, U32(_K[i+0]))
719 DATA((offset_idx+5)*4, U32(_K[i+1]))
720 DATA((offset_idx+6)*4, U32(_K[i+2]))
721 DATA((offset_idx+7)*4, U32(_K[i+3]))
722 offset_idx += 8
723 }
724 return K256
725 }
726
View as plain text