1
2
3
4
5
6
7
8
9
10
11
12 package main
13
14 import (
15 . "github.com/mmcloughlin/avo/build"
16 "github.com/mmcloughlin/avo/ir"
17 . "github.com/mmcloughlin/avo/operand"
18 . "github.com/mmcloughlin/avo/reg"
19 )
20
21
22
23 var (
24 B0 VecPhysical = X0
25 B1 = X1
26 B2 = X2
27 B3 = X3
28 B4 = X4
29 B5 = X5
30 B6 = X6
31 B7 = X7
32
33 ACC0 VecPhysical = X8
34 ACC1 = X9
35 ACCM = X10
36
37 T0 VecPhysical = X11
38 T1 = X12
39 T2 = X13
40 POLY = X14
41 BSWAP = X15
42 )
43
44 func main() {
45 Package("crypto/aes")
46 ConstraintExpr("!purego")
47
48 gcmAesFinish()
49 gcmAesInit()
50 gcmAesData()
51 gcmAesEnc()
52 gcmAesDec()
53
54 Generate()
55 }
56
57 func gcmAesFinish() {
58 Implement("gcmAesFinish")
59 Attributes(NOSPLIT)
60 AllocLocal(0)
61
62 var (
63 pTbl GPPhysical = RDI
64 tMsk = RSI
65 tPtr = RDX
66 plen = RAX
67 dlen = RCX
68 )
69
70 Load(Param("productTable"), pTbl)
71 Load(Param("tagMask"), tMsk)
72 Load(Param("T"), tPtr)
73 Load(Param("pLen"), plen)
74 Load(Param("dLen"), dlen)
75
76 MOVOU(Mem{Base: tPtr}, ACC0)
77 MOVOU(Mem{Base: tMsk}, T2)
78
79 bswapMask := bswapMask_DATA()
80 gcmPoly := gcmPoly_DATA()
81 MOVOU(bswapMask, BSWAP)
82 MOVOU(gcmPoly, POLY)
83
84 SHLQ(Imm(3), plen)
85 SHLQ(Imm(3), dlen)
86
87 MOVQ(plen, B0)
88 PINSRQ(Imm(1), dlen, B0)
89
90 PXOR(ACC0, B0)
91
92 MOVOU(Mem{Base: pTbl}.Offset(16*14), ACC0)
93 MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
94 MOVOU(ACC0, ACC1)
95
96 PCLMULQDQ(Imm(0x00), B0, ACC0)
97 PCLMULQDQ(Imm(0x11), B0, ACC1)
98 PSHUFD(Imm(78), B0, T0)
99 PXOR(B0, T0)
100 PCLMULQDQ(Imm(0x00), T0, ACCM)
101
102 PXOR(ACC0, ACCM)
103 PXOR(ACC1, ACCM)
104 MOVOU(ACCM, T0)
105 PSRLDQ(Imm(8), ACCM)
106 PSLLDQ(Imm(8), T0)
107 PXOR(ACCM, ACC1)
108 PXOR(T0, ACC0)
109
110 MOVOU(POLY, T0)
111 PCLMULQDQ(Imm(0x01), ACC0, T0)
112 PSHUFD(Imm(78), ACC0, ACC0)
113 PXOR(T0, ACC0)
114
115 MOVOU(POLY, T0)
116 PCLMULQDQ(Imm(0x01), ACC0, T0)
117 PSHUFD(Imm(78), ACC0, ACC0)
118 PXOR(T0, ACC0)
119
120 PXOR(ACC1, ACC0)
121
122 PSHUFB(BSWAP, ACC0)
123 PXOR(T2, ACC0)
124 MOVOU(ACC0, Mem{Base: tPtr})
125
126 RET()
127 }
128
129 func gcmAesInit() {
130 Implement("gcmAesInit")
131 Attributes(NOSPLIT)
132 AllocLocal(0)
133
134 var (
135 dst GPPhysical = RDI
136 KS = RSI
137 NR = RDX
138 )
139
140 Load(Param("productTable"), dst)
141 Load(Param("ks").Base(), KS)
142 Load(Param("ks").Len(), NR)
143
144 SHRQ(Imm(2), NR)
145 DECQ(NR)
146
147 bswapMask := bswapMask_DATA()
148 gcmPoly := gcmPoly_DATA()
149 MOVOU(bswapMask, BSWAP)
150 MOVOU(gcmPoly, POLY)
151
152 Comment("Encrypt block 0, with the AES key to generate the hash key H")
153 MOVOU(Mem{Base: KS}.Offset(16*0), B0)
154 MOVOU(Mem{Base: KS}.Offset(16*1), T0)
155 AESENC(T0, B0)
156 MOVOU(Mem{Base: KS}.Offset(16*2), T0)
157 AESENC(T0, B0)
158 MOVOU(Mem{Base: KS}.Offset(16*3), T0)
159 AESENC(T0, B0)
160 MOVOU(Mem{Base: KS}.Offset(16*4), T0)
161 AESENC(T0, B0)
162 MOVOU(Mem{Base: KS}.Offset(16*5), T0)
163 AESENC(T0, B0)
164 MOVOU(Mem{Base: KS}.Offset(16*6), T0)
165 AESENC(T0, B0)
166 MOVOU(Mem{Base: KS}.Offset(16*7), T0)
167 AESENC(T0, B0)
168 MOVOU(Mem{Base: KS}.Offset(16*8), T0)
169 AESENC(T0, B0)
170 MOVOU(Mem{Base: KS}.Offset(16*9), T0)
171 AESENC(T0, B0)
172 MOVOU(Mem{Base: KS}.Offset(16*10), T0)
173 CMPQ(NR, Imm(12))
174 JB(LabelRef("initEncLast"))
175 AESENC(T0, B0)
176 MOVOU(Mem{Base: KS}.Offset(16*11), T0)
177 AESENC(T0, B0)
178 MOVOU(Mem{Base: KS}.Offset(16*12), T0)
179 JE(LabelRef("initEncLast"))
180 AESENC(T0, B0)
181 MOVOU(Mem{Base: KS}.Offset(16*13), T0)
182 AESENC(T0, B0)
183 MOVOU(Mem{Base: KS}.Offset(16*14), T0)
184
185 initEncLast(dst)
186 initLoop(dst)
187
188 RET()
189 }
190
191 func initEncLast(dst GPPhysical) {
192 Label("initEncLast")
193 AESENCLAST(T0, B0)
194
195 PSHUFB(BSWAP, B0)
196 Comment("H * 2")
197 PSHUFD(Imm(0xff), B0, T0)
198 MOVOU(B0, T1)
199 PSRAL(Imm(31), T0)
200 PAND(POLY, T0)
201 PSRLL(Imm(31), T1)
202 PSLLDQ(Imm(4), T1)
203 PSLLL(Imm(1), B0)
204 PXOR(T0, B0)
205 PXOR(T1, B0)
206 Comment("Karatsuba pre-computations")
207 MOVOU(B0, Mem{Base: dst}.Offset(16*14))
208 PSHUFD(Imm(78), B0, B1)
209 PXOR(B0, B1)
210 MOVOU(B1, Mem{Base: dst}.Offset(16*15))
211
212 MOVOU(B0, B2)
213 MOVOU(B1, B3)
214 Comment("Now prepare powers of H and pre-computations for them")
215 MOVQ(U32(7), RAX)
216 }
217
218 func initLoop(dst GPPhysical) {
219 Label("initLoop")
220 MOVOU(B2, T0)
221 MOVOU(B2, T1)
222 MOVOU(B3, T2)
223 PCLMULQDQ(Imm(0x00), B0, T0)
224 PCLMULQDQ(Imm(0x11), B0, T1)
225 PCLMULQDQ(Imm(0x00), B1, T2)
226
227 PXOR(T0, T2)
228 PXOR(T1, T2)
229 MOVOU(T2, B4)
230 PSLLDQ(Imm(8), B4)
231 PSRLDQ(Imm(8), T2)
232 PXOR(B4, T0)
233 PXOR(T2, T1)
234
235 MOVOU(POLY, B2)
236 PCLMULQDQ(Imm(0x01), T0, B2)
237 PSHUFD(Imm(78), T0, T0)
238 PXOR(B2, T0)
239 MOVOU(POLY, B2)
240 PCLMULQDQ(Imm(0x01), T0, B2)
241 PSHUFD(Imm(78), T0, T0)
242 PXOR(T0, B2)
243 PXOR(T1, B2)
244
245 MOVOU(B2, Mem{Base: dst}.Offset(16*12))
246 PSHUFD(Imm(78), B2, B3)
247 PXOR(B2, B3)
248 MOVOU(B3, Mem{Base: dst}.Offset(16*13))
249
250 DECQ(RAX)
251 LEAQ(Mem{Base: dst}.Offset(-16*2), dst)
252 JNE(LabelRef("initLoop"))
253 }
254
255 func gcmAesData() {
256 Implement("gcmAesData")
257 Attributes(NOSPLIT)
258 AllocLocal(0)
259
260 var (
261 pTbl GPPhysical = RDI
262 aut = RSI
263 tPtr = RCX
264 autLen = RDX
265 )
266
267 Load(Param("productTable"), pTbl)
268 Load(Param("data").Base(), aut)
269 Load(Param("data").Len(), autLen)
270 Load(Param("T"), tPtr)
271
272 bswapMask := bswapMask_DATA()
273 gcmPoly := gcmPoly_DATA()
274 PXOR(ACC0, ACC0)
275 MOVOU(bswapMask, BSWAP)
276 MOVOU(gcmPoly, POLY)
277
278 TESTQ(autLen, autLen)
279 JEQ(LabelRef("dataBail"))
280
281 CMPQ(autLen, Imm(13))
282 JE(LabelRef("dataTLS"))
283 CMPQ(autLen, Imm(128))
284 JB(LabelRef("startSinglesLoop"))
285 JMP(LabelRef("dataOctaLoop"))
286
287 dataTLS(pTbl, aut, autLen)
288 dataOctaLoop(pTbl, aut, autLen)
289 startSinglesLoop(pTbl)
290 dataSinglesLoop(aut, autLen)
291 dataMul(aut)
292 dataEnd(aut, autLen)
293 dataLoadLoop(aut, autLen)
294 dataBail(tPtr)
295 }
296
297 func reduceRound(a VecPhysical) {
298 MOVOU(POLY, T0)
299 PCLMULQDQ(Imm(0x01), a, T0)
300 PSHUFD(Imm(78), a, a)
301 PXOR(T0, a)
302 }
303
304 func mulRoundAAD(X VecPhysical, i int, pTbl GPPhysical) {
305 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
306 MOVOU(T1, T2)
307 PCLMULQDQ(Imm(0x00), X, T1)
308 PXOR(T1, ACC0)
309 PCLMULQDQ(Imm(0x11), X, T2)
310 PXOR(T2, ACC1)
311 PSHUFD(Imm(78), X, T1)
312 PXOR(T1, X)
313 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T1)
314 PCLMULQDQ(Imm(0x00), X, T1)
315 PXOR(T1, ACCM)
316 }
317
318 func dataTLS(pTbl, aut, autLen GPPhysical) {
319 Label("dataTLS")
320 MOVOU(Mem{Base: pTbl}.Offset(16*14), T1)
321 MOVOU(Mem{Base: pTbl}.Offset(16*15), T2)
322 PXOR(B0, B0)
323 MOVQ(Mem{Base: aut}, B0)
324 PINSRD(Imm(2), Mem{Base: aut}.Offset(8), B0)
325 PINSRB(Imm(12), Mem{Base: aut}.Offset(12), B0)
326 XORQ(autLen, autLen)
327 JMP(LabelRef("dataMul"))
328 }
329
330 func dataOctaLoop(pTbl, aut, autLen GPPhysical) {
331 Label("dataOctaLoop")
332 CMPQ(autLen, Imm(128))
333 JB(LabelRef("startSinglesLoop"))
334 SUBQ(Imm(128), autLen)
335
336 MOVOU(Mem{Base: aut}.Offset(16*0), X0)
337 MOVOU(Mem{Base: aut}.Offset(16*1), X1)
338 MOVOU(Mem{Base: aut}.Offset(16*2), X2)
339 MOVOU(Mem{Base: aut}.Offset(16*3), X3)
340 MOVOU(Mem{Base: aut}.Offset(16*4), X4)
341 MOVOU(Mem{Base: aut}.Offset(16*5), X5)
342 MOVOU(Mem{Base: aut}.Offset(16*6), X6)
343 MOVOU(Mem{Base: aut}.Offset(16*7), X7)
344 LEAQ(Mem{Base: aut}.Offset(16*8), aut)
345 PSHUFB(BSWAP, X0)
346 PSHUFB(BSWAP, X1)
347 PSHUFB(BSWAP, X2)
348 PSHUFB(BSWAP, X3)
349 PSHUFB(BSWAP, X4)
350 PSHUFB(BSWAP, X5)
351 PSHUFB(BSWAP, X6)
352 PSHUFB(BSWAP, X7)
353 PXOR(ACC0, X0)
354
355 MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
356 MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
357 MOVOU(ACC0, ACC1)
358 PSHUFD(Imm(78), X0, T1)
359 PXOR(X0, T1)
360 PCLMULQDQ(Imm(0x00), X0, ACC0)
361 PCLMULQDQ(Imm(0x11), X0, ACC1)
362 PCLMULQDQ(Imm(0x00), T1, ACCM)
363
364 mulRoundAAD(X1, 1, pTbl)
365 mulRoundAAD(X2, 2, pTbl)
366 mulRoundAAD(X3, 3, pTbl)
367 mulRoundAAD(X4, 4, pTbl)
368 mulRoundAAD(X5, 5, pTbl)
369 mulRoundAAD(X6, 6, pTbl)
370 mulRoundAAD(X7, 7, pTbl)
371
372 PXOR(ACC0, ACCM)
373 PXOR(ACC1, ACCM)
374 MOVOU(ACCM, T0)
375 PSRLDQ(Imm(8), ACCM)
376 PSLLDQ(Imm(8), T0)
377 PXOR(ACCM, ACC1)
378 PXOR(T0, ACC0)
379 reduceRound(ACC0)
380 reduceRound(ACC0)
381 PXOR(ACC1, ACC0)
382 JMP(LabelRef("dataOctaLoop"))
383 }
384
385 func startSinglesLoop(pTbl GPPhysical) {
386 Label("startSinglesLoop")
387 MOVOU(Mem{Base: pTbl}.Offset(16*14), T1)
388 MOVOU(Mem{Base: pTbl}.Offset(16*15), T2)
389
390 }
391
392 func dataSinglesLoop(aut, autLen GPPhysical) {
393 Label("dataSinglesLoop")
394
395 CMPQ(autLen, Imm(16))
396 JB(LabelRef("dataEnd"))
397 SUBQ(Imm(16), autLen)
398
399 MOVOU(Mem{Base: aut}, B0)
400 }
401
402 func dataMul(aut GPPhysical) {
403 Label("dataMul")
404 PSHUFB(BSWAP, B0)
405 PXOR(ACC0, B0)
406
407 MOVOU(T1, ACC0)
408 MOVOU(T2, ACCM)
409 MOVOU(T1, ACC1)
410
411 PSHUFD(Imm(78), B0, T0)
412 PXOR(B0, T0)
413 PCLMULQDQ(Imm(0x00), B0, ACC0)
414 PCLMULQDQ(Imm(0x11), B0, ACC1)
415 PCLMULQDQ(Imm(0x00), T0, ACCM)
416
417 PXOR(ACC0, ACCM)
418 PXOR(ACC1, ACCM)
419 MOVOU(ACCM, T0)
420 PSRLDQ(Imm(8), ACCM)
421 PSLLDQ(Imm(8), T0)
422 PXOR(ACCM, ACC1)
423 PXOR(T0, ACC0)
424
425 MOVOU(POLY, T0)
426 PCLMULQDQ(Imm(0x01), ACC0, T0)
427 PSHUFD(Imm(78), ACC0, ACC0)
428 PXOR(T0, ACC0)
429
430 MOVOU(POLY, T0)
431 PCLMULQDQ(Imm(0x01), ACC0, T0)
432 PSHUFD(Imm(78), ACC0, ACC0)
433 PXOR(T0, ACC0)
434 PXOR(ACC1, ACC0)
435
436 LEAQ(Mem{Base: aut}.Offset(16), aut)
437
438 JMP(LabelRef("dataSinglesLoop"))
439 }
440
441 func dataEnd(aut, autLen GPPhysical) {
442 Label("dataEnd")
443
444 TESTQ(autLen, autLen)
445 JEQ(LabelRef("dataBail"))
446
447 PXOR(B0, B0)
448
449 LEAQ(Mem{Base: aut, Index: autLen, Scale: 1}.Offset(-1), aut)
450 }
451
452 func dataLoadLoop(aut, autLen GPPhysical) {
453 Label("dataLoadLoop")
454
455 PSLLDQ(Imm(1), B0)
456 PINSRB(Imm(0), Mem{Base: aut}, B0)
457
458 LEAQ(Mem{Base: aut}.Offset(-1), aut)
459 DECQ(autLen)
460 JNE(LabelRef("dataLoadLoop"))
461
462 JMP(LabelRef("dataMul"))
463 }
464
465 func dataBail(tPtr GPPhysical) {
466 Label("dataBail")
467 MOVOU(ACC0, Mem{Base: tPtr})
468 RET()
469 }
470
471 func gcmAesEnc() {
472 Implement("gcmAesEnc")
473 Attributes(0)
474 AllocLocal(256)
475
476 var (
477 pTbl GPPhysical = RDI
478 ctx = RDX
479 ctrPtr = RCX
480 ptx = RSI
481 ks = RAX
482 tPtr = R8
483 ptxLen = R9
484 aluCTR = R10L
485 aluTMP = R11L
486 aluK = R12L
487 NR = R13
488 )
489
490 Load(Param("productTable"), pTbl)
491 Load(Param("dst").Base(), ctx)
492 Load(Param("src").Base(), ptx)
493 Load(Param("src").Len(), ptxLen)
494 Load(Param("ctr"), ctrPtr)
495 Load(Param("T"), tPtr)
496 Load(Param("ks").Base(), ks)
497 Load(Param("ks").Len(), NR)
498
499 SHRQ(Imm(2), NR)
500 DECQ(NR)
501
502 bswapMask := bswapMask_DATA()
503 gcmPoly := gcmPoly_DATA()
504 MOVOU(bswapMask, BSWAP)
505 MOVOU(gcmPoly, POLY)
506
507 MOVOU(Mem{Base: tPtr}, ACC0)
508 PXOR(ACC1, ACC1)
509 PXOR(ACCM, ACCM)
510 MOVOU(Mem{Base: ctrPtr}, B0)
511 MOVL(Mem{Base: ctrPtr}.Offset(3*4), aluCTR)
512 MOVOU(Mem{Base: ks}, T0)
513 MOVL(Mem{Base: ks}.Offset(3*4), aluK)
514 BSWAPL(aluCTR)
515 BSWAPL(aluK)
516
517 PXOR(B0, T0)
518 MOVOU(T0, Mem{Base: SP}.Offset(8*16+0*16))
519 incrementEnc(0, aluCTR, aluTMP, aluK)
520
521 CMPQ(ptxLen, Imm(128))
522 JB(LabelRef("gcmAesEncSingles"))
523 SUBQ(Imm(128), ptxLen)
524
525 Comment("We have at least 8 blocks to encrypt, prepare the rest of the counters")
526 MOVOU(T0, Mem{Base: SP}.Offset(8*16+1*16))
527 incrementEnc(1, aluCTR, aluTMP, aluK)
528 MOVOU(T0, Mem{Base: SP}.Offset(8*16+2*16))
529 incrementEnc(2, aluCTR, aluTMP, aluK)
530 MOVOU(T0, Mem{Base: SP}.Offset(8*16+3*16))
531 incrementEnc(3, aluCTR, aluTMP, aluK)
532 MOVOU(T0, Mem{Base: SP}.Offset(8*16+4*16))
533 incrementEnc(4, aluCTR, aluTMP, aluK)
534 MOVOU(T0, Mem{Base: SP}.Offset(8*16+5*16))
535 incrementEnc(5, aluCTR, aluTMP, aluK)
536 MOVOU(T0, Mem{Base: SP}.Offset(8*16+6*16))
537 incrementEnc(6, aluCTR, aluTMP, aluK)
538 MOVOU(T0, Mem{Base: SP}.Offset(8*16+7*16))
539 incrementEnc(7, aluCTR, aluTMP, aluK)
540
541 MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
542 MOVOU(Mem{Base: SP}.Offset(8*16+1*16), B1)
543 MOVOU(Mem{Base: SP}.Offset(8*16+2*16), B2)
544 MOVOU(Mem{Base: SP}.Offset(8*16+3*16), B3)
545 MOVOU(Mem{Base: SP}.Offset(8*16+4*16), B4)
546 MOVOU(Mem{Base: SP}.Offset(8*16+5*16), B5)
547 MOVOU(Mem{Base: SP}.Offset(8*16+6*16), B6)
548 MOVOU(Mem{Base: SP}.Offset(8*16+7*16), B7)
549
550 aesRound(1, ks)
551 incrementEnc(0, aluCTR, aluTMP, aluK)
552 aesRound(2, ks)
553 incrementEnc(1, aluCTR, aluTMP, aluK)
554 aesRound(3, ks)
555 incrementEnc(2, aluCTR, aluTMP, aluK)
556 aesRound(4, ks)
557 incrementEnc(3, aluCTR, aluTMP, aluK)
558 aesRound(5, ks)
559 incrementEnc(4, aluCTR, aluTMP, aluK)
560 aesRound(6, ks)
561 incrementEnc(5, aluCTR, aluTMP, aluK)
562 aesRound(7, ks)
563 incrementEnc(6, aluCTR, aluTMP, aluK)
564 aesRound(8, ks)
565 incrementEnc(7, aluCTR, aluTMP, aluK)
566 aesRound(9, ks)
567 MOVOU(Mem{Base: ks}.Offset(16*10), T0)
568 CMPQ(NR, Imm(12))
569 JB(LabelRef("encLast1"))
570 aesRnd(T0)
571 aesRound(11, ks)
572 MOVOU(Mem{Base: ks}.Offset(16*12), T0)
573 JE(LabelRef("encLast1"))
574 aesRnd(T0)
575 aesRound(13, ks)
576 MOVOU(Mem{Base: ks}.Offset(16*14), T0)
577
578 encLast1(ctx, ptx)
579 gcmAesEncOctetsLoop(pTbl, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
580 encLast2(ctx, ptx)
581 gcmAesEncOctetsEnd(pTbl, ptxLen, aluCTR)
582 gcmAesEncSingles(pTbl, ks)
583 gcmAesEncSinglesLoop(ks, ptxLen, aluCTR, aluTMP, aluK, NR)
584 encLast3(pTbl, ctx, ptx)
585 gcmAesEncTail(ks, ptxLen, NR)
586 encLast4(ptx, ptxLen, aluCTR, aluTMP)
587 ptxLoadLoop(pTbl, ctx, ptx, ptxLen)
588 gcmAesEncDone(tPtr)
589 }
590
591 func incrementEnc(i int, aluCTR, aluTMP, aluK GPPhysical) {
592 ADDL(Imm(1), aluCTR)
593 MOVL(aluCTR, aluTMP)
594 XORL(aluK, aluTMP)
595 BSWAPL(aluTMP)
596 MOVL(aluTMP, Mem{Base: SP}.Offset(3*4+8*16+i*16))
597 }
598
599 func aesRnd(k VecPhysical) {
600 AESENC(k, B0)
601 AESENC(k, B1)
602 AESENC(k, B2)
603 AESENC(k, B3)
604 AESENC(k, B4)
605 AESENC(k, B5)
606 AESENC(k, B6)
607 AESENC(k, B7)
608 }
609
610 func aesRound(i int, ks GPPhysical) {
611
612 MOVOU(Mem{Base: ks}.Offset(16*i), T0)
613 AESENC(T0, B0)
614 AESENC(T0, B1)
615 AESENC(T0, B2)
616 AESENC(T0, B3)
617 AESENC(T0, B4)
618 AESENC(T0, B5)
619 AESENC(T0, B6)
620 AESENC(T0, B7)
621 }
622
623 func aesRndLast(k VecPhysical) {
624 AESENCLAST(k, B0)
625 AESENCLAST(k, B1)
626 AESENCLAST(k, B2)
627 AESENCLAST(k, B3)
628 AESENCLAST(k, B4)
629 AESENCLAST(k, B5)
630 AESENCLAST(k, B6)
631 AESENCLAST(k, B7)
632 }
633
634 func combinedRound(i int, pTbl, ks GPPhysical) {
635 MOVOU(Mem{Base: ks}.Offset(16*i), T0)
636 AESENC(T0, B0)
637 AESENC(T0, B1)
638 AESENC(T0, B2)
639 AESENC(T0, B3)
640 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
641 MOVOU(T1, T2)
642 AESENC(T0, B4)
643 AESENC(T0, B5)
644 AESENC(T0, B6)
645 AESENC(T0, B7)
646 MOVOU(Mem{Base: SP}.Offset(16*i), T0)
647 PCLMULQDQ(Imm(0x00), T0, T1)
648 PXOR(T1, ACC0)
649 PSHUFD(Imm(78), T0, T1)
650 PCLMULQDQ(Imm(0x11), T0, T2)
651 PXOR(T1, T0)
652 PXOR(T2, ACC1)
653 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T2)
654 PCLMULQDQ(Imm(0x00), T2, T0)
655 PXOR(T0, ACCM)
656 }
657
658 func mulRound(i int, pTbl GPPhysical) {
659 MOVOU(Mem{Base: SP}.Offset(16*i), T0)
660 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
661 MOVOU(T1, T2)
662 PCLMULQDQ(Imm(0x00), T0, T1)
663 PXOR(T1, ACC0)
664 PCLMULQDQ(Imm(0x11), T0, T2)
665 PXOR(T2, ACC1)
666 PSHUFD(Imm(78), T0, T1)
667 PXOR(T1, T0)
668 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T1)
669 PCLMULQDQ(Imm(0x00), T0, T1)
670 PXOR(T1, ACCM)
671 }
672
673 func encLast1(ctx, ptx GPPhysical) {
674 Label("encLast1")
675 aesRndLast(T0)
676
677 MOVOU(Mem{Base: ptx}.Offset(16*0), T0)
678 PXOR(T0, B0)
679 MOVOU(Mem{Base: ptx}.Offset(16*1), T0)
680 PXOR(T0, B1)
681 MOVOU(Mem{Base: ptx}.Offset(16*2), T0)
682 PXOR(T0, B2)
683 MOVOU(Mem{Base: ptx}.Offset(16*3), T0)
684 PXOR(T0, B3)
685 MOVOU(Mem{Base: ptx}.Offset(16*4), T0)
686 PXOR(T0, B4)
687 MOVOU(Mem{Base: ptx}.Offset(16*5), T0)
688 PXOR(T0, B5)
689 MOVOU(Mem{Base: ptx}.Offset(16*6), T0)
690 PXOR(T0, B6)
691 MOVOU(Mem{Base: ptx}.Offset(16*7), T0)
692 PXOR(T0, B7)
693
694 MOVOU(B0, Mem{Base: ctx}.Offset(16*0))
695 PSHUFB(BSWAP, B0)
696 PXOR(ACC0, B0)
697 MOVOU(B1, Mem{Base: ctx}.Offset(16*1))
698 PSHUFB(BSWAP, B1)
699 MOVOU(B2, Mem{Base: ctx}.Offset(16*2))
700 PSHUFB(BSWAP, B2)
701 MOVOU(B3, Mem{Base: ctx}.Offset(16*3))
702 PSHUFB(BSWAP, B3)
703 MOVOU(B4, Mem{Base: ctx}.Offset(16*4))
704 PSHUFB(BSWAP, B4)
705 MOVOU(B5, Mem{Base: ctx}.Offset(16*5))
706 PSHUFB(BSWAP, B5)
707 MOVOU(B6, Mem{Base: ctx}.Offset(16*6))
708 PSHUFB(BSWAP, B6)
709 MOVOU(B7, Mem{Base: ctx}.Offset(16*7))
710 PSHUFB(BSWAP, B7)
711
712 MOVOU(B0, Mem{Base: SP}.Offset(16*0))
713 MOVOU(B1, Mem{Base: SP}.Offset(16*1))
714 MOVOU(B2, Mem{Base: SP}.Offset(16*2))
715 MOVOU(B3, Mem{Base: SP}.Offset(16*3))
716 MOVOU(B4, Mem{Base: SP}.Offset(16*4))
717 MOVOU(B5, Mem{Base: SP}.Offset(16*5))
718 MOVOU(B6, Mem{Base: SP}.Offset(16*6))
719 MOVOU(B7, Mem{Base: SP}.Offset(16*7))
720
721 LEAQ(Mem{Base: ptx}.Offset(128), ptx)
722 LEAQ(Mem{Base: ctx}.Offset(128), ctx)
723 }
724
725 func gcmAesEncOctetsLoop(pTbl, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
726 Label("gcmAesEncOctetsLoop")
727
728 CMPQ(ptxLen, Imm(128))
729 JB(LabelRef("gcmAesEncOctetsEnd"))
730 SUBQ(Imm(128), ptxLen)
731
732 MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
733 MOVOU(Mem{Base: SP}.Offset(8*16+1*16), B1)
734 MOVOU(Mem{Base: SP}.Offset(8*16+2*16), B2)
735 MOVOU(Mem{Base: SP}.Offset(8*16+3*16), B3)
736 MOVOU(Mem{Base: SP}.Offset(8*16+4*16), B4)
737 MOVOU(Mem{Base: SP}.Offset(8*16+5*16), B5)
738 MOVOU(Mem{Base: SP}.Offset(8*16+6*16), B6)
739 MOVOU(Mem{Base: SP}.Offset(8*16+7*16), B7)
740
741 MOVOU(Mem{Base: SP}.Offset(16*0), T0)
742 PSHUFD(Imm(78), T0, T1)
743 PXOR(T0, T1)
744
745 MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
746 MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
747 MOVOU(ACC0, ACC1)
748
749 PCLMULQDQ(Imm(0x00), T1, ACCM)
750 PCLMULQDQ(Imm(0x00), T0, ACC0)
751 PCLMULQDQ(Imm(0x11), T0, ACC1)
752
753 combinedRound(1, pTbl, ks)
754 incrementEnc(0, aluCTR, aluTMP, aluK)
755 combinedRound(2, pTbl, ks)
756 incrementEnc(1, aluCTR, aluTMP, aluK)
757 combinedRound(3, pTbl, ks)
758 incrementEnc(2, aluCTR, aluTMP, aluK)
759 combinedRound(4, pTbl, ks)
760 incrementEnc(3, aluCTR, aluTMP, aluK)
761 combinedRound(5, pTbl, ks)
762 incrementEnc(4, aluCTR, aluTMP, aluK)
763 combinedRound(6, pTbl, ks)
764 incrementEnc(5, aluCTR, aluTMP, aluK)
765 combinedRound(7, pTbl, ks)
766 incrementEnc(6, aluCTR, aluTMP, aluK)
767
768 aesRound(8, ks)
769 incrementEnc(7, aluCTR, aluTMP, aluK)
770
771 PXOR(ACC0, ACCM)
772 PXOR(ACC1, ACCM)
773 MOVOU(ACCM, T0)
774 PSRLDQ(Imm(8), ACCM)
775 PSLLDQ(Imm(8), T0)
776 PXOR(ACCM, ACC1)
777 PXOR(T0, ACC0)
778
779 reduceRound(ACC0)
780 aesRound(9, ks)
781
782 reduceRound(ACC0)
783 PXOR(ACC1, ACC0)
784
785 MOVOU(Mem{Base: ks}.Offset(16*10), T0)
786 CMPQ(NR, Imm(12))
787 JB(LabelRef("encLast2"))
788 aesRnd(T0)
789 aesRound(11, ks)
790 MOVOU(Mem{Base: ks}.Offset(16*12), T0)
791 JE(LabelRef("encLast2"))
792 aesRnd(T0)
793 aesRound(13, ks)
794 MOVOU(Mem{Base: ks}.Offset(16*14), T0)
795 }
796
797 func encLast2(ctx, ptx GPPhysical) {
798 Label("encLast2")
799 aesRndLast(T0)
800
801 MOVOU(Mem{Base: ptx}.Offset(16*0), T0)
802 PXOR(T0, B0)
803 MOVOU(Mem{Base: ptx}.Offset(16*1), T0)
804 PXOR(T0, B1)
805 MOVOU(Mem{Base: ptx}.Offset(16*2), T0)
806 PXOR(T0, B2)
807 MOVOU(Mem{Base: ptx}.Offset(16*3), T0)
808 PXOR(T0, B3)
809 MOVOU(Mem{Base: ptx}.Offset(16*4), T0)
810 PXOR(T0, B4)
811 MOVOU(Mem{Base: ptx}.Offset(16*5), T0)
812 PXOR(T0, B5)
813 MOVOU(Mem{Base: ptx}.Offset(16*6), T0)
814 PXOR(T0, B6)
815 MOVOU(Mem{Base: ptx}.Offset(16*7), T0)
816 PXOR(T0, B7)
817
818 MOVOU(B0, Mem{Base: ctx}.Offset(16*0))
819 PSHUFB(BSWAP, B0)
820 PXOR(ACC0, B0)
821 MOVOU(B1, Mem{Base: ctx}.Offset(16*1))
822 PSHUFB(BSWAP, B1)
823 MOVOU(B2, Mem{Base: ctx}.Offset(16*2))
824 PSHUFB(BSWAP, B2)
825 MOVOU(B3, Mem{Base: ctx}.Offset(16*3))
826 PSHUFB(BSWAP, B3)
827 MOVOU(B4, Mem{Base: ctx}.Offset(16*4))
828 PSHUFB(BSWAP, B4)
829 MOVOU(B5, Mem{Base: ctx}.Offset(16*5))
830 PSHUFB(BSWAP, B5)
831 MOVOU(B6, Mem{Base: ctx}.Offset(16*6))
832 PSHUFB(BSWAP, B6)
833 MOVOU(B7, Mem{Base: ctx}.Offset(16*7))
834 PSHUFB(BSWAP, B7)
835
836 MOVOU(B0, Mem{Base: SP}.Offset(16*0))
837 MOVOU(B1, Mem{Base: SP}.Offset(16*1))
838 MOVOU(B2, Mem{Base: SP}.Offset(16*2))
839 MOVOU(B3, Mem{Base: SP}.Offset(16*3))
840 MOVOU(B4, Mem{Base: SP}.Offset(16*4))
841 MOVOU(B5, Mem{Base: SP}.Offset(16*5))
842 MOVOU(B6, Mem{Base: SP}.Offset(16*6))
843 MOVOU(B7, Mem{Base: SP}.Offset(16*7))
844
845 LEAQ(Mem{Base: ptx}.Offset(128), ptx)
846 LEAQ(Mem{Base: ctx}.Offset(128), ctx)
847
848 JMP(LabelRef("gcmAesEncOctetsLoop"))
849 }
850
851 func gcmAesEncOctetsEnd(pTbl, ptxLen, aluCTR GPPhysical) {
852 Label("gcmAesEncOctetsEnd")
853
854 MOVOU(Mem{Base: SP}.Offset(16*0), T0)
855 MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
856 MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
857 MOVOU(ACC0, ACC1)
858 PSHUFD(Imm(78), T0, T1)
859 PXOR(T0, T1)
860 PCLMULQDQ(Imm(0x00), T0, ACC0)
861 PCLMULQDQ(Imm(0x11), T0, ACC1)
862 PCLMULQDQ(Imm(0x00), T1, ACCM)
863
864 mulRound(1, pTbl)
865 mulRound(2, pTbl)
866 mulRound(3, pTbl)
867 mulRound(4, pTbl)
868 mulRound(5, pTbl)
869 mulRound(6, pTbl)
870 mulRound(7, pTbl)
871
872 PXOR(ACC0, ACCM)
873 PXOR(ACC1, ACCM)
874 MOVOU(ACCM, T0)
875 PSRLDQ(Imm(8), ACCM)
876 PSLLDQ(Imm(8), T0)
877 PXOR(ACCM, ACC1)
878 PXOR(T0, ACC0)
879
880 reduceRound(ACC0)
881 reduceRound(ACC0)
882 PXOR(ACC1, ACC0)
883
884 TESTQ(ptxLen, ptxLen)
885 JE(LabelRef("gcmAesEncDone"))
886
887
888
889 Instruction(&ir.Instruction{Opcode: "SUBQ", Operands: []Op{Imm(7), aluCTR}})
890 }
891
892 func gcmAesEncSingles(pTbl, ks GPPhysical) {
893 Label("gcmAesEncSingles")
894
895 MOVOU(Mem{Base: ks}.Offset(16*1), B1)
896 MOVOU(Mem{Base: ks}.Offset(16*2), B2)
897 MOVOU(Mem{Base: ks}.Offset(16*3), B3)
898 MOVOU(Mem{Base: ks}.Offset(16*4), B4)
899 MOVOU(Mem{Base: ks}.Offset(16*5), B5)
900 MOVOU(Mem{Base: ks}.Offset(16*6), B6)
901 MOVOU(Mem{Base: ks}.Offset(16*7), B7)
902
903 MOVOU(Mem{Base: pTbl}.Offset(16*14), T2)
904 }
905
906 func gcmAesEncSinglesLoop(ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
907 Label("gcmAesEncSinglesLoop")
908
909 CMPQ(ptxLen, Imm(16))
910 JB(LabelRef("gcmAesEncTail"))
911 SUBQ(Imm(16), ptxLen)
912
913 MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
914 incrementEnc(0, aluCTR, aluTMP, aluK)
915
916 AESENC(B1, B0)
917 AESENC(B2, B0)
918 AESENC(B3, B0)
919 AESENC(B4, B0)
920 AESENC(B5, B0)
921 AESENC(B6, B0)
922 AESENC(B7, B0)
923 MOVOU(Mem{Base: ks}.Offset(16*8), T0)
924 AESENC(T0, B0)
925 MOVOU(Mem{Base: ks}.Offset(16*9), T0)
926 AESENC(T0, B0)
927 MOVOU(Mem{Base: ks}.Offset(16*10), T0)
928 CMPQ(NR, Imm(12))
929 JB(LabelRef("encLast3"))
930 AESENC(T0, B0)
931 MOVOU(Mem{Base: ks}.Offset(16*11), T0)
932 AESENC(T0, B0)
933 MOVOU(Mem{Base: ks}.Offset(16*12), T0)
934 JE(LabelRef("encLast3"))
935 AESENC(T0, B0)
936 MOVOU(Mem{Base: ks}.Offset(16*13), T0)
937 AESENC(T0, B0)
938 MOVOU(Mem{Base: ks}.Offset(16*14), T0)
939 }
940
941 func encLast3(pTbl, ctx, ptx GPPhysical) {
942 Label("encLast3")
943 AESENCLAST(T0, B0)
944
945 MOVOU(Mem{Base: ptx}, T0)
946 PXOR(T0, B0)
947 MOVOU(B0, Mem{Base: ctx})
948
949 PSHUFB(BSWAP, B0)
950 PXOR(ACC0, B0)
951
952 MOVOU(T2, ACC0)
953 MOVOU(T2, ACC1)
954 MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
955
956 PSHUFD(Imm(78), B0, T0)
957 PXOR(B0, T0)
958 PCLMULQDQ(Imm(0x00), B0, ACC0)
959 PCLMULQDQ(Imm(0x11), B0, ACC1)
960 PCLMULQDQ(Imm(0x00), T0, ACCM)
961
962 PXOR(ACC0, ACCM)
963 PXOR(ACC1, ACCM)
964 MOVOU(ACCM, T0)
965 PSRLDQ(Imm(8), ACCM)
966 PSLLDQ(Imm(8), T0)
967 PXOR(ACCM, ACC1)
968 PXOR(T0, ACC0)
969
970 reduceRound(ACC0)
971 reduceRound(ACC0)
972 PXOR(ACC1, ACC0)
973
974 LEAQ(Mem{Base: ptx}.Offset(16*1), ptx)
975 LEAQ(Mem{Base: ctx}.Offset(16*1), ctx)
976
977 JMP(LabelRef("gcmAesEncSinglesLoop"))
978 }
979
980 func gcmAesEncTail(ks, ptxLen, NR GPPhysical) {
981 Label("gcmAesEncTail")
982 TESTQ(ptxLen, ptxLen)
983 JE(LabelRef("gcmAesEncDone"))
984
985 MOVOU(Mem{Base: SP}.Offset(8*16+0*16), B0)
986 AESENC(B1, B0)
987 AESENC(B2, B0)
988 AESENC(B3, B0)
989 AESENC(B4, B0)
990 AESENC(B5, B0)
991 AESENC(B6, B0)
992 AESENC(B7, B0)
993 MOVOU(Mem{Base: ks}.Offset(16*8), T0)
994 AESENC(T0, B0)
995 MOVOU(Mem{Base: ks}.Offset(16*9), T0)
996 AESENC(T0, B0)
997 MOVOU(Mem{Base: ks}.Offset(16*10), T0)
998 CMPQ(NR, Imm(12))
999 JB(LabelRef("encLast4"))
1000 AESENC(T0, B0)
1001 MOVOU(Mem{Base: ks}.Offset(16*11), T0)
1002 AESENC(T0, B0)
1003 MOVOU(Mem{Base: ks}.Offset(16*12), T0)
1004 JE(LabelRef("encLast4"))
1005 AESENC(T0, B0)
1006 MOVOU(Mem{Base: ks}.Offset(16*13), T0)
1007 AESENC(T0, B0)
1008 MOVOU(Mem{Base: ks}.Offset(16*14), T0)
1009 }
1010
1011 func encLast4(ptx, ptxLen, aluCTR, aluTMP GPPhysical) {
1012 Label("encLast4")
1013 AESENCLAST(T0, B0)
1014 MOVOU(B0, T0)
1015
1016 LEAQ(Mem{Base: ptx, Index: ptxLen, Scale: 1}.Offset(-1), ptx)
1017
1018
1019
1020 Instruction(&ir.Instruction{Opcode: "MOVQ", Operands: []Op{ptxLen, aluTMP}})
1021
1022
1023 Instruction(&ir.Instruction{Opcode: "SHLQ", Operands: []Op{Imm(4), aluTMP}})
1024
1025 andMask := andMask_DATA()
1026
1027
1028 Instruction(&ir.Instruction{Opcode: "LEAQ", Operands: []Op{andMask, aluCTR}})
1029 MOVOU(Mem{Base: aluCTR, Index: aluTMP, Scale: 1}.Offset(-16), T1)
1030
1031 PXOR(B0, B0)
1032 }
1033
1034 func ptxLoadLoop(pTbl, ctx, ptx, ptxLen GPPhysical) {
1035 Label("ptxLoadLoop")
1036 PSLLDQ(Imm(1), B0)
1037 PINSRB(Imm(0), Mem{Base: ptx}, B0)
1038 LEAQ(Mem{Base: ptx}.Offset(-1), ptx)
1039 DECQ(ptxLen)
1040 JNE(LabelRef("ptxLoadLoop"))
1041
1042 PXOR(T0, B0)
1043 PAND(T1, B0)
1044 MOVOU(B0, Mem{Base: ctx})
1045
1046 PSHUFB(BSWAP, B0)
1047 PXOR(ACC0, B0)
1048
1049 MOVOU(T2, ACC0)
1050 MOVOU(T2, ACC1)
1051 MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
1052
1053 PSHUFD(Imm(78), B0, T0)
1054 PXOR(B0, T0)
1055 PCLMULQDQ(Imm(0x00), B0, ACC0)
1056 PCLMULQDQ(Imm(0x11), B0, ACC1)
1057 PCLMULQDQ(Imm(0x00), T0, ACCM)
1058
1059 PXOR(ACC0, ACCM)
1060 PXOR(ACC1, ACCM)
1061 MOVOU(ACCM, T0)
1062 PSRLDQ(Imm(8), ACCM)
1063 PSLLDQ(Imm(8), T0)
1064 PXOR(ACCM, ACC1)
1065 PXOR(T0, ACC0)
1066
1067 reduceRound(ACC0)
1068 reduceRound(ACC0)
1069 PXOR(ACC1, ACC0)
1070 }
1071
1072 func gcmAesEncDone(tPtr GPPhysical) {
1073 Label("gcmAesEncDone")
1074 MOVOU(ACC0, Mem{Base: tPtr})
1075 RET()
1076 }
1077
1078 func gcmAesDec() {
1079 Implement("gcmAesDec")
1080 Attributes(0)
1081 AllocLocal(128)
1082
1083 var (
1084 pTbl GPPhysical = RDI
1085 ctx = RDX
1086 ctrPtr = RCX
1087 ptx = RSI
1088 ks = RAX
1089 tPtr = R8
1090 ptxLen = R9
1091 aluCTR = R10L
1092 aluTMP = R11L
1093 aluK = R12L
1094 NR = R13
1095 )
1096
1097 Load(Param("productTable"), pTbl)
1098 Load(Param("dst").Base(), ptx)
1099 Load(Param("src").Base(), ctx)
1100 Load(Param("src").Len(), ptxLen)
1101 Load(Param("ctr"), ctrPtr)
1102 Load(Param("T"), tPtr)
1103 Load(Param("ks").Base(), ks)
1104 Load(Param("ks").Len(), NR)
1105
1106 SHRQ(Imm(2), NR)
1107 DECQ(NR)
1108
1109 bswapMask := bswapMask_DATA()
1110 gcmPoly := gcmPoly_DATA()
1111 MOVOU(bswapMask, BSWAP)
1112 MOVOU(gcmPoly, POLY)
1113
1114 MOVOU(Mem{Base: tPtr}, ACC0)
1115 PXOR(ACC1, ACC1)
1116 PXOR(ACCM, ACCM)
1117 MOVOU(Mem{Base: ctrPtr}, B0)
1118 MOVL(Mem{Base: ctrPtr}.Offset(3*4), aluCTR)
1119 MOVOU(Mem{Base: ks}, T0)
1120 MOVL(Mem{Base: ks}.Offset(3*4), aluK)
1121 BSWAPL(aluCTR)
1122 BSWAPL(aluK)
1123
1124 PXOR(B0, T0)
1125 MOVOU(T0, Mem{Base: SP}.Offset(0*16))
1126 incrementDec(0, aluCTR, aluTMP, aluK)
1127
1128 CMPQ(ptxLen, Imm(128))
1129 JB(LabelRef("gcmAesDecSingles"))
1130
1131 MOVOU(T0, Mem{Base: SP}.Offset(1*16))
1132 incrementDec(1, aluCTR, aluTMP, aluK)
1133 MOVOU(T0, Mem{Base: SP}.Offset(2*16))
1134 incrementDec(2, aluCTR, aluTMP, aluK)
1135 MOVOU(T0, Mem{Base: SP}.Offset(3*16))
1136 incrementDec(3, aluCTR, aluTMP, aluK)
1137 MOVOU(T0, Mem{Base: SP}.Offset(4*16))
1138 incrementDec(4, aluCTR, aluTMP, aluK)
1139 MOVOU(T0, Mem{Base: SP}.Offset(5*16))
1140 incrementDec(5, aluCTR, aluTMP, aluK)
1141 MOVOU(T0, Mem{Base: SP}.Offset(6*16))
1142 incrementDec(6, aluCTR, aluTMP, aluK)
1143 MOVOU(T0, Mem{Base: SP}.Offset(7*16))
1144 incrementDec(7, aluCTR, aluTMP, aluK)
1145
1146 gcmAesDecOctetsLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
1147 decLast1(ctx, ptx)
1148 gcmAesDecEndOctets(aluCTR)
1149 gcmAesDecSingles(pTbl, ks)
1150 gcmAesDecSinglesLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
1151 decLast2(ctx, ptx)
1152 gcmAesDecTail(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR)
1153 decLast3()
1154 ptxStoreLoop(ptx, ptxLen)
1155 gcmAesDecDone(tPtr)
1156 }
1157
1158 func incrementDec(i int, aluCTR, aluTMP, aluK GPPhysical) {
1159 ADDL(Imm(1), aluCTR)
1160 MOVL(aluCTR, aluTMP)
1161 XORL(aluK, aluTMP)
1162 BSWAPL(aluTMP)
1163 MOVL(aluTMP, Mem{Base: SP}.Offset(3*4+i*16))
1164 }
1165
1166 func combinedDecRound(i int, pTbl, ctx, ks GPPhysical) {
1167 MOVOU(Mem{Base: ks}.Offset(16*i), T0)
1168 AESENC(T0, B0)
1169 AESENC(T0, B1)
1170 AESENC(T0, B2)
1171 AESENC(T0, B3)
1172 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2)), T1)
1173 MOVOU(T1, T2)
1174 AESENC(T0, B4)
1175 AESENC(T0, B5)
1176 AESENC(T0, B6)
1177 AESENC(T0, B7)
1178 MOVOU(Mem{Base: ctx}.Offset(16*i), T0)
1179 PSHUFB(BSWAP, T0)
1180 PCLMULQDQ(Imm(0x00), T0, T1)
1181 PXOR(T1, ACC0)
1182 PSHUFD(Imm(78), T0, T1)
1183 PCLMULQDQ(Imm(0x11), T0, T2)
1184 PXOR(T1, T0)
1185 PXOR(T2, ACC1)
1186 MOVOU(Mem{Base: pTbl}.Offset(16*(i*2+1)), T2)
1187 PCLMULQDQ(Imm(0x00), T2, T0)
1188 PXOR(T0, ACCM)
1189 }
1190
1191 func gcmAesDecOctetsLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
1192 Label("gcmAesDecOctetsLoop")
1193
1194 CMPQ(ptxLen, Imm(128))
1195 JB(LabelRef("gcmAesDecEndOctets"))
1196 SUBQ(Imm(128), ptxLen)
1197
1198 MOVOU(Mem{Base: SP}.Offset(0*16), B0)
1199 MOVOU(Mem{Base: SP}.Offset(1*16), B1)
1200 MOVOU(Mem{Base: SP}.Offset(2*16), B2)
1201 MOVOU(Mem{Base: SP}.Offset(3*16), B3)
1202 MOVOU(Mem{Base: SP}.Offset(4*16), B4)
1203 MOVOU(Mem{Base: SP}.Offset(5*16), B5)
1204 MOVOU(Mem{Base: SP}.Offset(6*16), B6)
1205 MOVOU(Mem{Base: SP}.Offset(7*16), B7)
1206
1207 MOVOU(Mem{Base: ctx}.Offset(16*0), T0)
1208 PSHUFB(BSWAP, T0)
1209 PXOR(ACC0, T0)
1210 PSHUFD(Imm(78), T0, T1)
1211 PXOR(T0, T1)
1212
1213 MOVOU(Mem{Base: pTbl}.Offset(16*0), ACC0)
1214 MOVOU(Mem{Base: pTbl}.Offset(16*1), ACCM)
1215 MOVOU(ACC0, ACC1)
1216
1217 PCLMULQDQ(Imm(0x00), T1, ACCM)
1218 PCLMULQDQ(Imm(0x00), T0, ACC0)
1219 PCLMULQDQ(Imm(0x11), T0, ACC1)
1220
1221 combinedDecRound(1, pTbl, ctx, ks)
1222 incrementDec(0, aluCTR, aluTMP, aluK)
1223 combinedDecRound(2, pTbl, ctx, ks)
1224 incrementDec(1, aluCTR, aluTMP, aluK)
1225 combinedDecRound(3, pTbl, ctx, ks)
1226 incrementDec(2, aluCTR, aluTMP, aluK)
1227 combinedDecRound(4, pTbl, ctx, ks)
1228 incrementDec(3, aluCTR, aluTMP, aluK)
1229 combinedDecRound(5, pTbl, ctx, ks)
1230 incrementDec(4, aluCTR, aluTMP, aluK)
1231 combinedDecRound(6, pTbl, ctx, ks)
1232 incrementDec(5, aluCTR, aluTMP, aluK)
1233 combinedDecRound(7, pTbl, ctx, ks)
1234 incrementDec(6, aluCTR, aluTMP, aluK)
1235
1236 aesRound(8, ks)
1237 incrementDec(7, aluCTR, aluTMP, aluK)
1238
1239 PXOR(ACC0, ACCM)
1240 PXOR(ACC1, ACCM)
1241 MOVOU(ACCM, T0)
1242 PSRLDQ(Imm(8), ACCM)
1243 PSLLDQ(Imm(8), T0)
1244 PXOR(ACCM, ACC1)
1245 PXOR(T0, ACC0)
1246
1247 reduceRound(ACC0)
1248 aesRound(9, ks)
1249
1250 reduceRound(ACC0)
1251 PXOR(ACC1, ACC0)
1252
1253 MOVOU(Mem{Base: ks}.Offset(16*10), T0)
1254 CMPQ(NR, Imm(12))
1255 JB(LabelRef("decLast1"))
1256 aesRnd(T0)
1257 aesRound(11, ks)
1258 MOVOU(Mem{Base: ks}.Offset(16*12), T0)
1259 JE(LabelRef("decLast1"))
1260 aesRnd(T0)
1261 aesRound(13, ks)
1262 MOVOU(Mem{Base: ks}.Offset(16*14), T0)
1263 }
1264
1265 func decLast1(ctx, ptx GPPhysical) {
1266 Label("decLast1")
1267 aesRndLast(T0)
1268
1269 MOVOU(Mem{Base: ctx}.Offset(16*0), T0)
1270 PXOR(T0, B0)
1271 MOVOU(Mem{Base: ctx}.Offset(16*1), T0)
1272 PXOR(T0, B1)
1273 MOVOU(Mem{Base: ctx}.Offset(16*2), T0)
1274 PXOR(T0, B2)
1275 MOVOU(Mem{Base: ctx}.Offset(16*3), T0)
1276 PXOR(T0, B3)
1277 MOVOU(Mem{Base: ctx}.Offset(16*4), T0)
1278 PXOR(T0, B4)
1279 MOVOU(Mem{Base: ctx}.Offset(16*5), T0)
1280 PXOR(T0, B5)
1281 MOVOU(Mem{Base: ctx}.Offset(16*6), T0)
1282 PXOR(T0, B6)
1283 MOVOU(Mem{Base: ctx}.Offset(16*7), T0)
1284 PXOR(T0, B7)
1285
1286 MOVOU(B0, Mem{Base: ptx}.Offset(16*0))
1287 MOVOU(B1, Mem{Base: ptx}.Offset(16*1))
1288 MOVOU(B2, Mem{Base: ptx}.Offset(16*2))
1289 MOVOU(B3, Mem{Base: ptx}.Offset(16*3))
1290 MOVOU(B4, Mem{Base: ptx}.Offset(16*4))
1291 MOVOU(B5, Mem{Base: ptx}.Offset(16*5))
1292 MOVOU(B6, Mem{Base: ptx}.Offset(16*6))
1293 MOVOU(B7, Mem{Base: ptx}.Offset(16*7))
1294
1295 LEAQ(Mem{Base: ptx}.Offset(128), ptx)
1296 LEAQ(Mem{Base: ctx}.Offset(128), ctx)
1297
1298 JMP(LabelRef("gcmAesDecOctetsLoop"))
1299 }
1300
1301 func gcmAesDecEndOctets(aluCTR GPPhysical) {
1302 Label("gcmAesDecEndOctets")
1303
1304
1305 Instruction(&ir.Instruction{Opcode: "SUBQ", Operands: []Op{Imm(7), aluCTR}})
1306 }
1307
1308 func gcmAesDecSingles(pTbl, ks GPPhysical) {
1309 Label("gcmAesDecSingles")
1310
1311 MOVOU(Mem{Base: ks}.Offset(16*1), B1)
1312 MOVOU(Mem{Base: ks}.Offset(16*2), B2)
1313 MOVOU(Mem{Base: ks}.Offset(16*3), B3)
1314 MOVOU(Mem{Base: ks}.Offset(16*4), B4)
1315 MOVOU(Mem{Base: ks}.Offset(16*5), B5)
1316 MOVOU(Mem{Base: ks}.Offset(16*6), B6)
1317 MOVOU(Mem{Base: ks}.Offset(16*7), B7)
1318
1319 MOVOU(Mem{Base: pTbl}.Offset(16*14), T2)
1320 }
1321
1322 func gcmAesDecSinglesLoop(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
1323 Label("gcmAesDecSinglesLoop")
1324
1325 CMPQ(ptxLen, Imm(16))
1326 JB(LabelRef("gcmAesDecTail"))
1327 SUBQ(Imm(16), ptxLen)
1328
1329 MOVOU(Mem{Base: ctx}, B0)
1330 MOVOU(B0, T1)
1331 PSHUFB(BSWAP, B0)
1332 PXOR(ACC0, B0)
1333
1334 MOVOU(T2, ACC0)
1335 MOVOU(T2, ACC1)
1336 MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
1337
1338 PCLMULQDQ(Imm(0x00), B0, ACC0)
1339 PCLMULQDQ(Imm(0x11), B0, ACC1)
1340 PSHUFD(Imm(78), B0, T0)
1341 PXOR(B0, T0)
1342 PCLMULQDQ(Imm(0x00), T0, ACCM)
1343
1344 PXOR(ACC0, ACCM)
1345 PXOR(ACC1, ACCM)
1346 MOVOU(ACCM, T0)
1347 PSRLDQ(Imm(8), ACCM)
1348 PSLLDQ(Imm(8), T0)
1349 PXOR(ACCM, ACC1)
1350 PXOR(T0, ACC0)
1351
1352 reduceRound(ACC0)
1353 reduceRound(ACC0)
1354 PXOR(ACC1, ACC0)
1355
1356 MOVOU(Mem{Base: SP}.Offset(0*16), B0)
1357 incrementDec(0, aluCTR, aluTMP, aluK)
1358 AESENC(B1, B0)
1359 AESENC(B2, B0)
1360 AESENC(B3, B0)
1361 AESENC(B4, B0)
1362 AESENC(B5, B0)
1363 AESENC(B6, B0)
1364 AESENC(B7, B0)
1365 MOVOU(Mem{Base: ks}.Offset(16*8), T0)
1366 AESENC(T0, B0)
1367 MOVOU(Mem{Base: ks}.Offset(16*9), T0)
1368 AESENC(T0, B0)
1369 MOVOU(Mem{Base: ks}.Offset(16*10), T0)
1370 CMPQ(NR, Imm(12))
1371 JB(LabelRef("decLast2"))
1372 AESENC(T0, B0)
1373 MOVOU(Mem{Base: ks}.Offset(16*11), T0)
1374 AESENC(T0, B0)
1375 MOVOU(Mem{Base: ks}.Offset(16*12), T0)
1376 JE(LabelRef("decLast2"))
1377 AESENC(T0, B0)
1378 MOVOU(Mem{Base: ks}.Offset(16*13), T0)
1379 AESENC(T0, B0)
1380 MOVOU(Mem{Base: ks}.Offset(16*14), T0)
1381 }
1382
1383 func decLast2(ctx, ptx GPPhysical) {
1384 Label("decLast2")
1385 AESENCLAST(T0, B0)
1386
1387 PXOR(T1, B0)
1388 MOVOU(B0, Mem{Base: ptx})
1389
1390 LEAQ(Mem{Base: ptx}.Offset(16*1), ptx)
1391 LEAQ(Mem{Base: ctx}.Offset(16*1), ctx)
1392
1393 JMP(LabelRef("gcmAesDecSinglesLoop"))
1394 }
1395
1396 func gcmAesDecTail(pTbl, ctx, ks, ptxLen, aluCTR, aluTMP, aluK, NR GPPhysical) {
1397 Label("gcmAesDecTail")
1398
1399 TESTQ(ptxLen, ptxLen)
1400 JE(LabelRef("gcmAesDecDone"))
1401
1402
1403
1404 Instruction(&ir.Instruction{Opcode: "MOVQ", Operands: []Op{ptxLen, aluTMP}})
1405
1406
1407 Instruction(&ir.Instruction{Opcode: "SHLQ", Operands: []Op{Imm(4), aluTMP}})
1408
1409 andMask := andMask_DATA()
1410
1411
1412 Instruction(&ir.Instruction{Opcode: "LEAQ", Operands: []Op{andMask, aluCTR}})
1413 MOVOU(Mem{Base: aluCTR, Index: aluTMP, Scale: 1}.Offset(-16), T1)
1414
1415 MOVOU(Mem{Base: ctx}, B0)
1416 PAND(T1, B0)
1417
1418 MOVOU(B0, T1)
1419 PSHUFB(BSWAP, B0)
1420 PXOR(ACC0, B0)
1421
1422 MOVOU(Mem{Base: pTbl}.Offset(16*14), ACC0)
1423 MOVOU(Mem{Base: pTbl}.Offset(16*15), ACCM)
1424 MOVOU(ACC0, ACC1)
1425
1426 PCLMULQDQ(Imm(0x00), B0, ACC0)
1427 PCLMULQDQ(Imm(0x11), B0, ACC1)
1428 PSHUFD(Imm(78), B0, T0)
1429 PXOR(B0, T0)
1430 PCLMULQDQ(Imm(0x00), T0, ACCM)
1431
1432 PXOR(ACC0, ACCM)
1433 PXOR(ACC1, ACCM)
1434 MOVOU(ACCM, T0)
1435 PSRLDQ(Imm(8), ACCM)
1436 PSLLDQ(Imm(8), T0)
1437 PXOR(ACCM, ACC1)
1438 PXOR(T0, ACC0)
1439
1440 reduceRound(ACC0)
1441 reduceRound(ACC0)
1442 PXOR(ACC1, ACC0)
1443
1444 MOVOU(Mem{Base: SP}.Offset(0*16), B0)
1445 incrementDec(0, aluCTR, aluTMP, aluK)
1446 AESENC(B1, B0)
1447 AESENC(B2, B0)
1448 AESENC(B3, B0)
1449 AESENC(B4, B0)
1450 AESENC(B5, B0)
1451 AESENC(B6, B0)
1452 AESENC(B7, B0)
1453 MOVOU(Mem{Base: ks}.Offset(16*8), T0)
1454 AESENC(T0, B0)
1455 MOVOU(Mem{Base: ks}.Offset(16*9), T0)
1456 AESENC(T0, B0)
1457 MOVOU(Mem{Base: ks}.Offset(16*10), T0)
1458 CMPQ(NR, Imm(12))
1459 JB(LabelRef("decLast3"))
1460 AESENC(T0, B0)
1461 MOVOU(Mem{Base: ks}.Offset(16*11), T0)
1462 AESENC(T0, B0)
1463 MOVOU(Mem{Base: ks}.Offset(16*12), T0)
1464 JE(LabelRef("decLast3"))
1465 AESENC(T0, B0)
1466 MOVOU(Mem{Base: ks}.Offset(16*13), T0)
1467 AESENC(T0, B0)
1468 MOVOU(Mem{Base: ks}.Offset(16*14), T0)
1469 }
1470
1471 func decLast3() {
1472 Label("decLast3")
1473 AESENCLAST(T0, B0)
1474 PXOR(T1, B0)
1475 }
1476
1477 func ptxStoreLoop(ptx, ptxLen GPPhysical) {
1478 Label("ptxStoreLoop")
1479 PEXTRB(Imm(0), B0, Mem{Base: ptx})
1480 PSRLDQ(Imm(1), B0)
1481 LEAQ(Mem{Base: ptx}.Offset(1), ptx)
1482 DECQ(ptxLen)
1483
1484 JNE(LabelRef("ptxStoreLoop"))
1485 }
1486
1487 func gcmAesDecDone(tPtr GPPhysical) {
1488 Label("gcmAesDecDone")
1489 MOVOU(ACC0, Mem{Base: tPtr})
1490 RET()
1491 }
1492
1493
1494
1495 var bswapMask_DATA_ptr, gcmPoly_DATA_ptr, andMask_DATA_ptr *Mem
1496
1497 func bswapMask_DATA() Mem {
1498 if bswapMask_DATA_ptr != nil {
1499 return *bswapMask_DATA_ptr
1500 }
1501
1502 bswapMask := GLOBL("bswapMask", NOPTR|RODATA)
1503 bswapMask_DATA_ptr = &bswapMask
1504 DATA(0x00, U64(0x08090a0b0c0d0e0f))
1505 DATA(0x08, U64(0x0001020304050607))
1506
1507 return bswapMask
1508 }
1509
1510 func gcmPoly_DATA() Mem {
1511 if gcmPoly_DATA_ptr != nil {
1512 return *gcmPoly_DATA_ptr
1513 }
1514
1515 gcmPoly := GLOBL("gcmPoly", NOPTR|RODATA)
1516 gcmPoly_DATA_ptr = &gcmPoly
1517 DATA(0x00, U64(0x0000000000000001))
1518 DATA(0x08, U64(0xc200000000000000))
1519
1520 return gcmPoly
1521 }
1522
1523 var andMask_K = [30]uint64{
1524 0x00000000000000ff,
1525 0x0000000000000000,
1526 0x000000000000ffff,
1527 0x0000000000000000,
1528 0x0000000000ffffff,
1529 0x0000000000000000,
1530 0x00000000ffffffff,
1531 0x0000000000000000,
1532 0x000000ffffffffff,
1533 0x0000000000000000,
1534 0x0000ffffffffffff,
1535 0x0000000000000000,
1536 0x00ffffffffffffff,
1537 0x0000000000000000,
1538 0xffffffffffffffff,
1539 0x0000000000000000,
1540 0xffffffffffffffff,
1541 0x00000000000000ff,
1542 0xffffffffffffffff,
1543 0x000000000000ffff,
1544 0xffffffffffffffff,
1545 0x0000000000ffffff,
1546 0xffffffffffffffff,
1547 0x00000000ffffffff,
1548 0xffffffffffffffff,
1549 0x000000ffffffffff,
1550 0xffffffffffffffff,
1551 0x0000ffffffffffff,
1552 0xffffffffffffffff,
1553 0x00ffffffffffffff,
1554 }
1555
1556 func andMask_DATA() Mem {
1557 if andMask_DATA_ptr != nil {
1558 return *andMask_DATA_ptr
1559 }
1560 andMask := GLOBL("andMask", NOPTR|RODATA)
1561 andMask_DATA_ptr = &andMask
1562
1563 for i, k := range andMask_K {
1564 DATA(i*8, U64(k))
1565 }
1566
1567 return andMask
1568 }
1569
View as plain text