1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 // This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
8 // The implementation uses some optimization as described in:
9 // [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
10 // Instruction and its Usage for Computing the GCM Mode rev. 2.02
11 // [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
12 // Hardware
13
14 #include "textflag.h"
15
16 #define B0 X0
17 #define B1 X1
18 #define B2 X2
19 #define B3 X3
20 #define B4 X4
21 #define B5 X5
22 #define B6 X6
23 #define B7 X7
24
25 #define ACC0 X8
26 #define ACC1 X9
27 #define ACCM X10
28
29 #define T0 X11
30 #define T1 X12
31 #define T2 X13
32 #define POLY X14
33 #define BSWAP X15
34
35 DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
36 DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
37
38 DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
39 DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
40
41 DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
42 DATA andMask<>+0x08(SB)/8, $0x0000000000000000
43 DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
44 DATA andMask<>+0x18(SB)/8, $0x0000000000000000
45 DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
46 DATA andMask<>+0x28(SB)/8, $0x0000000000000000
47 DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
48 DATA andMask<>+0x38(SB)/8, $0x0000000000000000
49 DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
50 DATA andMask<>+0x48(SB)/8, $0x0000000000000000
51 DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
52 DATA andMask<>+0x58(SB)/8, $0x0000000000000000
53 DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
54 DATA andMask<>+0x68(SB)/8, $0x0000000000000000
55 DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
56 DATA andMask<>+0x78(SB)/8, $0x0000000000000000
57 DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
58 DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
59 DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
60 DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
61 DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
62 DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
63 DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
64 DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
65 DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
66 DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
67 DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
68 DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
69 DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
70 DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
71
72 GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
73 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
74 GLOBL andMask<>(SB), (NOPTR+RODATA), $240
75
76 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
77 TEXT ·gcmAesFinish(SB),NOSPLIT,$0
78 #define pTbl DI
79 #define tMsk SI
80 #define tPtr DX
81 #define plen AX
82 #define dlen CX
83
84 MOVQ productTable+0(FP), pTbl
85 MOVQ tagMask+8(FP), tMsk
86 MOVQ T+16(FP), tPtr
87 MOVQ pLen+24(FP), plen
88 MOVQ dLen+32(FP), dlen
89
90 MOVOU (tPtr), ACC0
91 MOVOU (tMsk), T2
92
93 MOVOU bswapMask<>(SB), BSWAP
94 MOVOU gcmPoly<>(SB), POLY
95
96 SHLQ $3, plen
97 SHLQ $3, dlen
98
99 MOVQ plen, B0
100 PINSRQ $1, dlen, B0
101
102 PXOR ACC0, B0
103
104 MOVOU (16*14)(pTbl), ACC0
105 MOVOU (16*15)(pTbl), ACCM
106 MOVOU ACC0, ACC1
107
108 PCLMULQDQ $0x00, B0, ACC0
109 PCLMULQDQ $0x11, B0, ACC1
110 PSHUFD $78, B0, T0
111 PXOR B0, T0
112 PCLMULQDQ $0x00, T0, ACCM
113
114 PXOR ACC0, ACCM
115 PXOR ACC1, ACCM
116 MOVOU ACCM, T0
117 PSRLDQ $8, ACCM
118 PSLLDQ $8, T0
119 PXOR ACCM, ACC1
120 PXOR T0, ACC0
121
122 MOVOU POLY, T0
123 PCLMULQDQ $0x01, ACC0, T0
124 PSHUFD $78, ACC0, ACC0
125 PXOR T0, ACC0
126
127 MOVOU POLY, T0
128 PCLMULQDQ $0x01, ACC0, T0
129 PSHUFD $78, ACC0, ACC0
130 PXOR T0, ACC0
131
132 PXOR ACC1, ACC0
133
134 PSHUFB BSWAP, ACC0
135 PXOR T2, ACC0
136 MOVOU ACC0, (tPtr)
137
138 RET
139 #undef pTbl
140 #undef tMsk
141 #undef tPtr
142 #undef plen
143 #undef dlen
144
145 // func gcmAesInit(productTable *[256]byte, ks []uint32)
146 TEXT ·gcmAesInit(SB),NOSPLIT,$0
147 #define dst DI
148 #define KS SI
149 #define NR DX
150
151 MOVQ productTable+0(FP), dst
152 MOVQ ks_base+8(FP), KS
153 MOVQ ks_len+16(FP), NR
154
155 SHRQ $2, NR
156 DECQ NR
157
158 MOVOU bswapMask<>(SB), BSWAP
159 MOVOU gcmPoly<>(SB), POLY
160
161 // Encrypt block 0, with the AES key to generate the hash key H
162 MOVOU (16*0)(KS), B0
163 MOVOU (16*1)(KS), T0
164 AESENC T0, B0
165 MOVOU (16*2)(KS), T0
166 AESENC T0, B0
167 MOVOU (16*3)(KS), T0
168 AESENC T0, B0
169 MOVOU (16*4)(KS), T0
170 AESENC T0, B0
171 MOVOU (16*5)(KS), T0
172 AESENC T0, B0
173 MOVOU (16*6)(KS), T0
174 AESENC T0, B0
175 MOVOU (16*7)(KS), T0
176 AESENC T0, B0
177 MOVOU (16*8)(KS), T0
178 AESENC T0, B0
179 MOVOU (16*9)(KS), T0
180 AESENC T0, B0
181 MOVOU (16*10)(KS), T0
182 CMPQ NR, $12
183 JB initEncLast
184 AESENC T0, B0
185 MOVOU (16*11)(KS), T0
186 AESENC T0, B0
187 MOVOU (16*12)(KS), T0
188 JE initEncLast
189 AESENC T0, B0
190 MOVOU (16*13)(KS), T0
191 AESENC T0, B0
192 MOVOU (16*14)(KS), T0
193 initEncLast:
194 AESENCLAST T0, B0
195
196 PSHUFB BSWAP, B0
197 // H * 2
198 PSHUFD $0xff, B0, T0
199 MOVOU B0, T1
200 PSRAL $31, T0
201 PAND POLY, T0
202 PSRLL $31, T1
203 PSLLDQ $4, T1
204 PSLLL $1, B0
205 PXOR T0, B0
206 PXOR T1, B0
207 // Karatsuba pre-computations
208 MOVOU B0, (16*14)(dst)
209 PSHUFD $78, B0, B1
210 PXOR B0, B1
211 MOVOU B1, (16*15)(dst)
212
213 MOVOU B0, B2
214 MOVOU B1, B3
215 // Now prepare powers of H and pre-computations for them
216 MOVQ $7, AX
217
218 initLoop:
219 MOVOU B2, T0
220 MOVOU B2, T1
221 MOVOU B3, T2
222 PCLMULQDQ $0x00, B0, T0
223 PCLMULQDQ $0x11, B0, T1
224 PCLMULQDQ $0x00, B1, T2
225
226 PXOR T0, T2
227 PXOR T1, T2
228 MOVOU T2, B4
229 PSLLDQ $8, B4
230 PSRLDQ $8, T2
231 PXOR B4, T0
232 PXOR T2, T1
233
234 MOVOU POLY, B2
235 PCLMULQDQ $0x01, T0, B2
236 PSHUFD $78, T0, T0
237 PXOR B2, T0
238 MOVOU POLY, B2
239 PCLMULQDQ $0x01, T0, B2
240 PSHUFD $78, T0, T0
241 PXOR T0, B2
242 PXOR T1, B2
243
244 MOVOU B2, (16*12)(dst)
245 PSHUFD $78, B2, B3
246 PXOR B2, B3
247 MOVOU B3, (16*13)(dst)
248
249 DECQ AX
250 LEAQ (-16*2)(dst), dst
251 JNE initLoop
252
253 RET
254 #undef NR
255 #undef KS
256 #undef dst
257
258 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
259 TEXT ·gcmAesData(SB),NOSPLIT,$0
260 #define pTbl DI
261 #define aut SI
262 #define tPtr CX
263 #define autLen DX
264
265 #define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
266 #define mulRoundAAD(X ,i) \
267 MOVOU (16*(i*2))(pTbl), T1;\
268 MOVOU T1, T2;\
269 PCLMULQDQ $0x00, X, T1;\
270 PXOR T1, ACC0;\
271 PCLMULQDQ $0x11, X, T2;\
272 PXOR T2, ACC1;\
273 PSHUFD $78, X, T1;\
274 PXOR T1, X;\
275 MOVOU (16*(i*2+1))(pTbl), T1;\
276 PCLMULQDQ $0x00, X, T1;\
277 PXOR T1, ACCM
278
279 MOVQ productTable+0(FP), pTbl
280 MOVQ data_base+8(FP), aut
281 MOVQ data_len+16(FP), autLen
282 MOVQ T+32(FP), tPtr
283
284 PXOR ACC0, ACC0
285 MOVOU bswapMask<>(SB), BSWAP
286 MOVOU gcmPoly<>(SB), POLY
287
288 TESTQ autLen, autLen
289 JEQ dataBail
290
291 CMPQ autLen, $13 // optimize the TLS case
292 JE dataTLS
293 CMPQ autLen, $128
294 JB startSinglesLoop
295 JMP dataOctaLoop
296
297 dataTLS:
298 MOVOU (16*14)(pTbl), T1
299 MOVOU (16*15)(pTbl), T2
300 PXOR B0, B0
301 MOVQ (aut), B0
302 PINSRD $2, 8(aut), B0
303 PINSRB $12, 12(aut), B0
304 XORQ autLen, autLen
305 JMP dataMul
306
307 dataOctaLoop:
308 CMPQ autLen, $128
309 JB startSinglesLoop
310 SUBQ $128, autLen
311
312 MOVOU (16*0)(aut), X0
313 MOVOU (16*1)(aut), X1
314 MOVOU (16*2)(aut), X2
315 MOVOU (16*3)(aut), X3
316 MOVOU (16*4)(aut), X4
317 MOVOU (16*5)(aut), X5
318 MOVOU (16*6)(aut), X6
319 MOVOU (16*7)(aut), X7
320 LEAQ (16*8)(aut), aut
321 PSHUFB BSWAP, X0
322 PSHUFB BSWAP, X1
323 PSHUFB BSWAP, X2
324 PSHUFB BSWAP, X3
325 PSHUFB BSWAP, X4
326 PSHUFB BSWAP, X5
327 PSHUFB BSWAP, X6
328 PSHUFB BSWAP, X7
329 PXOR ACC0, X0
330
331 MOVOU (16*0)(pTbl), ACC0
332 MOVOU (16*1)(pTbl), ACCM
333 MOVOU ACC0, ACC1
334 PSHUFD $78, X0, T1
335 PXOR X0, T1
336 PCLMULQDQ $0x00, X0, ACC0
337 PCLMULQDQ $0x11, X0, ACC1
338 PCLMULQDQ $0x00, T1, ACCM
339
340 mulRoundAAD(X1, 1)
341 mulRoundAAD(X2, 2)
342 mulRoundAAD(X3, 3)
343 mulRoundAAD(X4, 4)
344 mulRoundAAD(X5, 5)
345 mulRoundAAD(X6, 6)
346 mulRoundAAD(X7, 7)
347
348 PXOR ACC0, ACCM
349 PXOR ACC1, ACCM
350 MOVOU ACCM, T0
351 PSRLDQ $8, ACCM
352 PSLLDQ $8, T0
353 PXOR ACCM, ACC1
354 PXOR T0, ACC0
355 reduceRound(ACC0)
356 reduceRound(ACC0)
357 PXOR ACC1, ACC0
358 JMP dataOctaLoop
359
360 startSinglesLoop:
361 MOVOU (16*14)(pTbl), T1
362 MOVOU (16*15)(pTbl), T2
363
364 dataSinglesLoop:
365
366 CMPQ autLen, $16
367 JB dataEnd
368 SUBQ $16, autLen
369
370 MOVOU (aut), B0
371 dataMul:
372 PSHUFB BSWAP, B0
373 PXOR ACC0, B0
374
375 MOVOU T1, ACC0
376 MOVOU T2, ACCM
377 MOVOU T1, ACC1
378
379 PSHUFD $78, B0, T0
380 PXOR B0, T0
381 PCLMULQDQ $0x00, B0, ACC0
382 PCLMULQDQ $0x11, B0, ACC1
383 PCLMULQDQ $0x00, T0, ACCM
384
385 PXOR ACC0, ACCM
386 PXOR ACC1, ACCM
387 MOVOU ACCM, T0
388 PSRLDQ $8, ACCM
389 PSLLDQ $8, T0
390 PXOR ACCM, ACC1
391 PXOR T0, ACC0
392
393 MOVOU POLY, T0
394 PCLMULQDQ $0x01, ACC0, T0
395 PSHUFD $78, ACC0, ACC0
396 PXOR T0, ACC0
397
398 MOVOU POLY, T0
399 PCLMULQDQ $0x01, ACC0, T0
400 PSHUFD $78, ACC0, ACC0
401 PXOR T0, ACC0
402 PXOR ACC1, ACC0
403
404 LEAQ 16(aut), aut
405
406 JMP dataSinglesLoop
407
408 dataEnd:
409
410 TESTQ autLen, autLen
411 JEQ dataBail
412
413 PXOR B0, B0
414 LEAQ -1(aut)(autLen*1), aut
415
416 dataLoadLoop:
417
418 PSLLDQ $1, B0
419 PINSRB $0, (aut), B0
420
421 LEAQ -1(aut), aut
422 DECQ autLen
423 JNE dataLoadLoop
424
425 JMP dataMul
426
427 dataBail:
428 MOVOU ACC0, (tPtr)
429 RET
430 #undef pTbl
431 #undef aut
432 #undef tPtr
433 #undef autLen
434
435 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
436 TEXT ·gcmAesEnc(SB),0,$256-96
437 #define pTbl DI
438 #define ctx DX
439 #define ctrPtr CX
440 #define ptx SI
441 #define ks AX
442 #define tPtr R8
443 #define ptxLen R9
444 #define aluCTR R10
445 #define aluTMP R11
446 #define aluK R12
447 #define NR R13
448
449 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
450 #define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
451 #define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
452 #define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
453 #define combinedRound(i) \
454 MOVOU (16*i)(ks), T0;\
455 AESENC T0, B0;\
456 AESENC T0, B1;\
457 AESENC T0, B2;\
458 AESENC T0, B3;\
459 MOVOU (16*(i*2))(pTbl), T1;\
460 MOVOU T1, T2;\
461 AESENC T0, B4;\
462 AESENC T0, B5;\
463 AESENC T0, B6;\
464 AESENC T0, B7;\
465 MOVOU (16*i)(SP), T0;\
466 PCLMULQDQ $0x00, T0, T1;\
467 PXOR T1, ACC0;\
468 PSHUFD $78, T0, T1;\
469 PCLMULQDQ $0x11, T0, T2;\
470 PXOR T1, T0;\
471 PXOR T2, ACC1;\
472 MOVOU (16*(i*2+1))(pTbl), T2;\
473 PCLMULQDQ $0x00, T2, T0;\
474 PXOR T0, ACCM
475 #define mulRound(i) \
476 MOVOU (16*i)(SP), T0;\
477 MOVOU (16*(i*2))(pTbl), T1;\
478 MOVOU T1, T2;\
479 PCLMULQDQ $0x00, T0, T1;\
480 PXOR T1, ACC0;\
481 PCLMULQDQ $0x11, T0, T2;\
482 PXOR T2, ACC1;\
483 PSHUFD $78, T0, T1;\
484 PXOR T1, T0;\
485 MOVOU (16*(i*2+1))(pTbl), T1;\
486 PCLMULQDQ $0x00, T0, T1;\
487 PXOR T1, ACCM
488
489 MOVQ productTable+0(FP), pTbl
490 MOVQ dst+8(FP), ctx
491 MOVQ src_base+32(FP), ptx
492 MOVQ src_len+40(FP), ptxLen
493 MOVQ ctr+56(FP), ctrPtr
494 MOVQ T+64(FP), tPtr
495 MOVQ ks_base+72(FP), ks
496 MOVQ ks_len+80(FP), NR
497
498 SHRQ $2, NR
499 DECQ NR
500
501 MOVOU bswapMask<>(SB), BSWAP
502 MOVOU gcmPoly<>(SB), POLY
503
504 MOVOU (tPtr), ACC0
505 PXOR ACC1, ACC1
506 PXOR ACCM, ACCM
507 MOVOU (ctrPtr), B0
508 MOVL (3*4)(ctrPtr), aluCTR
509 MOVOU (ks), T0
510 MOVL (3*4)(ks), aluK
511 BSWAPL aluCTR
512 BSWAPL aluK
513
514 PXOR B0, T0
515 MOVOU T0, (8*16 + 0*16)(SP)
516 increment(0)
517
518 CMPQ ptxLen, $128
519 JB gcmAesEncSingles
520 SUBQ $128, ptxLen
521
522 // We have at least 8 blocks to encrypt, prepare the rest of the counters
523 MOVOU T0, (8*16 + 1*16)(SP)
524 increment(1)
525 MOVOU T0, (8*16 + 2*16)(SP)
526 increment(2)
527 MOVOU T0, (8*16 + 3*16)(SP)
528 increment(3)
529 MOVOU T0, (8*16 + 4*16)(SP)
530 increment(4)
531 MOVOU T0, (8*16 + 5*16)(SP)
532 increment(5)
533 MOVOU T0, (8*16 + 6*16)(SP)
534 increment(6)
535 MOVOU T0, (8*16 + 7*16)(SP)
536 increment(7)
537
538 MOVOU (8*16 + 0*16)(SP), B0
539 MOVOU (8*16 + 1*16)(SP), B1
540 MOVOU (8*16 + 2*16)(SP), B2
541 MOVOU (8*16 + 3*16)(SP), B3
542 MOVOU (8*16 + 4*16)(SP), B4
543 MOVOU (8*16 + 5*16)(SP), B5
544 MOVOU (8*16 + 6*16)(SP), B6
545 MOVOU (8*16 + 7*16)(SP), B7
546
547 aesRound(1)
548 increment(0)
549 aesRound(2)
550 increment(1)
551 aesRound(3)
552 increment(2)
553 aesRound(4)
554 increment(3)
555 aesRound(5)
556 increment(4)
557 aesRound(6)
558 increment(5)
559 aesRound(7)
560 increment(6)
561 aesRound(8)
562 increment(7)
563 aesRound(9)
564 MOVOU (16*10)(ks), T0
565 CMPQ NR, $12
566 JB encLast1
567 aesRnd(T0)
568 aesRound(11)
569 MOVOU (16*12)(ks), T0
570 JE encLast1
571 aesRnd(T0)
572 aesRound(13)
573 MOVOU (16*14)(ks), T0
574 encLast1:
575 aesRndLast(T0)
576
577 MOVOU (16*0)(ptx), T0
578 PXOR T0, B0
579 MOVOU (16*1)(ptx), T0
580 PXOR T0, B1
581 MOVOU (16*2)(ptx), T0
582 PXOR T0, B2
583 MOVOU (16*3)(ptx), T0
584 PXOR T0, B3
585 MOVOU (16*4)(ptx), T0
586 PXOR T0, B4
587 MOVOU (16*5)(ptx), T0
588 PXOR T0, B5
589 MOVOU (16*6)(ptx), T0
590 PXOR T0, B6
591 MOVOU (16*7)(ptx), T0
592 PXOR T0, B7
593
594 MOVOU B0, (16*0)(ctx)
595 PSHUFB BSWAP, B0
596 PXOR ACC0, B0
597 MOVOU B1, (16*1)(ctx)
598 PSHUFB BSWAP, B1
599 MOVOU B2, (16*2)(ctx)
600 PSHUFB BSWAP, B2
601 MOVOU B3, (16*3)(ctx)
602 PSHUFB BSWAP, B3
603 MOVOU B4, (16*4)(ctx)
604 PSHUFB BSWAP, B4
605 MOVOU B5, (16*5)(ctx)
606 PSHUFB BSWAP, B5
607 MOVOU B6, (16*6)(ctx)
608 PSHUFB BSWAP, B6
609 MOVOU B7, (16*7)(ctx)
610 PSHUFB BSWAP, B7
611
612 MOVOU B0, (16*0)(SP)
613 MOVOU B1, (16*1)(SP)
614 MOVOU B2, (16*2)(SP)
615 MOVOU B3, (16*3)(SP)
616 MOVOU B4, (16*4)(SP)
617 MOVOU B5, (16*5)(SP)
618 MOVOU B6, (16*6)(SP)
619 MOVOU B7, (16*7)(SP)
620
621 LEAQ 128(ptx), ptx
622 LEAQ 128(ctx), ctx
623
624 gcmAesEncOctetsLoop:
625
626 CMPQ ptxLen, $128
627 JB gcmAesEncOctetsEnd
628 SUBQ $128, ptxLen
629
630 MOVOU (8*16 + 0*16)(SP), B0
631 MOVOU (8*16 + 1*16)(SP), B1
632 MOVOU (8*16 + 2*16)(SP), B2
633 MOVOU (8*16 + 3*16)(SP), B3
634 MOVOU (8*16 + 4*16)(SP), B4
635 MOVOU (8*16 + 5*16)(SP), B5
636 MOVOU (8*16 + 6*16)(SP), B6
637 MOVOU (8*16 + 7*16)(SP), B7
638
639 MOVOU (16*0)(SP), T0
640 PSHUFD $78, T0, T1
641 PXOR T0, T1
642
643 MOVOU (16*0)(pTbl), ACC0
644 MOVOU (16*1)(pTbl), ACCM
645 MOVOU ACC0, ACC1
646
647 PCLMULQDQ $0x00, T1, ACCM
648 PCLMULQDQ $0x00, T0, ACC0
649 PCLMULQDQ $0x11, T0, ACC1
650
651 combinedRound(1)
652 increment(0)
653 combinedRound(2)
654 increment(1)
655 combinedRound(3)
656 increment(2)
657 combinedRound(4)
658 increment(3)
659 combinedRound(5)
660 increment(4)
661 combinedRound(6)
662 increment(5)
663 combinedRound(7)
664 increment(6)
665
666 aesRound(8)
667 increment(7)
668
669 PXOR ACC0, ACCM
670 PXOR ACC1, ACCM
671 MOVOU ACCM, T0
672 PSRLDQ $8, ACCM
673 PSLLDQ $8, T0
674 PXOR ACCM, ACC1
675 PXOR T0, ACC0
676
677 reduceRound(ACC0)
678 aesRound(9)
679
680 reduceRound(ACC0)
681 PXOR ACC1, ACC0
682
683 MOVOU (16*10)(ks), T0
684 CMPQ NR, $12
685 JB encLast2
686 aesRnd(T0)
687 aesRound(11)
688 MOVOU (16*12)(ks), T0
689 JE encLast2
690 aesRnd(T0)
691 aesRound(13)
692 MOVOU (16*14)(ks), T0
693 encLast2:
694 aesRndLast(T0)
695
696 MOVOU (16*0)(ptx), T0
697 PXOR T0, B0
698 MOVOU (16*1)(ptx), T0
699 PXOR T0, B1
700 MOVOU (16*2)(ptx), T0
701 PXOR T0, B2
702 MOVOU (16*3)(ptx), T0
703 PXOR T0, B3
704 MOVOU (16*4)(ptx), T0
705 PXOR T0, B4
706 MOVOU (16*5)(ptx), T0
707 PXOR T0, B5
708 MOVOU (16*6)(ptx), T0
709 PXOR T0, B6
710 MOVOU (16*7)(ptx), T0
711 PXOR T0, B7
712
713 MOVOU B0, (16*0)(ctx)
714 PSHUFB BSWAP, B0
715 PXOR ACC0, B0
716 MOVOU B1, (16*1)(ctx)
717 PSHUFB BSWAP, B1
718 MOVOU B2, (16*2)(ctx)
719 PSHUFB BSWAP, B2
720 MOVOU B3, (16*3)(ctx)
721 PSHUFB BSWAP, B3
722 MOVOU B4, (16*4)(ctx)
723 PSHUFB BSWAP, B4
724 MOVOU B5, (16*5)(ctx)
725 PSHUFB BSWAP, B5
726 MOVOU B6, (16*6)(ctx)
727 PSHUFB BSWAP, B6
728 MOVOU B7, (16*7)(ctx)
729 PSHUFB BSWAP, B7
730
731 MOVOU B0, (16*0)(SP)
732 MOVOU B1, (16*1)(SP)
733 MOVOU B2, (16*2)(SP)
734 MOVOU B3, (16*3)(SP)
735 MOVOU B4, (16*4)(SP)
736 MOVOU B5, (16*5)(SP)
737 MOVOU B6, (16*6)(SP)
738 MOVOU B7, (16*7)(SP)
739
740 LEAQ 128(ptx), ptx
741 LEAQ 128(ctx), ctx
742
743 JMP gcmAesEncOctetsLoop
744
745 gcmAesEncOctetsEnd:
746
747 MOVOU (16*0)(SP), T0
748 MOVOU (16*0)(pTbl), ACC0
749 MOVOU (16*1)(pTbl), ACCM
750 MOVOU ACC0, ACC1
751 PSHUFD $78, T0, T1
752 PXOR T0, T1
753 PCLMULQDQ $0x00, T0, ACC0
754 PCLMULQDQ $0x11, T0, ACC1
755 PCLMULQDQ $0x00, T1, ACCM
756
757 mulRound(1)
758 mulRound(2)
759 mulRound(3)
760 mulRound(4)
761 mulRound(5)
762 mulRound(6)
763 mulRound(7)
764
765 PXOR ACC0, ACCM
766 PXOR ACC1, ACCM
767 MOVOU ACCM, T0
768 PSRLDQ $8, ACCM
769 PSLLDQ $8, T0
770 PXOR ACCM, ACC1
771 PXOR T0, ACC0
772
773 reduceRound(ACC0)
774 reduceRound(ACC0)
775 PXOR ACC1, ACC0
776
777 TESTQ ptxLen, ptxLen
778 JE gcmAesEncDone
779
780 SUBQ $7, aluCTR
781
782 gcmAesEncSingles:
783
784 MOVOU (16*1)(ks), B1
785 MOVOU (16*2)(ks), B2
786 MOVOU (16*3)(ks), B3
787 MOVOU (16*4)(ks), B4
788 MOVOU (16*5)(ks), B5
789 MOVOU (16*6)(ks), B6
790 MOVOU (16*7)(ks), B7
791
792 MOVOU (16*14)(pTbl), T2
793
794 gcmAesEncSinglesLoop:
795
796 CMPQ ptxLen, $16
797 JB gcmAesEncTail
798 SUBQ $16, ptxLen
799
800 MOVOU (8*16 + 0*16)(SP), B0
801 increment(0)
802
803 AESENC B1, B0
804 AESENC B2, B0
805 AESENC B3, B0
806 AESENC B4, B0
807 AESENC B5, B0
808 AESENC B6, B0
809 AESENC B7, B0
810 MOVOU (16*8)(ks), T0
811 AESENC T0, B0
812 MOVOU (16*9)(ks), T0
813 AESENC T0, B0
814 MOVOU (16*10)(ks), T0
815 CMPQ NR, $12
816 JB encLast3
817 AESENC T0, B0
818 MOVOU (16*11)(ks), T0
819 AESENC T0, B0
820 MOVOU (16*12)(ks), T0
821 JE encLast3
822 AESENC T0, B0
823 MOVOU (16*13)(ks), T0
824 AESENC T0, B0
825 MOVOU (16*14)(ks), T0
826 encLast3:
827 AESENCLAST T0, B0
828
829 MOVOU (ptx), T0
830 PXOR T0, B0
831 MOVOU B0, (ctx)
832
833 PSHUFB BSWAP, B0
834 PXOR ACC0, B0
835
836 MOVOU T2, ACC0
837 MOVOU T2, ACC1
838 MOVOU (16*15)(pTbl), ACCM
839
840 PSHUFD $78, B0, T0
841 PXOR B0, T0
842 PCLMULQDQ $0x00, B0, ACC0
843 PCLMULQDQ $0x11, B0, ACC1
844 PCLMULQDQ $0x00, T0, ACCM
845
846 PXOR ACC0, ACCM
847 PXOR ACC1, ACCM
848 MOVOU ACCM, T0
849 PSRLDQ $8, ACCM
850 PSLLDQ $8, T0
851 PXOR ACCM, ACC1
852 PXOR T0, ACC0
853
854 reduceRound(ACC0)
855 reduceRound(ACC0)
856 PXOR ACC1, ACC0
857
858 LEAQ (16*1)(ptx), ptx
859 LEAQ (16*1)(ctx), ctx
860
861 JMP gcmAesEncSinglesLoop
862
863 gcmAesEncTail:
864 TESTQ ptxLen, ptxLen
865 JE gcmAesEncDone
866
867 MOVOU (8*16 + 0*16)(SP), B0
868 AESENC B1, B0
869 AESENC B2, B0
870 AESENC B3, B0
871 AESENC B4, B0
872 AESENC B5, B0
873 AESENC B6, B0
874 AESENC B7, B0
875 MOVOU (16*8)(ks), T0
876 AESENC T0, B0
877 MOVOU (16*9)(ks), T0
878 AESENC T0, B0
879 MOVOU (16*10)(ks), T0
880 CMPQ NR, $12
881 JB encLast4
882 AESENC T0, B0
883 MOVOU (16*11)(ks), T0
884 AESENC T0, B0
885 MOVOU (16*12)(ks), T0
886 JE encLast4
887 AESENC T0, B0
888 MOVOU (16*13)(ks), T0
889 AESENC T0, B0
890 MOVOU (16*14)(ks), T0
891 encLast4:
892 AESENCLAST T0, B0
893 MOVOU B0, T0
894
895 LEAQ -1(ptx)(ptxLen*1), ptx
896
897 MOVQ ptxLen, aluTMP
898 SHLQ $4, aluTMP
899
900 LEAQ andMask<>(SB), aluCTR
901 MOVOU -16(aluCTR)(aluTMP*1), T1
902
903 PXOR B0, B0
904 ptxLoadLoop:
905 PSLLDQ $1, B0
906 PINSRB $0, (ptx), B0
907 LEAQ -1(ptx), ptx
908 DECQ ptxLen
909 JNE ptxLoadLoop
910
911 PXOR T0, B0
912 PAND T1, B0
913 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT
914
915 PSHUFB BSWAP, B0
916 PXOR ACC0, B0
917
918 MOVOU T2, ACC0
919 MOVOU T2, ACC1
920 MOVOU (16*15)(pTbl), ACCM
921
922 PSHUFD $78, B0, T0
923 PXOR B0, T0
924 PCLMULQDQ $0x00, B0, ACC0
925 PCLMULQDQ $0x11, B0, ACC1
926 PCLMULQDQ $0x00, T0, ACCM
927
928 PXOR ACC0, ACCM
929 PXOR ACC1, ACCM
930 MOVOU ACCM, T0
931 PSRLDQ $8, ACCM
932 PSLLDQ $8, T0
933 PXOR ACCM, ACC1
934 PXOR T0, ACC0
935
936 reduceRound(ACC0)
937 reduceRound(ACC0)
938 PXOR ACC1, ACC0
939
940 gcmAesEncDone:
941 MOVOU ACC0, (tPtr)
942 RET
943 #undef increment
944
945 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
946 TEXT ·gcmAesDec(SB),0,$128-96
947 #define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
948 #define combinedDecRound(i) \
949 MOVOU (16*i)(ks), T0;\
950 AESENC T0, B0;\
951 AESENC T0, B1;\
952 AESENC T0, B2;\
953 AESENC T0, B3;\
954 MOVOU (16*(i*2))(pTbl), T1;\
955 MOVOU T1, T2;\
956 AESENC T0, B4;\
957 AESENC T0, B5;\
958 AESENC T0, B6;\
959 AESENC T0, B7;\
960 MOVOU (16*i)(ctx), T0;\
961 PSHUFB BSWAP, T0;\
962 PCLMULQDQ $0x00, T0, T1;\
963 PXOR T1, ACC0;\
964 PSHUFD $78, T0, T1;\
965 PCLMULQDQ $0x11, T0, T2;\
966 PXOR T1, T0;\
967 PXOR T2, ACC1;\
968 MOVOU (16*(i*2+1))(pTbl), T2;\
969 PCLMULQDQ $0x00, T2, T0;\
970 PXOR T0, ACCM
971
972 MOVQ productTable+0(FP), pTbl
973 MOVQ dst+8(FP), ptx
974 MOVQ src_base+32(FP), ctx
975 MOVQ src_len+40(FP), ptxLen
976 MOVQ ctr+56(FP), ctrPtr
977 MOVQ T+64(FP), tPtr
978 MOVQ ks_base+72(FP), ks
979 MOVQ ks_len+80(FP), NR
980
981 SHRQ $2, NR
982 DECQ NR
983
984 MOVOU bswapMask<>(SB), BSWAP
985 MOVOU gcmPoly<>(SB), POLY
986
987 MOVOU (tPtr), ACC0
988 PXOR ACC1, ACC1
989 PXOR ACCM, ACCM
990 MOVOU (ctrPtr), B0
991 MOVL (3*4)(ctrPtr), aluCTR
992 MOVOU (ks), T0
993 MOVL (3*4)(ks), aluK
994 BSWAPL aluCTR
995 BSWAPL aluK
996
997 PXOR B0, T0
998 MOVOU T0, (0*16)(SP)
999 increment(0)
1000
1001 CMPQ ptxLen, $128
1002 JB gcmAesDecSingles
1003
1004 MOVOU T0, (1*16)(SP)
1005 increment(1)
1006 MOVOU T0, (2*16)(SP)
1007 increment(2)
1008 MOVOU T0, (3*16)(SP)
1009 increment(3)
1010 MOVOU T0, (4*16)(SP)
1011 increment(4)
1012 MOVOU T0, (5*16)(SP)
1013 increment(5)
1014 MOVOU T0, (6*16)(SP)
1015 increment(6)
1016 MOVOU T0, (7*16)(SP)
1017 increment(7)
1018
1019 gcmAesDecOctetsLoop:
1020
1021 CMPQ ptxLen, $128
1022 JB gcmAesDecEndOctets
1023 SUBQ $128, ptxLen
1024
1025 MOVOU (0*16)(SP), B0
1026 MOVOU (1*16)(SP), B1
1027 MOVOU (2*16)(SP), B2
1028 MOVOU (3*16)(SP), B3
1029 MOVOU (4*16)(SP), B4
1030 MOVOU (5*16)(SP), B5
1031 MOVOU (6*16)(SP), B6
1032 MOVOU (7*16)(SP), B7
1033
1034 MOVOU (16*0)(ctx), T0
1035 PSHUFB BSWAP, T0
1036 PXOR ACC0, T0
1037 PSHUFD $78, T0, T1
1038 PXOR T0, T1
1039
1040 MOVOU (16*0)(pTbl), ACC0
1041 MOVOU (16*1)(pTbl), ACCM
1042 MOVOU ACC0, ACC1
1043
1044 PCLMULQDQ $0x00, T1, ACCM
1045 PCLMULQDQ $0x00, T0, ACC0
1046 PCLMULQDQ $0x11, T0, ACC1
1047
1048 combinedDecRound(1)
1049 increment(0)
1050 combinedDecRound(2)
1051 increment(1)
1052 combinedDecRound(3)
1053 increment(2)
1054 combinedDecRound(4)
1055 increment(3)
1056 combinedDecRound(5)
1057 increment(4)
1058 combinedDecRound(6)
1059 increment(5)
1060 combinedDecRound(7)
1061 increment(6)
1062
1063 aesRound(8)
1064 increment(7)
1065
1066 PXOR ACC0, ACCM
1067 PXOR ACC1, ACCM
1068 MOVOU ACCM, T0
1069 PSRLDQ $8, ACCM
1070 PSLLDQ $8, T0
1071 PXOR ACCM, ACC1
1072 PXOR T0, ACC0
1073
1074 reduceRound(ACC0)
1075 aesRound(9)
1076
1077 reduceRound(ACC0)
1078 PXOR ACC1, ACC0
1079
1080 MOVOU (16*10)(ks), T0
1081 CMPQ NR, $12
1082 JB decLast1
1083 aesRnd(T0)
1084 aesRound(11)
1085 MOVOU (16*12)(ks), T0
1086 JE decLast1
1087 aesRnd(T0)
1088 aesRound(13)
1089 MOVOU (16*14)(ks), T0
1090 decLast1:
1091 aesRndLast(T0)
1092
1093 MOVOU (16*0)(ctx), T0
1094 PXOR T0, B0
1095 MOVOU (16*1)(ctx), T0
1096 PXOR T0, B1
1097 MOVOU (16*2)(ctx), T0
1098 PXOR T0, B2
1099 MOVOU (16*3)(ctx), T0
1100 PXOR T0, B3
1101 MOVOU (16*4)(ctx), T0
1102 PXOR T0, B4
1103 MOVOU (16*5)(ctx), T0
1104 PXOR T0, B5
1105 MOVOU (16*6)(ctx), T0
1106 PXOR T0, B6
1107 MOVOU (16*7)(ctx), T0
1108 PXOR T0, B7
1109
1110 MOVOU B0, (16*0)(ptx)
1111 MOVOU B1, (16*1)(ptx)
1112 MOVOU B2, (16*2)(ptx)
1113 MOVOU B3, (16*3)(ptx)
1114 MOVOU B4, (16*4)(ptx)
1115 MOVOU B5, (16*5)(ptx)
1116 MOVOU B6, (16*6)(ptx)
1117 MOVOU B7, (16*7)(ptx)
1118
1119 LEAQ 128(ptx), ptx
1120 LEAQ 128(ctx), ctx
1121
1122 JMP gcmAesDecOctetsLoop
1123
1124 gcmAesDecEndOctets:
1125
1126 SUBQ $7, aluCTR
1127
1128 gcmAesDecSingles:
1129
1130 MOVOU (16*1)(ks), B1
1131 MOVOU (16*2)(ks), B2
1132 MOVOU (16*3)(ks), B3
1133 MOVOU (16*4)(ks), B4
1134 MOVOU (16*5)(ks), B5
1135 MOVOU (16*6)(ks), B6
1136 MOVOU (16*7)(ks), B7
1137
1138 MOVOU (16*14)(pTbl), T2
1139
1140 gcmAesDecSinglesLoop:
1141
1142 CMPQ ptxLen, $16
1143 JB gcmAesDecTail
1144 SUBQ $16, ptxLen
1145
1146 MOVOU (ctx), B0
1147 MOVOU B0, T1
1148 PSHUFB BSWAP, B0
1149 PXOR ACC0, B0
1150
1151 MOVOU T2, ACC0
1152 MOVOU T2, ACC1
1153 MOVOU (16*15)(pTbl), ACCM
1154
1155 PCLMULQDQ $0x00, B0, ACC0
1156 PCLMULQDQ $0x11, B0, ACC1
1157 PSHUFD $78, B0, T0
1158 PXOR B0, T0
1159 PCLMULQDQ $0x00, T0, ACCM
1160
1161 PXOR ACC0, ACCM
1162 PXOR ACC1, ACCM
1163 MOVOU ACCM, T0
1164 PSRLDQ $8, ACCM
1165 PSLLDQ $8, T0
1166 PXOR ACCM, ACC1
1167 PXOR T0, ACC0
1168
1169 reduceRound(ACC0)
1170 reduceRound(ACC0)
1171 PXOR ACC1, ACC0
1172
1173 MOVOU (0*16)(SP), B0
1174 increment(0)
1175 AESENC B1, B0
1176 AESENC B2, B0
1177 AESENC B3, B0
1178 AESENC B4, B0
1179 AESENC B5, B0
1180 AESENC B6, B0
1181 AESENC B7, B0
1182 MOVOU (16*8)(ks), T0
1183 AESENC T0, B0
1184 MOVOU (16*9)(ks), T0
1185 AESENC T0, B0
1186 MOVOU (16*10)(ks), T0
1187 CMPQ NR, $12
1188 JB decLast2
1189 AESENC T0, B0
1190 MOVOU (16*11)(ks), T0
1191 AESENC T0, B0
1192 MOVOU (16*12)(ks), T0
1193 JE decLast2
1194 AESENC T0, B0
1195 MOVOU (16*13)(ks), T0
1196 AESENC T0, B0
1197 MOVOU (16*14)(ks), T0
1198 decLast2:
1199 AESENCLAST T0, B0
1200
1201 PXOR T1, B0
1202 MOVOU B0, (ptx)
1203
1204 LEAQ (16*1)(ptx), ptx
1205 LEAQ (16*1)(ctx), ctx
1206
1207 JMP gcmAesDecSinglesLoop
1208
1209 gcmAesDecTail:
1210
1211 TESTQ ptxLen, ptxLen
1212 JE gcmAesDecDone
1213
1214 MOVQ ptxLen, aluTMP
1215 SHLQ $4, aluTMP
1216 LEAQ andMask<>(SB), aluCTR
1217 MOVOU -16(aluCTR)(aluTMP*1), T1
1218
1219 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow
1220 PAND T1, B0
1221
1222 MOVOU B0, T1
1223 PSHUFB BSWAP, B0
1224 PXOR ACC0, B0
1225
1226 MOVOU (16*14)(pTbl), ACC0
1227 MOVOU (16*15)(pTbl), ACCM
1228 MOVOU ACC0, ACC1
1229
1230 PCLMULQDQ $0x00, B0, ACC0
1231 PCLMULQDQ $0x11, B0, ACC1
1232 PSHUFD $78, B0, T0
1233 PXOR B0, T0
1234 PCLMULQDQ $0x00, T0, ACCM
1235
1236 PXOR ACC0, ACCM
1237 PXOR ACC1, ACCM
1238 MOVOU ACCM, T0
1239 PSRLDQ $8, ACCM
1240 PSLLDQ $8, T0
1241 PXOR ACCM, ACC1
1242 PXOR T0, ACC0
1243
1244 reduceRound(ACC0)
1245 reduceRound(ACC0)
1246 PXOR ACC1, ACC0
1247
1248 MOVOU (0*16)(SP), B0
1249 increment(0)
1250 AESENC B1, B0
1251 AESENC B2, B0
1252 AESENC B3, B0
1253 AESENC B4, B0
1254 AESENC B5, B0
1255 AESENC B6, B0
1256 AESENC B7, B0
1257 MOVOU (16*8)(ks), T0
1258 AESENC T0, B0
1259 MOVOU (16*9)(ks), T0
1260 AESENC T0, B0
1261 MOVOU (16*10)(ks), T0
1262 CMPQ NR, $12
1263 JB decLast3
1264 AESENC T0, B0
1265 MOVOU (16*11)(ks), T0
1266 AESENC T0, B0
1267 MOVOU (16*12)(ks), T0
1268 JE decLast3
1269 AESENC T0, B0
1270 MOVOU (16*13)(ks), T0
1271 AESENC T0, B0
1272 MOVOU (16*14)(ks), T0
1273 decLast3:
1274 AESENCLAST T0, B0
1275 PXOR T1, B0
1276
1277 ptxStoreLoop:
1278 PEXTRB $0, B0, (ptx)
1279 PSRLDQ $1, B0
1280 LEAQ 1(ptx), ptx
1281 DECQ ptxLen
1282
1283 JNE ptxStoreLoop
1284
1285 gcmAesDecDone:
1286
1287 MOVOU ACC0, (tPtr)
1288 RET
1289
View as plain text