1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 #define B0 V0
10 #define B1 V1
11 #define B2 V2
12 #define B3 V3
13 #define B4 V4
14 #define B5 V5
15 #define B6 V6
16 #define B7 V7
17
18 #define ACC0 V8
19 #define ACC1 V9
20 #define ACCM V10
21
22 #define T0 V11
23 #define T1 V12
24 #define T2 V13
25 #define T3 V14
26
27 #define POLY V15
28 #define ZERO V16
29 #define INC V17
30 #define CTR V18
31
32 #define K0 V19
33 #define K1 V20
34 #define K2 V21
35 #define K3 V22
36 #define K4 V23
37 #define K5 V24
38 #define K6 V25
39 #define K7 V26
40 #define K8 V27
41 #define K9 V28
42 #define K10 V29
43 #define K11 V30
44 #define KLAST V31
45
46 #define reduce() \
47 VEOR ACC0.B16, ACCM.B16, ACCM.B16 \
48 VEOR ACC1.B16, ACCM.B16, ACCM.B16 \
49 VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \
50 VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \
51 VEOR ACCM.B16, ACC0.B16, ACC0.B16 \
52 VEOR T0.B16, ACC1.B16, ACC1.B16 \
53 VPMULL POLY.D1, ACC0.D1, T0.Q1 \
54 VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \
55 VEOR T0.B16, ACC0.B16, ACC0.B16 \
56 VPMULL POLY.D1, ACC0.D1, T0.Q1 \
57 VEOR T0.B16, ACC1.B16, ACC1.B16 \
58 VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \
59 VEOR ACC1.B16, ACC0.B16, ACC0.B16 \
60
61 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
62 TEXT ·gcmAesFinish(SB),NOSPLIT,$0
63 #define pTbl R0
64 #define tMsk R1
65 #define tPtr R2
66 #define plen R3
67 #define dlen R4
68
69 MOVD $0xC2, R1
70 LSL $56, R1
71 MOVD $1, R0
72 VMOV R1, POLY.D[0]
73 VMOV R0, POLY.D[1]
74 VEOR ZERO.B16, ZERO.B16, ZERO.B16
75
76 MOVD productTable+0(FP), pTbl
77 MOVD tagMask+8(FP), tMsk
78 MOVD T+16(FP), tPtr
79 MOVD pLen+24(FP), plen
80 MOVD dLen+32(FP), dlen
81
82 VLD1 (tPtr), [ACC0.B16]
83 VLD1 (tMsk), [B1.B16]
84
85 LSL $3, plen
86 LSL $3, dlen
87
88 VMOV dlen, B0.D[0]
89 VMOV plen, B0.D[1]
90
91 ADD $14*16, pTbl
92 VLD1.P (pTbl), [T1.B16, T2.B16]
93
94 VEOR ACC0.B16, B0.B16, B0.B16
95
96 VEXT $8, B0.B16, B0.B16, T0.B16
97 VEOR B0.B16, T0.B16, T0.B16
98 VPMULL B0.D1, T1.D1, ACC1.Q1
99 VPMULL2 B0.D2, T1.D2, ACC0.Q1
100 VPMULL T0.D1, T2.D1, ACCM.Q1
101
102 reduce()
103
104 VREV64 ACC0.B16, ACC0.B16
105 VEOR B1.B16, ACC0.B16, ACC0.B16
106
107 VST1 [ACC0.B16], (tPtr)
108 RET
109 #undef pTbl
110 #undef tMsk
111 #undef tPtr
112 #undef plen
113 #undef dlen
114
115 // func gcmAesInit(productTable *[256]byte, ks []uint32)
116 TEXT ·gcmAesInit(SB),NOSPLIT,$0
117 #define pTbl R0
118 #define KS R1
119 #define NR R2
120 #define I R3
121 MOVD productTable+0(FP), pTbl
122 MOVD ks_base+8(FP), KS
123 MOVD ks_len+16(FP), NR
124
125 MOVD $0xC2, I
126 LSL $56, I
127 VMOV I, POLY.D[0]
128 MOVD $1, I
129 VMOV I, POLY.D[1]
130 VEOR ZERO.B16, ZERO.B16, ZERO.B16
131
132 // Encrypt block 0 with the AES key to generate the hash key H
133 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
134 VEOR B0.B16, B0.B16, B0.B16
135 AESE T0.B16, B0.B16
136 AESMC B0.B16, B0.B16
137 AESE T1.B16, B0.B16
138 AESMC B0.B16, B0.B16
139 AESE T2.B16, B0.B16
140 AESMC B0.B16, B0.B16
141 AESE T3.B16, B0.B16
142 AESMC B0.B16, B0.B16
143 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
144 AESE T0.B16, B0.B16
145 AESMC B0.B16, B0.B16
146 AESE T1.B16, B0.B16
147 AESMC B0.B16, B0.B16
148 AESE T2.B16, B0.B16
149 AESMC B0.B16, B0.B16
150 AESE T3.B16, B0.B16
151 AESMC B0.B16, B0.B16
152 TBZ $4, NR, initEncFinish
153 VLD1.P 32(KS), [T0.B16, T1.B16]
154 AESE T0.B16, B0.B16
155 AESMC B0.B16, B0.B16
156 AESE T1.B16, B0.B16
157 AESMC B0.B16, B0.B16
158 TBZ $3, NR, initEncFinish
159 VLD1.P 32(KS), [T0.B16, T1.B16]
160 AESE T0.B16, B0.B16
161 AESMC B0.B16, B0.B16
162 AESE T1.B16, B0.B16
163 AESMC B0.B16, B0.B16
164 initEncFinish:
165 VLD1 (KS), [T0.B16, T1.B16, T2.B16]
166 AESE T0.B16, B0.B16
167 AESMC B0.B16, B0.B16
168 AESE T1.B16, B0.B16
169 VEOR T2.B16, B0.B16, B0.B16
170
171 VREV64 B0.B16, B0.B16
172
173 // Multiply by 2 modulo P
174 VMOV B0.D[0], I
175 ASR $63, I
176 VMOV I, T1.D[0]
177 VMOV I, T1.D[1]
178 VAND POLY.B16, T1.B16, T1.B16
179 VUSHR $63, B0.D2, T2.D2
180 VEXT $8, ZERO.B16, T2.B16, T2.B16
181 VSHL $1, B0.D2, B0.D2
182 VEOR T1.B16, B0.B16, B0.B16
183 VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
184
185 // Karatsuba pre-computation
186 VEXT $8, B0.B16, B0.B16, B1.B16
187 VEOR B0.B16, B1.B16, B1.B16
188
189 ADD $14*16, pTbl
190 VST1 [B0.B16, B1.B16], (pTbl)
191 SUB $2*16, pTbl
192
193 VMOV B0.B16, B2.B16
194 VMOV B1.B16, B3.B16
195
196 MOVD $7, I
197
198 initLoop:
199 // Compute powers of H
200 SUBS $1, I
201
202 VPMULL B0.D1, B2.D1, T1.Q1
203 VPMULL2 B0.D2, B2.D2, T0.Q1
204 VPMULL B1.D1, B3.D1, T2.Q1
205 VEOR T0.B16, T2.B16, T2.B16
206 VEOR T1.B16, T2.B16, T2.B16
207 VEXT $8, ZERO.B16, T2.B16, T3.B16
208 VEXT $8, T2.B16, ZERO.B16, T2.B16
209 VEOR T2.B16, T0.B16, T0.B16
210 VEOR T3.B16, T1.B16, T1.B16
211 VPMULL POLY.D1, T0.D1, T2.Q1
212 VEXT $8, T0.B16, T0.B16, T0.B16
213 VEOR T2.B16, T0.B16, T0.B16
214 VPMULL POLY.D1, T0.D1, T2.Q1
215 VEXT $8, T0.B16, T0.B16, T0.B16
216 VEOR T2.B16, T0.B16, T0.B16
217 VEOR T1.B16, T0.B16, B2.B16
218 VMOV B2.B16, B3.B16
219 VEXT $8, B2.B16, B2.B16, B2.B16
220 VEOR B2.B16, B3.B16, B3.B16
221
222 VST1 [B2.B16, B3.B16], (pTbl)
223 SUB $2*16, pTbl
224
225 BNE initLoop
226 RET
227 #undef I
228 #undef NR
229 #undef KS
230 #undef pTbl
231
232 // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
233 TEXT ·gcmAesData(SB),NOSPLIT,$0
234 #define pTbl R0
235 #define aut R1
236 #define tPtr R2
237 #define autLen R3
238 #define H0 R4
239 #define pTblSave R5
240
241 #define mulRound(X) \
242 VLD1.P 32(pTbl), [T1.B16, T2.B16] \
243 VREV64 X.B16, X.B16 \
244 VEXT $8, X.B16, X.B16, T0.B16 \
245 VEOR X.B16, T0.B16, T0.B16 \
246 VPMULL X.D1, T1.D1, T3.Q1 \
247 VEOR T3.B16, ACC1.B16, ACC1.B16 \
248 VPMULL2 X.D2, T1.D2, T3.Q1 \
249 VEOR T3.B16, ACC0.B16, ACC0.B16 \
250 VPMULL T0.D1, T2.D1, T3.Q1 \
251 VEOR T3.B16, ACCM.B16, ACCM.B16
252
253 MOVD productTable+0(FP), pTbl
254 MOVD data_base+8(FP), aut
255 MOVD data_len+16(FP), autLen
256 MOVD T+32(FP), tPtr
257
258 VEOR ACC0.B16, ACC0.B16, ACC0.B16
259 CBZ autLen, dataBail
260
261 MOVD $0xC2, H0
262 LSL $56, H0
263 VMOV H0, POLY.D[0]
264 MOVD $1, H0
265 VMOV H0, POLY.D[1]
266 VEOR ZERO.B16, ZERO.B16, ZERO.B16
267 MOVD pTbl, pTblSave
268
269 CMP $13, autLen
270 BEQ dataTLS
271 CMP $128, autLen
272 BLT startSinglesLoop
273 B octetsLoop
274
275 dataTLS:
276 ADD $14*16, pTbl
277 VLD1.P (pTbl), [T1.B16, T2.B16]
278 VEOR B0.B16, B0.B16, B0.B16
279
280 MOVD (aut), H0
281 VMOV H0, B0.D[0]
282 MOVW 8(aut), H0
283 VMOV H0, B0.S[2]
284 MOVB 12(aut), H0
285 VMOV H0, B0.B[12]
286
287 MOVD $0, autLen
288 B dataMul
289
290 octetsLoop:
291 CMP $128, autLen
292 BLT startSinglesLoop
293 SUB $128, autLen
294
295 VLD1.P 32(aut), [B0.B16, B1.B16]
296
297 VLD1.P 32(pTbl), [T1.B16, T2.B16]
298 VREV64 B0.B16, B0.B16
299 VEOR ACC0.B16, B0.B16, B0.B16
300 VEXT $8, B0.B16, B0.B16, T0.B16
301 VEOR B0.B16, T0.B16, T0.B16
302 VPMULL B0.D1, T1.D1, ACC1.Q1
303 VPMULL2 B0.D2, T1.D2, ACC0.Q1
304 VPMULL T0.D1, T2.D1, ACCM.Q1
305
306 mulRound(B1)
307 VLD1.P 32(aut), [B2.B16, B3.B16]
308 mulRound(B2)
309 mulRound(B3)
310 VLD1.P 32(aut), [B4.B16, B5.B16]
311 mulRound(B4)
312 mulRound(B5)
313 VLD1.P 32(aut), [B6.B16, B7.B16]
314 mulRound(B6)
315 mulRound(B7)
316
317 MOVD pTblSave, pTbl
318 reduce()
319 B octetsLoop
320
321 startSinglesLoop:
322
323 ADD $14*16, pTbl
324 VLD1.P (pTbl), [T1.B16, T2.B16]
325
326 singlesLoop:
327
328 CMP $16, autLen
329 BLT dataEnd
330 SUB $16, autLen
331
332 VLD1.P 16(aut), [B0.B16]
333 dataMul:
334 VREV64 B0.B16, B0.B16
335 VEOR ACC0.B16, B0.B16, B0.B16
336
337 VEXT $8, B0.B16, B0.B16, T0.B16
338 VEOR B0.B16, T0.B16, T0.B16
339 VPMULL B0.D1, T1.D1, ACC1.Q1
340 VPMULL2 B0.D2, T1.D2, ACC0.Q1
341 VPMULL T0.D1, T2.D1, ACCM.Q1
342
343 reduce()
344
345 B singlesLoop
346
347 dataEnd:
348
349 CBZ autLen, dataBail
350 VEOR B0.B16, B0.B16, B0.B16
351 ADD autLen, aut
352
353 dataLoadLoop:
354 MOVB.W -1(aut), H0
355 VEXT $15, B0.B16, ZERO.B16, B0.B16
356 VMOV H0, B0.B[0]
357 SUBS $1, autLen
358 BNE dataLoadLoop
359 B dataMul
360
361 dataBail:
362 VST1 [ACC0.B16], (tPtr)
363 RET
364
365 #undef pTbl
366 #undef aut
367 #undef tPtr
368 #undef autLen
369 #undef H0
370 #undef pTblSave
371
372 // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
373 TEXT ·gcmAesEnc(SB),NOSPLIT,$0
374 #define pTbl R0
375 #define dstPtr R1
376 #define ctrPtr R2
377 #define srcPtr R3
378 #define ks R4
379 #define tPtr R5
380 #define srcPtrLen R6
381 #define aluCTR R7
382 #define aluTMP R8
383 #define aluK R9
384 #define NR R10
385 #define H0 R11
386 #define H1 R12
387 #define curK R13
388 #define pTblSave R14
389
390 #define aesrndx8(K) \
391 AESE K.B16, B0.B16 \
392 AESMC B0.B16, B0.B16 \
393 AESE K.B16, B1.B16 \
394 AESMC B1.B16, B1.B16 \
395 AESE K.B16, B2.B16 \
396 AESMC B2.B16, B2.B16 \
397 AESE K.B16, B3.B16 \
398 AESMC B3.B16, B3.B16 \
399 AESE K.B16, B4.B16 \
400 AESMC B4.B16, B4.B16 \
401 AESE K.B16, B5.B16 \
402 AESMC B5.B16, B5.B16 \
403 AESE K.B16, B6.B16 \
404 AESMC B6.B16, B6.B16 \
405 AESE K.B16, B7.B16 \
406 AESMC B7.B16, B7.B16
407
408 #define aesrndlastx8(K) \
409 AESE K.B16, B0.B16 \
410 AESE K.B16, B1.B16 \
411 AESE K.B16, B2.B16 \
412 AESE K.B16, B3.B16 \
413 AESE K.B16, B4.B16 \
414 AESE K.B16, B5.B16 \
415 AESE K.B16, B6.B16 \
416 AESE K.B16, B7.B16
417
418 MOVD productTable+0(FP), pTbl
419 MOVD dst+8(FP), dstPtr
420 MOVD src_base+32(FP), srcPtr
421 MOVD src_len+40(FP), srcPtrLen
422 MOVD ctr+56(FP), ctrPtr
423 MOVD T+64(FP), tPtr
424 MOVD ks_base+72(FP), ks
425 MOVD ks_len+80(FP), NR
426
427 MOVD $0xC2, H1
428 LSL $56, H1
429 MOVD $1, H0
430 VMOV H1, POLY.D[0]
431 VMOV H0, POLY.D[1]
432 VEOR ZERO.B16, ZERO.B16, ZERO.B16
433 // Compute NR from len(ks)
434 MOVD pTbl, pTblSave
435 // Current tag, after AAD
436 VLD1 (tPtr), [ACC0.B16]
437 VEOR ACC1.B16, ACC1.B16, ACC1.B16
438 VEOR ACCM.B16, ACCM.B16, ACCM.B16
439 // Prepare initial counter, and the increment vector
440 VLD1 (ctrPtr), [CTR.B16]
441 VEOR INC.B16, INC.B16, INC.B16
442 MOVD $1, H0
443 VMOV H0, INC.S[3]
444 VREV32 CTR.B16, CTR.B16
445 VADD CTR.S4, INC.S4, CTR.S4
446 // Skip to <8 blocks loop
447 CMP $128, srcPtrLen
448
449 MOVD ks, H0
450 // For AES-128 round keys are stored in: K0 .. K10, KLAST
451 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
452 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
453 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16]
454 VMOV K10.B16, KLAST.B16
455
456 BLT startSingles
457 // There are at least 8 blocks to encrypt
458 TBZ $4, NR, octetsLoop
459
460 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
461 VMOV K8.B16, K10.B16
462 VMOV K9.B16, K11.B16
463 VMOV KLAST.B16, K8.B16
464 VLD1.P 16(H0), [K9.B16]
465 VLD1.P 16(H0), [KLAST.B16]
466 TBZ $3, NR, octetsLoop
467 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
468 VMOV KLAST.B16, K8.B16
469 VLD1.P 16(H0), [K9.B16]
470 VLD1.P 16(H0), [KLAST.B16]
471 ADD $10*16, ks, H0
472 MOVD H0, curK
473
474 octetsLoop:
475 SUB $128, srcPtrLen
476
477 VMOV CTR.B16, B0.B16
478 VADD B0.S4, INC.S4, B1.S4
479 VREV32 B0.B16, B0.B16
480 VADD B1.S4, INC.S4, B2.S4
481 VREV32 B1.B16, B1.B16
482 VADD B2.S4, INC.S4, B3.S4
483 VREV32 B2.B16, B2.B16
484 VADD B3.S4, INC.S4, B4.S4
485 VREV32 B3.B16, B3.B16
486 VADD B4.S4, INC.S4, B5.S4
487 VREV32 B4.B16, B4.B16
488 VADD B5.S4, INC.S4, B6.S4
489 VREV32 B5.B16, B5.B16
490 VADD B6.S4, INC.S4, B7.S4
491 VREV32 B6.B16, B6.B16
492 VADD B7.S4, INC.S4, CTR.S4
493 VREV32 B7.B16, B7.B16
494
495 aesrndx8(K0)
496 aesrndx8(K1)
497 aesrndx8(K2)
498 aesrndx8(K3)
499 aesrndx8(K4)
500 aesrndx8(K5)
501 aesrndx8(K6)
502 aesrndx8(K7)
503 TBZ $4, NR, octetsFinish
504 aesrndx8(K10)
505 aesrndx8(K11)
506 TBZ $3, NR, octetsFinish
507 VLD1.P 32(curK), [T1.B16, T2.B16]
508 aesrndx8(T1)
509 aesrndx8(T2)
510 MOVD H0, curK
511 octetsFinish:
512 aesrndx8(K8)
513 aesrndlastx8(K9)
514
515 VEOR KLAST.B16, B0.B16, B0.B16
516 VEOR KLAST.B16, B1.B16, B1.B16
517 VEOR KLAST.B16, B2.B16, B2.B16
518 VEOR KLAST.B16, B3.B16, B3.B16
519 VEOR KLAST.B16, B4.B16, B4.B16
520 VEOR KLAST.B16, B5.B16, B5.B16
521 VEOR KLAST.B16, B6.B16, B6.B16
522 VEOR KLAST.B16, B7.B16, B7.B16
523
524 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
525 VEOR B0.B16, T1.B16, B0.B16
526 VEOR B1.B16, T2.B16, B1.B16
527 VST1.P [B0.B16, B1.B16], 32(dstPtr)
528 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
529 VEOR B2.B16, T1.B16, B2.B16
530 VEOR B3.B16, T2.B16, B3.B16
531 VST1.P [B2.B16, B3.B16], 32(dstPtr)
532 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
533 VEOR B4.B16, T1.B16, B4.B16
534 VEOR B5.B16, T2.B16, B5.B16
535 VST1.P [B4.B16, B5.B16], 32(dstPtr)
536 VLD1.P 32(srcPtr), [T1.B16, T2.B16]
537 VEOR B6.B16, T1.B16, B6.B16
538 VEOR B7.B16, T2.B16, B7.B16
539 VST1.P [B6.B16, B7.B16], 32(dstPtr)
540
541 VLD1.P 32(pTbl), [T1.B16, T2.B16]
542 VREV64 B0.B16, B0.B16
543 VEOR ACC0.B16, B0.B16, B0.B16
544 VEXT $8, B0.B16, B0.B16, T0.B16
545 VEOR B0.B16, T0.B16, T0.B16
546 VPMULL B0.D1, T1.D1, ACC1.Q1
547 VPMULL2 B0.D2, T1.D2, ACC0.Q1
548 VPMULL T0.D1, T2.D1, ACCM.Q1
549
550 mulRound(B1)
551 mulRound(B2)
552 mulRound(B3)
553 mulRound(B4)
554 mulRound(B5)
555 mulRound(B6)
556 mulRound(B7)
557 MOVD pTblSave, pTbl
558 reduce()
559
560 CMP $128, srcPtrLen
561 BGE octetsLoop
562
563 startSingles:
564 CBZ srcPtrLen, done
565 ADD $14*16, pTbl
566 // Preload H and its Karatsuba precomp
567 VLD1.P (pTbl), [T1.B16, T2.B16]
568 // Preload AES round keys
569 ADD $128, ks
570 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16]
571 VMOV K10.B16, KLAST.B16
572 TBZ $4, NR, singlesLoop
573 VLD1.P 32(ks), [B1.B16, B2.B16]
574 VMOV B2.B16, KLAST.B16
575 TBZ $3, NR, singlesLoop
576 VLD1.P 32(ks), [B3.B16, B4.B16]
577 VMOV B4.B16, KLAST.B16
578
579 singlesLoop:
580 CMP $16, srcPtrLen
581 BLT tail
582 SUB $16, srcPtrLen
583
584 VLD1.P 16(srcPtr), [T0.B16]
585 VEOR KLAST.B16, T0.B16, T0.B16
586
587 VREV32 CTR.B16, B0.B16
588 VADD CTR.S4, INC.S4, CTR.S4
589
590 AESE K0.B16, B0.B16
591 AESMC B0.B16, B0.B16
592 AESE K1.B16, B0.B16
593 AESMC B0.B16, B0.B16
594 AESE K2.B16, B0.B16
595 AESMC B0.B16, B0.B16
596 AESE K3.B16, B0.B16
597 AESMC B0.B16, B0.B16
598 AESE K4.B16, B0.B16
599 AESMC B0.B16, B0.B16
600 AESE K5.B16, B0.B16
601 AESMC B0.B16, B0.B16
602 AESE K6.B16, B0.B16
603 AESMC B0.B16, B0.B16
604 AESE K7.B16, B0.B16
605 AESMC B0.B16, B0.B16
606 AESE K8.B16, B0.B16
607 AESMC B0.B16, B0.B16
608 AESE K9.B16, B0.B16
609 TBZ $4, NR, singlesLast
610 AESMC B0.B16, B0.B16
611 AESE K10.B16, B0.B16
612 AESMC B0.B16, B0.B16
613 AESE B1.B16, B0.B16
614 TBZ $3, NR, singlesLast
615 AESMC B0.B16, B0.B16
616 AESE B2.B16, B0.B16
617 AESMC B0.B16, B0.B16
618 AESE B3.B16, B0.B16
619 singlesLast:
620 VEOR T0.B16, B0.B16, B0.B16
621 encReduce:
622 VST1.P [B0.B16], 16(dstPtr)
623
624 VREV64 B0.B16, B0.B16
625 VEOR ACC0.B16, B0.B16, B0.B16
626
627 VEXT $8, B0.B16, B0.B16, T0.B16
628 VEOR B0.B16, T0.B16, T0.B16
629 VPMULL B0.D1, T1.D1, ACC1.Q1
630 VPMULL2 B0.D2, T1.D2, ACC0.Q1
631 VPMULL T0.D1, T2.D1, ACCM.Q1
632
633 reduce()
634
635 B singlesLoop
636 tail:
637 CBZ srcPtrLen, done
638
639 VEOR T0.B16, T0.B16, T0.B16
640 VEOR T3.B16, T3.B16, T3.B16
641 MOVD $0, H1
642 SUB $1, H1
643 ADD srcPtrLen, srcPtr
644
645 TBZ $3, srcPtrLen, ld4
646 MOVD.W -8(srcPtr), H0
647 VMOV H0, T0.D[0]
648 VMOV H1, T3.D[0]
649 ld4:
650 TBZ $2, srcPtrLen, ld2
651 MOVW.W -4(srcPtr), H0
652 VEXT $12, T0.B16, ZERO.B16, T0.B16
653 VEXT $12, T3.B16, ZERO.B16, T3.B16
654 VMOV H0, T0.S[0]
655 VMOV H1, T3.S[0]
656 ld2:
657 TBZ $1, srcPtrLen, ld1
658 MOVH.W -2(srcPtr), H0
659 VEXT $14, T0.B16, ZERO.B16, T0.B16
660 VEXT $14, T3.B16, ZERO.B16, T3.B16
661 VMOV H0, T0.H[0]
662 VMOV H1, T3.H[0]
663 ld1:
664 TBZ $0, srcPtrLen, ld0
665 MOVB.W -1(srcPtr), H0
666 VEXT $15, T0.B16, ZERO.B16, T0.B16
667 VEXT $15, T3.B16, ZERO.B16, T3.B16
668 VMOV H0, T0.B[0]
669 VMOV H1, T3.B[0]
670 ld0:
671
672 MOVD ZR, srcPtrLen
673 VEOR KLAST.B16, T0.B16, T0.B16
674 VREV32 CTR.B16, B0.B16
675
676 AESE K0.B16, B0.B16
677 AESMC B0.B16, B0.B16
678 AESE K1.B16, B0.B16
679 AESMC B0.B16, B0.B16
680 AESE K2.B16, B0.B16
681 AESMC B0.B16, B0.B16
682 AESE K3.B16, B0.B16
683 AESMC B0.B16, B0.B16
684 AESE K4.B16, B0.B16
685 AESMC B0.B16, B0.B16
686 AESE K5.B16, B0.B16
687 AESMC B0.B16, B0.B16
688 AESE K6.B16, B0.B16
689 AESMC B0.B16, B0.B16
690 AESE K7.B16, B0.B16
691 AESMC B0.B16, B0.B16
692 AESE K8.B16, B0.B16
693 AESMC B0.B16, B0.B16
694 AESE K9.B16, B0.B16
695 TBZ $4, NR, tailLast
696 AESMC B0.B16, B0.B16
697 AESE K10.B16, B0.B16
698 AESMC B0.B16, B0.B16
699 AESE B1.B16, B0.B16
700 TBZ $3, NR, tailLast
701 AESMC B0.B16, B0.B16
702 AESE B2.B16, B0.B16
703 AESMC B0.B16, B0.B16
704 AESE B3.B16, B0.B16
705
706 tailLast:
707 VEOR T0.B16, B0.B16, B0.B16
708 VAND T3.B16, B0.B16, B0.B16
709 B encReduce
710
711 done:
712 VST1 [ACC0.B16], (tPtr)
713 RET
714
715 // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
716 TEXT ·gcmAesDec(SB),NOSPLIT,$0
717 MOVD productTable+0(FP), pTbl
718 MOVD dst+8(FP), dstPtr
719 MOVD src_base+32(FP), srcPtr
720 MOVD src_len+40(FP), srcPtrLen
721 MOVD ctr+56(FP), ctrPtr
722 MOVD T+64(FP), tPtr
723 MOVD ks_base+72(FP), ks
724 MOVD ks_len+80(FP), NR
725
726 MOVD $0xC2, H1
727 LSL $56, H1
728 MOVD $1, H0
729 VMOV H1, POLY.D[0]
730 VMOV H0, POLY.D[1]
731 VEOR ZERO.B16, ZERO.B16, ZERO.B16
732 // Compute NR from len(ks)
733 MOVD pTbl, pTblSave
734 // Current tag, after AAD
735 VLD1 (tPtr), [ACC0.B16]
736 VEOR ACC1.B16, ACC1.B16, ACC1.B16
737 VEOR ACCM.B16, ACCM.B16, ACCM.B16
738 // Prepare initial counter, and the increment vector
739 VLD1 (ctrPtr), [CTR.B16]
740 VEOR INC.B16, INC.B16, INC.B16
741 MOVD $1, H0
742 VMOV H0, INC.S[3]
743 VREV32 CTR.B16, CTR.B16
744 VADD CTR.S4, INC.S4, CTR.S4
745
746 MOVD ks, H0
747 // For AES-128 round keys are stored in: K0 .. K10, KLAST
748 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
749 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
750 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16]
751 VMOV K10.B16, KLAST.B16
752
753 // Skip to <8 blocks loop
754 CMP $128, srcPtrLen
755 BLT startSingles
756 // There are at least 8 blocks to encrypt
757 TBZ $4, NR, octetsLoop
758
759 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
760 VMOV K8.B16, K10.B16
761 VMOV K9.B16, K11.B16
762 VMOV KLAST.B16, K8.B16
763 VLD1.P 16(H0), [K9.B16]
764 VLD1.P 16(H0), [KLAST.B16]
765 TBZ $3, NR, octetsLoop
766 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
767 VMOV KLAST.B16, K8.B16
768 VLD1.P 16(H0), [K9.B16]
769 VLD1.P 16(H0), [KLAST.B16]
770 ADD $10*16, ks, H0
771 MOVD H0, curK
772
773 octetsLoop:
774 SUB $128, srcPtrLen
775
776 VMOV CTR.B16, B0.B16
777 VADD B0.S4, INC.S4, B1.S4
778 VREV32 B0.B16, B0.B16
779 VADD B1.S4, INC.S4, B2.S4
780 VREV32 B1.B16, B1.B16
781 VADD B2.S4, INC.S4, B3.S4
782 VREV32 B2.B16, B2.B16
783 VADD B3.S4, INC.S4, B4.S4
784 VREV32 B3.B16, B3.B16
785 VADD B4.S4, INC.S4, B5.S4
786 VREV32 B4.B16, B4.B16
787 VADD B5.S4, INC.S4, B6.S4
788 VREV32 B5.B16, B5.B16
789 VADD B6.S4, INC.S4, B7.S4
790 VREV32 B6.B16, B6.B16
791 VADD B7.S4, INC.S4, CTR.S4
792 VREV32 B7.B16, B7.B16
793
794 aesrndx8(K0)
795 aesrndx8(K1)
796 aesrndx8(K2)
797 aesrndx8(K3)
798 aesrndx8(K4)
799 aesrndx8(K5)
800 aesrndx8(K6)
801 aesrndx8(K7)
802 TBZ $4, NR, octetsFinish
803 aesrndx8(K10)
804 aesrndx8(K11)
805 TBZ $3, NR, octetsFinish
806 VLD1.P 32(curK), [T1.B16, T2.B16]
807 aesrndx8(T1)
808 aesrndx8(T2)
809 MOVD H0, curK
810 octetsFinish:
811 aesrndx8(K8)
812 aesrndlastx8(K9)
813
814 VEOR KLAST.B16, B0.B16, T1.B16
815 VEOR KLAST.B16, B1.B16, T2.B16
816 VEOR KLAST.B16, B2.B16, B2.B16
817 VEOR KLAST.B16, B3.B16, B3.B16
818 VEOR KLAST.B16, B4.B16, B4.B16
819 VEOR KLAST.B16, B5.B16, B5.B16
820 VEOR KLAST.B16, B6.B16, B6.B16
821 VEOR KLAST.B16, B7.B16, B7.B16
822
823 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
824 VEOR B0.B16, T1.B16, T1.B16
825 VEOR B1.B16, T2.B16, T2.B16
826 VST1.P [T1.B16, T2.B16], 32(dstPtr)
827
828 VLD1.P 32(pTbl), [T1.B16, T2.B16]
829 VREV64 B0.B16, B0.B16
830 VEOR ACC0.B16, B0.B16, B0.B16
831 VEXT $8, B0.B16, B0.B16, T0.B16
832 VEOR B0.B16, T0.B16, T0.B16
833 VPMULL B0.D1, T1.D1, ACC1.Q1
834 VPMULL2 B0.D2, T1.D2, ACC0.Q1
835 VPMULL T0.D1, T2.D1, ACCM.Q1
836 mulRound(B1)
837
838 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
839 VEOR B2.B16, B0.B16, T1.B16
840 VEOR B3.B16, B1.B16, T2.B16
841 VST1.P [T1.B16, T2.B16], 32(dstPtr)
842 mulRound(B0)
843 mulRound(B1)
844
845 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
846 VEOR B4.B16, B0.B16, T1.B16
847 VEOR B5.B16, B1.B16, T2.B16
848 VST1.P [T1.B16, T2.B16], 32(dstPtr)
849 mulRound(B0)
850 mulRound(B1)
851
852 VLD1.P 32(srcPtr), [B0.B16, B1.B16]
853 VEOR B6.B16, B0.B16, T1.B16
854 VEOR B7.B16, B1.B16, T2.B16
855 VST1.P [T1.B16, T2.B16], 32(dstPtr)
856 mulRound(B0)
857 mulRound(B1)
858
859 MOVD pTblSave, pTbl
860 reduce()
861
862 CMP $128, srcPtrLen
863 BGE octetsLoop
864
865 startSingles:
866 CBZ srcPtrLen, done
867 ADD $14*16, pTbl
868 // Preload H and its Karatsuba precomp
869 VLD1.P (pTbl), [T1.B16, T2.B16]
870 // Preload AES round keys
871 ADD $128, ks
872 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16]
873 VMOV K10.B16, KLAST.B16
874 TBZ $4, NR, singlesLoop
875 VLD1.P 32(ks), [B1.B16, B2.B16]
876 VMOV B2.B16, KLAST.B16
877 TBZ $3, NR, singlesLoop
878 VLD1.P 32(ks), [B3.B16, B4.B16]
879 VMOV B4.B16, KLAST.B16
880
881 singlesLoop:
882 CMP $16, srcPtrLen
883 BLT tail
884 SUB $16, srcPtrLen
885
886 VLD1.P 16(srcPtr), [T0.B16]
887 VREV64 T0.B16, B5.B16
888 VEOR KLAST.B16, T0.B16, T0.B16
889
890 VREV32 CTR.B16, B0.B16
891 VADD CTR.S4, INC.S4, CTR.S4
892
893 AESE K0.B16, B0.B16
894 AESMC B0.B16, B0.B16
895 AESE K1.B16, B0.B16
896 AESMC B0.B16, B0.B16
897 AESE K2.B16, B0.B16
898 AESMC B0.B16, B0.B16
899 AESE K3.B16, B0.B16
900 AESMC B0.B16, B0.B16
901 AESE K4.B16, B0.B16
902 AESMC B0.B16, B0.B16
903 AESE K5.B16, B0.B16
904 AESMC B0.B16, B0.B16
905 AESE K6.B16, B0.B16
906 AESMC B0.B16, B0.B16
907 AESE K7.B16, B0.B16
908 AESMC B0.B16, B0.B16
909 AESE K8.B16, B0.B16
910 AESMC B0.B16, B0.B16
911 AESE K9.B16, B0.B16
912 TBZ $4, NR, singlesLast
913 AESMC B0.B16, B0.B16
914 AESE K10.B16, B0.B16
915 AESMC B0.B16, B0.B16
916 AESE B1.B16, B0.B16
917 TBZ $3, NR, singlesLast
918 AESMC B0.B16, B0.B16
919 AESE B2.B16, B0.B16
920 AESMC B0.B16, B0.B16
921 AESE B3.B16, B0.B16
922 singlesLast:
923 VEOR T0.B16, B0.B16, B0.B16
924
925 VST1.P [B0.B16], 16(dstPtr)
926
927 VEOR ACC0.B16, B5.B16, B5.B16
928 VEXT $8, B5.B16, B5.B16, T0.B16
929 VEOR B5.B16, T0.B16, T0.B16
930 VPMULL B5.D1, T1.D1, ACC1.Q1
931 VPMULL2 B5.D2, T1.D2, ACC0.Q1
932 VPMULL T0.D1, T2.D1, ACCM.Q1
933 reduce()
934
935 B singlesLoop
936 tail:
937 CBZ srcPtrLen, done
938
939 VREV32 CTR.B16, B0.B16
940 VADD CTR.S4, INC.S4, CTR.S4
941
942 AESE K0.B16, B0.B16
943 AESMC B0.B16, B0.B16
944 AESE K1.B16, B0.B16
945 AESMC B0.B16, B0.B16
946 AESE K2.B16, B0.B16
947 AESMC B0.B16, B0.B16
948 AESE K3.B16, B0.B16
949 AESMC B0.B16, B0.B16
950 AESE K4.B16, B0.B16
951 AESMC B0.B16, B0.B16
952 AESE K5.B16, B0.B16
953 AESMC B0.B16, B0.B16
954 AESE K6.B16, B0.B16
955 AESMC B0.B16, B0.B16
956 AESE K7.B16, B0.B16
957 AESMC B0.B16, B0.B16
958 AESE K8.B16, B0.B16
959 AESMC B0.B16, B0.B16
960 AESE K9.B16, B0.B16
961 TBZ $4, NR, tailLast
962 AESMC B0.B16, B0.B16
963 AESE K10.B16, B0.B16
964 AESMC B0.B16, B0.B16
965 AESE B1.B16, B0.B16
966 TBZ $3, NR, tailLast
967 AESMC B0.B16, B0.B16
968 AESE B2.B16, B0.B16
969 AESMC B0.B16, B0.B16
970 AESE B3.B16, B0.B16
971 tailLast:
972 VEOR KLAST.B16, B0.B16, B0.B16
973
974 // Assuming it is safe to load past dstPtr due to the presence of the tag
975 VLD1 (srcPtr), [B5.B16]
976
977 VEOR B5.B16, B0.B16, B0.B16
978
979 VEOR T3.B16, T3.B16, T3.B16
980 MOVD $0, H1
981 SUB $1, H1
982
983 TBZ $3, srcPtrLen, ld4
984 VMOV B0.D[0], H0
985 MOVD.P H0, 8(dstPtr)
986 VMOV H1, T3.D[0]
987 VEXT $8, ZERO.B16, B0.B16, B0.B16
988 ld4:
989 TBZ $2, srcPtrLen, ld2
990 VMOV B0.S[0], H0
991 MOVW.P H0, 4(dstPtr)
992 VEXT $12, T3.B16, ZERO.B16, T3.B16
993 VMOV H1, T3.S[0]
994 VEXT $4, ZERO.B16, B0.B16, B0.B16
995 ld2:
996 TBZ $1, srcPtrLen, ld1
997 VMOV B0.H[0], H0
998 MOVH.P H0, 2(dstPtr)
999 VEXT $14, T3.B16, ZERO.B16, T3.B16
1000 VMOV H1, T3.H[0]
1001 VEXT $2, ZERO.B16, B0.B16, B0.B16
1002 ld1:
1003 TBZ $0, srcPtrLen, ld0
1004 VMOV B0.B[0], H0
1005 MOVB.P H0, 1(dstPtr)
1006 VEXT $15, T3.B16, ZERO.B16, T3.B16
1007 VMOV H1, T3.B[0]
1008 ld0:
1009
1010 VAND T3.B16, B5.B16, B5.B16
1011 VREV64 B5.B16, B5.B16
1012
1013 VEOR ACC0.B16, B5.B16, B5.B16
1014 VEXT $8, B5.B16, B5.B16, T0.B16
1015 VEOR B5.B16, T0.B16, T0.B16
1016 VPMULL B5.D1, T1.D1, ACC1.Q1
1017 VPMULL2 B5.D2, T1.D2, ACC0.Q1
1018 VPMULL T0.D1, T2.D1, ACCM.Q1
1019 reduce()
1020 done:
1021 VST1 [ACC0.B16], (tPtr)
1022
1023 RET
1024
View as plain text