Text file
src/crypto/aes/gcm_ppc64x.s
1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build (ppc64 || ppc64le) && !purego
6
7 // Portions based on CRYPTOGAMS code with the following comment:
8 // # ====================================================================
9 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10 // # project. The module is, however, dual licensed under OpenSSL and
11 // # CRYPTOGAMS licenses depending on where you obtain it. For further
12 // # details see http://www.openssl.org/~appro/cryptogams/.
13 // # ====================================================================
14
15 // The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
16 // from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
17 // from commit d47afb3c.
18
19 // Changes were made due to differences in the ABI and some register usage.
20 // Some arguments were changed due to the way the Go code passes them.
21
22 // Portions that use the stitched AES-GCM approach in counterCryptASM
23 // are based on code found in
24 // https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
25
26 #include "textflag.h"
27
28 #define XIP R3
29 #define HTBL R4
30 #define INP R5
31 #define LEN R6
32
33 #define XL V0
34 #define XM V1
35 #define XH V2
36 #define IN V3
37 #define ZERO V4
38 #define T0 V5
39 #define T1 V6
40 #define T2 V7
41 #define XC2 V8
42 #define H V9
43 #define HH V10
44 #define HL V11
45 #define LEMASK V12
46 #define XL1 V13
47 #define XM1 V14
48 #define XH1 V15
49 #define IN1 V16
50 #define H2 V17
51 #define H2H V18
52 #define H2L V19
53 #define XL3 V20
54 #define XM2 V21
55 #define IN2 V22
56 #define H3L V23
57 #define H3 V24
58 #define H3H V25
59 #define XH3 V26
60 #define XM3 V27
61 #define IN3 V28
62 #define H4L V29
63 #define H4 V30
64 #define H4H V31
65
66 #define IN0 IN
67 #define H21L HL
68 #define H21H HH
69 #define LOPERM H2L
70 #define HIPERM H2H
71
72 #define VXL VS32
73 #define VIN VS35
74 #define VXC2 VS40
75 #define VH VS41
76 #define VHH VS42
77 #define VHL VS43
78 #define VIN1 VS48
79 #define VH2 VS49
80 #define VH2H VS50
81 #define VH2L VS51
82
83 #define VIN2 VS54
84 #define VH3L VS55
85 #define VH3 VS56
86 #define VH3H VS57
87 #define VIN3 VS60
88 #define VH4L VS61
89 #define VH4 VS62
90 #define VH4H VS63
91
92 #define VIN0 VIN
93
94 #define ESPERM V10
95 #define TMP2 V11
96
97 // The following macros provide appropriate
98 // implementations for endianness as well as
99 // ISA specific for power8 and power9.
100 #ifdef GOARCH_ppc64le
101 # ifdef GOPPC64_power9
102 #define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT
103 #define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB)
104 # else
105 #define NEEDS_ESPERM
106 #define P8_LXVB16X(RA,RB,VT) \
107 LXVD2X (RA+RB), VT \
108 VPERM VT, VT, ESPERM, VT
109
110 #define P8_STXVB16X(VS,RA,RB) \
111 VPERM VS, VS, ESPERM, TMP2; \
112 STXVD2X TMP2, (RA+RB)
113
114 # endif
115 #else
116 #define P8_LXVB16X(RA,RB,VT) \
117 LXVD2X (RA+RB), VT
118
119 #define P8_STXVB16X(VS,RA,RB) \
120 STXVD2X VS, (RA+RB)
121
122 #endif
123
124 #define MASK_PTR R8
125
126 #define MASKV V0
127 #define INV V1
128
129 // The following macros are used for
130 // the stitched implementation within
131 // counterCryptASM.
132
133 // Load the initial GCM counter value
134 // in V30 and set up the counter increment
135 // in V31
136 #define SETUP_COUNTER \
137 P8_LXVB16X(COUNTER, R0, V30); \
138 VSPLTISB $1, V28; \
139 VXOR V31, V31, V31; \
140 VSLDOI $1, V31, V28, V31
141
142 // These macros set up the initial value
143 // for a single encryption, or 4 or 8
144 // stitched encryptions implemented
145 // with interleaving vciphers.
146 //
147 // The input value for each encryption
148 // is generated by XORing the counter
149 // from V30 with the first key in VS0
150 // and incrementing the counter.
151 //
152 // Single encryption in V15
153 #define GEN_VCIPHER_INPUT \
154 XXLOR VS0, VS0, V29 \
155 VXOR V30, V29, V15; \
156 VADDUWM V30, V31, V30
157
158 // 4 encryptions in V15 - V18
159 #define GEN_VCIPHER_4_INPUTS \
160 XXLOR VS0, VS0, V29; \
161 VXOR V30, V29, V15; \
162 VADDUWM V30, V31, V30; \
163 VXOR V30, V29, V16; \
164 VADDUWM V30, V31, V30; \
165 VXOR V30, V29, V17; \
166 VADDUWM V30, V31, V30; \
167 VXOR V30, V29, V18; \
168 VADDUWM V30, V31, V30
169
170 // 8 encryptions in V15 - V22
171 #define GEN_VCIPHER_8_INPUTS \
172 XXLOR VS0, VS0, V29; \
173 VXOR V30, V29, V15; \
174 VADDUWM V30, V31, V30; \
175 VXOR V30, V29, V16; \
176 VADDUWM V30, V31, V30; \
177 VXOR V30, V29, V17; \
178 VADDUWM V30, V31, V30; \
179 VXOR V30, V29, V18; \
180 VADDUWM V30, V31, V30; \
181 VXOR V30, V29, V19; \
182 VADDUWM V30, V31, V30; \
183 VXOR V30, V29, V20; \
184 VADDUWM V30, V31, V30; \
185 VXOR V30, V29, V21; \
186 VADDUWM V30, V31, V30; \
187 VXOR V30, V29, V22; \
188 VADDUWM V30, V31, V30
189
190 // Load the keys to be used for
191 // encryption based on key_len.
192 // Keys are in VS0 - VS14
193 // depending on key_len.
194 // Valid keys sizes are verified
195 // here. CR2 is set and used
196 // throughout to check key_len.
197 #define LOAD_KEYS(blk_key, key_len) \
198 MOVD $16, R16; \
199 MOVD $32, R17; \
200 MOVD $48, R18; \
201 MOVD $64, R19; \
202 LXVD2X (blk_key)(R0), VS0; \
203 LXVD2X (blk_key)(R16), VS1; \
204 LXVD2X (blk_key)(R17), VS2; \
205 LXVD2X (blk_key)(R18), VS3; \
206 LXVD2X (blk_key)(R19), VS4; \
207 ADD $64, R16; \
208 ADD $64, R17; \
209 ADD $64, R18; \
210 ADD $64, R19; \
211 LXVD2X (blk_key)(R16), VS5; \
212 LXVD2X (blk_key)(R17), VS6; \
213 LXVD2X (blk_key)(R18), VS7; \
214 LXVD2X (blk_key)(R19), VS8; \
215 ADD $64, R16; \
216 ADD $64, R17; \
217 ADD $64, R18; \
218 ADD $64, R19; \
219 LXVD2X (blk_key)(R16), VS9; \
220 LXVD2X (blk_key)(R17), VS10; \
221 CMP key_len, $12, CR2; \
222 CMP key_len, $10; \
223 BEQ keysLoaded; \
224 LXVD2X (blk_key)(R18), VS11; \
225 LXVD2X (blk_key)(R19), VS12; \
226 BEQ CR2, keysLoaded; \
227 ADD $64, R16; \
228 ADD $64, R17; \
229 LXVD2X (blk_key)(R16), VS13; \
230 LXVD2X (blk_key)(R17), VS14; \
231 CMP key_len, $14; \
232 BEQ keysLoaded; \
233 MOVD R0,0(R0); \
234 keysLoaded:
235
236 // Encrypt 1 (vin) with first 9
237 // keys from VS1 - VS9.
238 #define VCIPHER_1X9_KEYS(vin) \
239 XXLOR VS1, VS1, V23; \
240 XXLOR VS2, VS2, V24; \
241 XXLOR VS3, VS3, V25; \
242 XXLOR VS4, VS4, V26; \
243 XXLOR VS5, VS5, V27; \
244 VCIPHER vin, V23, vin; \
245 VCIPHER vin, V24, vin; \
246 VCIPHER vin, V25, vin; \
247 VCIPHER vin, V26, vin; \
248 VCIPHER vin, V27, vin; \
249 XXLOR VS6, VS6, V23; \
250 XXLOR VS7, VS7, V24; \
251 XXLOR VS8, VS8, V25; \
252 XXLOR VS9, VS9, V26; \
253 VCIPHER vin, V23, vin; \
254 VCIPHER vin, V24, vin; \
255 VCIPHER vin, V25, vin; \
256 VCIPHER vin, V26, vin
257
258 // Encrypt 1 value (vin) with
259 // 2 specified keys
260 #define VCIPHER_1X2_KEYS(vin, key1, key2) \
261 XXLOR key1, key1, V25; \
262 XXLOR key2, key2, V26; \
263 VCIPHER vin, V25, vin; \
264 VCIPHER vin, V26, vin
265
266 // Encrypt 4 values in V15 - V18
267 // with the specified key from
268 // VS1 - VS9.
269 #define VCIPHER_4X1_KEY(key) \
270 XXLOR key, key, V23; \
271 VCIPHER V15, V23, V15; \
272 VCIPHER V16, V23, V16; \
273 VCIPHER V17, V23, V17; \
274 VCIPHER V18, V23, V18
275
276 // Encrypt 8 values in V15 - V22
277 // with the specified key,
278 // assuming it is a VSreg
279 #define VCIPHER_8X1_KEY(key) \
280 XXLOR key, key, V23; \
281 VCIPHER V15, V23, V15; \
282 VCIPHER V16, V23, V16; \
283 VCIPHER V17, V23, V17; \
284 VCIPHER V18, V23, V18; \
285 VCIPHER V19, V23, V19; \
286 VCIPHER V20, V23, V20; \
287 VCIPHER V21, V23, V21; \
288 VCIPHER V22, V23, V22
289
290 // Load input block into V1-V4
291 // in big endian order and
292 // update blk_inp by 64.
293 #define LOAD_INPUT_BLOCK64(blk_inp) \
294 MOVD $16, R16; \
295 MOVD $32, R17; \
296 MOVD $48, R18; \
297 P8_LXVB16X(blk_inp,R0,V1); \
298 P8_LXVB16X(blk_inp,R16,V2); \
299 P8_LXVB16X(blk_inp,R17,V3); \
300 P8_LXVB16X(blk_inp,R18,V4); \
301 ADD $64, blk_inp
302
303 // Load input block into V1-V8
304 // in big endian order and
305 // Update blk_inp by 128
306 #define LOAD_INPUT_BLOCK128(blk_inp) \
307 MOVD $16, R16; \
308 MOVD $32, R17; \
309 MOVD $48, R18; \
310 MOVD $64, R19; \
311 MOVD $80, R20; \
312 MOVD $96, R21; \
313 MOVD $112, R22; \
314 P8_LXVB16X(blk_inp,R0,V1); \
315 P8_LXVB16X(blk_inp,R16,V2); \
316 P8_LXVB16X(blk_inp,R17,V3); \
317 P8_LXVB16X(blk_inp,R18,V4); \
318 P8_LXVB16X(blk_inp,R19,V5); \
319 P8_LXVB16X(blk_inp,R20,V6); \
320 P8_LXVB16X(blk_inp,R21,V7); \
321 P8_LXVB16X(blk_inp,R22,V8); \
322 ADD $128, blk_inp
323
324 // Finish encryption on 8 streams and
325 // XOR with input block
326 #define VCIPHERLAST8_XOR_INPUT \
327 VCIPHERLAST V15, V23, V15; \
328 VCIPHERLAST V16, V23, V16; \
329 VCIPHERLAST V17, V23, V17; \
330 VCIPHERLAST V18, V23, V18; \
331 VCIPHERLAST V19, V23, V19; \
332 VCIPHERLAST V20, V23, V20; \
333 VCIPHERLAST V21, V23, V21; \
334 VCIPHERLAST V22, V23, V22; \
335 XXLXOR V1, V15, V1; \
336 XXLXOR V2, V16, V2; \
337 XXLXOR V3, V17, V3; \
338 XXLXOR V4, V18, V4; \
339 XXLXOR V5, V19, V5; \
340 XXLXOR V6, V20, V6; \
341 XXLXOR V7, V21, V7; \
342 XXLXOR V8, V22, V8
343
344 // Finish encryption on 4 streams and
345 // XOR with input block
346 #define VCIPHERLAST4_XOR_INPUT \
347 VCIPHERLAST V15, V23, V15; \
348 VCIPHERLAST V16, V23, V16; \
349 VCIPHERLAST V17, V23, V17; \
350 VCIPHERLAST V18, V23, V18; \
351 XXLXOR V1, V15, V1; \
352 XXLXOR V2, V16, V2; \
353 XXLXOR V3, V17, V3; \
354 XXLXOR V4, V18, V4
355
356 // Store output block from V1-V8
357 // in big endian order and
358 // Update blk_out by 128
359 #define STORE_OUTPUT_BLOCK128(blk_out) \
360 P8_STXVB16X(V1,blk_out,R0); \
361 P8_STXVB16X(V2,blk_out,R16); \
362 P8_STXVB16X(V3,blk_out,R17); \
363 P8_STXVB16X(V4,blk_out,R18); \
364 P8_STXVB16X(V5,blk_out,R19); \
365 P8_STXVB16X(V6,blk_out,R20); \
366 P8_STXVB16X(V7,blk_out,R21); \
367 P8_STXVB16X(V8,blk_out,R22); \
368 ADD $128, blk_out
369
370 // Store output block from V1-V4
371 // in big endian order and
372 // Update blk_out by 64
373 #define STORE_OUTPUT_BLOCK64(blk_out) \
374 P8_STXVB16X(V1,blk_out,R0); \
375 P8_STXVB16X(V2,blk_out,R16); \
376 P8_STXVB16X(V3,blk_out,R17); \
377 P8_STXVB16X(V4,blk_out,R18); \
378 ADD $64, blk_out
379
380 // func gcmInit(productTable *[256]byte, h []byte)
381 TEXT ·gcmInit(SB), NOSPLIT, $0-32
382 MOVD productTable+0(FP), XIP
383 MOVD h+8(FP), HTBL
384
385 MOVD $0x10, R8
386 MOVD $0x20, R9
387 MOVD $0x30, R10
388 LXVD2X (HTBL)(R0), VH // Load H
389
390 VSPLTISB $-16, XC2 // 0xf0
391 VSPLTISB $1, T0 // one
392 VADDUBM XC2, XC2, XC2 // 0xe0
393 VXOR ZERO, ZERO, ZERO
394 VOR XC2, T0, XC2 // 0xe1
395 VSLDOI $15, XC2, ZERO, XC2 // 0xe1...
396 VSLDOI $1, ZERO, T0, T1 // ...1
397 VADDUBM XC2, XC2, XC2 // 0xc2...
398 VSPLTISB $7, T2
399 VOR XC2, T1, XC2 // 0xc2....01
400 VSPLTB $0, H, T1 // most significant byte
401 VSL H, T0, H // H<<=1
402 VSRAB T1, T2, T1 // broadcast carry bit
403 VAND T1, XC2, T1
404 VXOR H, T1, IN // twisted H
405
406 VSLDOI $8, IN, IN, H // twist even more ...
407 VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
408 VSLDOI $8, ZERO, H, HL // ... and split
409 VSLDOI $8, H, ZERO, HH
410
411 STXVD2X VXC2, (XIP+R0) // save pre-computed table
412 STXVD2X VHL, (XIP+R8)
413 MOVD $0x40, R8
414 STXVD2X VH, (XIP+R9)
415 MOVD $0x50, R9
416 STXVD2X VHH, (XIP+R10)
417 MOVD $0x60, R10
418
419 VPMSUMD IN, HL, XL // H.lo·H.lo
420 VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi
421 VPMSUMD IN, HH, XH // H.hi·H.hi
422
423 VPMSUMD XL, XC2, T2 // 1st reduction phase
424
425 VSLDOI $8, XM, ZERO, T0
426 VSLDOI $8, ZERO, XM, T1
427 VXOR XL, T0, XL
428 VXOR XH, T1, XH
429
430 VSLDOI $8, XL, XL, XL
431 VXOR XL, T2, XL
432
433 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
434 VPMSUMD XL, XC2, XL
435 VXOR T1, XH, T1
436 VXOR XL, T1, IN1
437
438 VSLDOI $8, IN1, IN1, H2
439 VSLDOI $8, ZERO, H2, H2L
440 VSLDOI $8, H2, ZERO, H2H
441
442 STXVD2X VH2L, (XIP+R8) // save H^2
443 MOVD $0x70, R8
444 STXVD2X VH2, (XIP+R9)
445 MOVD $0x80, R9
446 STXVD2X VH2H, (XIP+R10)
447 MOVD $0x90, R10
448
449 VPMSUMD IN, H2L, XL // H.lo·H^2.lo
450 VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
451 VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi
452 VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi
453 VPMSUMD IN, H2H, XH // H.hi·H^2.hi
454 VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
455
456 VPMSUMD XL, XC2, T2 // 1st reduction phase
457 VPMSUMD XL1, XC2, HH // 1st reduction phase
458
459 VSLDOI $8, XM, ZERO, T0
460 VSLDOI $8, ZERO, XM, T1
461 VSLDOI $8, XM1, ZERO, HL
462 VSLDOI $8, ZERO, XM1, H
463 VXOR XL, T0, XL
464 VXOR XH, T1, XH
465 VXOR XL1, HL, XL1
466 VXOR XH1, H, XH1
467
468 VSLDOI $8, XL, XL, XL
469 VSLDOI $8, XL1, XL1, XL1
470 VXOR XL, T2, XL
471 VXOR XL1, HH, XL1
472
473 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
474 VSLDOI $8, XL1, XL1, H // 2nd reduction phase
475 VPMSUMD XL, XC2, XL
476 VPMSUMD XL1, XC2, XL1
477 VXOR T1, XH, T1
478 VXOR H, XH1, H
479 VXOR XL, T1, XL
480 VXOR XL1, H, XL1
481
482 VSLDOI $8, XL, XL, H
483 VSLDOI $8, XL1, XL1, H2
484 VSLDOI $8, ZERO, H, HL
485 VSLDOI $8, H, ZERO, HH
486 VSLDOI $8, ZERO, H2, H2L
487 VSLDOI $8, H2, ZERO, H2H
488
489 STXVD2X VHL, (XIP+R8) // save H^3
490 MOVD $0xa0, R8
491 STXVD2X VH, (XIP+R9)
492 MOVD $0xb0, R9
493 STXVD2X VHH, (XIP+R10)
494 MOVD $0xc0, R10
495 STXVD2X VH2L, (XIP+R8) // save H^4
496 STXVD2X VH2, (XIP+R9)
497 STXVD2X VH2H, (XIP+R10)
498
499 RET
500
501 // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
502 TEXT ·gcmHash(SB), NOSPLIT, $0-64
503 MOVD output+0(FP), XIP
504 MOVD productTable+24(FP), HTBL
505 MOVD inp+32(FP), INP
506 MOVD len+56(FP), LEN
507
508 MOVD $0x10, R8
509 MOVD $0x20, R9
510 MOVD $0x30, R10
511 LXVD2X (XIP)(R0), VXL // load Xi
512
513 LXVD2X (HTBL)(R8), VHL // load pre-computed table
514 MOVD $0x40, R8
515 LXVD2X (HTBL)(R9), VH
516 MOVD $0x50, R9
517 LXVD2X (HTBL)(R10), VHH
518 MOVD $0x60, R10
519 LXVD2X (HTBL)(R0), VXC2
520 #ifdef GOARCH_ppc64le
521 LVSL (R0)(R0), LEMASK
522 VSPLTISB $0x07, T0
523 VXOR LEMASK, T0, LEMASK
524 VPERM XL, XL, LEMASK, XL
525 #endif
526 VXOR ZERO, ZERO, ZERO
527
528 CMPU LEN, $64
529 BGE gcm_ghash_p8_4x
530
531 LXVD2X (INP)(R0), VIN
532 ADD $16, INP, INP
533 SUBCCC $16, LEN, LEN
534 #ifdef GOARCH_ppc64le
535 VPERM IN, IN, LEMASK, IN
536 #endif
537 VXOR IN, XL, IN
538 BEQ short
539
540 LXVD2X (HTBL)(R8), VH2L // load H^2
541 MOVD $16, R8
542 LXVD2X (HTBL)(R9), VH2
543 ADD LEN, INP, R9 // end of input
544 LXVD2X (HTBL)(R10), VH2H
545
546 loop_2x:
547 LXVD2X (INP)(R0), VIN1
548 #ifdef GOARCH_ppc64le
549 VPERM IN1, IN1, LEMASK, IN1
550 #endif
551
552 SUBC $32, LEN, LEN
553 VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo
554 VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo
555 SUBE R11, R11, R11 // borrow?-1:0
556 VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi
557 VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi
558 AND LEN, R11, R11
559 VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi
560 VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi
561 ADD R11, INP, INP
562
563 VXOR XL, XL1, XL
564 VXOR XM, XM1, XM
565
566 VPMSUMD XL, XC2, T2 // 1st reduction phase
567
568 VSLDOI $8, XM, ZERO, T0
569 VSLDOI $8, ZERO, XM, T1
570 VXOR XH, XH1, XH
571 VXOR XL, T0, XL
572 VXOR XH, T1, XH
573
574 VSLDOI $8, XL, XL, XL
575 VXOR XL, T2, XL
576 LXVD2X (INP)(R8), VIN
577 ADD $32, INP, INP
578
579 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
580 VPMSUMD XL, XC2, XL
581 #ifdef GOARCH_ppc64le
582 VPERM IN, IN, LEMASK, IN
583 #endif
584 VXOR T1, XH, T1
585 VXOR IN, T1, IN
586 VXOR IN, XL, IN
587 CMP R9, INP
588 BGT loop_2x // done yet?
589
590 CMPWU LEN, $0
591 BNE even
592
593 short:
594 VPMSUMD IN, HL, XL // H.lo·Xi.lo
595 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
596 VPMSUMD IN, HH, XH // H.hi·Xi.hi
597
598 VPMSUMD XL, XC2, T2 // 1st reduction phase
599
600 VSLDOI $8, XM, ZERO, T0
601 VSLDOI $8, ZERO, XM, T1
602 VXOR XL, T0, XL
603 VXOR XH, T1, XH
604
605 VSLDOI $8, XL, XL, XL
606 VXOR XL, T2, XL
607
608 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
609 VPMSUMD XL, XC2, XL
610 VXOR T1, XH, T1
611
612 even:
613 VXOR XL, T1, XL
614 #ifdef GOARCH_ppc64le
615 VPERM XL, XL, LEMASK, XL
616 #endif
617 STXVD2X VXL, (XIP+R0)
618
619 OR R12, R12, R12 // write out Xi
620 RET
621
622 gcm_ghash_p8_4x:
623 LVSL (R8)(R0), T0 // 0x0001..0e0f
624 MOVD $0x70, R8
625 LXVD2X (HTBL)(R9), VH2
626 MOVD $0x80, R9
627 VSPLTISB $8, T1 // 0x0808..0808
628 MOVD $0x90, R10
629 LXVD2X (HTBL)(R8), VH3L // load H^3
630 MOVD $0xa0, R8
631 LXVD2X (HTBL)(R9), VH3
632 MOVD $0xb0, R9
633 LXVD2X (HTBL)(R10), VH3H
634 MOVD $0xc0, R10
635 LXVD2X (HTBL)(R8), VH4L // load H^4
636 MOVD $0x10, R8
637 LXVD2X (HTBL)(R9), VH4
638 MOVD $0x20, R9
639 LXVD2X (HTBL)(R10), VH4H
640 MOVD $0x30, R10
641
642 VSLDOI $8, ZERO, T1, T2 // 0x0000..0808
643 VADDUBM T0, T2, HIPERM // 0x0001..1617
644 VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
645
646 SRD $4, LEN, LEN // this allows to use sign bit as carry
647
648 LXVD2X (INP)(R0), VIN0 // load input
649 LXVD2X (INP)(R8), VIN1
650 SUBCCC $8, LEN, LEN
651 LXVD2X (INP)(R9), VIN2
652 LXVD2X (INP)(R10), VIN3
653 ADD $0x40, INP, INP
654 #ifdef GOARCH_ppc64le
655 VPERM IN0, IN0, LEMASK, IN0
656 VPERM IN1, IN1, LEMASK, IN1
657 VPERM IN2, IN2, LEMASK, IN2
658 VPERM IN3, IN3, LEMASK, IN3
659 #endif
660
661 VXOR IN0, XL, XH
662
663 VPMSUMD IN1, H3L, XL1
664 VPMSUMD IN1, H3, XM1
665 VPMSUMD IN1, H3H, XH1
666
667 VPERM H2, H, HIPERM, H21L
668 VPERM IN2, IN3, LOPERM, T0
669 VPERM H2, H, LOPERM, H21H
670 VPERM IN2, IN3, HIPERM, T1
671 VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
672 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
673 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
674 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
675
676 VXOR XM2, XM1, XM2
677 VXOR XL3, XL1, XL3
678 VXOR XM3, XM2, XM3
679 VXOR XH3, XH1, XH3
680
681 BLT tail_4x
682
683 loop_4x:
684 LXVD2X (INP)(R0), VIN0
685 LXVD2X (INP)(R8), VIN1
686 SUBCCC $4, LEN, LEN
687 LXVD2X (INP)(R9), VIN2
688 LXVD2X (INP)(R10), VIN3
689 ADD $0x40, INP, INP
690 #ifdef GOARCH_ppc64le
691 VPERM IN1, IN1, LEMASK, IN1
692 VPERM IN2, IN2, LEMASK, IN2
693 VPERM IN3, IN3, LEMASK, IN3
694 VPERM IN0, IN0, LEMASK, IN0
695 #endif
696
697 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
698 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
699 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
700 VPMSUMD IN1, H3L, XL1
701 VPMSUMD IN1, H3, XM1
702 VPMSUMD IN1, H3H, XH1
703
704 VXOR XL, XL3, XL
705 VXOR XM, XM3, XM
706 VXOR XH, XH3, XH
707 VPERM IN2, IN3, LOPERM, T0
708 VPERM IN2, IN3, HIPERM, T1
709
710 VPMSUMD XL, XC2, T2 // 1st reduction phase
711 VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
712 VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
713
714 VSLDOI $8, XM, ZERO, T0
715 VSLDOI $8, ZERO, XM, T1
716 VXOR XL, T0, XL
717 VXOR XH, T1, XH
718
719 VSLDOI $8, XL, XL, XL
720 VXOR XL, T2, XL
721
722 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
723 VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
724 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
725 VPMSUMD XL, XC2, XL
726
727 VXOR XL3, XL1, XL3
728 VXOR XH3, XH1, XH3
729 VXOR XH, IN0, XH
730 VXOR XM2, XM1, XM2
731 VXOR XH, T1, XH
732 VXOR XM3, XM2, XM3
733 VXOR XH, XL, XH
734 BGE loop_4x
735
736 tail_4x:
737 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
738 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
739 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
740
741 VXOR XL, XL3, XL
742 VXOR XM, XM3, XM
743
744 VPMSUMD XL, XC2, T2 // 1st reduction phase
745
746 VSLDOI $8, XM, ZERO, T0
747 VSLDOI $8, ZERO, XM, T1
748 VXOR XH, XH3, XH
749 VXOR XL, T0, XL
750 VXOR XH, T1, XH
751
752 VSLDOI $8, XL, XL, XL
753 VXOR XL, T2, XL
754
755 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
756 VPMSUMD XL, XC2, XL
757 VXOR T1, XH, T1
758 VXOR XL, T1, XL
759
760 ADDCCC $4, LEN, LEN
761 BEQ done_4x
762
763 LXVD2X (INP)(R0), VIN0
764 CMPU LEN, $2
765 MOVD $-4, LEN
766 BLT one
767 LXVD2X (INP)(R8), VIN1
768 BEQ two
769
770 three:
771 LXVD2X (INP)(R9), VIN2
772 #ifdef GOARCH_ppc64le
773 VPERM IN0, IN0, LEMASK, IN0
774 VPERM IN1, IN1, LEMASK, IN1
775 VPERM IN2, IN2, LEMASK, IN2
776 #endif
777
778 VXOR IN0, XL, XH
779 VOR H3L, H3L, H4L
780 VOR H3, H3, H4
781 VOR H3H, H3H, H4H
782
783 VPERM IN1, IN2, LOPERM, T0
784 VPERM IN1, IN2, HIPERM, T1
785 VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
786 VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi
787 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
788 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
789
790 VXOR XM3, XM2, XM3
791 JMP tail_4x
792
793 two:
794 #ifdef GOARCH_ppc64le
795 VPERM IN0, IN0, LEMASK, IN0
796 VPERM IN1, IN1, LEMASK, IN1
797 #endif
798
799 VXOR IN, XL, XH
800 VPERM ZERO, IN1, LOPERM, T0
801 VPERM ZERO, IN1, HIPERM, T1
802
803 VSLDOI $8, ZERO, H2, H4L
804 VOR H2, H2, H4
805 VSLDOI $8, H2, ZERO, H4H
806
807 VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
808 VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi
809 VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
810
811 JMP tail_4x
812
813 one:
814 #ifdef GOARCH_ppc64le
815 VPERM IN0, IN0, LEMASK, IN0
816 #endif
817
818 VSLDOI $8, ZERO, H, H4L
819 VOR H, H, H4
820 VSLDOI $8, H, ZERO, H4H
821
822 VXOR IN0, XL, XH
823 VXOR XL3, XL3, XL3
824 VXOR XM3, XM3, XM3
825 VXOR XH3, XH3, XH3
826
827 JMP tail_4x
828
829 done_4x:
830 #ifdef GOARCH_ppc64le
831 VPERM XL, XL, LEMASK, XL
832 #endif
833 STXVD2X VXL, (XIP+R0) // write out Xi
834 RET
835
836 // func gcmMul(output []byte, productTable *[256]byte)
837 TEXT ·gcmMul(SB), NOSPLIT, $0-32
838 MOVD output+0(FP), XIP
839 MOVD productTable+24(FP), HTBL
840
841 MOVD $0x10, R8
842 MOVD $0x20, R9
843 MOVD $0x30, R10
844 LXVD2X (XIP)(R0), VIN // load Xi
845
846 LXVD2X (HTBL)(R8), VHL // Load pre-computed table
847 LXVD2X (HTBL)(R9), VH
848 LXVD2X (HTBL)(R10), VHH
849 LXVD2X (HTBL)(R0), VXC2
850 #ifdef GOARCH_ppc64le
851 VSPLTISB $0x07, T0
852 VXOR LEMASK, T0, LEMASK
853 VPERM IN, IN, LEMASK, IN
854 #endif
855 VXOR ZERO, ZERO, ZERO
856
857 VPMSUMD IN, HL, XL // H.lo·Xi.lo
858 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
859 VPMSUMD IN, HH, XH // H.hi·Xi.hi
860
861 VPMSUMD XL, XC2, T2 // 1st reduction phase
862
863 VSLDOI $8, XM, ZERO, T0
864 VSLDOI $8, ZERO, XM, T1
865 VXOR XL, T0, XL
866 VXOR XH, T1, XH
867
868 VSLDOI $8, XL, XL, XL
869 VXOR XL, T2, XL
870
871 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
872 VPMSUMD XL, XC2, XL
873 VXOR T1, XH, T1
874 VXOR XL, T1, XL
875
876 #ifdef GOARCH_ppc64le
877 VPERM XL, XL, LEMASK, XL
878 #endif
879 STXVD2X VXL, (XIP+R0) // write out Xi
880 RET
881
882 #define BLK_INP R3
883 #define BLK_OUT R4
884 #define BLK_KEY R5
885 #define KEY_LEN R6
886 #define BLK_IDX R7
887 #define IDX R8
888 #define IN_LEN R9
889 #define COUNTER R10
890 #define CONPTR R14
891 #define MASK V5
892
893 // Implementation of the counterCrypt function in assembler.
894 // Original loop is unrolled to allow for multiple encryption
895 // streams to be done in parallel, which is achieved by interleaving
896 // vcipher instructions from each stream. This is also referred to as
897 // stitching, and provides significant performance improvements.
898 // Some macros are defined which enable execution for big or little
899 // endian as well as different ISA targets.
900 //func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
901 //func counterCryptASM(xr, out, in, counter, key)
902 TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
903 MOVD xr(FP), KEY_LEN
904 MOVD out+8(FP), BLK_OUT
905 MOVD out_len+16(FP), R8
906 MOVD in+32(FP), BLK_INP
907 MOVD in_len+40(FP), IN_LEN
908 MOVD counter+56(FP), COUNTER
909 MOVD key+64(FP), BLK_KEY
910
911 // Set up permute string when needed.
912 #ifdef NEEDS_ESPERM
913 MOVD $·rcon(SB), R14
914 LVX (R14), ESPERM // Permute value for P8_ macros.
915 #endif
916 SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1}
917 LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize
918 CMP IN_LEN, $128
919 BLT block64
920 block128_loop:
921 // Do 8 encryptions in parallel by setting
922 // input values in V15-V22 and executing
923 // vcipher on the updated value and the keys.
924 GEN_VCIPHER_8_INPUTS
925 VCIPHER_8X1_KEY(VS1)
926 VCIPHER_8X1_KEY(VS2)
927 VCIPHER_8X1_KEY(VS3)
928 VCIPHER_8X1_KEY(VS4)
929 VCIPHER_8X1_KEY(VS5)
930 VCIPHER_8X1_KEY(VS6)
931 VCIPHER_8X1_KEY(VS7)
932 VCIPHER_8X1_KEY(VS8)
933 VCIPHER_8X1_KEY(VS9)
934 // Additional encryptions are done based on
935 // the key length, with the last key moved
936 // to V23 for use with VCIPHERLAST.
937 // CR2 = CMP key_len, $12
938 XXLOR VS10, VS10, V23
939 BLT CR2, block128_last // key_len = 10
940 VCIPHER_8X1_KEY(VS10)
941 VCIPHER_8X1_KEY(VS11)
942 XXLOR VS12,VS12,V23
943 BEQ CR2, block128_last // ken_len = 12
944 VCIPHER_8X1_KEY(VS12)
945 VCIPHER_8X1_KEY(VS13)
946 XXLOR VS14,VS14,V23 // key_len = 14
947 block128_last:
948 // vcipher encryptions are in V15-V22 at this
949 // point with vcipherlast remaining to be done.
950 // Load input block into V1-V8, setting index offsets
951 // in R16-R22 to use with the STORE.
952 LOAD_INPUT_BLOCK128(BLK_INP)
953 // Do VCIPHERLAST on the last key for each encryption
954 // stream and XOR the result with the corresponding
955 // value from the input block.
956 VCIPHERLAST8_XOR_INPUT
957 // Store the results (8*16) and update BLK_OUT by 128.
958 STORE_OUTPUT_BLOCK128(BLK_OUT)
959 ADD $-128, IN_LEN // input size
960 CMP IN_LEN, $128 // check if >= blocksize
961 BGE block128_loop // next input block
962 CMP IN_LEN, $0
963 BEQ done
964 block64:
965 CMP IN_LEN, $64 // Check if >= 64
966 BLT block16_loop
967 // Do 4 encryptions in parallel by setting
968 // input values in V15-V18 and executing
969 // vcipher on the updated value and the keys.
970 GEN_VCIPHER_4_INPUTS
971 VCIPHER_4X1_KEY(VS1)
972 VCIPHER_4X1_KEY(VS2)
973 VCIPHER_4X1_KEY(VS3)
974 VCIPHER_4X1_KEY(VS4)
975 VCIPHER_4X1_KEY(VS5)
976 VCIPHER_4X1_KEY(VS6)
977 VCIPHER_4X1_KEY(VS7)
978 VCIPHER_4X1_KEY(VS8)
979 VCIPHER_4X1_KEY(VS9)
980 // Check key length based on CR2
981 // Move last key to V23 for use with later vcipherlast
982 XXLOR VS10, VS10, V23
983 BLT CR2, block64_last // size = 10
984 VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys
985 VCIPHER_4X1_KEY(VS11)
986 XXLOR VS12, VS12, V23
987 BEQ CR2, block64_last // size = 12
988 VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys
989 VCIPHER_4X1_KEY(VS13)
990 XXLOR VS14, VS14, V23 // size = 14
991 block64_last:
992 LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input
993 // Do VCIPHERLAST on the last for each encryption
994 // stream and XOR the result with the corresponding
995 // value from the input block.
996 VCIPHERLAST4_XOR_INPUT
997 // Store the results (4*16) and update BLK_OUT by 64.
998 STORE_OUTPUT_BLOCK64(BLK_OUT)
999 ADD $-64, IN_LEN // decrement input block length
1000 CMP IN_LEN, $0 // check for remaining length
1001 BEQ done
1002 block16_loop:
1003 CMP IN_LEN, $16 // More input
1004 BLT final_block // If not, then handle partial block
1005 // Single encryption, no stitching
1006 GEN_VCIPHER_INPUT // Generate input value for single encryption
1007 VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys
1008 XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast
1009 // Key length based on CR2. (LT=10, EQ=12, GT=14)
1010 BLT CR2, block16_last // Finish for key size 10
1011 VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
1012 XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast
1013 BEQ CR2, block16_last // Finish for key size 12
1014 VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
1015 XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14
1016 block16_last:
1017 P8_LXVB16X(BLK_INP, R0, V1) // Load input
1018 VCIPHERLAST V15, V23, V15 // Encrypt last value in V23
1019 XXLXOR V15, V1, V1 // XOR with input
1020 P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output
1021 ADD $16, BLK_INP // Increment input pointer
1022 ADD $16, BLK_OUT // Increment output pointer
1023 ADD $-16, IN_LEN // Decrement input length
1024 BR block16_loop // Check for next
1025 final_block:
1026 CMP IN_LEN, $0
1027 BEQ done
1028 GEN_VCIPHER_INPUT // Generate input value for partial encryption
1029 VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys
1030 XXLOR VS10, VS10, V23 // Save possible last key
1031 BLT CR2, final_block_last
1032 VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys
1033 XXLOR VS12, VS12, V23 // Save possible last key
1034 BEQ CR2, final_block_last
1035 VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
1036 XXLOR VS14, VS14, V23 // Save last key
1037 final_block_last:
1038 VCIPHERLAST V15, V23, V15 // Finish encryption
1039 #ifdef GOPPC64_power10
1040 // set up length
1041 SLD $56, IN_LEN, R17
1042 LXVLL BLK_INP, R17, V25
1043 VXOR V25, V15, V25
1044 STXVLL V25, BLK_OUT, R17
1045 #else
1046 ADD $32, R1, MASK_PTR
1047 MOVD $0, R16
1048 P8_STXVB16X(V15, MASK_PTR, R0)
1049 CMP IN_LEN, $8
1050 BLT next4
1051 MOVD 0(MASK_PTR), R14
1052 MOVD 0(BLK_INP), R15
1053 XOR R14, R15, R14
1054 MOVD R14, 0(BLK_OUT)
1055 ADD $8, R16
1056 ADD $-8, IN_LEN
1057 next4:
1058 CMP IN_LEN, $4
1059 BLT next2
1060 MOVWZ (BLK_INP)(R16), R15
1061 MOVWZ (MASK_PTR)(R16), R14
1062 XOR R14, R15, R14
1063 MOVW R14, (R16)(BLK_OUT)
1064 ADD $4, R16
1065 ADD $-4, IN_LEN
1066 next2:
1067 CMP IN_LEN, $2
1068 BLT next1
1069 MOVHZ (BLK_INP)(R16), R15
1070 MOVHZ (MASK_PTR)(R16), R14
1071 XOR R14, R15, R14
1072 MOVH R14, (R16)(BLK_OUT)
1073 ADD $2, R16
1074 ADD $-2, IN_LEN
1075 next1:
1076 CMP IN_LEN, $1
1077 BLT done
1078 MOVBZ (MASK_PTR)(R16), R14
1079 MOVBZ (BLK_INP)(R16), R15
1080 XOR R14, R15, R14
1081 MOVB R14, (R16)(BLK_OUT)
1082 #endif
1083 done:
1084 // Save the updated counter value
1085 P8_STXVB16X(V30, COUNTER, R0)
1086 // Clear the keys
1087 XXLXOR VS0, VS0, VS0
1088 XXLXOR VS1, VS1, VS1
1089 XXLXOR VS2, VS2, VS2
1090 XXLXOR VS3, VS3, VS3
1091 XXLXOR VS4, VS4, VS4
1092 XXLXOR VS5, VS5, VS5
1093 XXLXOR VS6, VS6, VS6
1094 XXLXOR VS7, VS7, VS7
1095 XXLXOR VS8, VS8, VS8
1096 XXLXOR VS9, VS9, VS9
1097 XXLXOR VS10, VS10, VS10
1098 XXLXOR VS11, VS11, VS11
1099 XXLXOR VS12, VS12, VS12
1100 XXLXOR VS13, VS13, VS13
1101 XXLXOR VS14, VS14, VS14
1102 RET
1103
1104
View as plain text