1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Based on CRYPTOGAMS code with the following comment:
6 // # ====================================================================
7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8 // # project. The module is, however, dual licensed under OpenSSL and
9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
10 // # details see http://www.openssl.org/~appro/cryptogams/.
11 // # ====================================================================
12
13 // Code for the perl script that generates the ppc64 assembler
14 // can be found in the cryptogams repository at the link below. It is based on
15 // the original from openssl.
16
17 // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
18
19 // The differences in this and the original implementation are
20 // due to the calling conventions and initialization of constants.
21
22 //go:build gc && !purego && (ppc64 || ppc64le)
23
24 #include "textflag.h"
25
26 #define OUT R3
27 #define INP R4
28 #define LEN R5
29 #define KEY R6
30 #define CNT R7
31 #define TMP R15
32
33 #define CONSTBASE R16
34 #define BLOCKS R17
35
36 // for VPERMXOR
37 #define MASK R18
38
39 DATA consts<>+0x00(SB)/4, $0x61707865
40 DATA consts<>+0x04(SB)/4, $0x3320646e
41 DATA consts<>+0x08(SB)/4, $0x79622d32
42 DATA consts<>+0x0c(SB)/4, $0x6b206574
43 DATA consts<>+0x10(SB)/4, $0x00000001
44 DATA consts<>+0x14(SB)/4, $0x00000000
45 DATA consts<>+0x18(SB)/4, $0x00000000
46 DATA consts<>+0x1c(SB)/4, $0x00000000
47 DATA consts<>+0x20(SB)/4, $0x00000004
48 DATA consts<>+0x24(SB)/4, $0x00000000
49 DATA consts<>+0x28(SB)/4, $0x00000000
50 DATA consts<>+0x2c(SB)/4, $0x00000000
51 DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
52 DATA consts<>+0x34(SB)/4, $0x0a0b0809
53 DATA consts<>+0x38(SB)/4, $0x06070405
54 DATA consts<>+0x3c(SB)/4, $0x02030001
55 DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
56 DATA consts<>+0x44(SB)/4, $0x090a0b08
57 DATA consts<>+0x48(SB)/4, $0x05060704
58 DATA consts<>+0x4c(SB)/4, $0x01020300
59 DATA consts<>+0x50(SB)/4, $0x61707865
60 DATA consts<>+0x54(SB)/4, $0x61707865
61 DATA consts<>+0x58(SB)/4, $0x61707865
62 DATA consts<>+0x5c(SB)/4, $0x61707865
63 DATA consts<>+0x60(SB)/4, $0x3320646e
64 DATA consts<>+0x64(SB)/4, $0x3320646e
65 DATA consts<>+0x68(SB)/4, $0x3320646e
66 DATA consts<>+0x6c(SB)/4, $0x3320646e
67 DATA consts<>+0x70(SB)/4, $0x79622d32
68 DATA consts<>+0x74(SB)/4, $0x79622d32
69 DATA consts<>+0x78(SB)/4, $0x79622d32
70 DATA consts<>+0x7c(SB)/4, $0x79622d32
71 DATA consts<>+0x80(SB)/4, $0x6b206574
72 DATA consts<>+0x84(SB)/4, $0x6b206574
73 DATA consts<>+0x88(SB)/4, $0x6b206574
74 DATA consts<>+0x8c(SB)/4, $0x6b206574
75 DATA consts<>+0x90(SB)/4, $0x00000000
76 DATA consts<>+0x94(SB)/4, $0x00000001
77 DATA consts<>+0x98(SB)/4, $0x00000002
78 DATA consts<>+0x9c(SB)/4, $0x00000003
79 DATA consts<>+0xa0(SB)/4, $0x11223300
80 DATA consts<>+0xa4(SB)/4, $0x55667744
81 DATA consts<>+0xa8(SB)/4, $0x99aabb88
82 DATA consts<>+0xac(SB)/4, $0xddeeffcc
83 DATA consts<>+0xb0(SB)/4, $0x22330011
84 DATA consts<>+0xb4(SB)/4, $0x66774455
85 DATA consts<>+0xb8(SB)/4, $0xaabb8899
86 DATA consts<>+0xbc(SB)/4, $0xeeffccdd
87 GLOBL consts<>(SB), RODATA, $0xc0
88
89 #ifdef GOARCH_ppc64
90 #define BE_XXBRW_INIT() \
91 LVSL (R0)(R0), V24 \
92 VSPLTISB $3, V25 \
93 VXOR V24, V25, V24 \
94
95 #define BE_XXBRW(vr) VPERM vr, vr, V24, vr
96 #else
97 #define BE_XXBRW_INIT()
98 #define BE_XXBRW(vr)
99 #endif
100
101 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
102 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
103 MOVD out+0(FP), OUT
104 MOVD inp+8(FP), INP
105 MOVD len+16(FP), LEN
106 MOVD key+24(FP), KEY
107 MOVD counter+32(FP), CNT
108
109 // Addressing for constants
110 MOVD $consts<>+0x00(SB), CONSTBASE
111 MOVD $16, R8
112 MOVD $32, R9
113 MOVD $48, R10
114 MOVD $64, R11
115 SRD $6, LEN, BLOCKS
116 // for VPERMXOR
117 MOVD $consts<>+0xa0(SB), MASK
118 MOVD $16, R20
119 // V16
120 LXVW4X (CONSTBASE)(R0), VS48
121 ADD $80,CONSTBASE
122
123 // Load key into V17,V18
124 LXVW4X (KEY)(R0), VS49
125 LXVW4X (KEY)(R8), VS50
126
127 // Load CNT, NONCE into V19
128 LXVW4X (CNT)(R0), VS51
129
130 // Clear V27
131 VXOR V27, V27, V27
132
133 BE_XXBRW_INIT()
134
135 // V28
136 LXVW4X (CONSTBASE)(R11), VS60
137
138 // Load mask constants for VPERMXOR
139 LXVW4X (MASK)(R0), V20
140 LXVW4X (MASK)(R20), V21
141
142 // splat slot from V19 -> V26
143 VSPLTW $0, V19, V26
144
145 VSLDOI $4, V19, V27, V19
146 VSLDOI $12, V27, V19, V19
147
148 VADDUWM V26, V28, V26
149
150 MOVD $10, R14
151 MOVD R14, CTR
152 PCALIGN $16
153 loop_outer_vsx:
154 // V0, V1, V2, V3
155 LXVW4X (R0)(CONSTBASE), VS32
156 LXVW4X (R8)(CONSTBASE), VS33
157 LXVW4X (R9)(CONSTBASE), VS34
158 LXVW4X (R10)(CONSTBASE), VS35
159
160 // splat values from V17, V18 into V4-V11
161 VSPLTW $0, V17, V4
162 VSPLTW $1, V17, V5
163 VSPLTW $2, V17, V6
164 VSPLTW $3, V17, V7
165 VSPLTW $0, V18, V8
166 VSPLTW $1, V18, V9
167 VSPLTW $2, V18, V10
168 VSPLTW $3, V18, V11
169
170 // VOR
171 VOR V26, V26, V12
172
173 // splat values from V19 -> V13, V14, V15
174 VSPLTW $1, V19, V13
175 VSPLTW $2, V19, V14
176 VSPLTW $3, V19, V15
177
178 // splat const values
179 VSPLTISW $-16, V27
180 VSPLTISW $12, V28
181 VSPLTISW $8, V29
182 VSPLTISW $7, V30
183 PCALIGN $16
184 loop_vsx:
185 VADDUWM V0, V4, V0
186 VADDUWM V1, V5, V1
187 VADDUWM V2, V6, V2
188 VADDUWM V3, V7, V3
189
190 VPERMXOR V12, V0, V21, V12
191 VPERMXOR V13, V1, V21, V13
192 VPERMXOR V14, V2, V21, V14
193 VPERMXOR V15, V3, V21, V15
194
195 VADDUWM V8, V12, V8
196 VADDUWM V9, V13, V9
197 VADDUWM V10, V14, V10
198 VADDUWM V11, V15, V11
199
200 VXOR V4, V8, V4
201 VXOR V5, V9, V5
202 VXOR V6, V10, V6
203 VXOR V7, V11, V7
204
205 VRLW V4, V28, V4
206 VRLW V5, V28, V5
207 VRLW V6, V28, V6
208 VRLW V7, V28, V7
209
210 VADDUWM V0, V4, V0
211 VADDUWM V1, V5, V1
212 VADDUWM V2, V6, V2
213 VADDUWM V3, V7, V3
214
215 VPERMXOR V12, V0, V20, V12
216 VPERMXOR V13, V1, V20, V13
217 VPERMXOR V14, V2, V20, V14
218 VPERMXOR V15, V3, V20, V15
219
220 VADDUWM V8, V12, V8
221 VADDUWM V9, V13, V9
222 VADDUWM V10, V14, V10
223 VADDUWM V11, V15, V11
224
225 VXOR V4, V8, V4
226 VXOR V5, V9, V5
227 VXOR V6, V10, V6
228 VXOR V7, V11, V7
229
230 VRLW V4, V30, V4
231 VRLW V5, V30, V5
232 VRLW V6, V30, V6
233 VRLW V7, V30, V7
234
235 VADDUWM V0, V5, V0
236 VADDUWM V1, V6, V1
237 VADDUWM V2, V7, V2
238 VADDUWM V3, V4, V3
239
240 VPERMXOR V15, V0, V21, V15
241 VPERMXOR V12, V1, V21, V12
242 VPERMXOR V13, V2, V21, V13
243 VPERMXOR V14, V3, V21, V14
244
245 VADDUWM V10, V15, V10
246 VADDUWM V11, V12, V11
247 VADDUWM V8, V13, V8
248 VADDUWM V9, V14, V9
249
250 VXOR V5, V10, V5
251 VXOR V6, V11, V6
252 VXOR V7, V8, V7
253 VXOR V4, V9, V4
254
255 VRLW V5, V28, V5
256 VRLW V6, V28, V6
257 VRLW V7, V28, V7
258 VRLW V4, V28, V4
259
260 VADDUWM V0, V5, V0
261 VADDUWM V1, V6, V1
262 VADDUWM V2, V7, V2
263 VADDUWM V3, V4, V3
264
265 VPERMXOR V15, V0, V20, V15
266 VPERMXOR V12, V1, V20, V12
267 VPERMXOR V13, V2, V20, V13
268 VPERMXOR V14, V3, V20, V14
269
270 VADDUWM V10, V15, V10
271 VADDUWM V11, V12, V11
272 VADDUWM V8, V13, V8
273 VADDUWM V9, V14, V9
274
275 VXOR V5, V10, V5
276 VXOR V6, V11, V6
277 VXOR V7, V8, V7
278 VXOR V4, V9, V4
279
280 VRLW V5, V30, V5
281 VRLW V6, V30, V6
282 VRLW V7, V30, V7
283 VRLW V4, V30, V4
284 BDNZ loop_vsx
285
286 VADDUWM V12, V26, V12
287
288 VMRGEW V0, V1, V27
289 VMRGEW V2, V3, V28
290
291 VMRGOW V0, V1, V0
292 VMRGOW V2, V3, V2
293
294 VMRGEW V4, V5, V29
295 VMRGEW V6, V7, V30
296
297 XXPERMDI VS32, VS34, $0, VS33
298 XXPERMDI VS32, VS34, $3, VS35
299 XXPERMDI VS59, VS60, $0, VS32
300 XXPERMDI VS59, VS60, $3, VS34
301
302 VMRGOW V4, V5, V4
303 VMRGOW V6, V7, V6
304
305 VMRGEW V8, V9, V27
306 VMRGEW V10, V11, V28
307
308 XXPERMDI VS36, VS38, $0, VS37
309 XXPERMDI VS36, VS38, $3, VS39
310 XXPERMDI VS61, VS62, $0, VS36
311 XXPERMDI VS61, VS62, $3, VS38
312
313 VMRGOW V8, V9, V8
314 VMRGOW V10, V11, V10
315
316 VMRGEW V12, V13, V29
317 VMRGEW V14, V15, V30
318
319 XXPERMDI VS40, VS42, $0, VS41
320 XXPERMDI VS40, VS42, $3, VS43
321 XXPERMDI VS59, VS60, $0, VS40
322 XXPERMDI VS59, VS60, $3, VS42
323
324 VMRGOW V12, V13, V12
325 VMRGOW V14, V15, V14
326
327 VSPLTISW $4, V27
328 VADDUWM V26, V27, V26
329
330 XXPERMDI VS44, VS46, $0, VS45
331 XXPERMDI VS44, VS46, $3, VS47
332 XXPERMDI VS61, VS62, $0, VS44
333 XXPERMDI VS61, VS62, $3, VS46
334
335 VADDUWM V0, V16, V0
336 VADDUWM V4, V17, V4
337 VADDUWM V8, V18, V8
338 VADDUWM V12, V19, V12
339
340 BE_XXBRW(V0)
341 BE_XXBRW(V4)
342 BE_XXBRW(V8)
343 BE_XXBRW(V12)
344
345 CMPU LEN, $64
346 BLT tail_vsx
347
348 // Bottom of loop
349 LXVW4X (INP)(R0), VS59
350 LXVW4X (INP)(R8), VS60
351 LXVW4X (INP)(R9), VS61
352 LXVW4X (INP)(R10), VS62
353
354 VXOR V27, V0, V27
355 VXOR V28, V4, V28
356 VXOR V29, V8, V29
357 VXOR V30, V12, V30
358
359 STXVW4X VS59, (OUT)(R0)
360 STXVW4X VS60, (OUT)(R8)
361 ADD $64, INP
362 STXVW4X VS61, (OUT)(R9)
363 ADD $-64, LEN
364 STXVW4X VS62, (OUT)(R10)
365 ADD $64, OUT
366 BEQ done_vsx
367
368 VADDUWM V1, V16, V0
369 VADDUWM V5, V17, V4
370 VADDUWM V9, V18, V8
371 VADDUWM V13, V19, V12
372
373 BE_XXBRW(V0)
374 BE_XXBRW(V4)
375 BE_XXBRW(V8)
376 BE_XXBRW(V12)
377
378 CMPU LEN, $64
379 BLT tail_vsx
380
381 LXVW4X (INP)(R0), VS59
382 LXVW4X (INP)(R8), VS60
383 LXVW4X (INP)(R9), VS61
384 LXVW4X (INP)(R10), VS62
385
386 VXOR V27, V0, V27
387 VXOR V28, V4, V28
388 VXOR V29, V8, V29
389 VXOR V30, V12, V30
390
391 STXVW4X VS59, (OUT)(R0)
392 STXVW4X VS60, (OUT)(R8)
393 ADD $64, INP
394 STXVW4X VS61, (OUT)(R9)
395 ADD $-64, LEN
396 STXVW4X VS62, (OUT)(V10)
397 ADD $64, OUT
398 BEQ done_vsx
399
400 VADDUWM V2, V16, V0
401 VADDUWM V6, V17, V4
402 VADDUWM V10, V18, V8
403 VADDUWM V14, V19, V12
404
405 BE_XXBRW(V0)
406 BE_XXBRW(V4)
407 BE_XXBRW(V8)
408 BE_XXBRW(V12)
409
410 CMPU LEN, $64
411 BLT tail_vsx
412
413 LXVW4X (INP)(R0), VS59
414 LXVW4X (INP)(R8), VS60
415 LXVW4X (INP)(R9), VS61
416 LXVW4X (INP)(R10), VS62
417
418 VXOR V27, V0, V27
419 VXOR V28, V4, V28
420 VXOR V29, V8, V29
421 VXOR V30, V12, V30
422
423 STXVW4X VS59, (OUT)(R0)
424 STXVW4X VS60, (OUT)(R8)
425 ADD $64, INP
426 STXVW4X VS61, (OUT)(R9)
427 ADD $-64, LEN
428 STXVW4X VS62, (OUT)(R10)
429 ADD $64, OUT
430 BEQ done_vsx
431
432 VADDUWM V3, V16, V0
433 VADDUWM V7, V17, V4
434 VADDUWM V11, V18, V8
435 VADDUWM V15, V19, V12
436
437 BE_XXBRW(V0)
438 BE_XXBRW(V4)
439 BE_XXBRW(V8)
440 BE_XXBRW(V12)
441
442 CMPU LEN, $64
443 BLT tail_vsx
444
445 LXVW4X (INP)(R0), VS59
446 LXVW4X (INP)(R8), VS60
447 LXVW4X (INP)(R9), VS61
448 LXVW4X (INP)(R10), VS62
449
450 VXOR V27, V0, V27
451 VXOR V28, V4, V28
452 VXOR V29, V8, V29
453 VXOR V30, V12, V30
454
455 STXVW4X VS59, (OUT)(R0)
456 STXVW4X VS60, (OUT)(R8)
457 ADD $64, INP
458 STXVW4X VS61, (OUT)(R9)
459 ADD $-64, LEN
460 STXVW4X VS62, (OUT)(R10)
461 ADD $64, OUT
462
463 MOVD $10, R14
464 MOVD R14, CTR
465 BNE loop_outer_vsx
466
467 done_vsx:
468 // Increment counter by number of 64 byte blocks
469 MOVWZ (CNT), R14
470 ADD BLOCKS, R14
471 MOVWZ R14, (CNT)
472 RET
473
474 tail_vsx:
475 ADD $32, R1, R11
476 MOVD LEN, CTR
477
478 // Save values on stack to copy from
479 STXVW4X VS32, (R11)(R0)
480 STXVW4X VS36, (R11)(R8)
481 STXVW4X VS40, (R11)(R9)
482 STXVW4X VS44, (R11)(R10)
483 ADD $-1, R11, R12
484 ADD $-1, INP
485 ADD $-1, OUT
486 PCALIGN $16
487 looptail_vsx:
488 // Copying the result to OUT
489 // in bytes.
490 MOVBZU 1(R12), KEY
491 MOVBZU 1(INP), TMP
492 XOR KEY, TMP, KEY
493 MOVBU KEY, 1(OUT)
494 BDNZ looptail_vsx
495
496 // Clear the stack values
497 STXVW4X VS48, (R11)(R0)
498 STXVW4X VS48, (R11)(R8)
499 STXVW4X VS48, (R11)(R9)
500 STXVW4X VS48, (R11)(R10)
501 BR done_vsx
502
View as plain text