1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 // This is a port of the s390x asm implementation.
10 // to ppc64le.
11
12 // Some changes were needed due to differences in
13 // the Go opcodes and/or available instructions
14 // between s390x and ppc64le.
15
16 // 1. There were operand order differences in the
17 // VSUBUQM, VSUBCUQ, and VSEL instructions.
18
19 // 2. ppc64 does not have a multiply high and low
20 // like s390x, so those were implemented using
21 // macros to compute the equivalent values.
22
23 // 3. The LVX, STVX instructions on ppc64 require
24 // 16 byte alignment of the data. To avoid that
25 // requirement, data is loaded using LXVD2X and
26 // STXVD2X with VPERM to reorder bytes correctly.
27
28 // I have identified some areas where I believe
29 // changes would be needed to make this work for big
30 // endian; however additional changes beyond what I
31 // have noted are most likely needed to make it work.
32 // - The string used with VPERM to swap the byte order
33 // for loads and stores.
34 // - The constants that are loaded from CPOOL.
35 //
36
37 // The following constants are defined in an order
38 // that is correct for use with LXVD2X/STXVD2X
39 // on little endian.
40 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
41 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
42 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
43 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
44 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
45 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
46 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
47 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
48 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
49 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
50 DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
51 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
52 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
53 DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
54 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
55 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
56 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
57 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
58 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
59 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
60 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
61 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
62 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
63 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
64 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
65 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
66 DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
67 DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
68 DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
69 DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
70
71 // External declarations for constants
72 GLOBL p256ord<>(SB), 8, $32
73 GLOBL p256<>(SB), 8, $80
74 GLOBL p256mul<>(SB), 8, $160
75
76 // The following macros are used to implement the ppc64le
77 // equivalent function from the corresponding s390x
78 // instruction for vector multiply high, low, and add,
79 // since there aren't exact equivalent instructions.
80 // The corresponding s390x instructions appear in the
81 // comments.
82 // Implementation for big endian would have to be
83 // investigated, I think it would be different.
84 //
85 //
86 // Vector multiply word
87 //
88 // VMLF x0, x1, out_low
89 // VMLHF x0, x1, out_hi
90 #define VMULT(x1, x2, out_low, out_hi) \
91 VMULEUW x1, x2, TMP1; \
92 VMULOUW x1, x2, TMP2; \
93 VMRGEW TMP1, TMP2, out_hi; \
94 VMRGOW TMP1, TMP2, out_low
95
96 //
97 // Vector multiply add word
98 //
99 // VMALF x0, x1, y, out_low
100 // VMALHF x0, x1, y, out_hi
101 #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
102 VMULEUW y, one, TMP2; \
103 VMULOUW y, one, TMP1; \
104 VMULEUW x1, x2, out_low; \
105 VMULOUW x1, x2, out_hi; \
106 VADDUDM TMP2, out_low, TMP2; \
107 VADDUDM TMP1, out_hi, TMP1; \
108 VMRGOW TMP2, TMP1, out_low; \
109 VMRGEW TMP2, TMP1, out_hi
110
111 #define res_ptr R3
112 #define a_ptr R4
113
114 #undef res_ptr
115 #undef a_ptr
116
117 #define P1ptr R3
118 #define CPOOL R7
119
120 #define Y1L V0
121 #define Y1H V1
122 #define T1L V2
123 #define T1H V3
124
125 #define PL V30
126 #define PH V31
127
128 #define CAR1 V6
129 // func p256NegCond(val *p256Point, cond int)
130 TEXT ·p256NegCond(SB), NOSPLIT, $0-16
131 MOVD val+0(FP), P1ptr
132 MOVD $16, R16
133
134 MOVD cond+8(FP), R6
135 CMP $0, R6
136 BC 12, 2, LR // just return if cond == 0
137
138 MOVD $p256mul<>+0x00(SB), CPOOL
139
140 LXVD2X (P1ptr)(R0), Y1L
141 LXVD2X (P1ptr)(R16), Y1H
142
143 XXPERMDI Y1H, Y1H, $2, Y1H
144 XXPERMDI Y1L, Y1L, $2, Y1L
145
146 LXVD2X (CPOOL)(R0), PL
147 LXVD2X (CPOOL)(R16), PH
148
149 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry
150 VSUBUQM PL, Y1L, T1L // subtract part2 giving result
151 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
152
153 XXPERMDI T1H, T1H, $2, T1H
154 XXPERMDI T1L, T1L, $2, T1L
155
156 STXVD2X T1L, (R0+P1ptr)
157 STXVD2X T1H, (R16+P1ptr)
158 RET
159
160 #undef P1ptr
161 #undef CPOOL
162 #undef Y1L
163 #undef Y1H
164 #undef T1L
165 #undef T1H
166 #undef PL
167 #undef PH
168 #undef CAR1
169
170 #define P3ptr R3
171 #define P1ptr R4
172 #define P2ptr R5
173
174 #define X1L V0
175 #define X1H V1
176 #define Y1L V2
177 #define Y1H V3
178 #define Z1L V4
179 #define Z1H V5
180 #define X2L V6
181 #define X2H V7
182 #define Y2L V8
183 #define Y2H V9
184 #define Z2L V10
185 #define Z2H V11
186 #define SEL V12
187 #define ZER V13
188
189 // This function uses LXVD2X and STXVD2X to avoid the
190 // data alignment requirement for LVX, STVX. Since
191 // this code is just moving bytes and not doing arithmetic,
192 // order of the bytes doesn't matter.
193 //
194 // func p256MovCond(res, a, b *p256Point, cond int)
195 TEXT ·p256MovCond(SB), NOSPLIT, $0-32
196 MOVD res+0(FP), P3ptr
197 MOVD a+8(FP), P1ptr
198 MOVD b+16(FP), P2ptr
199 MOVD $16, R16
200 MOVD $32, R17
201 MOVD $48, R18
202 MOVD $56, R21
203 MOVD $64, R19
204 MOVD $80, R20
205 // cond is R1 + 24 (cond offset) + 32
206 LXVDSX (R1)(R21), SEL
207 VSPLTISB $0, ZER
208 // SEL controls whether to store a or b
209 VCMPEQUD SEL, ZER, SEL
210
211 LXVD2X (P1ptr+R0), X1H
212 LXVD2X (P1ptr+R16), X1L
213 LXVD2X (P1ptr+R17), Y1H
214 LXVD2X (P1ptr+R18), Y1L
215 LXVD2X (P1ptr+R19), Z1H
216 LXVD2X (P1ptr+R20), Z1L
217
218 LXVD2X (P2ptr+R0), X2H
219 LXVD2X (P2ptr+R16), X2L
220 LXVD2X (P2ptr+R17), Y2H
221 LXVD2X (P2ptr+R18), Y2L
222 LXVD2X (P2ptr+R19), Z2H
223 LXVD2X (P2ptr+R20), Z2L
224
225 VSEL X1H, X2H, SEL, X1H
226 VSEL X1L, X2L, SEL, X1L
227 VSEL Y1H, Y2H, SEL, Y1H
228 VSEL Y1L, Y2L, SEL, Y1L
229 VSEL Z1H, Z2H, SEL, Z1H
230 VSEL Z1L, Z2L, SEL, Z1L
231
232 STXVD2X X1H, (P3ptr+R0)
233 STXVD2X X1L, (P3ptr+R16)
234 STXVD2X Y1H, (P3ptr+R17)
235 STXVD2X Y1L, (P3ptr+R18)
236 STXVD2X Z1H, (P3ptr+R19)
237 STXVD2X Z1L, (P3ptr+R20)
238
239 RET
240
241 #undef P3ptr
242 #undef P1ptr
243 #undef P2ptr
244 #undef X1L
245 #undef X1H
246 #undef Y1L
247 #undef Y1H
248 #undef Z1L
249 #undef Z1H
250 #undef X2L
251 #undef X2H
252 #undef Y2L
253 #undef Y2H
254 #undef Z2L
255 #undef Z2H
256 #undef SEL
257 #undef ZER
258
259 #define P3ptr R3
260 #define P1ptr R4
261 #define COUNT R5
262
263 #define X1L V0
264 #define X1H V1
265 #define Y1L V2
266 #define Y1H V3
267 #define Z1L V4
268 #define Z1H V5
269 #define X2L V6
270 #define X2H V7
271 #define Y2L V8
272 #define Y2H V9
273 #define Z2L V10
274 #define Z2H V11
275
276 #define ONE V18
277 #define IDX V19
278 #define SEL1 V20
279 #define SEL2 V21
280 // func p256Select(point *p256Point, table *p256Table, idx int)
281 TEXT ·p256Select(SB), NOSPLIT, $0-24
282 MOVD res+0(FP), P3ptr
283 MOVD table+8(FP), P1ptr
284 MOVD $16, R16
285 MOVD $32, R17
286 MOVD $48, R18
287 MOVD $64, R19
288 MOVD $80, R20
289
290 LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
291 VSPLTB $7, SEL1, IDX // splat byte
292 VSPLTISB $1, ONE // VREPIB $1, ONE
293 VSPLTISB $1, SEL2 // VREPIB $1, SEL2
294 MOVD $17, COUNT
295 MOVD COUNT, CTR // set up ctr
296
297 VSPLTISB $0, X1H // VZERO X1H
298 VSPLTISB $0, X1L // VZERO X1L
299 VSPLTISB $0, Y1H // VZERO Y1H
300 VSPLTISB $0, Y1L // VZERO Y1L
301 VSPLTISB $0, Z1H // VZERO Z1H
302 VSPLTISB $0, Z1L // VZERO Z1L
303
304 loop_select:
305
306 // LVXD2X is used here since data alignment doesn't
307 // matter.
308
309 LXVD2X (P1ptr+R0), X2H
310 LXVD2X (P1ptr+R16), X2L
311 LXVD2X (P1ptr+R17), Y2H
312 LXVD2X (P1ptr+R18), Y2L
313 LXVD2X (P1ptr+R19), Z2H
314 LXVD2X (P1ptr+R20), Z2L
315
316 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
317
318 // This will result in SEL1 being all 0s or 1s, meaning
319 // the result is either X1L or X2L, no individual byte
320 // selection.
321
322 VSEL X1L, X2L, SEL1, X1L
323 VSEL X1H, X2H, SEL1, X1H
324 VSEL Y1L, Y2L, SEL1, Y1L
325 VSEL Y1H, Y2H, SEL1, Y1H
326 VSEL Z1L, Z2L, SEL1, Z1L
327 VSEL Z1H, Z2H, SEL1, Z1H
328
329 // Add 1 to all bytes in SEL2
330 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK
331 ADD $96, P1ptr
332 BDNZ loop_select
333
334 // STXVD2X is used here so that alignment doesn't
335 // need to be verified. Since values were loaded
336 // using LXVD2X this is OK.
337 STXVD2X X1H, (P3ptr+R0)
338 STXVD2X X1L, (P3ptr+R16)
339 STXVD2X Y1H, (P3ptr+R17)
340 STXVD2X Y1L, (P3ptr+R18)
341 STXVD2X Z1H, (P3ptr+R19)
342 STXVD2X Z1L, (P3ptr+R20)
343 RET
344
345 #undef P3ptr
346 #undef P1ptr
347 #undef COUNT
348 #undef X1L
349 #undef X1H
350 #undef Y1L
351 #undef Y1H
352 #undef Z1L
353 #undef Z1H
354 #undef X2L
355 #undef X2H
356 #undef Y2L
357 #undef Y2H
358 #undef Z2L
359 #undef Z2H
360 #undef ONE
361 #undef IDX
362 #undef SEL1
363 #undef SEL2
364
365 // The following functions all reverse the byte order.
366
367 //func p256BigToLittle(res *p256Element, in *[32]byte)
368 TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
369 MOVD res+0(FP), R3
370 MOVD in+8(FP), R4
371 BR p256InternalEndianSwap<>(SB)
372
373 //func p256LittleToBig(res *[32]byte, in *p256Element)
374 TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
375 MOVD res+0(FP), R3
376 MOVD in+8(FP), R4
377 BR p256InternalEndianSwap<>(SB)
378
379 //func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
380 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
381 MOVD res+0(FP), R3
382 MOVD in+8(FP), R4
383 BR p256InternalEndianSwap<>(SB)
384
385 //func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
386 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
387 MOVD res+0(FP), R3
388 MOVD in+8(FP), R4
389 BR p256InternalEndianSwap<>(SB)
390
391 TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0
392 // Index registers needed for BR movs
393 MOVD $8, R9
394 MOVD $16, R10
395 MOVD $24, R14
396
397 MOVDBR (R0)(R4), R5
398 MOVDBR (R9)(R4), R6
399 MOVDBR (R10)(R4), R7
400 MOVDBR (R14)(R4), R8
401
402 MOVD R8, 0(R3)
403 MOVD R7, 8(R3)
404 MOVD R6, 16(R3)
405 MOVD R5, 24(R3)
406
407 RET
408
409 #define P3ptr R3
410 #define P1ptr R4
411 #define COUNT R5
412
413 #define X1L V0
414 #define X1H V1
415 #define Y1L V2
416 #define Y1H V3
417 #define Z1L V4
418 #define Z1H V5
419 #define X2L V6
420 #define X2H V7
421 #define Y2L V8
422 #define Y2H V9
423 #define Z2L V10
424 #define Z2H V11
425
426 #define ONE V18
427 #define IDX V19
428 #define SEL1 V20
429 #define SEL2 V21
430
431 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
432 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
433 MOVD res+0(FP), P3ptr
434 MOVD table+8(FP), P1ptr
435 MOVD $16, R16
436 MOVD $32, R17
437 MOVD $48, R18
438
439 LXVDSX (R1)(R18), SEL1
440 VSPLTB $7, SEL1, IDX // splat byte
441
442 VSPLTISB $1, ONE // Vector with byte 1s
443 VSPLTISB $1, SEL2 // Vector with byte 1s
444 MOVD $64, COUNT
445 MOVD COUNT, CTR // loop count
446
447 VSPLTISB $0, X1H // VZERO X1H
448 VSPLTISB $0, X1L // VZERO X1L
449 VSPLTISB $0, Y1H // VZERO Y1H
450 VSPLTISB $0, Y1L // VZERO Y1L
451
452 loop_select:
453 LXVD2X (P1ptr+R0), X2H
454 LXVD2X (P1ptr+R16), X2L
455 LXVD2X (P1ptr+R17), Y2H
456 LXVD2X (P1ptr+R18), Y2L
457
458 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
459
460 VSEL X1L, X2L, SEL1, X1L // Select if idx matched
461 VSEL X1H, X2H, SEL1, X1H
462 VSEL Y1L, Y2L, SEL1, Y1L
463 VSEL Y1H, Y2H, SEL1, Y1H
464
465 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1
466 ADD $64, P1ptr // Next chunk
467 BDNZ loop_select
468
469 STXVD2X X1H, (P3ptr+R0)
470 STXVD2X X1L, (P3ptr+R16)
471 STXVD2X Y1H, (P3ptr+R17)
472 STXVD2X Y1L, (P3ptr+R18)
473 RET
474
475 #undef P3ptr
476 #undef P1ptr
477 #undef COUNT
478 #undef X1L
479 #undef X1H
480 #undef Y1L
481 #undef Y1H
482 #undef Z1L
483 #undef Z1H
484 #undef X2L
485 #undef X2H
486 #undef Y2L
487 #undef Y2H
488 #undef Z2L
489 #undef Z2H
490 #undef ONE
491 #undef IDX
492 #undef SEL1
493 #undef SEL2
494
495 #define res_ptr R3
496 #define x_ptr R4
497 #define CPOOL R7
498
499 #define T0 V0
500 #define T1 V1
501 #define T2 V2
502 #define TT0 V3
503 #define TT1 V4
504
505 #define ZER V6
506 #define SEL1 V7
507 #define SEL2 V8
508 #define CAR1 V9
509 #define CAR2 V10
510 #define RED1 V11
511 #define RED2 V12
512 #define PL V13
513 #define PH V14
514
515 // func p256FromMont(res, in *p256Element)
516 TEXT ·p256FromMont(SB), NOSPLIT, $0-16
517 MOVD res+0(FP), res_ptr
518 MOVD in+8(FP), x_ptr
519
520 MOVD $16, R16
521 MOVD $32, R17
522 MOVD $48, R18
523 MOVD $64, R19
524 MOVD $p256<>+0x00(SB), CPOOL
525
526 VSPLTISB $0, T2 // VZERO T2
527 VSPLTISB $0, ZER // VZERO ZER
528
529 // Constants are defined so that the LXVD2X is correct
530 LXVD2X (CPOOL+R0), PH
531 LXVD2X (CPOOL+R16), PL
532
533 // VPERM byte selections
534 LXVD2X (CPOOL+R18), SEL2
535 LXVD2X (CPOOL+R19), SEL1
536
537 LXVD2X (R16)(x_ptr), T1
538 LXVD2X (R0)(x_ptr), T0
539
540 // Put in true little endian order
541 XXPERMDI T0, T0, $2, T0
542 XXPERMDI T1, T1, $2, T1
543
544 // First round
545 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
546 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
547 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
548
549 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
550 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
551
552 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
553 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
554 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
555 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
556 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
557
558 // Second round
559 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
560 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
561 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
562
563 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
564 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
565
566 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
567 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
568 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
569 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
570 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
571
572 // Third round
573 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
574 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
575 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
576
577 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
578 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
579
580 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
581 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
582 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
583 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
584 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
585
586 // Last round
587 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
588 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
589 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
590
591 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
592 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
593
594 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
595 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
596 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
597 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
598 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
599
600 // ---------------------------------------------------
601
602 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1
603 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0
604 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
605 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1
606 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2
607
608 VSEL TT0, T0, T2, T0
609 VSEL TT1, T1, T2, T1
610
611 // Reorder the bytes so STXVD2X can be used.
612 // TT0, TT1 used for VPERM result in case
613 // the caller expects T0, T1 to be good.
614 XXPERMDI T0, T0, $2, TT0
615 XXPERMDI T1, T1, $2, TT1
616
617 STXVD2X TT0, (R0)(res_ptr)
618 STXVD2X TT1, (R16)(res_ptr)
619 RET
620
621 #undef res_ptr
622 #undef x_ptr
623 #undef CPOOL
624 #undef T0
625 #undef T1
626 #undef T2
627 #undef TT0
628 #undef TT1
629 #undef ZER
630 #undef SEL1
631 #undef SEL2
632 #undef CAR1
633 #undef CAR2
634 #undef RED1
635 #undef RED2
636 #undef PL
637 #undef PH
638
639 // ---------------------------------------
640 // p256MulInternal
641 // V0-V3 V30,V31 - Not Modified
642 // V4-V15 V27-V29 - Volatile
643
644 #define CPOOL R7
645
646 // Parameters
647 #define X0 V0 // Not modified
648 #define X1 V1 // Not modified
649 #define Y0 V2 // Not modified
650 #define Y1 V3 // Not modified
651 #define T0 V4 // Result
652 #define T1 V5 // Result
653 #define P0 V30 // Not modified
654 #define P1 V31 // Not modified
655
656 // Temporaries: lots of reused vector regs
657 #define YDIG V6 // Overloaded with CAR2
658 #define ADD1H V7 // Overloaded with ADD3H
659 #define ADD2H V8 // Overloaded with ADD4H
660 #define ADD3 V9 // Overloaded with SEL2,SEL5
661 #define ADD4 V10 // Overloaded with SEL3,SEL6
662 #define RED1 V11 // Overloaded with CAR2
663 #define RED2 V12
664 #define RED3 V13 // Overloaded with SEL1
665 #define T2 V14
666 // Overloaded temporaries
667 #define ADD1 V4 // Overloaded with T0
668 #define ADD2 V5 // Overloaded with T1
669 #define ADD3H V7 // Overloaded with ADD1H
670 #define ADD4H V8 // Overloaded with ADD2H
671 #define ZER V28 // Overloaded with TMP1
672 #define CAR1 V6 // Overloaded with YDIG
673 #define CAR2 V11 // Overloaded with RED1
674 // Constant Selects
675 #define SEL1 V13 // Overloaded with RED3
676 #define SEL2 V9 // Overloaded with ADD3,SEL5
677 #define SEL3 V10 // Overloaded with ADD4,SEL6
678 #define SEL4 V6 // Overloaded with YDIG,CAR1
679 #define SEL5 V9 // Overloaded with ADD3,SEL2
680 #define SEL6 V10 // Overloaded with ADD4,SEL3
681
682 // TMP1, TMP2 used in
683 // VMULT macros
684 #define TMP1 V13 // Overloaded with RED3
685 #define TMP2 V27
686 #define ONE V29 // 1s splatted by word
687
688 /* *
689 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
690 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
691 * With you, SIMD be...
692 *
693 * +--------+--------+
694 * +--------| RED2 | RED1 |
695 * | +--------+--------+
696 * | ---+--------+--------+
697 * | +---- T2| T1 | T0 |--+
698 * | | ---+--------+--------+ |
699 * | | |
700 * | | ======================= |
701 * | | |
702 * | | +--------+--------+<-+
703 * | +-------| ADD2 | ADD1 |--|-----+
704 * | | +--------+--------+ | |
705 * | | +--------+--------+<---+ |
706 * | | | ADD2H | ADD1H |--+ |
707 * | | +--------+--------+ | |
708 * | | +--------+--------+<-+ |
709 * | | | ADD4 | ADD3 |--|-+ |
710 * | | +--------+--------+ | | |
711 * | | +--------+--------+<---+ | |
712 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
713 * | | +--------+--------+ | | V
714 * | | ------------------------ | | +--------+
715 * | | | | | RED3 | [d0 0 0 d0]
716 * | | | | +--------+
717 * | +---->+--------+--------+ | | |
718 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
719 * | +--------+--------+ | | |
720 * +---->---+--------+--------+ | | |
721 * T2| T1 | T0 |----+ | |
722 * ---+--------+--------+ | | |
723 * ---+--------+--------+<---+ | |
724 * +--- T2| T1 | T0 |----------+
725 * | ---+--------+--------+ | |
726 * | +--------+--------+<-------------+
727 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
728 * | +--------+--------+ | | |
729 * | +--------+<----------------------+
730 * | | RED3 |--------------+ | [0 0 d1 d0]
731 * | +--------+ | |
732 * +--->+--------+--------+ | |
733 * | T1 | T0 |--------+
734 * +--------+--------+ | |
735 * --------------------------- | |
736 * | |
737 * +--------+--------+<----+ |
738 * | RED2 | RED1 | |
739 * +--------+--------+ |
740 * ---+--------+--------+<-------+
741 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
742 * ---+--------+--------+
743 *
744 * *Mi obra de arte de siglo XXI @vpaprots
745 *
746 *
747 * First group is special, doesn't get the two inputs:
748 * +--------+--------+<-+
749 * +-------| ADD2 | ADD1 |--|-----+
750 * | +--------+--------+ | |
751 * | +--------+--------+<---+ |
752 * | | ADD2H | ADD1H |--+ |
753 * | +--------+--------+ | |
754 * | +--------+--------+<-+ |
755 * | | ADD4 | ADD3 |--|-+ |
756 * | +--------+--------+ | | |
757 * | +--------+--------+<---+ | |
758 * | | ADD4H | ADD3H |------|-+ |(+vzero)
759 * | +--------+--------+ | | V
760 * | ------------------------ | | +--------+
761 * | | | | RED3 | [d0 0 0 d0]
762 * | | | +--------+
763 * +---->+--------+--------+ | | |
764 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
765 * +--------+--------+ | | |
766 * ---+--------+--------+<---+ | |
767 * +--- T2| T1 | T0 |----------+
768 * | ---+--------+--------+ | |
769 * | +--------+--------+<-------------+
770 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
771 * | +--------+--------+ | | |
772 * | +--------+<----------------------+
773 * | | RED3 |--------------+ | [0 0 d1 d0]
774 * | +--------+ | |
775 * +--->+--------+--------+ | |
776 * | T1 | T0 |--------+
777 * +--------+--------+ | |
778 * --------------------------- | |
779 * | |
780 * +--------+--------+<----+ |
781 * | RED2 | RED1 | |
782 * +--------+--------+ |
783 * ---+--------+--------+<-------+
784 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
785 * ---+--------+--------+
786 *
787 * Last 'group' needs to RED2||RED1 shifted less
788 */
789 TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
790 // CPOOL loaded from caller
791 MOVD $16, R16
792 MOVD $32, R17
793 MOVD $48, R18
794 MOVD $64, R19
795 MOVD $80, R20
796 MOVD $96, R21
797 MOVD $112, R22
798
799 // ---------------------------------------------------
800
801 VSPLTW $3, Y0, YDIG // VREPF Y0 is input
802
803 // VMLHF X0, YDIG, ADD1H
804 // VMLHF X1, YDIG, ADD2H
805 // VMLF X0, YDIG, ADD1
806 // VMLF X1, YDIG, ADD2
807 //
808 VMULT(X0, YDIG, ADD1, ADD1H)
809 VMULT(X1, YDIG, ADD2, ADD2H)
810
811 VSPLTISW $1, ONE
812 VSPLTW $2, Y0, YDIG // VREPF
813
814 // VMALF X0, YDIG, ADD1H, ADD3
815 // VMALF X1, YDIG, ADD2H, ADD4
816 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
817 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
818 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
819 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
820
821 LXVD2X (R17)(CPOOL), SEL1
822 VSPLTISB $0, ZER // VZERO ZER
823 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
824
825 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
826 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB
827
828 VADDCUQ T0, ADD3, CAR1 // VACCQ
829 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ
830 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
831 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ
832
833 LXVD2X (R18)(CPOOL), SEL2
834 LXVD2X (R19)(CPOOL), SEL3
835 LXVD2X (R20)(CPOOL), SEL4
836 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
837 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
838 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
839 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ
840
841 VSLDOI $12, T1, T0, T0 // VSLDB
842 VSLDOI $12, T2, T1, T1 // VSLDB
843
844 VADDCUQ T0, ADD3H, CAR1 // VACCQ
845 VADDUQM T0, ADD3H, T0 // VAQ
846 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
847 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
848
849 // ---------------------------------------------------
850
851 VSPLTW $1, Y0, YDIG // VREPF
852
853 // VMALHF X0, YDIG, T0, ADD1H
854 // VMALHF X1, YDIG, T1, ADD2H
855 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
856 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
857 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
858 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
859
860 VSPLTW $0, Y0, YDIG // VREPF
861
862 // VMALF X0, YDIG, ADD1H, ADD3
863 // VMALF X1, YDIG, ADD2H, ADD4
864 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
865 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
866 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
867 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
868
869 VSPLTISB $0, ZER // VZERO ZER
870 LXVD2X (R17)(CPOOL), SEL1
871 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
872
873 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB
874 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB
875
876 VADDCUQ T0, RED1, CAR1 // VACCQ
877 VADDUQM T0, RED1, T0 // VAQ
878 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
879 VADDEUQM T1, RED2, CAR1, T1 // VACQ
880
881 VADDCUQ T0, ADD3, CAR1 // VACCQ
882 VADDUQM T0, ADD3, T0 // VAQ
883 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
884 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
885 VADDUQM T2, CAR2, T2 // VAQ
886
887 LXVD2X (R18)(CPOOL), SEL2
888 LXVD2X (R19)(CPOOL), SEL3
889 LXVD2X (R20)(CPOOL), SEL4
890 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
891 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
892 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
893 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
894
895 VSLDOI $12, T1, T0, T0 // VSLDB
896 VSLDOI $12, T2, T1, T1 // VSLDB
897
898 VADDCUQ T0, ADD3H, CAR1 // VACCQ
899 VADDUQM T0, ADD3H, T0 // VAQ
900 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
901 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
902
903 // ---------------------------------------------------
904
905 VSPLTW $3, Y1, YDIG // VREPF
906
907 // VMALHF X0, YDIG, T0, ADD1H
908 // VMALHF X1, YDIG, T1, ADD2H
909 // VMALF X0, YDIG, T0, ADD1
910 // VMALF X1, YDIG, T1, ADD2
911 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
912 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
913
914 VSPLTW $2, Y1, YDIG // VREPF
915
916 // VMALF X0, YDIG, ADD1H, ADD3
917 // VMALF X1, YDIG, ADD2H, ADD4
918 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
919 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
920 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
921 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
922
923 LXVD2X (R17)(CPOOL), SEL1
924 VSPLTISB $0, ZER // VZERO ZER
925 LXVD2X (R17)(CPOOL), SEL1
926 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
927
928 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
929 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB
930
931 VADDCUQ T0, RED1, CAR1 // VACCQ
932 VADDUQM T0, RED1, T0 // VAQ
933 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
934 VADDEUQM T1, RED2, CAR1, T1 // VACQ
935
936 VADDCUQ T0, ADD3, CAR1 // VACCQ
937 VADDUQM T0, ADD3, T0 // VAQ
938 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
939 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
940 VADDUQM T2, CAR2, T2 // VAQ
941
942 LXVD2X (R18)(CPOOL), SEL2
943 LXVD2X (R19)(CPOOL), SEL3
944 LXVD2X (R20)(CPOOL), SEL4
945 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
946 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
947 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
948 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
949
950 VSLDOI $12, T1, T0, T0 // VSLDB
951 VSLDOI $12, T2, T1, T1 // VSLDB
952
953 VADDCUQ T0, ADD3H, CAR1 // VACCQ
954 VADDUQM T0, ADD3H, T0 // VAQ
955 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
956 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
957
958 // ---------------------------------------------------
959
960 VSPLTW $1, Y1, YDIG // VREPF
961
962 // VMALHF X0, YDIG, T0, ADD1H
963 // VMALHF X1, YDIG, T1, ADD2H
964 // VMALF X0, YDIG, T0, ADD1
965 // VMALF X1, YDIG, T1, ADD2
966 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
967 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
968
969 VSPLTW $0, Y1, YDIG // VREPF
970
971 // VMALF X0, YDIG, ADD1H, ADD3
972 // VMALF X1, YDIG, ADD2H, ADD4
973 // VMALHF X0, YDIG, ADD1H, ADD3H
974 // VMALHF X1, YDIG, ADD2H, ADD4H
975 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
976 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
977
978 VSPLTISB $0, ZER // VZERO ZER
979 LXVD2X (R17)(CPOOL), SEL1
980 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
981
982 VSLDOI $12, ADD2, ADD1, T0 // VSLDB
983 VSLDOI $12, T2, ADD2, T1 // VSLDB
984
985 VADDCUQ T0, RED1, CAR1 // VACCQ
986 VADDUQM T0, RED1, T0 // VAQ
987 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
988 VADDEUQM T1, RED2, CAR1, T1 // VACQ
989
990 VADDCUQ T0, ADD3, CAR1 // VACCQ
991 VADDUQM T0, ADD3, T0 // VAQ
992 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
993 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
994 VADDUQM T2, CAR2, T2 // VAQ
995
996 LXVD2X (R21)(CPOOL), SEL5
997 LXVD2X (R22)(CPOOL), SEL6
998 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
999 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
1000 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ
1001
1002 VSLDOI $12, T1, T0, T0 // VSLDB
1003 VSLDOI $12, T2, T1, T1 // VSLDB
1004
1005 VADDCUQ T0, ADD3H, CAR1 // VACCQ
1006 VADDUQM T0, ADD3H, T0 // VAQ
1007 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
1008 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
1009
1010 VADDCUQ T0, RED1, CAR1 // VACCQ
1011 VADDUQM T0, RED1, T0 // VAQ
1012 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
1013 VADDEUQM T1, RED2, CAR1, T1 // VACQ
1014 VADDUQM T2, CAR2, T2 // VAQ
1015
1016 // ---------------------------------------------------
1017
1018 VSPLTISB $0, RED3 // VZERO RED3
1019 VSUBCUQ T0, P0, CAR1 // VSCBIQ
1020 VSUBUQM T0, P0, ADD1H // VSQ
1021 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ
1022 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
1023 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ
1024
1025 // what output to use, ADD2H||ADD1H or T1||T0?
1026 VSEL ADD1H, T0, T2, T0
1027 VSEL ADD2H, T1, T2, T1
1028 RET
1029
1030 #undef CPOOL
1031
1032 #undef X0
1033 #undef X1
1034 #undef Y0
1035 #undef Y1
1036 #undef T0
1037 #undef T1
1038 #undef P0
1039 #undef P1
1040
1041 #undef SEL1
1042 #undef SEL2
1043 #undef SEL3
1044 #undef SEL4
1045 #undef SEL5
1046 #undef SEL6
1047
1048 #undef YDIG
1049 #undef ADD1H
1050 #undef ADD2H
1051 #undef ADD3
1052 #undef ADD4
1053 #undef RED1
1054 #undef RED2
1055 #undef RED3
1056 #undef T2
1057 #undef ADD1
1058 #undef ADD2
1059 #undef ADD3H
1060 #undef ADD4H
1061 #undef ZER
1062 #undef CAR1
1063 #undef CAR2
1064
1065 #undef TMP1
1066 #undef TMP2
1067
1068 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1069 VSPLTISB $0, ZER \ // VZERO
1070 VSUBCUQ X0, Y0, CAR1 \
1071 VSUBUQM X0, Y0, T0 \
1072 VSUBECUQ X1, Y1, CAR1, SEL1 \
1073 VSUBEUQM X1, Y1, CAR1, T1 \
1074 VSUBUQM ZER, SEL1, SEL1 \ // VSQ
1075 \
1076 VADDCUQ T0, PL, CAR1 \ // VACCQ
1077 VADDUQM T0, PL, TT0 \ // VAQ
1078 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ
1079 \
1080 VSEL TT0, T0, SEL1, T0 \
1081 VSEL TT1, T1, SEL1, T1 \
1082
1083 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1084 VADDCUQ X0, Y0, CAR1 \
1085 VADDUQM X0, Y0, T0 \
1086 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ
1087 VADDEUQM X1, Y1, CAR1, T1 \
1088 \
1089 VSPLTISB $0, ZER \
1090 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ
1091 VSUBUQM T0, PL, TT0 \
1092 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ
1093 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ
1094 VSUBEUQM T2, ZER, CAR2, SEL1 \
1095 \
1096 VSEL TT0, T0, SEL1, T0 \
1097 VSEL TT1, T1, SEL1, T1
1098
1099 #define p256HalfInternal(T1, T0, X1, X0) \
1100 VSPLTISB $0, ZER \
1101 VSUBEUQM ZER, ZER, X0, SEL1 \
1102 \
1103 VADDCUQ X0, PL, CAR1 \
1104 VADDUQM X0, PL, T0 \
1105 VADDECUQ X1, PH, CAR1, T2 \
1106 VADDEUQM X1, PH, CAR1, T1 \
1107 \
1108 VSEL T0, X0, SEL1, T0 \
1109 VSEL T1, X1, SEL1, T1 \
1110 VSEL T2, ZER, SEL1, T2 \
1111 \
1112 VSLDOI $15, T2, ZER, TT1 \
1113 VSLDOI $15, T1, ZER, TT0 \
1114 VSPLTISB $1, SEL1 \
1115 VSR T0, SEL1, T0 \ // VSRL
1116 VSR T1, SEL1, T1 \
1117 VSPLTISB $7, SEL1 \ // VREPIB
1118 VSL TT0, SEL1, TT0 \
1119 VSL TT1, SEL1, TT1 \
1120 VOR T0, TT0, T0 \
1121 VOR T1, TT1, T1
1122
1123 #define res_ptr R3
1124 #define x_ptr R4
1125 #define y_ptr R5
1126 #define CPOOL R7
1127 #define TEMP R8
1128 #define N R9
1129
1130 // Parameters
1131 #define X0 V0
1132 #define X1 V1
1133 #define Y0 V2
1134 #define Y1 V3
1135 #define T0 V4
1136 #define T1 V5
1137
1138 // Constants
1139 #define P0 V30
1140 #define P1 V31
1141 // func p256MulAsm(res, in1, in2 *p256Element)
1142 TEXT ·p256Mul(SB), NOSPLIT, $0-24
1143 MOVD res+0(FP), res_ptr
1144 MOVD in1+8(FP), x_ptr
1145 MOVD in2+16(FP), y_ptr
1146 MOVD $16, R16
1147 MOVD $32, R17
1148
1149 MOVD $p256mul<>+0x00(SB), CPOOL
1150
1151
1152 LXVD2X (R0)(x_ptr), X0
1153 LXVD2X (R16)(x_ptr), X1
1154
1155 XXPERMDI X0, X0, $2, X0
1156 XXPERMDI X1, X1, $2, X1
1157
1158 LXVD2X (R0)(y_ptr), Y0
1159 LXVD2X (R16)(y_ptr), Y1
1160
1161 XXPERMDI Y0, Y0, $2, Y0
1162 XXPERMDI Y1, Y1, $2, Y1
1163
1164 LXVD2X (R16)(CPOOL), P1
1165 LXVD2X (R0)(CPOOL), P0
1166
1167 CALL p256MulInternal<>(SB)
1168
1169 MOVD $p256mul<>+0x00(SB), CPOOL
1170
1171 XXPERMDI T0, T0, $2, T0
1172 XXPERMDI T1, T1, $2, T1
1173 STXVD2X T0, (R0)(res_ptr)
1174 STXVD2X T1, (R16)(res_ptr)
1175 RET
1176
1177 // func p256Sqr(res, in *p256Element, n int)
1178 TEXT ·p256Sqr(SB), NOSPLIT, $0-24
1179 MOVD res+0(FP), res_ptr
1180 MOVD in+8(FP), x_ptr
1181 MOVD $16, R16
1182 MOVD $32, R17
1183
1184 MOVD $p256mul<>+0x00(SB), CPOOL
1185
1186 LXVD2X (R0)(x_ptr), X0
1187 LXVD2X (R16)(x_ptr), X1
1188
1189 XXPERMDI X0, X0, $2, X0
1190 XXPERMDI X1, X1, $2, X1
1191
1192 sqrLoop:
1193 // Sqr uses same value for both
1194
1195 VOR X0, X0, Y0
1196 VOR X1, X1, Y1
1197
1198 LXVD2X (R16)(CPOOL), P1
1199 LXVD2X (R0)(CPOOL), P0
1200
1201 CALL p256MulInternal<>(SB)
1202
1203 MOVD n+16(FP), N
1204 ADD $-1, N
1205 CMP $0, N
1206 BEQ done
1207 MOVD N, n+16(FP) // Save counter to avoid clobber
1208 VOR T0, T0, X0
1209 VOR T1, T1, X1
1210 BR sqrLoop
1211
1212 done:
1213 MOVD $p256mul<>+0x00(SB), CPOOL
1214
1215 XXPERMDI T0, T0, $2, T0
1216 XXPERMDI T1, T1, $2, T1
1217 STXVD2X T0, (R0)(res_ptr)
1218 STXVD2X T1, (R16)(res_ptr)
1219 RET
1220
1221 #undef res_ptr
1222 #undef x_ptr
1223 #undef y_ptr
1224 #undef CPOOL
1225
1226 #undef X0
1227 #undef X1
1228 #undef Y0
1229 #undef Y1
1230 #undef T0
1231 #undef T1
1232 #undef P0
1233 #undef P1
1234
1235 #define P3ptr R3
1236 #define P1ptr R4
1237 #define P2ptr R5
1238 #define CPOOL R7
1239
1240 // Temporaries in REGs
1241 #define Y2L V15
1242 #define Y2H V16
1243 #define T1L V17
1244 #define T1H V18
1245 #define T2L V19
1246 #define T2H V20
1247 #define T3L V21
1248 #define T3H V22
1249 #define T4L V23
1250 #define T4H V24
1251
1252 // Temps for Sub and Add
1253 #define TT0 V11
1254 #define TT1 V12
1255 #define T2 V13
1256
1257 // p256MulAsm Parameters
1258 #define X0 V0
1259 #define X1 V1
1260 #define Y0 V2
1261 #define Y1 V3
1262 #define T0 V4
1263 #define T1 V5
1264
1265 #define PL V30
1266 #define PH V31
1267
1268 // Names for zero/sel selects
1269 #define X1L V0
1270 #define X1H V1
1271 #define Y1L V2 // p256MulAsmParmY
1272 #define Y1H V3 // p256MulAsmParmY
1273 #define Z1L V4
1274 #define Z1H V5
1275 #define X2L V0
1276 #define X2H V1
1277 #define Z2L V4
1278 #define Z2H V5
1279 #define X3L V17 // T1L
1280 #define X3H V18 // T1H
1281 #define Y3L V21 // T3L
1282 #define Y3H V22 // T3H
1283 #define Z3L V25
1284 #define Z3H V26
1285
1286 #define ZER V6
1287 #define SEL1 V7
1288 #define CAR1 V8
1289 #define CAR2 V9
1290 /* *
1291 * Three operand formula:
1292 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1293 * T1 = Z1²
1294 * T2 = T1*Z1
1295 * T1 = T1*X2
1296 * T2 = T2*Y2
1297 * T1 = T1-X1
1298 * T2 = T2-Y1
1299 * Z3 = Z1*T1
1300 * T3 = T1²
1301 * T4 = T3*T1
1302 * T3 = T3*X1
1303 * T1 = 2*T3
1304 * X3 = T2²
1305 * X3 = X3-T1
1306 * X3 = X3-T4
1307 * T3 = T3-X3
1308 * T3 = T3*T2
1309 * T4 = T4*Y1
1310 * Y3 = T3-T4
1311
1312 * Three operand formulas, but with MulInternal X,Y used to store temps
1313 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1314 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1315 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1316 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1317 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1318 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1319 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1320 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1321 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1322 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1323 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1324 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1325 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1326 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1327 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1328 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1329 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1330 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1331
1332 */
1333 //
1334 // V27 is clobbered by p256MulInternal so must be
1335 // saved in a temp.
1336 //
1337 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1338 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
1339 MOVD res+0(FP), P3ptr
1340 MOVD in1+8(FP), P1ptr
1341 MOVD in2+16(FP), P2ptr
1342
1343 MOVD $p256mul<>+0x00(SB), CPOOL
1344
1345 MOVD $16, R16
1346 MOVD $32, R17
1347 MOVD $48, R18
1348 MOVD $64, R19
1349 MOVD $80, R20
1350 MOVD $96, R21
1351 MOVD $112, R22
1352 MOVD $128, R23
1353 MOVD $144, R24
1354 MOVD $160, R25
1355 MOVD $104, R26 // offset of sign+24(FP)
1356
1357 LXVD2X (R16)(CPOOL), PH
1358 LXVD2X (R0)(CPOOL), PL
1359
1360 LXVD2X (R17)(P2ptr), Y2L
1361 LXVD2X (R18)(P2ptr), Y2H
1362 XXPERMDI Y2H, Y2H, $2, Y2H
1363 XXPERMDI Y2L, Y2L, $2, Y2L
1364
1365 // Equivalent of VLREPG sign+24(FP), SEL1
1366 LXVDSX (R1)(R26), SEL1
1367 VSPLTISB $0, ZER
1368 VCMPEQUD SEL1, ZER, SEL1
1369
1370 VSUBCUQ PL, Y2L, CAR1
1371 VSUBUQM PL, Y2L, T1L
1372 VSUBEUQM PH, Y2H, CAR1, T1H
1373
1374 VSEL T1L, Y2L, SEL1, Y2L
1375 VSEL T1H, Y2H, SEL1, Y2H
1376
1377 /* *
1378 * Three operand formula:
1379 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1380 */
1381 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1382 LXVD2X (R19)(P1ptr), X0 // Z1H
1383 LXVD2X (R20)(P1ptr), X1 // Z1L
1384 XXPERMDI X0, X0, $2, X0
1385 XXPERMDI X1, X1, $2, X1
1386 VOR X0, X0, Y0
1387 VOR X1, X1, Y1
1388 CALL p256MulInternal<>(SB)
1389
1390 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1391 VOR T0, T0, X0
1392 VOR T1, T1, X1
1393 CALL p256MulInternal<>(SB)
1394 VOR T0, T0, T2L
1395 VOR T1, T1, T2H
1396
1397 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1398 MOVD in2+16(FP), P2ptr
1399 LXVD2X (R0)(P2ptr), Y0 // X2H
1400 LXVD2X (R16)(P2ptr), Y1 // X2L
1401 XXPERMDI Y0, Y0, $2, Y0
1402 XXPERMDI Y1, Y1, $2, Y1
1403 CALL p256MulInternal<>(SB)
1404 VOR T0, T0, T1L
1405 VOR T1, T1, T1H
1406
1407 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1408 VOR T2L, T2L, X0
1409 VOR T2H, T2H, X1
1410 VOR Y2L, Y2L, Y0
1411 VOR Y2H, Y2H, Y1
1412 CALL p256MulInternal<>(SB)
1413
1414 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1415 MOVD in1+8(FP), P1ptr
1416 LXVD2X (R17)(P1ptr), Y1L
1417 LXVD2X (R18)(P1ptr), Y1H
1418 XXPERMDI Y1H, Y1H, $2, Y1H
1419 XXPERMDI Y1L, Y1L, $2, Y1L
1420 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1421
1422 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1423 LXVD2X (R0)(P1ptr), X1L
1424 LXVD2X (R16)(P1ptr), X1H
1425 XXPERMDI X1H, X1H, $2, X1H
1426 XXPERMDI X1L, X1L, $2, X1L
1427 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1428
1429 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1430 LXVD2X (R19)(P1ptr), X0 // Z1H
1431 LXVD2X (R20)(P1ptr), X1 // Z1L
1432 XXPERMDI X0, X0, $2, X0
1433 XXPERMDI X1, X1, $2, X1
1434 CALL p256MulInternal<>(SB)
1435
1436 VOR T0, T0, Z3L
1437 VOR T1, T1, Z3H
1438
1439 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1440 VOR Y0, Y0, X0
1441 VOR Y1, Y1, X1
1442 CALL p256MulInternal<>(SB)
1443 VOR T0, T0, X0
1444 VOR T1, T1, X1
1445
1446 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1447 CALL p256MulInternal<>(SB)
1448 VOR T0, T0, T4L
1449 VOR T1, T1, T4H
1450
1451 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1452 MOVD in1+8(FP), P1ptr
1453 LXVD2X (R0)(P1ptr), Y0 // X1H
1454 LXVD2X (R16)(P1ptr), Y1 // X1L
1455 XXPERMDI Y1, Y1, $2, Y1
1456 XXPERMDI Y0, Y0, $2, Y0
1457 CALL p256MulInternal<>(SB)
1458 VOR T0, T0, T3L
1459 VOR T1, T1, T3H
1460
1461 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1462 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1463
1464 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1465 VOR T2L, T2L, X0
1466 VOR T2H, T2H, X1
1467 VOR T2L, T2L, Y0
1468 VOR T2H, T2H, Y1
1469 CALL p256MulInternal<>(SB)
1470
1471 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1472 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1473
1474 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1475 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1476 VOR T0, T0, X3L
1477 VOR T1, T1, X3H
1478
1479 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1480 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1481
1482 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1483 CALL p256MulInternal<>(SB)
1484 VOR T0, T0, T3L
1485 VOR T1, T1, T3H
1486
1487 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1488 VOR T4L, T4L, X0
1489 VOR T4H, T4H, X1
1490 MOVD in1+8(FP), P1ptr
1491 LXVD2X (R17)(P1ptr), Y0 // Y1H
1492 LXVD2X (R18)(P1ptr), Y1 // Y1L
1493 XXPERMDI Y0, Y0, $2, Y0
1494 XXPERMDI Y1, Y1, $2, Y1
1495 CALL p256MulInternal<>(SB)
1496
1497 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1498 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1499
1500 // if (sel == 0) {
1501 // copy(P3.x[:], X1)
1502 // copy(P3.y[:], Y1)
1503 // copy(P3.z[:], Z1)
1504 // }
1505
1506 LXVD2X (R0)(P1ptr), X1L
1507 LXVD2X (R16)(P1ptr), X1H
1508 XXPERMDI X1H, X1H, $2, X1H
1509 XXPERMDI X1L, X1L, $2, X1L
1510
1511 // Y1 already loaded, left over from addition
1512 LXVD2X (R19)(P1ptr), Z1L
1513 LXVD2X (R20)(P1ptr), Z1H
1514 XXPERMDI Z1H, Z1H, $2, Z1H
1515 XXPERMDI Z1L, Z1L, $2, Z1L
1516
1517 MOVD $112, R26 // Get offset to sel+32
1518 LXVDSX (R1)(R26), SEL1
1519 VSPLTISB $0, ZER
1520 VCMPEQUD SEL1, ZER, SEL1
1521
1522 VSEL X3L, X1L, SEL1, X3L
1523 VSEL X3H, X1H, SEL1, X3H
1524 VSEL Y3L, Y1L, SEL1, Y3L
1525 VSEL Y3H, Y1H, SEL1, Y3H
1526 VSEL Z3L, Z1L, SEL1, Z3L
1527 VSEL Z3H, Z1H, SEL1, Z3H
1528
1529 MOVD in2+16(FP), P2ptr
1530 LXVD2X (R0)(P2ptr), X2L
1531 LXVD2X (R16)(P2ptr), X2H
1532 XXPERMDI X2H, X2H, $2, X2H
1533 XXPERMDI X2L, X2L, $2, X2L
1534
1535 // Y2 already loaded
1536 LXVD2X (R23)(CPOOL), Z2L
1537 LXVD2X (R24)(CPOOL), Z2H
1538
1539 MOVD $120, R26 // Get the value from zero+40(FP)
1540 LXVDSX (R1)(R26), SEL1
1541 VSPLTISB $0, ZER
1542 VCMPEQUD SEL1, ZER, SEL1
1543
1544 VSEL X3L, X2L, SEL1, X3L
1545 VSEL X3H, X2H, SEL1, X3H
1546 VSEL Y3L, Y2L, SEL1, Y3L
1547 VSEL Y3H, Y2H, SEL1, Y3H
1548 VSEL Z3L, Z2L, SEL1, Z3L
1549 VSEL Z3H, Z2H, SEL1, Z3H
1550
1551 // Reorder the bytes so they can be stored using STXVD2X.
1552 MOVD res+0(FP), P3ptr
1553 XXPERMDI X3H, X3H, $2, X3H
1554 XXPERMDI X3L, X3L, $2, X3L
1555 XXPERMDI Y3H, Y3H, $2, Y3H
1556 XXPERMDI Y3L, Y3L, $2, Y3L
1557 XXPERMDI Z3H, Z3H, $2, Z3H
1558 XXPERMDI Z3L, Z3L, $2, Z3L
1559 STXVD2X X3L, (R0)(P3ptr)
1560 STXVD2X X3H, (R16)(P3ptr)
1561 STXVD2X Y3L, (R17)(P3ptr)
1562 STXVD2X Y3H, (R18)(P3ptr)
1563 STXVD2X Z3L, (R19)(P3ptr)
1564 STXVD2X Z3H, (R20)(P3ptr)
1565
1566 RET
1567
1568 #undef P3ptr
1569 #undef P1ptr
1570 #undef P2ptr
1571 #undef CPOOL
1572
1573 #undef Y2L
1574 #undef Y2H
1575 #undef T1L
1576 #undef T1H
1577 #undef T2L
1578 #undef T2H
1579 #undef T3L
1580 #undef T3H
1581 #undef T4L
1582 #undef T4H
1583
1584 #undef TT0
1585 #undef TT1
1586 #undef T2
1587
1588 #undef X0
1589 #undef X1
1590 #undef Y0
1591 #undef Y1
1592 #undef T0
1593 #undef T1
1594
1595 #undef PL
1596 #undef PH
1597
1598 #undef X1L
1599 #undef X1H
1600 #undef Y1L
1601 #undef Y1H
1602 #undef Z1L
1603 #undef Z1H
1604 #undef X2L
1605 #undef X2H
1606 #undef Z2L
1607 #undef Z2H
1608 #undef X3L
1609 #undef X3H
1610 #undef Y3L
1611 #undef Y3H
1612 #undef Z3L
1613 #undef Z3H
1614
1615 #undef ZER
1616 #undef SEL1
1617 #undef CAR1
1618 #undef CAR2
1619
1620 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1621 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1622 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1623 #define P3ptr R3
1624 #define P1ptr R4
1625 #define CPOOL R7
1626
1627 // Temporaries in REGs
1628 #define X3L V15
1629 #define X3H V16
1630 #define Y3L V17
1631 #define Y3H V18
1632 #define T1L V19
1633 #define T1H V20
1634 #define T2L V21
1635 #define T2H V22
1636 #define T3L V23
1637 #define T3H V24
1638
1639 #define X1L V6
1640 #define X1H V7
1641 #define Y1L V8
1642 #define Y1H V9
1643 #define Z1L V10
1644 #define Z1H V11
1645
1646 // Temps for Sub and Add
1647 #define TT0 V11
1648 #define TT1 V12
1649 #define T2 V13
1650
1651 // p256MulAsm Parameters
1652 #define X0 V0
1653 #define X1 V1
1654 #define Y0 V2
1655 #define Y1 V3
1656 #define T0 V4
1657 #define T1 V5
1658
1659 #define PL V30
1660 #define PH V31
1661
1662 #define Z3L V23
1663 #define Z3H V24
1664
1665 #define ZER V26
1666 #define SEL1 V27
1667 #define CAR1 V28
1668 #define CAR2 V29
1669 /*
1670 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1671 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1672 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1673 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1674 * B = 2Y₁
1675 * Z₃ = B×Z₁
1676 * C = B²
1677 * D = C×X₁
1678 * X₃ = A²-2D
1679 * Y₃ = (D-X₃)×A-C²/2
1680 *
1681 * Three-operand formula:
1682 * T1 = Z1²
1683 * T2 = X1-T1
1684 * T1 = X1+T1
1685 * T2 = T2*T1
1686 * T2 = 3*T2
1687 * Y3 = 2*Y1
1688 * Z3 = Y3*Z1
1689 * Y3 = Y3²
1690 * T3 = Y3*X1
1691 * Y3 = Y3²
1692 * Y3 = half*Y3
1693 * X3 = T2²
1694 * T1 = 2*T3
1695 * X3 = X3-T1
1696 * T1 = T3-X3
1697 * T1 = T1*T2
1698 * Y3 = T1-Y3
1699 */
1700 // p256PointDoubleAsm(res, in1 *p256Point)
1701 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
1702 MOVD res+0(FP), P3ptr
1703 MOVD in+8(FP), P1ptr
1704
1705 MOVD $p256mul<>+0x00(SB), CPOOL
1706
1707 MOVD $16, R16
1708 MOVD $32, R17
1709 MOVD $48, R18
1710 MOVD $64, R19
1711 MOVD $80, R20
1712
1713 LXVD2X (R16)(CPOOL), PH
1714 LXVD2X (R0)(CPOOL), PL
1715
1716 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1717 LXVD2X (R19)(P1ptr), X0 // Z1H
1718 LXVD2X (R20)(P1ptr), X1 // Z1L
1719
1720 XXPERMDI X0, X0, $2, X0
1721 XXPERMDI X1, X1, $2, X1
1722
1723 VOR X0, X0, Y0
1724 VOR X1, X1, Y1
1725 CALL p256MulInternal<>(SB)
1726
1727 // SUB(X<X1-T) // T2 = X1-T1
1728 LXVD2X (R0)(P1ptr), X1L
1729 LXVD2X (R16)(P1ptr), X1H
1730 XXPERMDI X1L, X1L, $2, X1L
1731 XXPERMDI X1H, X1H, $2, X1H
1732
1733 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1734
1735 // ADD(Y<X1+T) // T1 = X1+T1
1736 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1737
1738 // X- ; Y- ; MUL; T- // T2 = T2*T1
1739 CALL p256MulInternal<>(SB)
1740
1741 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1742 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1743 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1744
1745 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1746 LXVD2X (R17)(P1ptr), Y1L
1747 LXVD2X (R18)(P1ptr), Y1H
1748 XXPERMDI Y1L, Y1L, $2, Y1L
1749 XXPERMDI Y1H, Y1H, $2, Y1H
1750
1751 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1752
1753 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1754 LXVD2X (R19)(P1ptr), Y0
1755 LXVD2X (R20)(P1ptr), Y1
1756 XXPERMDI Y0, Y0, $2, Y0
1757 XXPERMDI Y1, Y1, $2, Y1
1758
1759 CALL p256MulInternal<>(SB)
1760
1761 // Leave T0, T1 as is.
1762 XXPERMDI T0, T0, $2, TT0
1763 XXPERMDI T1, T1, $2, TT1
1764 STXVD2X TT0, (R19)(P3ptr)
1765 STXVD2X TT1, (R20)(P3ptr)
1766
1767 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1768 VOR X0, X0, Y0
1769 VOR X1, X1, Y1
1770 CALL p256MulInternal<>(SB)
1771
1772 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
1773 VOR T0, T0, X0
1774 VOR T1, T1, X1
1775 LXVD2X (R0)(P1ptr), Y0
1776 LXVD2X (R16)(P1ptr), Y1
1777 XXPERMDI Y0, Y0, $2, Y0
1778 XXPERMDI Y1, Y1, $2, Y1
1779 CALL p256MulInternal<>(SB)
1780 VOR T0, T0, T3L
1781 VOR T1, T1, T3H
1782
1783 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1784 VOR X0, X0, Y0
1785 VOR X1, X1, Y1
1786 CALL p256MulInternal<>(SB)
1787
1788 // HAL(Y3<T) // Y3 = half*Y3
1789 p256HalfInternal(Y3H,Y3L, T1,T0)
1790
1791 // X=T2; Y=T2; MUL; T- // X3 = T2²
1792 VOR T2L, T2L, X0
1793 VOR T2H, T2H, X1
1794 VOR T2L, T2L, Y0
1795 VOR T2H, T2H, Y1
1796 CALL p256MulInternal<>(SB)
1797
1798 // ADD(T1<T3+T3) // T1 = 2*T3
1799 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
1800
1801 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
1802 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
1803
1804 XXPERMDI X3L, X3L, $2, TT0
1805 XXPERMDI X3H, X3H, $2, TT1
1806 STXVD2X TT0, (R0)(P3ptr)
1807 STXVD2X TT1, (R16)(P3ptr)
1808
1809 // SUB(X<T3-X3) // T1 = T3-X3
1810 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
1811
1812 // X- ; Y- ; MUL; T- // T1 = T1*T2
1813 CALL p256MulInternal<>(SB)
1814
1815 // SUB(Y3<T-Y3) // Y3 = T1-Y3
1816 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
1817
1818 XXPERMDI Y3L, Y3L, $2, Y3L
1819 XXPERMDI Y3H, Y3H, $2, Y3H
1820 STXVD2X Y3L, (R17)(P3ptr)
1821 STXVD2X Y3H, (R18)(P3ptr)
1822 RET
1823
1824 #undef P3ptr
1825 #undef P1ptr
1826 #undef CPOOL
1827 #undef X3L
1828 #undef X3H
1829 #undef Y3L
1830 #undef Y3H
1831 #undef T1L
1832 #undef T1H
1833 #undef T2L
1834 #undef T2H
1835 #undef T3L
1836 #undef T3H
1837 #undef X1L
1838 #undef X1H
1839 #undef Y1L
1840 #undef Y1H
1841 #undef Z1L
1842 #undef Z1H
1843 #undef TT0
1844 #undef TT1
1845 #undef T2
1846 #undef X0
1847 #undef X1
1848 #undef Y0
1849 #undef Y1
1850 #undef T0
1851 #undef T1
1852 #undef PL
1853 #undef PH
1854 #undef Z3L
1855 #undef Z3H
1856 #undef ZER
1857 #undef SEL1
1858 #undef CAR1
1859 #undef CAR2
1860
1861 #define P3ptr R3
1862 #define P1ptr R4
1863 #define P2ptr R5
1864 #define CPOOL R7
1865 #define TRUE R14
1866 #define RES1 R9
1867 #define RES2 R10
1868
1869 // Temporaries in REGs
1870 #define T1L V16
1871 #define T1H V17
1872 #define T2L V18
1873 #define T2H V19
1874 #define U1L V20
1875 #define U1H V21
1876 #define S1L V22
1877 #define S1H V23
1878 #define HL V24
1879 #define HH V25
1880 #define RL V26
1881 #define RH V27
1882
1883 // Temps for Sub and Add
1884 #define ZER V6
1885 #define SEL1 V7
1886 #define CAR1 V8
1887 #define CAR2 V9
1888 #define TT0 V11
1889 #define TT1 V12
1890 #define T2 V13
1891
1892 // p256MulAsm Parameters
1893 #define X0 V0
1894 #define X1 V1
1895 #define Y0 V2
1896 #define Y1 V3
1897 #define T0 V4
1898 #define T1 V5
1899
1900 #define PL V30
1901 #define PH V31
1902 /*
1903 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
1904 *
1905 * A = X₁×Z₂²
1906 * B = Y₁×Z₂³
1907 * C = X₂×Z₁²-A
1908 * D = Y₂×Z₁³-B
1909 * X₃ = D² - 2A×C² - C³
1910 * Y₃ = D×(A×C² - X₃) - B×C³
1911 * Z₃ = Z₁×Z₂×C
1912 *
1913 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
1914 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
1915 *
1916 * T1 = Z1*Z1
1917 * T2 = Z2*Z2
1918 * U1 = X1*T2
1919 * H = X2*T1
1920 * H = H-U1
1921 * Z3 = Z1*Z2
1922 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1923 *
1924 * S1 = Z2*T2
1925 * S1 = Y1*S1
1926 * R = Z1*T1
1927 * R = Y2*R
1928 * R = R-S1
1929 *
1930 * T1 = H*H
1931 * T2 = H*T1
1932 * U1 = U1*T1
1933 *
1934 * X3 = R*R
1935 * X3 = X3-T2
1936 * T1 = 2*U1
1937 * X3 = X3-T1 << store-out X3 result reg
1938 *
1939 * T2 = S1*T2
1940 * Y3 = U1-X3
1941 * Y3 = R*Y3
1942 * Y3 = Y3-T2 << store-out Y3 result reg
1943
1944 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1945 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1946 // X=X2; Y- ; MUL; H=T // H = X2*T1
1947 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1948 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1949 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1950 // SUB(H<H-T) // H = H-U1
1951 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
1952 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1953 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1954 // X=Y2; Y=R ; MUL; T- // R = Y2*R
1955 // SUB(R<T-S1) // R = R-S1
1956 // X=H ; Y=H ; MUL; T- // T1 = H*H
1957 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
1958 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
1959 // X=R ; Y=R ; MUL; T- // X3 = R*R
1960 // SUB(T<T-T2) // X3 = X3-T2
1961 // ADD(X<U1+U1) // T1 = 2*U1
1962 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
1963 // SUB(Y<U1-T) // Y3 = U1-X3
1964 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
1965 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
1966 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
1967 */
1968 // p256PointAddAsm(res, in1, in2 *p256Point)
1969 TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
1970 MOVD res+0(FP), P3ptr
1971 MOVD in1+8(FP), P1ptr
1972 MOVD $p256mul<>+0x00(SB), CPOOL
1973 MOVD $16, R16
1974 MOVD $32, R17
1975 MOVD $48, R18
1976 MOVD $64, R19
1977 MOVD $80, R20
1978
1979 LXVD2X (R16)(CPOOL), PH
1980 LXVD2X (R0)(CPOOL), PL
1981
1982 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1983 LXVD2X (R19)(P1ptr), X0 // Z1L
1984 LXVD2X (R20)(P1ptr), X1 // Z1H
1985 XXPERMDI X0, X0, $2, X0
1986 XXPERMDI X1, X1, $2, X1
1987 VOR X0, X0, Y0
1988 VOR X1, X1, Y1
1989 CALL p256MulInternal<>(SB)
1990
1991 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1992 VOR T0, T0, Y0
1993 VOR T1, T1, Y1
1994 CALL p256MulInternal<>(SB)
1995 VOR T0, T0, RL // SAVE: RL
1996 VOR T1, T1, RH // SAVE: RH
1997
1998 STXVD2X RH, (R1)(R17) // V27 has to be saved
1999
2000 // X=X2; Y- ; MUL; H=T // H = X2*T1
2001 MOVD in2+16(FP), P2ptr
2002 LXVD2X (R0)(P2ptr), X0 // X2L
2003 LXVD2X (R16)(P2ptr), X1 // X2H
2004 XXPERMDI X0, X0, $2, X0
2005 XXPERMDI X1, X1, $2, X1
2006 CALL p256MulInternal<>(SB)
2007 VOR T0, T0, HL // SAVE: HL
2008 VOR T1, T1, HH // SAVE: HH
2009
2010 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2011 MOVD in2+16(FP), P2ptr
2012 LXVD2X (R19)(P2ptr), X0 // Z2L
2013 LXVD2X (R20)(P2ptr), X1 // Z2H
2014 XXPERMDI X0, X0, $2, X0
2015 XXPERMDI X1, X1, $2, X1
2016 VOR X0, X0, Y0
2017 VOR X1, X1, Y1
2018 CALL p256MulInternal<>(SB)
2019
2020 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2021 VOR T0, T0, Y0
2022 VOR T1, T1, Y1
2023 CALL p256MulInternal<>(SB)
2024 VOR T0, T0, S1L // SAVE: S1L
2025 VOR T1, T1, S1H // SAVE: S1H
2026
2027 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2028 MOVD in1+8(FP), P1ptr
2029 LXVD2X (R0)(P1ptr), X0 // X1L
2030 LXVD2X (R16)(P1ptr), X1 // X1H
2031 XXPERMDI X0, X0, $2, X0
2032 XXPERMDI X1, X1, $2, X1
2033 CALL p256MulInternal<>(SB)
2034 VOR T0, T0, U1L // SAVE: U1L
2035 VOR T1, T1, U1H // SAVE: U1H
2036
2037 // SUB(H<H-T) // H = H-U1
2038 p256SubInternal(HH,HL,HH,HL,T1,T0)
2039
2040 // if H == 0 or H^P == 0 then ret=1 else ret=0
2041 // clobbers T1H and T1L
2042 MOVD $1, TRUE
2043 VSPLTISB $0, ZER
2044 VOR HL, HH, T1H
2045 VCMPEQUDCC ZER, T1H, T1H
2046
2047 // 26 = CR6 NE
2048 ISEL $26, R0, TRUE, RES1
2049 VXOR HL, PL, T1L // SAVE: T1L
2050 VXOR HH, PH, T1H // SAVE: T1H
2051 VOR T1L, T1H, T1H
2052 VCMPEQUDCC ZER, T1H, T1H
2053
2054 // 26 = CR6 NE
2055 ISEL $26, R0, TRUE, RES2
2056 OR RES2, RES1, RES1
2057 MOVD RES1, ret+24(FP)
2058
2059 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2060 MOVD in1+8(FP), P1ptr
2061 MOVD in2+16(FP), P2ptr
2062 LXVD2X (R19)(P1ptr), X0 // Z1L
2063 LXVD2X (R20)(P1ptr), X1 // Z1H
2064 XXPERMDI X0, X0, $2, X0
2065 XXPERMDI X1, X1, $2, X1
2066 LXVD2X (R19)(P2ptr), Y0 // Z2L
2067 LXVD2X (R20)(P2ptr), Y1 // Z2H
2068 XXPERMDI Y0, Y0, $2, Y0
2069 XXPERMDI Y1, Y1, $2, Y1
2070 CALL p256MulInternal<>(SB)
2071
2072 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2073 VOR T0, T0, X0
2074 VOR T1, T1, X1
2075 VOR HL, HL, Y0
2076 VOR HH, HH, Y1
2077 CALL p256MulInternal<>(SB)
2078 MOVD res+0(FP), P3ptr
2079 XXPERMDI T1, T1, $2, TT1
2080 XXPERMDI T0, T0, $2, TT0
2081 STXVD2X TT0, (R19)(P3ptr)
2082 STXVD2X TT1, (R20)(P3ptr)
2083
2084 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2085 MOVD in1+8(FP), P1ptr
2086 LXVD2X (R17)(P1ptr), X0
2087 LXVD2X (R18)(P1ptr), X1
2088 XXPERMDI X0, X0, $2, X0
2089 XXPERMDI X1, X1, $2, X1
2090 VOR S1L, S1L, Y0
2091 VOR S1H, S1H, Y1
2092 CALL p256MulInternal<>(SB)
2093 VOR T0, T0, S1L
2094 VOR T1, T1, S1H
2095
2096 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2097 MOVD in2+16(FP), P2ptr
2098 LXVD2X (R17)(P2ptr), X0
2099 LXVD2X (R18)(P2ptr), X1
2100 XXPERMDI X0, X0, $2, X0
2101 XXPERMDI X1, X1, $2, X1
2102 VOR RL, RL, Y0
2103
2104 // VOR RH, RH, Y1 RH was saved above in D2X format
2105 LXVD2X (R1)(R17), Y1
2106 CALL p256MulInternal<>(SB)
2107
2108 // SUB(R<T-S1) // R = T-S1
2109 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2110
2111 STXVD2X RH, (R1)(R17) // Save RH
2112
2113 // if R == 0 or R^P == 0 then ret=ret else ret=0
2114 // clobbers T1H and T1L
2115 // Redo this using ISEL??
2116 MOVD $1, TRUE
2117 VSPLTISB $0, ZER
2118 VOR RL, RH, T1H
2119 VCMPEQUDCC ZER, T1H, T1H
2120
2121 // 24 = CR6 NE
2122 ISEL $26, R0, TRUE, RES1
2123 VXOR RL, PL, T1L
2124 VXOR RH, PH, T1H // SAVE: T1L
2125 VOR T1L, T1H, T1H
2126 VCMPEQUDCC ZER, T1H, T1H
2127
2128 // 26 = CR6 NE
2129 ISEL $26, R0, TRUE, RES2
2130 OR RES2, RES1, RES1
2131 MOVD ret+24(FP), RES2
2132 AND RES2, RES1, RES1
2133 MOVD RES1, ret+24(FP)
2134
2135 // X=H ; Y=H ; MUL; T- // T1 = H*H
2136 VOR HL, HL, X0
2137 VOR HH, HH, X1
2138 VOR HL, HL, Y0
2139 VOR HH, HH, Y1
2140 CALL p256MulInternal<>(SB)
2141
2142 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2143 VOR T0, T0, Y0
2144 VOR T1, T1, Y1
2145 CALL p256MulInternal<>(SB)
2146 VOR T0, T0, T2L
2147 VOR T1, T1, T2H
2148
2149 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2150 VOR U1L, U1L, X0
2151 VOR U1H, U1H, X1
2152 CALL p256MulInternal<>(SB)
2153 VOR T0, T0, U1L
2154 VOR T1, T1, U1H
2155
2156 // X=R ; Y=R ; MUL; T- // X3 = R*R
2157 VOR RL, RL, X0
2158
2159 // VOR RH, RH, X1
2160 VOR RL, RL, Y0
2161
2162 // RH was saved above using STXVD2X
2163 LXVD2X (R1)(R17), X1
2164 VOR X1, X1, Y1
2165
2166 // VOR RH, RH, Y1
2167 CALL p256MulInternal<>(SB)
2168
2169 // SUB(T<T-T2) // X3 = X3-T2
2170 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2171
2172 // ADD(X<U1+U1) // T1 = 2*U1
2173 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2174
2175 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2176 p256SubInternal(T1,T0,T1,T0,X1,X0)
2177 MOVD res+0(FP), P3ptr
2178 XXPERMDI T1, T1, $2, TT1
2179 XXPERMDI T0, T0, $2, TT0
2180 STXVD2X TT0, (R0)(P3ptr)
2181 STXVD2X TT1, (R16)(P3ptr)
2182
2183 // SUB(Y<U1-T) // Y3 = U1-X3
2184 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2185
2186 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2187 VOR RL, RL, X0
2188
2189 // VOR RH, RH, X1
2190 LXVD2X (R1)(R17), X1
2191 CALL p256MulInternal<>(SB)
2192 VOR T0, T0, U1L
2193 VOR T1, T1, U1H
2194
2195 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2196 VOR S1L, S1L, X0
2197 VOR S1H, S1H, X1
2198 VOR T2L, T2L, Y0
2199 VOR T2H, T2H, Y1
2200 CALL p256MulInternal<>(SB)
2201
2202 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2203 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2204 MOVD res+0(FP), P3ptr
2205 XXPERMDI T1, T1, $2, TT1
2206 XXPERMDI T0, T0, $2, TT0
2207 STXVD2X TT0, (R17)(P3ptr)
2208 STXVD2X TT1, (R18)(P3ptr)
2209
2210 RET
2211
View as plain text