1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 // This is a port of the s390x asm implementation.
10 // to ppc64le.
11
12 // Some changes were needed due to differences in
13 // the Go opcodes and/or available instructions
14 // between s390x and ppc64le.
15
16 // 1. There were operand order differences in the
17 // VSUBUQM, VSUBCUQ, and VSEL instructions.
18
19 // 2. ppc64 does not have a multiply high and low
20 // like s390x, so those were implemented using
21 // macros to compute the equivalent values.
22
23 // 3. The LVX, STVX instructions on ppc64 require
24 // 16 byte alignment of the data. To avoid that
25 // requirement, data is loaded using LXVD2X and
26 // STXVD2X with VPERM to reorder bytes correctly.
27
28 // I have identified some areas where I believe
29 // changes would be needed to make this work for big
30 // endian; however additional changes beyond what I
31 // have noted are most likely needed to make it work.
32 // - The string used with VPERM to swap the byte order
33 // for loads and stores.
34 // - The constants that are loaded from CPOOL.
35 //
36
37 // The following constants are defined in an order
38 // that is correct for use with LXVD2X/STXVD2X
39 // on little endian.
40 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
41 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
42 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
43 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
44 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
45 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
46 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
47 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
48 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
49 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
50 DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
51 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
52 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
53 DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
54 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
55 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
56 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
57 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
58 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
59 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
60 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
61 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
62 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
63 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
64 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
65 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
66 DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
67 DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
68 DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
69 DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
70
71 // External declarations for constants
72 GLOBL p256ord<>(SB), 8, $32
73 GLOBL p256<>(SB), 8, $80
74 GLOBL p256mul<>(SB), 8, $160
75
76 // The following macros are used to implement the ppc64le
77 // equivalent function from the corresponding s390x
78 // instruction for vector multiply high, low, and add,
79 // since there aren't exact equivalent instructions.
80 // The corresponding s390x instructions appear in the
81 // comments.
82 // Implementation for big endian would have to be
83 // investigated, I think it would be different.
84 //
85 //
86 // Vector multiply word
87 //
88 // VMLF x0, x1, out_low
89 // VMLHF x0, x1, out_hi
90 #define VMULT(x1, x2, out_low, out_hi) \
91 VMULEUW x1, x2, TMP1; \
92 VMULOUW x1, x2, TMP2; \
93 VMRGEW TMP1, TMP2, out_hi; \
94 VMRGOW TMP1, TMP2, out_low
95
96 //
97 // Vector multiply add word
98 //
99 // VMALF x0, x1, y, out_low
100 // VMALHF x0, x1, y, out_hi
101 #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
102 VMULEUW y, one, TMP2; \
103 VMULOUW y, one, TMP1; \
104 VMULEUW x1, x2, out_low; \
105 VMULOUW x1, x2, out_hi; \
106 VADDUDM TMP2, out_low, TMP2; \
107 VADDUDM TMP1, out_hi, TMP1; \
108 VMRGOW TMP2, TMP1, out_low; \
109 VMRGEW TMP2, TMP1, out_hi
110
111 #define res_ptr R3
112 #define a_ptr R4
113
114 #undef res_ptr
115 #undef a_ptr
116
117 #define P1ptr R3
118 #define CPOOL R7
119
120 #define Y1L V0
121 #define Y1H V1
122 #define T1L V2
123 #define T1H V3
124
125 #define PL V30
126 #define PH V31
127
128 #define CAR1 V6
129
130 #define SEL V8
131 #define ZER V9
132
133 // func p256NegCond(val *p256Point, cond int)
134 TEXT ·p256NegCond(SB), NOSPLIT, $0-16
135 MOVD val+0(FP), P1ptr
136 MOVD $16, R16
137
138 // Copy cond into SEL (cond is R1 + 8 (cond offset) + 32)
139 MOVD $40, R17
140 LXVDSX (R1)(R17), SEL
141 // Zeroize ZER
142 VSPLTISB $0, ZER
143 // SEL controls whether to return the original value (Y1H/Y1L)
144 // or the negated value (T1H/T1L).
145 VCMPEQUD SEL, ZER, SEL
146
147 MOVD $p256mul<>+0x00(SB), CPOOL
148
149 LXVD2X (P1ptr)(R0), Y1L
150 LXVD2X (P1ptr)(R16), Y1H
151
152 XXPERMDI Y1H, Y1H, $2, Y1H
153 XXPERMDI Y1L, Y1L, $2, Y1L
154
155 LXVD2X (CPOOL)(R0), PL
156 LXVD2X (CPOOL)(R16), PH
157
158 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry
159 VSUBUQM PL, Y1L, T1L // subtract part2 giving result
160 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
161
162 VSEL T1H, Y1H, SEL, T1H
163 VSEL T1L, Y1L, SEL, T1L
164
165 XXPERMDI T1H, T1H, $2, T1H
166 XXPERMDI T1L, T1L, $2, T1L
167
168 STXVD2X T1L, (R0+P1ptr)
169 STXVD2X T1H, (R16+P1ptr)
170 RET
171
172 #undef P1ptr
173 #undef CPOOL
174 #undef Y1L
175 #undef Y1H
176 #undef T1L
177 #undef T1H
178 #undef PL
179 #undef PH
180 #undef CAR1
181 #undef SEL
182 #undef ZER
183
184 #define P3ptr R3
185 #define P1ptr R4
186 #define P2ptr R5
187
188 #define X1L V0
189 #define X1H V1
190 #define Y1L V2
191 #define Y1H V3
192 #define Z1L V4
193 #define Z1H V5
194 #define X2L V6
195 #define X2H V7
196 #define Y2L V8
197 #define Y2H V9
198 #define Z2L V10
199 #define Z2H V11
200 #define SEL V12
201 #define ZER V13
202
203 // This function uses LXVD2X and STXVD2X to avoid the
204 // data alignment requirement for LVX, STVX. Since
205 // this code is just moving bytes and not doing arithmetic,
206 // order of the bytes doesn't matter.
207 //
208 // func p256MovCond(res, a, b *p256Point, cond int)
209 TEXT ·p256MovCond(SB), NOSPLIT, $0-32
210 MOVD res+0(FP), P3ptr
211 MOVD a+8(FP), P1ptr
212 MOVD b+16(FP), P2ptr
213 MOVD $16, R16
214 MOVD $32, R17
215 MOVD $48, R18
216 MOVD $56, R21
217 MOVD $64, R19
218 MOVD $80, R20
219 // cond is R1 + 24 (cond offset) + 32
220 LXVDSX (R1)(R21), SEL
221 VSPLTISB $0, ZER
222 // SEL controls whether to store a or b
223 VCMPEQUD SEL, ZER, SEL
224
225 LXVD2X (P1ptr+R0), X1H
226 LXVD2X (P1ptr+R16), X1L
227 LXVD2X (P1ptr+R17), Y1H
228 LXVD2X (P1ptr+R18), Y1L
229 LXVD2X (P1ptr+R19), Z1H
230 LXVD2X (P1ptr+R20), Z1L
231
232 LXVD2X (P2ptr+R0), X2H
233 LXVD2X (P2ptr+R16), X2L
234 LXVD2X (P2ptr+R17), Y2H
235 LXVD2X (P2ptr+R18), Y2L
236 LXVD2X (P2ptr+R19), Z2H
237 LXVD2X (P2ptr+R20), Z2L
238
239 VSEL X1H, X2H, SEL, X1H
240 VSEL X1L, X2L, SEL, X1L
241 VSEL Y1H, Y2H, SEL, Y1H
242 VSEL Y1L, Y2L, SEL, Y1L
243 VSEL Z1H, Z2H, SEL, Z1H
244 VSEL Z1L, Z2L, SEL, Z1L
245
246 STXVD2X X1H, (P3ptr+R0)
247 STXVD2X X1L, (P3ptr+R16)
248 STXVD2X Y1H, (P3ptr+R17)
249 STXVD2X Y1L, (P3ptr+R18)
250 STXVD2X Z1H, (P3ptr+R19)
251 STXVD2X Z1L, (P3ptr+R20)
252
253 RET
254
255 #undef P3ptr
256 #undef P1ptr
257 #undef P2ptr
258 #undef X1L
259 #undef X1H
260 #undef Y1L
261 #undef Y1H
262 #undef Z1L
263 #undef Z1H
264 #undef X2L
265 #undef X2H
266 #undef Y2L
267 #undef Y2H
268 #undef Z2L
269 #undef Z2H
270 #undef SEL
271 #undef ZER
272
273 #define P3ptr R3
274 #define P1ptr R4
275 #define COUNT R5
276
277 #define X1L V0
278 #define X1H V1
279 #define Y1L V2
280 #define Y1H V3
281 #define Z1L V4
282 #define Z1H V5
283 #define X2L V6
284 #define X2H V7
285 #define Y2L V8
286 #define Y2H V9
287 #define Z2L V10
288 #define Z2H V11
289
290 #define ONE V18
291 #define IDX V19
292 #define SEL1 V20
293 #define SEL2 V21
294 // func p256Select(point *p256Point, table *p256Table, idx int)
295 TEXT ·p256Select(SB), NOSPLIT, $0-24
296 MOVD res+0(FP), P3ptr
297 MOVD table+8(FP), P1ptr
298 MOVD $16, R16
299 MOVD $32, R17
300 MOVD $48, R18
301 MOVD $64, R19
302 MOVD $80, R20
303
304 LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
305 VSPLTB $7, SEL1, IDX // splat byte
306 VSPLTISB $1, ONE // VREPIB $1, ONE
307 VSPLTISB $1, SEL2 // VREPIB $1, SEL2
308 MOVD $16, COUNT // len(p256Table)
309 MOVD COUNT, CTR // set up ctr
310
311 VSPLTISB $0, X1H // VZERO X1H
312 VSPLTISB $0, X1L // VZERO X1L
313 VSPLTISB $0, Y1H // VZERO Y1H
314 VSPLTISB $0, Y1L // VZERO Y1L
315 VSPLTISB $0, Z1H // VZERO Z1H
316 VSPLTISB $0, Z1L // VZERO Z1L
317
318 loop_select:
319
320 // LVXD2X is used here since data alignment doesn't
321 // matter.
322
323 LXVD2X (P1ptr+R0), X2H
324 LXVD2X (P1ptr+R16), X2L
325 LXVD2X (P1ptr+R17), Y2H
326 LXVD2X (P1ptr+R18), Y2L
327 LXVD2X (P1ptr+R19), Z2H
328 LXVD2X (P1ptr+R20), Z2L
329
330 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
331
332 // This will result in SEL1 being all 0s or 1s, meaning
333 // the result is either X1L or X2L, no individual byte
334 // selection.
335
336 VSEL X1L, X2L, SEL1, X1L
337 VSEL X1H, X2H, SEL1, X1H
338 VSEL Y1L, Y2L, SEL1, Y1L
339 VSEL Y1H, Y2H, SEL1, Y1H
340 VSEL Z1L, Z2L, SEL1, Z1L
341 VSEL Z1H, Z2H, SEL1, Z1H
342
343 // Add 1 to all bytes in SEL2
344 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK
345 ADD $96, P1ptr
346 BDNZ loop_select
347
348 // STXVD2X is used here so that alignment doesn't
349 // need to be verified. Since values were loaded
350 // using LXVD2X this is OK.
351 STXVD2X X1H, (P3ptr+R0)
352 STXVD2X X1L, (P3ptr+R16)
353 STXVD2X Y1H, (P3ptr+R17)
354 STXVD2X Y1L, (P3ptr+R18)
355 STXVD2X Z1H, (P3ptr+R19)
356 STXVD2X Z1L, (P3ptr+R20)
357 RET
358
359 #undef P3ptr
360 #undef P1ptr
361 #undef COUNT
362 #undef X1L
363 #undef X1H
364 #undef Y1L
365 #undef Y1H
366 #undef Z1L
367 #undef Z1H
368 #undef X2L
369 #undef X2H
370 #undef Y2L
371 #undef Y2H
372 #undef Z2L
373 #undef Z2H
374 #undef ONE
375 #undef IDX
376 #undef SEL1
377 #undef SEL2
378
379 #define P3ptr R3
380 #define P1ptr R4
381 #define COUNT R5
382
383 #define X1L V0
384 #define X1H V1
385 #define Y1L V2
386 #define Y1H V3
387 #define Z1L V4
388 #define Z1H V5
389 #define X2L V6
390 #define X2H V7
391 #define Y2L V8
392 #define Y2H V9
393 #define Z2L V10
394 #define Z2H V11
395
396 #define ONE V18
397 #define IDX V19
398 #define SEL1 V20
399 #define SEL2 V21
400
401 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
402 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
403 MOVD res+0(FP), P3ptr
404 MOVD table+8(FP), P1ptr
405 MOVD $16, R16
406 MOVD $32, R17
407 MOVD $48, R18
408
409 LXVDSX (R1)(R18), SEL1
410 VSPLTB $7, SEL1, IDX // splat byte
411
412 VSPLTISB $1, ONE // Vector with byte 1s
413 VSPLTISB $1, SEL2 // Vector with byte 1s
414 MOVD $32, COUNT // len(p256AffineTable)
415 MOVD COUNT, CTR // loop count
416
417 VSPLTISB $0, X1H // VZERO X1H
418 VSPLTISB $0, X1L // VZERO X1L
419 VSPLTISB $0, Y1H // VZERO Y1H
420 VSPLTISB $0, Y1L // VZERO Y1L
421
422 loop_select:
423 LXVD2X (P1ptr+R0), X2H
424 LXVD2X (P1ptr+R16), X2L
425 LXVD2X (P1ptr+R17), Y2H
426 LXVD2X (P1ptr+R18), Y2L
427
428 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
429
430 VSEL X1L, X2L, SEL1, X1L // Select if idx matched
431 VSEL X1H, X2H, SEL1, X1H
432 VSEL Y1L, Y2L, SEL1, Y1L
433 VSEL Y1H, Y2H, SEL1, Y1H
434
435 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1
436 ADD $64, P1ptr // Next chunk
437 BDNZ loop_select
438
439 STXVD2X X1H, (P3ptr+R0)
440 STXVD2X X1L, (P3ptr+R16)
441 STXVD2X Y1H, (P3ptr+R17)
442 STXVD2X Y1L, (P3ptr+R18)
443 RET
444
445 #undef P3ptr
446 #undef P1ptr
447 #undef COUNT
448 #undef X1L
449 #undef X1H
450 #undef Y1L
451 #undef Y1H
452 #undef Z1L
453 #undef Z1H
454 #undef X2L
455 #undef X2H
456 #undef Y2L
457 #undef Y2H
458 #undef Z2L
459 #undef Z2H
460 #undef ONE
461 #undef IDX
462 #undef SEL1
463 #undef SEL2
464
465 #define res_ptr R3
466 #define x_ptr R4
467 #define CPOOL R7
468
469 #define T0 V0
470 #define T1 V1
471 #define T2 V2
472 #define TT0 V3
473 #define TT1 V4
474
475 #define ZER V6
476 #define SEL1 V7
477 #define SEL2 V8
478 #define CAR1 V9
479 #define CAR2 V10
480 #define RED1 V11
481 #define RED2 V12
482 #define PL V13
483 #define PH V14
484
485 // func p256FromMont(res, in *p256Element)
486 TEXT ·p256FromMont(SB), NOSPLIT, $0-16
487 MOVD res+0(FP), res_ptr
488 MOVD in+8(FP), x_ptr
489
490 MOVD $16, R16
491 MOVD $32, R17
492 MOVD $48, R18
493 MOVD $64, R19
494 MOVD $p256<>+0x00(SB), CPOOL
495
496 VSPLTISB $0, T2 // VZERO T2
497 VSPLTISB $0, ZER // VZERO ZER
498
499 // Constants are defined so that the LXVD2X is correct
500 LXVD2X (CPOOL+R0), PH
501 LXVD2X (CPOOL+R16), PL
502
503 // VPERM byte selections
504 LXVD2X (CPOOL+R18), SEL2
505 LXVD2X (CPOOL+R19), SEL1
506
507 LXVD2X (R16)(x_ptr), T1
508 LXVD2X (R0)(x_ptr), T0
509
510 // Put in true little endian order
511 XXPERMDI T0, T0, $2, T0
512 XXPERMDI T1, T1, $2, T1
513
514 // First round
515 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
516 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
517 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
518
519 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
520 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
521
522 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
523 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
524 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
525 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
526 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
527
528 // Second round
529 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
530 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
531 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
532
533 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
534 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
535
536 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
537 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
538 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
539 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
540 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
541
542 // Third round
543 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
544 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
545 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
546
547 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
548 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
549
550 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
551 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
552 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
553 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
554 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
555
556 // Last round
557 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
558 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
559 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
560
561 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
562 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
563
564 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
565 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
566 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
567 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
568 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
569
570 // ---------------------------------------------------
571
572 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1
573 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0
574 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
575 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1
576 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2
577
578 VSEL TT0, T0, T2, T0
579 VSEL TT1, T1, T2, T1
580
581 // Reorder the bytes so STXVD2X can be used.
582 // TT0, TT1 used for VPERM result in case
583 // the caller expects T0, T1 to be good.
584 XXPERMDI T0, T0, $2, TT0
585 XXPERMDI T1, T1, $2, TT1
586
587 STXVD2X TT0, (R0)(res_ptr)
588 STXVD2X TT1, (R16)(res_ptr)
589 RET
590
591 #undef res_ptr
592 #undef x_ptr
593 #undef CPOOL
594 #undef T0
595 #undef T1
596 #undef T2
597 #undef TT0
598 #undef TT1
599 #undef ZER
600 #undef SEL1
601 #undef SEL2
602 #undef CAR1
603 #undef CAR2
604 #undef RED1
605 #undef RED2
606 #undef PL
607 #undef PH
608
609 // ---------------------------------------
610 // p256MulInternal
611 // V0-V3 V30,V31 - Not Modified
612 // V4-V15 V27-V29 - Volatile
613
614 #define CPOOL R7
615
616 // Parameters
617 #define X0 V0 // Not modified
618 #define X1 V1 // Not modified
619 #define Y0 V2 // Not modified
620 #define Y1 V3 // Not modified
621 #define T0 V4 // Result
622 #define T1 V5 // Result
623 #define P0 V30 // Not modified
624 #define P1 V31 // Not modified
625
626 // Temporaries: lots of reused vector regs
627 #define YDIG V6 // Overloaded with CAR2
628 #define ADD1H V7 // Overloaded with ADD3H
629 #define ADD2H V8 // Overloaded with ADD4H
630 #define ADD3 V9 // Overloaded with SEL2,SEL5
631 #define ADD4 V10 // Overloaded with SEL3,SEL6
632 #define RED1 V11 // Overloaded with CAR2
633 #define RED2 V12
634 #define RED3 V13 // Overloaded with SEL1
635 #define T2 V14
636 // Overloaded temporaries
637 #define ADD1 V4 // Overloaded with T0
638 #define ADD2 V5 // Overloaded with T1
639 #define ADD3H V7 // Overloaded with ADD1H
640 #define ADD4H V8 // Overloaded with ADD2H
641 #define ZER V28 // Overloaded with TMP1
642 #define CAR1 V6 // Overloaded with YDIG
643 #define CAR2 V11 // Overloaded with RED1
644 // Constant Selects
645 #define SEL1 V13 // Overloaded with RED3
646 #define SEL2 V9 // Overloaded with ADD3,SEL5
647 #define SEL3 V10 // Overloaded with ADD4,SEL6
648 #define SEL4 V6 // Overloaded with YDIG,CAR1
649 #define SEL5 V9 // Overloaded with ADD3,SEL2
650 #define SEL6 V10 // Overloaded with ADD4,SEL3
651
652 // TMP1, TMP2 used in
653 // VMULT macros
654 #define TMP1 V13 // Overloaded with RED3
655 #define TMP2 V27
656 #define ONE V29 // 1s splatted by word
657
658 /* *
659 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
660 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
661 * With you, SIMD be...
662 *
663 * +--------+--------+
664 * +--------| RED2 | RED1 |
665 * | +--------+--------+
666 * | ---+--------+--------+
667 * | +---- T2| T1 | T0 |--+
668 * | | ---+--------+--------+ |
669 * | | |
670 * | | ======================= |
671 * | | |
672 * | | +--------+--------+<-+
673 * | +-------| ADD2 | ADD1 |--|-----+
674 * | | +--------+--------+ | |
675 * | | +--------+--------+<---+ |
676 * | | | ADD2H | ADD1H |--+ |
677 * | | +--------+--------+ | |
678 * | | +--------+--------+<-+ |
679 * | | | ADD4 | ADD3 |--|-+ |
680 * | | +--------+--------+ | | |
681 * | | +--------+--------+<---+ | |
682 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
683 * | | +--------+--------+ | | V
684 * | | ------------------------ | | +--------+
685 * | | | | | RED3 | [d0 0 0 d0]
686 * | | | | +--------+
687 * | +---->+--------+--------+ | | |
688 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
689 * | +--------+--------+ | | |
690 * +---->---+--------+--------+ | | |
691 * T2| T1 | T0 |----+ | |
692 * ---+--------+--------+ | | |
693 * ---+--------+--------+<---+ | |
694 * +--- T2| T1 | T0 |----------+
695 * | ---+--------+--------+ | |
696 * | +--------+--------+<-------------+
697 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
698 * | +--------+--------+ | | |
699 * | +--------+<----------------------+
700 * | | RED3 |--------------+ | [0 0 d1 d0]
701 * | +--------+ | |
702 * +--->+--------+--------+ | |
703 * | T1 | T0 |--------+
704 * +--------+--------+ | |
705 * --------------------------- | |
706 * | |
707 * +--------+--------+<----+ |
708 * | RED2 | RED1 | |
709 * +--------+--------+ |
710 * ---+--------+--------+<-------+
711 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
712 * ---+--------+--------+
713 *
714 * *Mi obra de arte de siglo XXI @vpaprots
715 *
716 *
717 * First group is special, doesn't get the two inputs:
718 * +--------+--------+<-+
719 * +-------| ADD2 | ADD1 |--|-----+
720 * | +--------+--------+ | |
721 * | +--------+--------+<---+ |
722 * | | ADD2H | ADD1H |--+ |
723 * | +--------+--------+ | |
724 * | +--------+--------+<-+ |
725 * | | ADD4 | ADD3 |--|-+ |
726 * | +--------+--------+ | | |
727 * | +--------+--------+<---+ | |
728 * | | ADD4H | ADD3H |------|-+ |(+vzero)
729 * | +--------+--------+ | | V
730 * | ------------------------ | | +--------+
731 * | | | | RED3 | [d0 0 0 d0]
732 * | | | +--------+
733 * +---->+--------+--------+ | | |
734 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
735 * +--------+--------+ | | |
736 * ---+--------+--------+<---+ | |
737 * +--- T2| T1 | T0 |----------+
738 * | ---+--------+--------+ | |
739 * | +--------+--------+<-------------+
740 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
741 * | +--------+--------+ | | |
742 * | +--------+<----------------------+
743 * | | RED3 |--------------+ | [0 0 d1 d0]
744 * | +--------+ | |
745 * +--->+--------+--------+ | |
746 * | T1 | T0 |--------+
747 * +--------+--------+ | |
748 * --------------------------- | |
749 * | |
750 * +--------+--------+<----+ |
751 * | RED2 | RED1 | |
752 * +--------+--------+ |
753 * ---+--------+--------+<-------+
754 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
755 * ---+--------+--------+
756 *
757 * Last 'group' needs to RED2||RED1 shifted less
758 */
759 TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
760 // CPOOL loaded from caller
761 MOVD $16, R16
762 MOVD $32, R17
763 MOVD $48, R18
764 MOVD $64, R19
765 MOVD $80, R20
766 MOVD $96, R21
767 MOVD $112, R22
768
769 // ---------------------------------------------------
770
771 VSPLTW $3, Y0, YDIG // VREPF Y0 is input
772
773 // VMLHF X0, YDIG, ADD1H
774 // VMLHF X1, YDIG, ADD2H
775 // VMLF X0, YDIG, ADD1
776 // VMLF X1, YDIG, ADD2
777 //
778 VMULT(X0, YDIG, ADD1, ADD1H)
779 VMULT(X1, YDIG, ADD2, ADD2H)
780
781 VSPLTISW $1, ONE
782 VSPLTW $2, Y0, YDIG // VREPF
783
784 // VMALF X0, YDIG, ADD1H, ADD3
785 // VMALF X1, YDIG, ADD2H, ADD4
786 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
787 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
788 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
789 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
790
791 LXVD2X (R17)(CPOOL), SEL1
792 VSPLTISB $0, ZER // VZERO ZER
793 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
794
795 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
796 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB
797
798 VADDCUQ T0, ADD3, CAR1 // VACCQ
799 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ
800 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
801 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ
802
803 LXVD2X (R18)(CPOOL), SEL2
804 LXVD2X (R19)(CPOOL), SEL3
805 LXVD2X (R20)(CPOOL), SEL4
806 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
807 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
808 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
809 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ
810
811 VSLDOI $12, T1, T0, T0 // VSLDB
812 VSLDOI $12, T2, T1, T1 // VSLDB
813
814 VADDCUQ T0, ADD3H, CAR1 // VACCQ
815 VADDUQM T0, ADD3H, T0 // VAQ
816 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
817 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
818
819 // ---------------------------------------------------
820
821 VSPLTW $1, Y0, YDIG // VREPF
822
823 // VMALHF X0, YDIG, T0, ADD1H
824 // VMALHF X1, YDIG, T1, ADD2H
825 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
826 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
827 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
828 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
829
830 VSPLTW $0, Y0, YDIG // VREPF
831
832 // VMALF X0, YDIG, ADD1H, ADD3
833 // VMALF X1, YDIG, ADD2H, ADD4
834 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
835 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
836 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
837 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
838
839 VSPLTISB $0, ZER // VZERO ZER
840 LXVD2X (R17)(CPOOL), SEL1
841 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
842
843 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB
844 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB
845
846 VADDCUQ T0, RED1, CAR1 // VACCQ
847 VADDUQM T0, RED1, T0 // VAQ
848 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
849 VADDEUQM T1, RED2, CAR1, T1 // VACQ
850
851 VADDCUQ T0, ADD3, CAR1 // VACCQ
852 VADDUQM T0, ADD3, T0 // VAQ
853 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
854 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
855 VADDUQM T2, CAR2, T2 // VAQ
856
857 LXVD2X (R18)(CPOOL), SEL2
858 LXVD2X (R19)(CPOOL), SEL3
859 LXVD2X (R20)(CPOOL), SEL4
860 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
861 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
862 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
863 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
864
865 VSLDOI $12, T1, T0, T0 // VSLDB
866 VSLDOI $12, T2, T1, T1 // VSLDB
867
868 VADDCUQ T0, ADD3H, CAR1 // VACCQ
869 VADDUQM T0, ADD3H, T0 // VAQ
870 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
871 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
872
873 // ---------------------------------------------------
874
875 VSPLTW $3, Y1, YDIG // VREPF
876
877 // VMALHF X0, YDIG, T0, ADD1H
878 // VMALHF X1, YDIG, T1, ADD2H
879 // VMALF X0, YDIG, T0, ADD1
880 // VMALF X1, YDIG, T1, ADD2
881 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
882 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
883
884 VSPLTW $2, Y1, YDIG // VREPF
885
886 // VMALF X0, YDIG, ADD1H, ADD3
887 // VMALF X1, YDIG, ADD2H, ADD4
888 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
889 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
890 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
891 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
892
893 LXVD2X (R17)(CPOOL), SEL1
894 VSPLTISB $0, ZER // VZERO ZER
895 LXVD2X (R17)(CPOOL), SEL1
896 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
897
898 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
899 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB
900
901 VADDCUQ T0, RED1, CAR1 // VACCQ
902 VADDUQM T0, RED1, T0 // VAQ
903 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
904 VADDEUQM T1, RED2, CAR1, T1 // VACQ
905
906 VADDCUQ T0, ADD3, CAR1 // VACCQ
907 VADDUQM T0, ADD3, T0 // VAQ
908 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
909 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
910 VADDUQM T2, CAR2, T2 // VAQ
911
912 LXVD2X (R18)(CPOOL), SEL2
913 LXVD2X (R19)(CPOOL), SEL3
914 LXVD2X (R20)(CPOOL), SEL4
915 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
916 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
917 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
918 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
919
920 VSLDOI $12, T1, T0, T0 // VSLDB
921 VSLDOI $12, T2, T1, T1 // VSLDB
922
923 VADDCUQ T0, ADD3H, CAR1 // VACCQ
924 VADDUQM T0, ADD3H, T0 // VAQ
925 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
926 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
927
928 // ---------------------------------------------------
929
930 VSPLTW $1, Y1, YDIG // VREPF
931
932 // VMALHF X0, YDIG, T0, ADD1H
933 // VMALHF X1, YDIG, T1, ADD2H
934 // VMALF X0, YDIG, T0, ADD1
935 // VMALF X1, YDIG, T1, ADD2
936 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
937 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
938
939 VSPLTW $0, Y1, YDIG // VREPF
940
941 // VMALF X0, YDIG, ADD1H, ADD3
942 // VMALF X1, YDIG, ADD2H, ADD4
943 // VMALHF X0, YDIG, ADD1H, ADD3H
944 // VMALHF X1, YDIG, ADD2H, ADD4H
945 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
946 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
947
948 VSPLTISB $0, ZER // VZERO ZER
949 LXVD2X (R17)(CPOOL), SEL1
950 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
951
952 VSLDOI $12, ADD2, ADD1, T0 // VSLDB
953 VSLDOI $12, T2, ADD2, T1 // VSLDB
954
955 VADDCUQ T0, RED1, CAR1 // VACCQ
956 VADDUQM T0, RED1, T0 // VAQ
957 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
958 VADDEUQM T1, RED2, CAR1, T1 // VACQ
959
960 VADDCUQ T0, ADD3, CAR1 // VACCQ
961 VADDUQM T0, ADD3, T0 // VAQ
962 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
963 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
964 VADDUQM T2, CAR2, T2 // VAQ
965
966 LXVD2X (R21)(CPOOL), SEL5
967 LXVD2X (R22)(CPOOL), SEL6
968 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
969 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
970 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ
971
972 VSLDOI $12, T1, T0, T0 // VSLDB
973 VSLDOI $12, T2, T1, T1 // VSLDB
974
975 VADDCUQ T0, ADD3H, CAR1 // VACCQ
976 VADDUQM T0, ADD3H, T0 // VAQ
977 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
978 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
979
980 VADDCUQ T0, RED1, CAR1 // VACCQ
981 VADDUQM T0, RED1, T0 // VAQ
982 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
983 VADDEUQM T1, RED2, CAR1, T1 // VACQ
984 VADDUQM T2, CAR2, T2 // VAQ
985
986 // ---------------------------------------------------
987
988 VSPLTISB $0, RED3 // VZERO RED3
989 VSUBCUQ T0, P0, CAR1 // VSCBIQ
990 VSUBUQM T0, P0, ADD1H // VSQ
991 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ
992 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
993 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ
994
995 // what output to use, ADD2H||ADD1H or T1||T0?
996 VSEL ADD1H, T0, T2, T0
997 VSEL ADD2H, T1, T2, T1
998 RET
999
1000 #undef CPOOL
1001
1002 #undef X0
1003 #undef X1
1004 #undef Y0
1005 #undef Y1
1006 #undef T0
1007 #undef T1
1008 #undef P0
1009 #undef P1
1010
1011 #undef SEL1
1012 #undef SEL2
1013 #undef SEL3
1014 #undef SEL4
1015 #undef SEL5
1016 #undef SEL6
1017
1018 #undef YDIG
1019 #undef ADD1H
1020 #undef ADD2H
1021 #undef ADD3
1022 #undef ADD4
1023 #undef RED1
1024 #undef RED2
1025 #undef RED3
1026 #undef T2
1027 #undef ADD1
1028 #undef ADD2
1029 #undef ADD3H
1030 #undef ADD4H
1031 #undef ZER
1032 #undef CAR1
1033 #undef CAR2
1034
1035 #undef TMP1
1036 #undef TMP2
1037
1038 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1039 VSPLTISB $0, ZER \ // VZERO
1040 VSUBCUQ X0, Y0, CAR1 \
1041 VSUBUQM X0, Y0, T0 \
1042 VSUBECUQ X1, Y1, CAR1, SEL1 \
1043 VSUBEUQM X1, Y1, CAR1, T1 \
1044 VSUBUQM ZER, SEL1, SEL1 \ // VSQ
1045 \
1046 VADDCUQ T0, PL, CAR1 \ // VACCQ
1047 VADDUQM T0, PL, TT0 \ // VAQ
1048 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ
1049 \
1050 VSEL TT0, T0, SEL1, T0 \
1051 VSEL TT1, T1, SEL1, T1 \
1052
1053 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1054 VADDCUQ X0, Y0, CAR1 \
1055 VADDUQM X0, Y0, T0 \
1056 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ
1057 VADDEUQM X1, Y1, CAR1, T1 \
1058 \
1059 VSPLTISB $0, ZER \
1060 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ
1061 VSUBUQM T0, PL, TT0 \
1062 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ
1063 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ
1064 VSUBEUQM T2, ZER, CAR2, SEL1 \
1065 \
1066 VSEL TT0, T0, SEL1, T0 \
1067 VSEL TT1, T1, SEL1, T1
1068
1069 #define p256HalfInternal(T1, T0, X1, X0) \
1070 VSPLTISB $0, ZER \
1071 VSUBEUQM ZER, ZER, X0, SEL1 \
1072 \
1073 VADDCUQ X0, PL, CAR1 \
1074 VADDUQM X0, PL, T0 \
1075 VADDECUQ X1, PH, CAR1, T2 \
1076 VADDEUQM X1, PH, CAR1, T1 \
1077 \
1078 VSEL T0, X0, SEL1, T0 \
1079 VSEL T1, X1, SEL1, T1 \
1080 VSEL T2, ZER, SEL1, T2 \
1081 \
1082 VSLDOI $15, T2, ZER, TT1 \
1083 VSLDOI $15, T1, ZER, TT0 \
1084 VSPLTISB $1, SEL1 \
1085 VSR T0, SEL1, T0 \ // VSRL
1086 VSR T1, SEL1, T1 \
1087 VSPLTISB $7, SEL1 \ // VREPIB
1088 VSL TT0, SEL1, TT0 \
1089 VSL TT1, SEL1, TT1 \
1090 VOR T0, TT0, T0 \
1091 VOR T1, TT1, T1
1092
1093 #define res_ptr R3
1094 #define x_ptr R4
1095 #define y_ptr R5
1096 #define CPOOL R7
1097 #define TEMP R8
1098 #define N R9
1099
1100 // Parameters
1101 #define X0 V0
1102 #define X1 V1
1103 #define Y0 V2
1104 #define Y1 V3
1105 #define T0 V4
1106 #define T1 V5
1107
1108 // Constants
1109 #define P0 V30
1110 #define P1 V31
1111 // func p256MulAsm(res, in1, in2 *p256Element)
1112 TEXT ·p256Mul(SB), NOSPLIT, $0-24
1113 MOVD res+0(FP), res_ptr
1114 MOVD in1+8(FP), x_ptr
1115 MOVD in2+16(FP), y_ptr
1116 MOVD $16, R16
1117 MOVD $32, R17
1118
1119 MOVD $p256mul<>+0x00(SB), CPOOL
1120
1121
1122 LXVD2X (R0)(x_ptr), X0
1123 LXVD2X (R16)(x_ptr), X1
1124
1125 XXPERMDI X0, X0, $2, X0
1126 XXPERMDI X1, X1, $2, X1
1127
1128 LXVD2X (R0)(y_ptr), Y0
1129 LXVD2X (R16)(y_ptr), Y1
1130
1131 XXPERMDI Y0, Y0, $2, Y0
1132 XXPERMDI Y1, Y1, $2, Y1
1133
1134 LXVD2X (R16)(CPOOL), P1
1135 LXVD2X (R0)(CPOOL), P0
1136
1137 CALL p256MulInternal<>(SB)
1138
1139 MOVD $p256mul<>+0x00(SB), CPOOL
1140
1141 XXPERMDI T0, T0, $2, T0
1142 XXPERMDI T1, T1, $2, T1
1143 STXVD2X T0, (R0)(res_ptr)
1144 STXVD2X T1, (R16)(res_ptr)
1145 RET
1146
1147 // func p256Sqr(res, in *p256Element, n int)
1148 TEXT ·p256Sqr(SB), NOSPLIT, $0-24
1149 MOVD res+0(FP), res_ptr
1150 MOVD in+8(FP), x_ptr
1151 MOVD $16, R16
1152 MOVD $32, R17
1153
1154 MOVD $p256mul<>+0x00(SB), CPOOL
1155
1156 LXVD2X (R0)(x_ptr), X0
1157 LXVD2X (R16)(x_ptr), X1
1158
1159 XXPERMDI X0, X0, $2, X0
1160 XXPERMDI X1, X1, $2, X1
1161
1162 sqrLoop:
1163 // Sqr uses same value for both
1164
1165 VOR X0, X0, Y0
1166 VOR X1, X1, Y1
1167
1168 LXVD2X (R16)(CPOOL), P1
1169 LXVD2X (R0)(CPOOL), P0
1170
1171 CALL p256MulInternal<>(SB)
1172
1173 MOVD n+16(FP), N
1174 ADD $-1, N
1175 CMP $0, N
1176 BEQ done
1177 MOVD N, n+16(FP) // Save counter to avoid clobber
1178 VOR T0, T0, X0
1179 VOR T1, T1, X1
1180 BR sqrLoop
1181
1182 done:
1183 MOVD $p256mul<>+0x00(SB), CPOOL
1184
1185 XXPERMDI T0, T0, $2, T0
1186 XXPERMDI T1, T1, $2, T1
1187 STXVD2X T0, (R0)(res_ptr)
1188 STXVD2X T1, (R16)(res_ptr)
1189 RET
1190
1191 #undef res_ptr
1192 #undef x_ptr
1193 #undef y_ptr
1194 #undef CPOOL
1195
1196 #undef X0
1197 #undef X1
1198 #undef Y0
1199 #undef Y1
1200 #undef T0
1201 #undef T1
1202 #undef P0
1203 #undef P1
1204
1205 #define P3ptr R3
1206 #define P1ptr R4
1207 #define P2ptr R5
1208 #define CPOOL R7
1209
1210 // Temporaries in REGs
1211 #define Y2L V15
1212 #define Y2H V16
1213 #define T1L V17
1214 #define T1H V18
1215 #define T2L V19
1216 #define T2H V20
1217 #define T3L V21
1218 #define T3H V22
1219 #define T4L V23
1220 #define T4H V24
1221
1222 // Temps for Sub and Add
1223 #define TT0 V11
1224 #define TT1 V12
1225 #define T2 V13
1226
1227 // p256MulAsm Parameters
1228 #define X0 V0
1229 #define X1 V1
1230 #define Y0 V2
1231 #define Y1 V3
1232 #define T0 V4
1233 #define T1 V5
1234
1235 #define PL V30
1236 #define PH V31
1237
1238 // Names for zero/sel selects
1239 #define X1L V0
1240 #define X1H V1
1241 #define Y1L V2 // p256MulAsmParmY
1242 #define Y1H V3 // p256MulAsmParmY
1243 #define Z1L V4
1244 #define Z1H V5
1245 #define X2L V0
1246 #define X2H V1
1247 #define Z2L V4
1248 #define Z2H V5
1249 #define X3L V17 // T1L
1250 #define X3H V18 // T1H
1251 #define Y3L V21 // T3L
1252 #define Y3H V22 // T3H
1253 #define Z3L V25
1254 #define Z3H V26
1255
1256 #define ZER V6
1257 #define SEL1 V7
1258 #define CAR1 V8
1259 #define CAR2 V9
1260 /* *
1261 * Three operand formula:
1262 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1263 * T1 = Z1²
1264 * T2 = T1*Z1
1265 * T1 = T1*X2
1266 * T2 = T2*Y2
1267 * T1 = T1-X1
1268 * T2 = T2-Y1
1269 * Z3 = Z1*T1
1270 * T3 = T1²
1271 * T4 = T3*T1
1272 * T3 = T3*X1
1273 * T1 = 2*T3
1274 * X3 = T2²
1275 * X3 = X3-T1
1276 * X3 = X3-T4
1277 * T3 = T3-X3
1278 * T3 = T3*T2
1279 * T4 = T4*Y1
1280 * Y3 = T3-T4
1281
1282 * Three operand formulas, but with MulInternal X,Y used to store temps
1283 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1284 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1285 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1286 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1287 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1288 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1289 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1290 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1291 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1292 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1293 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1294 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1295 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1296 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1297 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1298 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1299 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1300 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1301
1302 */
1303 //
1304 // V27 is clobbered by p256MulInternal so must be
1305 // saved in a temp.
1306 //
1307 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1308 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
1309 MOVD res+0(FP), P3ptr
1310 MOVD in1+8(FP), P1ptr
1311 MOVD in2+16(FP), P2ptr
1312
1313 MOVD $p256mul<>+0x00(SB), CPOOL
1314
1315 MOVD $16, R16
1316 MOVD $32, R17
1317 MOVD $48, R18
1318 MOVD $64, R19
1319 MOVD $80, R20
1320 MOVD $96, R21
1321 MOVD $112, R22
1322 MOVD $128, R23
1323 MOVD $144, R24
1324 MOVD $160, R25
1325 MOVD $104, R26 // offset of sign+24(FP)
1326
1327 LXVD2X (R16)(CPOOL), PH
1328 LXVD2X (R0)(CPOOL), PL
1329
1330 LXVD2X (R17)(P2ptr), Y2L
1331 LXVD2X (R18)(P2ptr), Y2H
1332 XXPERMDI Y2H, Y2H, $2, Y2H
1333 XXPERMDI Y2L, Y2L, $2, Y2L
1334
1335 // Equivalent of VLREPG sign+24(FP), SEL1
1336 LXVDSX (R1)(R26), SEL1
1337 VSPLTISB $0, ZER
1338 VCMPEQUD SEL1, ZER, SEL1
1339
1340 VSUBCUQ PL, Y2L, CAR1
1341 VSUBUQM PL, Y2L, T1L
1342 VSUBEUQM PH, Y2H, CAR1, T1H
1343
1344 VSEL T1L, Y2L, SEL1, Y2L
1345 VSEL T1H, Y2H, SEL1, Y2H
1346
1347 /* *
1348 * Three operand formula:
1349 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1350 */
1351 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1352 LXVD2X (R19)(P1ptr), X0 // Z1H
1353 LXVD2X (R20)(P1ptr), X1 // Z1L
1354 XXPERMDI X0, X0, $2, X0
1355 XXPERMDI X1, X1, $2, X1
1356 VOR X0, X0, Y0
1357 VOR X1, X1, Y1
1358 CALL p256MulInternal<>(SB)
1359
1360 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1361 VOR T0, T0, X0
1362 VOR T1, T1, X1
1363 CALL p256MulInternal<>(SB)
1364 VOR T0, T0, T2L
1365 VOR T1, T1, T2H
1366
1367 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1368 MOVD in2+16(FP), P2ptr
1369 LXVD2X (R0)(P2ptr), Y0 // X2H
1370 LXVD2X (R16)(P2ptr), Y1 // X2L
1371 XXPERMDI Y0, Y0, $2, Y0
1372 XXPERMDI Y1, Y1, $2, Y1
1373 CALL p256MulInternal<>(SB)
1374 VOR T0, T0, T1L
1375 VOR T1, T1, T1H
1376
1377 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1378 VOR T2L, T2L, X0
1379 VOR T2H, T2H, X1
1380 VOR Y2L, Y2L, Y0
1381 VOR Y2H, Y2H, Y1
1382 CALL p256MulInternal<>(SB)
1383
1384 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1385 MOVD in1+8(FP), P1ptr
1386 LXVD2X (R17)(P1ptr), Y1L
1387 LXVD2X (R18)(P1ptr), Y1H
1388 XXPERMDI Y1H, Y1H, $2, Y1H
1389 XXPERMDI Y1L, Y1L, $2, Y1L
1390 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1391
1392 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1393 LXVD2X (R0)(P1ptr), X1L
1394 LXVD2X (R16)(P1ptr), X1H
1395 XXPERMDI X1H, X1H, $2, X1H
1396 XXPERMDI X1L, X1L, $2, X1L
1397 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1398
1399 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1400 LXVD2X (R19)(P1ptr), X0 // Z1H
1401 LXVD2X (R20)(P1ptr), X1 // Z1L
1402 XXPERMDI X0, X0, $2, X0
1403 XXPERMDI X1, X1, $2, X1
1404 CALL p256MulInternal<>(SB)
1405
1406 VOR T0, T0, Z3L
1407 VOR T1, T1, Z3H
1408
1409 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1410 VOR Y0, Y0, X0
1411 VOR Y1, Y1, X1
1412 CALL p256MulInternal<>(SB)
1413 VOR T0, T0, X0
1414 VOR T1, T1, X1
1415
1416 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1417 CALL p256MulInternal<>(SB)
1418 VOR T0, T0, T4L
1419 VOR T1, T1, T4H
1420
1421 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1422 MOVD in1+8(FP), P1ptr
1423 LXVD2X (R0)(P1ptr), Y0 // X1H
1424 LXVD2X (R16)(P1ptr), Y1 // X1L
1425 XXPERMDI Y1, Y1, $2, Y1
1426 XXPERMDI Y0, Y0, $2, Y0
1427 CALL p256MulInternal<>(SB)
1428 VOR T0, T0, T3L
1429 VOR T1, T1, T3H
1430
1431 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1432 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1433
1434 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1435 VOR T2L, T2L, X0
1436 VOR T2H, T2H, X1
1437 VOR T2L, T2L, Y0
1438 VOR T2H, T2H, Y1
1439 CALL p256MulInternal<>(SB)
1440
1441 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1442 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1443
1444 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1445 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1446 VOR T0, T0, X3L
1447 VOR T1, T1, X3H
1448
1449 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1450 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1451
1452 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1453 CALL p256MulInternal<>(SB)
1454 VOR T0, T0, T3L
1455 VOR T1, T1, T3H
1456
1457 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1458 VOR T4L, T4L, X0
1459 VOR T4H, T4H, X1
1460 MOVD in1+8(FP), P1ptr
1461 LXVD2X (R17)(P1ptr), Y0 // Y1H
1462 LXVD2X (R18)(P1ptr), Y1 // Y1L
1463 XXPERMDI Y0, Y0, $2, Y0
1464 XXPERMDI Y1, Y1, $2, Y1
1465 CALL p256MulInternal<>(SB)
1466
1467 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1468 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1469
1470 // if (sel == 0) {
1471 // copy(P3.x[:], X1)
1472 // copy(P3.y[:], Y1)
1473 // copy(P3.z[:], Z1)
1474 // }
1475
1476 LXVD2X (R0)(P1ptr), X1L
1477 LXVD2X (R16)(P1ptr), X1H
1478 XXPERMDI X1H, X1H, $2, X1H
1479 XXPERMDI X1L, X1L, $2, X1L
1480
1481 // Y1 already loaded, left over from addition
1482 LXVD2X (R19)(P1ptr), Z1L
1483 LXVD2X (R20)(P1ptr), Z1H
1484 XXPERMDI Z1H, Z1H, $2, Z1H
1485 XXPERMDI Z1L, Z1L, $2, Z1L
1486
1487 MOVD $112, R26 // Get offset to sel+32
1488 LXVDSX (R1)(R26), SEL1
1489 VSPLTISB $0, ZER
1490 VCMPEQUD SEL1, ZER, SEL1
1491
1492 VSEL X3L, X1L, SEL1, X3L
1493 VSEL X3H, X1H, SEL1, X3H
1494 VSEL Y3L, Y1L, SEL1, Y3L
1495 VSEL Y3H, Y1H, SEL1, Y3H
1496 VSEL Z3L, Z1L, SEL1, Z3L
1497 VSEL Z3H, Z1H, SEL1, Z3H
1498
1499 MOVD in2+16(FP), P2ptr
1500 LXVD2X (R0)(P2ptr), X2L
1501 LXVD2X (R16)(P2ptr), X2H
1502 XXPERMDI X2H, X2H, $2, X2H
1503 XXPERMDI X2L, X2L, $2, X2L
1504
1505 // Y2 already loaded
1506 LXVD2X (R23)(CPOOL), Z2L
1507 LXVD2X (R24)(CPOOL), Z2H
1508
1509 MOVD $120, R26 // Get the value from zero+40(FP)
1510 LXVDSX (R1)(R26), SEL1
1511 VSPLTISB $0, ZER
1512 VCMPEQUD SEL1, ZER, SEL1
1513
1514 VSEL X3L, X2L, SEL1, X3L
1515 VSEL X3H, X2H, SEL1, X3H
1516 VSEL Y3L, Y2L, SEL1, Y3L
1517 VSEL Y3H, Y2H, SEL1, Y3H
1518 VSEL Z3L, Z2L, SEL1, Z3L
1519 VSEL Z3H, Z2H, SEL1, Z3H
1520
1521 // Reorder the bytes so they can be stored using STXVD2X.
1522 MOVD res+0(FP), P3ptr
1523 XXPERMDI X3H, X3H, $2, X3H
1524 XXPERMDI X3L, X3L, $2, X3L
1525 XXPERMDI Y3H, Y3H, $2, Y3H
1526 XXPERMDI Y3L, Y3L, $2, Y3L
1527 XXPERMDI Z3H, Z3H, $2, Z3H
1528 XXPERMDI Z3L, Z3L, $2, Z3L
1529 STXVD2X X3L, (R0)(P3ptr)
1530 STXVD2X X3H, (R16)(P3ptr)
1531 STXVD2X Y3L, (R17)(P3ptr)
1532 STXVD2X Y3H, (R18)(P3ptr)
1533 STXVD2X Z3L, (R19)(P3ptr)
1534 STXVD2X Z3H, (R20)(P3ptr)
1535
1536 RET
1537
1538 #undef P3ptr
1539 #undef P1ptr
1540 #undef P2ptr
1541 #undef CPOOL
1542
1543 #undef Y2L
1544 #undef Y2H
1545 #undef T1L
1546 #undef T1H
1547 #undef T2L
1548 #undef T2H
1549 #undef T3L
1550 #undef T3H
1551 #undef T4L
1552 #undef T4H
1553
1554 #undef TT0
1555 #undef TT1
1556 #undef T2
1557
1558 #undef X0
1559 #undef X1
1560 #undef Y0
1561 #undef Y1
1562 #undef T0
1563 #undef T1
1564
1565 #undef PL
1566 #undef PH
1567
1568 #undef X1L
1569 #undef X1H
1570 #undef Y1L
1571 #undef Y1H
1572 #undef Z1L
1573 #undef Z1H
1574 #undef X2L
1575 #undef X2H
1576 #undef Z2L
1577 #undef Z2H
1578 #undef X3L
1579 #undef X3H
1580 #undef Y3L
1581 #undef Y3H
1582 #undef Z3L
1583 #undef Z3H
1584
1585 #undef ZER
1586 #undef SEL1
1587 #undef CAR1
1588 #undef CAR2
1589
1590 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1591 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1592 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1593 #define P3ptr R3
1594 #define P1ptr R4
1595 #define CPOOL R7
1596
1597 // Temporaries in REGs
1598 #define X3L V15
1599 #define X3H V16
1600 #define Y3L V17
1601 #define Y3H V18
1602 #define T1L V19
1603 #define T1H V20
1604 #define T2L V21
1605 #define T2H V22
1606 #define T3L V23
1607 #define T3H V24
1608
1609 #define X1L V6
1610 #define X1H V7
1611 #define Y1L V8
1612 #define Y1H V9
1613 #define Z1L V10
1614 #define Z1H V11
1615
1616 // Temps for Sub and Add
1617 #define TT0 V11
1618 #define TT1 V12
1619 #define T2 V13
1620
1621 // p256MulAsm Parameters
1622 #define X0 V0
1623 #define X1 V1
1624 #define Y0 V2
1625 #define Y1 V3
1626 #define T0 V4
1627 #define T1 V5
1628
1629 #define PL V30
1630 #define PH V31
1631
1632 #define Z3L V23
1633 #define Z3H V24
1634
1635 #define ZER V26
1636 #define SEL1 V27
1637 #define CAR1 V28
1638 #define CAR2 V29
1639 /*
1640 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1641 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1642 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1643 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1644 * B = 2Y₁
1645 * Z₃ = B×Z₁
1646 * C = B²
1647 * D = C×X₁
1648 * X₃ = A²-2D
1649 * Y₃ = (D-X₃)×A-C²/2
1650 *
1651 * Three-operand formula:
1652 * T1 = Z1²
1653 * T2 = X1-T1
1654 * T1 = X1+T1
1655 * T2 = T2*T1
1656 * T2 = 3*T2
1657 * Y3 = 2*Y1
1658 * Z3 = Y3*Z1
1659 * Y3 = Y3²
1660 * T3 = Y3*X1
1661 * Y3 = Y3²
1662 * Y3 = half*Y3
1663 * X3 = T2²
1664 * T1 = 2*T3
1665 * X3 = X3-T1
1666 * T1 = T3-X3
1667 * T1 = T1*T2
1668 * Y3 = T1-Y3
1669 */
1670 // p256PointDoubleAsm(res, in1 *p256Point)
1671 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
1672 MOVD res+0(FP), P3ptr
1673 MOVD in+8(FP), P1ptr
1674
1675 MOVD $p256mul<>+0x00(SB), CPOOL
1676
1677 MOVD $16, R16
1678 MOVD $32, R17
1679 MOVD $48, R18
1680 MOVD $64, R19
1681 MOVD $80, R20
1682
1683 LXVD2X (R16)(CPOOL), PH
1684 LXVD2X (R0)(CPOOL), PL
1685
1686 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1687 LXVD2X (R19)(P1ptr), X0 // Z1H
1688 LXVD2X (R20)(P1ptr), X1 // Z1L
1689
1690 XXPERMDI X0, X0, $2, X0
1691 XXPERMDI X1, X1, $2, X1
1692
1693 VOR X0, X0, Y0
1694 VOR X1, X1, Y1
1695 CALL p256MulInternal<>(SB)
1696
1697 // SUB(X<X1-T) // T2 = X1-T1
1698 LXVD2X (R0)(P1ptr), X1L
1699 LXVD2X (R16)(P1ptr), X1H
1700 XXPERMDI X1L, X1L, $2, X1L
1701 XXPERMDI X1H, X1H, $2, X1H
1702
1703 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1704
1705 // ADD(Y<X1+T) // T1 = X1+T1
1706 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1707
1708 // X- ; Y- ; MUL; T- // T2 = T2*T1
1709 CALL p256MulInternal<>(SB)
1710
1711 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1712 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1713 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1714
1715 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1716 LXVD2X (R17)(P1ptr), Y1L
1717 LXVD2X (R18)(P1ptr), Y1H
1718 XXPERMDI Y1L, Y1L, $2, Y1L
1719 XXPERMDI Y1H, Y1H, $2, Y1H
1720
1721 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1722
1723 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1724 LXVD2X (R19)(P1ptr), Y0
1725 LXVD2X (R20)(P1ptr), Y1
1726 XXPERMDI Y0, Y0, $2, Y0
1727 XXPERMDI Y1, Y1, $2, Y1
1728
1729 CALL p256MulInternal<>(SB)
1730
1731 // Leave T0, T1 as is.
1732 XXPERMDI T0, T0, $2, TT0
1733 XXPERMDI T1, T1, $2, TT1
1734 STXVD2X TT0, (R19)(P3ptr)
1735 STXVD2X TT1, (R20)(P3ptr)
1736
1737 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1738 VOR X0, X0, Y0
1739 VOR X1, X1, Y1
1740 CALL p256MulInternal<>(SB)
1741
1742 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
1743 VOR T0, T0, X0
1744 VOR T1, T1, X1
1745 LXVD2X (R0)(P1ptr), Y0
1746 LXVD2X (R16)(P1ptr), Y1
1747 XXPERMDI Y0, Y0, $2, Y0
1748 XXPERMDI Y1, Y1, $2, Y1
1749 CALL p256MulInternal<>(SB)
1750 VOR T0, T0, T3L
1751 VOR T1, T1, T3H
1752
1753 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1754 VOR X0, X0, Y0
1755 VOR X1, X1, Y1
1756 CALL p256MulInternal<>(SB)
1757
1758 // HAL(Y3<T) // Y3 = half*Y3
1759 p256HalfInternal(Y3H,Y3L, T1,T0)
1760
1761 // X=T2; Y=T2; MUL; T- // X3 = T2²
1762 VOR T2L, T2L, X0
1763 VOR T2H, T2H, X1
1764 VOR T2L, T2L, Y0
1765 VOR T2H, T2H, Y1
1766 CALL p256MulInternal<>(SB)
1767
1768 // ADD(T1<T3+T3) // T1 = 2*T3
1769 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
1770
1771 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
1772 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
1773
1774 XXPERMDI X3L, X3L, $2, TT0
1775 XXPERMDI X3H, X3H, $2, TT1
1776 STXVD2X TT0, (R0)(P3ptr)
1777 STXVD2X TT1, (R16)(P3ptr)
1778
1779 // SUB(X<T3-X3) // T1 = T3-X3
1780 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
1781
1782 // X- ; Y- ; MUL; T- // T1 = T1*T2
1783 CALL p256MulInternal<>(SB)
1784
1785 // SUB(Y3<T-Y3) // Y3 = T1-Y3
1786 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
1787
1788 XXPERMDI Y3L, Y3L, $2, Y3L
1789 XXPERMDI Y3H, Y3H, $2, Y3H
1790 STXVD2X Y3L, (R17)(P3ptr)
1791 STXVD2X Y3H, (R18)(P3ptr)
1792 RET
1793
1794 #undef P3ptr
1795 #undef P1ptr
1796 #undef CPOOL
1797 #undef X3L
1798 #undef X3H
1799 #undef Y3L
1800 #undef Y3H
1801 #undef T1L
1802 #undef T1H
1803 #undef T2L
1804 #undef T2H
1805 #undef T3L
1806 #undef T3H
1807 #undef X1L
1808 #undef X1H
1809 #undef Y1L
1810 #undef Y1H
1811 #undef Z1L
1812 #undef Z1H
1813 #undef TT0
1814 #undef TT1
1815 #undef T2
1816 #undef X0
1817 #undef X1
1818 #undef Y0
1819 #undef Y1
1820 #undef T0
1821 #undef T1
1822 #undef PL
1823 #undef PH
1824 #undef Z3L
1825 #undef Z3H
1826 #undef ZER
1827 #undef SEL1
1828 #undef CAR1
1829 #undef CAR2
1830
1831 #define P3ptr R3
1832 #define P1ptr R4
1833 #define P2ptr R5
1834 #define CPOOL R7
1835 #define TRUE R14
1836 #define RES1 R9
1837 #define RES2 R10
1838
1839 // Temporaries in REGs
1840 #define T1L V16
1841 #define T1H V17
1842 #define T2L V18
1843 #define T2H V19
1844 #define U1L V20
1845 #define U1H V21
1846 #define S1L V22
1847 #define S1H V23
1848 #define HL V24
1849 #define HH V25
1850 #define RL V26
1851 #define RH V27
1852
1853 // Temps for Sub and Add
1854 #define ZER V6
1855 #define SEL1 V7
1856 #define CAR1 V8
1857 #define CAR2 V9
1858 #define TT0 V11
1859 #define TT1 V12
1860 #define T2 V13
1861
1862 // p256MulAsm Parameters
1863 #define X0 V0
1864 #define X1 V1
1865 #define Y0 V2
1866 #define Y1 V3
1867 #define T0 V4
1868 #define T1 V5
1869
1870 #define PL V30
1871 #define PH V31
1872 /*
1873 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
1874 *
1875 * A = X₁×Z₂²
1876 * B = Y₁×Z₂³
1877 * C = X₂×Z₁²-A
1878 * D = Y₂×Z₁³-B
1879 * X₃ = D² - 2A×C² - C³
1880 * Y₃ = D×(A×C² - X₃) - B×C³
1881 * Z₃ = Z₁×Z₂×C
1882 *
1883 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
1884 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
1885 *
1886 * T1 = Z1*Z1
1887 * T2 = Z2*Z2
1888 * U1 = X1*T2
1889 * H = X2*T1
1890 * H = H-U1
1891 * Z3 = Z1*Z2
1892 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1893 *
1894 * S1 = Z2*T2
1895 * S1 = Y1*S1
1896 * R = Z1*T1
1897 * R = Y2*R
1898 * R = R-S1
1899 *
1900 * T1 = H*H
1901 * T2 = H*T1
1902 * U1 = U1*T1
1903 *
1904 * X3 = R*R
1905 * X3 = X3-T2
1906 * T1 = 2*U1
1907 * X3 = X3-T1 << store-out X3 result reg
1908 *
1909 * T2 = S1*T2
1910 * Y3 = U1-X3
1911 * Y3 = R*Y3
1912 * Y3 = Y3-T2 << store-out Y3 result reg
1913
1914 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1915 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1916 // X=X2; Y- ; MUL; H=T // H = X2*T1
1917 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1918 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1919 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1920 // SUB(H<H-T) // H = H-U1
1921 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
1922 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1923 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1924 // X=Y2; Y=R ; MUL; T- // R = Y2*R
1925 // SUB(R<T-S1) // R = R-S1
1926 // X=H ; Y=H ; MUL; T- // T1 = H*H
1927 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
1928 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
1929 // X=R ; Y=R ; MUL; T- // X3 = R*R
1930 // SUB(T<T-T2) // X3 = X3-T2
1931 // ADD(X<U1+U1) // T1 = 2*U1
1932 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
1933 // SUB(Y<U1-T) // Y3 = U1-X3
1934 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
1935 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
1936 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
1937 */
1938 // p256PointAddAsm(res, in1, in2 *p256Point)
1939 TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
1940 MOVD res+0(FP), P3ptr
1941 MOVD in1+8(FP), P1ptr
1942 MOVD $p256mul<>+0x00(SB), CPOOL
1943 MOVD $16, R16
1944 MOVD $32, R17
1945 MOVD $48, R18
1946 MOVD $64, R19
1947 MOVD $80, R20
1948
1949 LXVD2X (R16)(CPOOL), PH
1950 LXVD2X (R0)(CPOOL), PL
1951
1952 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1953 LXVD2X (R19)(P1ptr), X0 // Z1L
1954 LXVD2X (R20)(P1ptr), X1 // Z1H
1955 XXPERMDI X0, X0, $2, X0
1956 XXPERMDI X1, X1, $2, X1
1957 VOR X0, X0, Y0
1958 VOR X1, X1, Y1
1959 CALL p256MulInternal<>(SB)
1960
1961 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1962 VOR T0, T0, Y0
1963 VOR T1, T1, Y1
1964 CALL p256MulInternal<>(SB)
1965 VOR T0, T0, RL // SAVE: RL
1966 VOR T1, T1, RH // SAVE: RH
1967
1968 STXVD2X RH, (R1)(R17) // V27 has to be saved
1969
1970 // X=X2; Y- ; MUL; H=T // H = X2*T1
1971 MOVD in2+16(FP), P2ptr
1972 LXVD2X (R0)(P2ptr), X0 // X2L
1973 LXVD2X (R16)(P2ptr), X1 // X2H
1974 XXPERMDI X0, X0, $2, X0
1975 XXPERMDI X1, X1, $2, X1
1976 CALL p256MulInternal<>(SB)
1977 VOR T0, T0, HL // SAVE: HL
1978 VOR T1, T1, HH // SAVE: HH
1979
1980 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1981 MOVD in2+16(FP), P2ptr
1982 LXVD2X (R19)(P2ptr), X0 // Z2L
1983 LXVD2X (R20)(P2ptr), X1 // Z2H
1984 XXPERMDI X0, X0, $2, X0
1985 XXPERMDI X1, X1, $2, X1
1986 VOR X0, X0, Y0
1987 VOR X1, X1, Y1
1988 CALL p256MulInternal<>(SB)
1989
1990 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1991 VOR T0, T0, Y0
1992 VOR T1, T1, Y1
1993 CALL p256MulInternal<>(SB)
1994 VOR T0, T0, S1L // SAVE: S1L
1995 VOR T1, T1, S1H // SAVE: S1H
1996
1997 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1998 MOVD in1+8(FP), P1ptr
1999 LXVD2X (R0)(P1ptr), X0 // X1L
2000 LXVD2X (R16)(P1ptr), X1 // X1H
2001 XXPERMDI X0, X0, $2, X0
2002 XXPERMDI X1, X1, $2, X1
2003 CALL p256MulInternal<>(SB)
2004 VOR T0, T0, U1L // SAVE: U1L
2005 VOR T1, T1, U1H // SAVE: U1H
2006
2007 // SUB(H<H-T) // H = H-U1
2008 p256SubInternal(HH,HL,HH,HL,T1,T0)
2009
2010 // if H == 0 or H^P == 0 then ret=1 else ret=0
2011 // clobbers T1H and T1L
2012 MOVD $1, TRUE
2013 VSPLTISB $0, ZER
2014 VOR HL, HH, T1H
2015 VCMPEQUDCC ZER, T1H, T1H
2016
2017 // 26 = CR6 NE
2018 ISEL $26, R0, TRUE, RES1
2019 VXOR HL, PL, T1L // SAVE: T1L
2020 VXOR HH, PH, T1H // SAVE: T1H
2021 VOR T1L, T1H, T1H
2022 VCMPEQUDCC ZER, T1H, T1H
2023
2024 // 26 = CR6 NE
2025 ISEL $26, R0, TRUE, RES2
2026 OR RES2, RES1, RES1
2027 MOVD RES1, ret+24(FP)
2028
2029 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2030 MOVD in1+8(FP), P1ptr
2031 MOVD in2+16(FP), P2ptr
2032 LXVD2X (R19)(P1ptr), X0 // Z1L
2033 LXVD2X (R20)(P1ptr), X1 // Z1H
2034 XXPERMDI X0, X0, $2, X0
2035 XXPERMDI X1, X1, $2, X1
2036 LXVD2X (R19)(P2ptr), Y0 // Z2L
2037 LXVD2X (R20)(P2ptr), Y1 // Z2H
2038 XXPERMDI Y0, Y0, $2, Y0
2039 XXPERMDI Y1, Y1, $2, Y1
2040 CALL p256MulInternal<>(SB)
2041
2042 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2043 VOR T0, T0, X0
2044 VOR T1, T1, X1
2045 VOR HL, HL, Y0
2046 VOR HH, HH, Y1
2047 CALL p256MulInternal<>(SB)
2048 MOVD res+0(FP), P3ptr
2049 XXPERMDI T1, T1, $2, TT1
2050 XXPERMDI T0, T0, $2, TT0
2051 STXVD2X TT0, (R19)(P3ptr)
2052 STXVD2X TT1, (R20)(P3ptr)
2053
2054 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2055 MOVD in1+8(FP), P1ptr
2056 LXVD2X (R17)(P1ptr), X0
2057 LXVD2X (R18)(P1ptr), X1
2058 XXPERMDI X0, X0, $2, X0
2059 XXPERMDI X1, X1, $2, X1
2060 VOR S1L, S1L, Y0
2061 VOR S1H, S1H, Y1
2062 CALL p256MulInternal<>(SB)
2063 VOR T0, T0, S1L
2064 VOR T1, T1, S1H
2065
2066 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2067 MOVD in2+16(FP), P2ptr
2068 LXVD2X (R17)(P2ptr), X0
2069 LXVD2X (R18)(P2ptr), X1
2070 XXPERMDI X0, X0, $2, X0
2071 XXPERMDI X1, X1, $2, X1
2072 VOR RL, RL, Y0
2073
2074 // VOR RH, RH, Y1 RH was saved above in D2X format
2075 LXVD2X (R1)(R17), Y1
2076 CALL p256MulInternal<>(SB)
2077
2078 // SUB(R<T-S1) // R = T-S1
2079 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2080
2081 STXVD2X RH, (R1)(R17) // Save RH
2082
2083 // if R == 0 or R^P == 0 then ret=ret else ret=0
2084 // clobbers T1H and T1L
2085 // Redo this using ISEL??
2086 MOVD $1, TRUE
2087 VSPLTISB $0, ZER
2088 VOR RL, RH, T1H
2089 VCMPEQUDCC ZER, T1H, T1H
2090
2091 // 24 = CR6 NE
2092 ISEL $26, R0, TRUE, RES1
2093 VXOR RL, PL, T1L
2094 VXOR RH, PH, T1H // SAVE: T1L
2095 VOR T1L, T1H, T1H
2096 VCMPEQUDCC ZER, T1H, T1H
2097
2098 // 26 = CR6 NE
2099 ISEL $26, R0, TRUE, RES2
2100 OR RES2, RES1, RES1
2101 MOVD ret+24(FP), RES2
2102 AND RES2, RES1, RES1
2103 MOVD RES1, ret+24(FP)
2104
2105 // X=H ; Y=H ; MUL; T- // T1 = H*H
2106 VOR HL, HL, X0
2107 VOR HH, HH, X1
2108 VOR HL, HL, Y0
2109 VOR HH, HH, Y1
2110 CALL p256MulInternal<>(SB)
2111
2112 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2113 VOR T0, T0, Y0
2114 VOR T1, T1, Y1
2115 CALL p256MulInternal<>(SB)
2116 VOR T0, T0, T2L
2117 VOR T1, T1, T2H
2118
2119 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2120 VOR U1L, U1L, X0
2121 VOR U1H, U1H, X1
2122 CALL p256MulInternal<>(SB)
2123 VOR T0, T0, U1L
2124 VOR T1, T1, U1H
2125
2126 // X=R ; Y=R ; MUL; T- // X3 = R*R
2127 VOR RL, RL, X0
2128
2129 // VOR RH, RH, X1
2130 VOR RL, RL, Y0
2131
2132 // RH was saved above using STXVD2X
2133 LXVD2X (R1)(R17), X1
2134 VOR X1, X1, Y1
2135
2136 // VOR RH, RH, Y1
2137 CALL p256MulInternal<>(SB)
2138
2139 // SUB(T<T-T2) // X3 = X3-T2
2140 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2141
2142 // ADD(X<U1+U1) // T1 = 2*U1
2143 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2144
2145 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2146 p256SubInternal(T1,T0,T1,T0,X1,X0)
2147 MOVD res+0(FP), P3ptr
2148 XXPERMDI T1, T1, $2, TT1
2149 XXPERMDI T0, T0, $2, TT0
2150 STXVD2X TT0, (R0)(P3ptr)
2151 STXVD2X TT1, (R16)(P3ptr)
2152
2153 // SUB(Y<U1-T) // Y3 = U1-X3
2154 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2155
2156 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2157 VOR RL, RL, X0
2158
2159 // VOR RH, RH, X1
2160 LXVD2X (R1)(R17), X1
2161 CALL p256MulInternal<>(SB)
2162 VOR T0, T0, U1L
2163 VOR T1, T1, U1H
2164
2165 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2166 VOR S1L, S1L, X0
2167 VOR S1H, S1H, X1
2168 VOR T2L, T2L, Y0
2169 VOR T2H, T2H, Y1
2170 CALL p256MulInternal<>(SB)
2171
2172 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2173 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2174 MOVD res+0(FP), P3ptr
2175 XXPERMDI T1, T1, $2, TT1
2176 XXPERMDI T0, T0, $2, TT0
2177 STXVD2X TT0, (R17)(P3ptr)
2178 STXVD2X TT1, (R18)(P3ptr)
2179
2180 RET
2181
View as plain text