1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 // This file contains constant-time, 64-bit assembly implementation of
8 // P256. The optimizations performed here are described in detail in:
9 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
10 // 256-bit primes"
11 // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
12 // https://eprint.iacr.org/2013/816.pdf
13
14 #include "textflag.h"
15
16 #define res_ptr DI
17 #define x_ptr SI
18 #define y_ptr CX
19
20 #define acc0 R8
21 #define acc1 R9
22 #define acc2 R10
23 #define acc3 R11
24 #define acc4 R12
25 #define acc5 R13
26 #define t0 R14
27 #define t1 R15
28
29 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
30 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
31 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
32 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
33 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
34 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
35 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
36 DATA p256one<>+0x00(SB)/8, $0x0000000000000001
37 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
38 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
39 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
40 GLOBL p256const0<>(SB), 8, $8
41 GLOBL p256const1<>(SB), 8, $8
42 GLOBL p256ordK0<>(SB), 8, $8
43 GLOBL p256ord<>(SB), 8, $32
44 GLOBL p256one<>(SB), 8, $32
45
46 /* ---------------------------------------*/
47 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
48 TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
49 JMP ·p256BigToLittle(SB)
50 /* ---------------------------------------*/
51 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
52 TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
53 JMP ·p256BigToLittle(SB)
54 /* ---------------------------------------*/
55 // func p256LittleToBig(res *[32]byte, in *p256Element)
56 TEXT ·p256LittleToBig(SB),NOSPLIT,$0
57 JMP ·p256BigToLittle(SB)
58 /* ---------------------------------------*/
59 // func p256BigToLittle(res *p256Element, in *[32]byte)
60 TEXT ·p256BigToLittle(SB),NOSPLIT,$0
61 MOVQ res+0(FP), res_ptr
62 MOVQ in+8(FP), x_ptr
63
64 MOVQ (8*0)(x_ptr), acc0
65 MOVQ (8*1)(x_ptr), acc1
66 MOVQ (8*2)(x_ptr), acc2
67 MOVQ (8*3)(x_ptr), acc3
68
69 BSWAPQ acc0
70 BSWAPQ acc1
71 BSWAPQ acc2
72 BSWAPQ acc3
73
74 MOVQ acc3, (8*0)(res_ptr)
75 MOVQ acc2, (8*1)(res_ptr)
76 MOVQ acc1, (8*2)(res_ptr)
77 MOVQ acc0, (8*3)(res_ptr)
78
79 RET
80 /* ---------------------------------------*/
81 // func p256MovCond(res, a, b *P256Point, cond int)
82 TEXT ·p256MovCond(SB),NOSPLIT,$0
83 MOVQ res+0(FP), res_ptr
84 MOVQ a+8(FP), x_ptr
85 MOVQ b+16(FP), y_ptr
86 MOVQ cond+24(FP), X12
87
88 PXOR X13, X13
89 PSHUFD $0, X12, X12
90 PCMPEQL X13, X12
91
92 MOVOU X12, X0
93 MOVOU (16*0)(x_ptr), X6
94 PANDN X6, X0
95 MOVOU X12, X1
96 MOVOU (16*1)(x_ptr), X7
97 PANDN X7, X1
98 MOVOU X12, X2
99 MOVOU (16*2)(x_ptr), X8
100 PANDN X8, X2
101 MOVOU X12, X3
102 MOVOU (16*3)(x_ptr), X9
103 PANDN X9, X3
104 MOVOU X12, X4
105 MOVOU (16*4)(x_ptr), X10
106 PANDN X10, X4
107 MOVOU X12, X5
108 MOVOU (16*5)(x_ptr), X11
109 PANDN X11, X5
110
111 MOVOU (16*0)(y_ptr), X6
112 MOVOU (16*1)(y_ptr), X7
113 MOVOU (16*2)(y_ptr), X8
114 MOVOU (16*3)(y_ptr), X9
115 MOVOU (16*4)(y_ptr), X10
116 MOVOU (16*5)(y_ptr), X11
117
118 PAND X12, X6
119 PAND X12, X7
120 PAND X12, X8
121 PAND X12, X9
122 PAND X12, X10
123 PAND X12, X11
124
125 PXOR X6, X0
126 PXOR X7, X1
127 PXOR X8, X2
128 PXOR X9, X3
129 PXOR X10, X4
130 PXOR X11, X5
131
132 MOVOU X0, (16*0)(res_ptr)
133 MOVOU X1, (16*1)(res_ptr)
134 MOVOU X2, (16*2)(res_ptr)
135 MOVOU X3, (16*3)(res_ptr)
136 MOVOU X4, (16*4)(res_ptr)
137 MOVOU X5, (16*5)(res_ptr)
138
139 RET
140 /* ---------------------------------------*/
141 // func p256NegCond(val *p256Element, cond int)
142 TEXT ·p256NegCond(SB),NOSPLIT,$0
143 MOVQ val+0(FP), res_ptr
144 MOVQ cond+8(FP), t0
145 // acc = poly
146 MOVQ $-1, acc0
147 MOVQ p256const0<>(SB), acc1
148 MOVQ $0, acc2
149 MOVQ p256const1<>(SB), acc3
150 // Load the original value
151 MOVQ (8*0)(res_ptr), acc5
152 MOVQ (8*1)(res_ptr), x_ptr
153 MOVQ (8*2)(res_ptr), y_ptr
154 MOVQ (8*3)(res_ptr), t1
155 // Speculatively subtract
156 SUBQ acc5, acc0
157 SBBQ x_ptr, acc1
158 SBBQ y_ptr, acc2
159 SBBQ t1, acc3
160 // If condition is 0, keep original value
161 TESTQ t0, t0
162 CMOVQEQ acc5, acc0
163 CMOVQEQ x_ptr, acc1
164 CMOVQEQ y_ptr, acc2
165 CMOVQEQ t1, acc3
166 // Store result
167 MOVQ acc0, (8*0)(res_ptr)
168 MOVQ acc1, (8*1)(res_ptr)
169 MOVQ acc2, (8*2)(res_ptr)
170 MOVQ acc3, (8*3)(res_ptr)
171
172 RET
173 /* ---------------------------------------*/
174 // func p256Sqr(res, in *p256Element, n int)
175 TEXT ·p256Sqr(SB),NOSPLIT,$0
176 MOVQ res+0(FP), res_ptr
177 MOVQ in+8(FP), x_ptr
178 MOVQ n+16(FP), BX
179
180 sqrLoop:
181
182 // y[1:] * y[0]
183 MOVQ (8*0)(x_ptr), t0
184
185 MOVQ (8*1)(x_ptr), AX
186 MULQ t0
187 MOVQ AX, acc1
188 MOVQ DX, acc2
189
190 MOVQ (8*2)(x_ptr), AX
191 MULQ t0
192 ADDQ AX, acc2
193 ADCQ $0, DX
194 MOVQ DX, acc3
195
196 MOVQ (8*3)(x_ptr), AX
197 MULQ t0
198 ADDQ AX, acc3
199 ADCQ $0, DX
200 MOVQ DX, acc4
201 // y[2:] * y[1]
202 MOVQ (8*1)(x_ptr), t0
203
204 MOVQ (8*2)(x_ptr), AX
205 MULQ t0
206 ADDQ AX, acc3
207 ADCQ $0, DX
208 MOVQ DX, t1
209
210 MOVQ (8*3)(x_ptr), AX
211 MULQ t0
212 ADDQ t1, acc4
213 ADCQ $0, DX
214 ADDQ AX, acc4
215 ADCQ $0, DX
216 MOVQ DX, acc5
217 // y[3] * y[2]
218 MOVQ (8*2)(x_ptr), t0
219
220 MOVQ (8*3)(x_ptr), AX
221 MULQ t0
222 ADDQ AX, acc5
223 ADCQ $0, DX
224 MOVQ DX, y_ptr
225 XORQ t1, t1
226 // *2
227 ADDQ acc1, acc1
228 ADCQ acc2, acc2
229 ADCQ acc3, acc3
230 ADCQ acc4, acc4
231 ADCQ acc5, acc5
232 ADCQ y_ptr, y_ptr
233 ADCQ $0, t1
234 // Missing products
235 MOVQ (8*0)(x_ptr), AX
236 MULQ AX
237 MOVQ AX, acc0
238 MOVQ DX, t0
239
240 MOVQ (8*1)(x_ptr), AX
241 MULQ AX
242 ADDQ t0, acc1
243 ADCQ AX, acc2
244 ADCQ $0, DX
245 MOVQ DX, t0
246
247 MOVQ (8*2)(x_ptr), AX
248 MULQ AX
249 ADDQ t0, acc3
250 ADCQ AX, acc4
251 ADCQ $0, DX
252 MOVQ DX, t0
253
254 MOVQ (8*3)(x_ptr), AX
255 MULQ AX
256 ADDQ t0, acc5
257 ADCQ AX, y_ptr
258 ADCQ DX, t1
259 MOVQ t1, x_ptr
260 // First reduction step
261 MOVQ acc0, AX
262 MOVQ acc0, t1
263 SHLQ $32, acc0
264 MULQ p256const1<>(SB)
265 SHRQ $32, t1
266 ADDQ acc0, acc1
267 ADCQ t1, acc2
268 ADCQ AX, acc3
269 ADCQ $0, DX
270 MOVQ DX, acc0
271 // Second reduction step
272 MOVQ acc1, AX
273 MOVQ acc1, t1
274 SHLQ $32, acc1
275 MULQ p256const1<>(SB)
276 SHRQ $32, t1
277 ADDQ acc1, acc2
278 ADCQ t1, acc3
279 ADCQ AX, acc0
280 ADCQ $0, DX
281 MOVQ DX, acc1
282 // Third reduction step
283 MOVQ acc2, AX
284 MOVQ acc2, t1
285 SHLQ $32, acc2
286 MULQ p256const1<>(SB)
287 SHRQ $32, t1
288 ADDQ acc2, acc3
289 ADCQ t1, acc0
290 ADCQ AX, acc1
291 ADCQ $0, DX
292 MOVQ DX, acc2
293 // Last reduction step
294 XORQ t0, t0
295 MOVQ acc3, AX
296 MOVQ acc3, t1
297 SHLQ $32, acc3
298 MULQ p256const1<>(SB)
299 SHRQ $32, t1
300 ADDQ acc3, acc0
301 ADCQ t1, acc1
302 ADCQ AX, acc2
303 ADCQ $0, DX
304 MOVQ DX, acc3
305 // Add bits [511:256] of the sqr result
306 ADCQ acc4, acc0
307 ADCQ acc5, acc1
308 ADCQ y_ptr, acc2
309 ADCQ x_ptr, acc3
310 ADCQ $0, t0
311
312 MOVQ acc0, acc4
313 MOVQ acc1, acc5
314 MOVQ acc2, y_ptr
315 MOVQ acc3, t1
316 // Subtract p256
317 SUBQ $-1, acc0
318 SBBQ p256const0<>(SB) ,acc1
319 SBBQ $0, acc2
320 SBBQ p256const1<>(SB), acc3
321 SBBQ $0, t0
322
323 CMOVQCS acc4, acc0
324 CMOVQCS acc5, acc1
325 CMOVQCS y_ptr, acc2
326 CMOVQCS t1, acc3
327
328 MOVQ acc0, (8*0)(res_ptr)
329 MOVQ acc1, (8*1)(res_ptr)
330 MOVQ acc2, (8*2)(res_ptr)
331 MOVQ acc3, (8*3)(res_ptr)
332 MOVQ res_ptr, x_ptr
333 DECQ BX
334 JNE sqrLoop
335
336 RET
337 /* ---------------------------------------*/
338 // func p256Mul(res, in1, in2 *p256Element)
339 TEXT ·p256Mul(SB),NOSPLIT,$0
340 MOVQ res+0(FP), res_ptr
341 MOVQ in1+8(FP), x_ptr
342 MOVQ in2+16(FP), y_ptr
343 // x * y[0]
344 MOVQ (8*0)(y_ptr), t0
345
346 MOVQ (8*0)(x_ptr), AX
347 MULQ t0
348 MOVQ AX, acc0
349 MOVQ DX, acc1
350
351 MOVQ (8*1)(x_ptr), AX
352 MULQ t0
353 ADDQ AX, acc1
354 ADCQ $0, DX
355 MOVQ DX, acc2
356
357 MOVQ (8*2)(x_ptr), AX
358 MULQ t0
359 ADDQ AX, acc2
360 ADCQ $0, DX
361 MOVQ DX, acc3
362
363 MOVQ (8*3)(x_ptr), AX
364 MULQ t0
365 ADDQ AX, acc3
366 ADCQ $0, DX
367 MOVQ DX, acc4
368 XORQ acc5, acc5
369 // First reduction step
370 MOVQ acc0, AX
371 MOVQ acc0, t1
372 SHLQ $32, acc0
373 MULQ p256const1<>(SB)
374 SHRQ $32, t1
375 ADDQ acc0, acc1
376 ADCQ t1, acc2
377 ADCQ AX, acc3
378 ADCQ DX, acc4
379 ADCQ $0, acc5
380 XORQ acc0, acc0
381 // x * y[1]
382 MOVQ (8*1)(y_ptr), t0
383
384 MOVQ (8*0)(x_ptr), AX
385 MULQ t0
386 ADDQ AX, acc1
387 ADCQ $0, DX
388 MOVQ DX, t1
389
390 MOVQ (8*1)(x_ptr), AX
391 MULQ t0
392 ADDQ t1, acc2
393 ADCQ $0, DX
394 ADDQ AX, acc2
395 ADCQ $0, DX
396 MOVQ DX, t1
397
398 MOVQ (8*2)(x_ptr), AX
399 MULQ t0
400 ADDQ t1, acc3
401 ADCQ $0, DX
402 ADDQ AX, acc3
403 ADCQ $0, DX
404 MOVQ DX, t1
405
406 MOVQ (8*3)(x_ptr), AX
407 MULQ t0
408 ADDQ t1, acc4
409 ADCQ $0, DX
410 ADDQ AX, acc4
411 ADCQ DX, acc5
412 ADCQ $0, acc0
413 // Second reduction step
414 MOVQ acc1, AX
415 MOVQ acc1, t1
416 SHLQ $32, acc1
417 MULQ p256const1<>(SB)
418 SHRQ $32, t1
419 ADDQ acc1, acc2
420 ADCQ t1, acc3
421 ADCQ AX, acc4
422 ADCQ DX, acc5
423 ADCQ $0, acc0
424 XORQ acc1, acc1
425 // x * y[2]
426 MOVQ (8*2)(y_ptr), t0
427
428 MOVQ (8*0)(x_ptr), AX
429 MULQ t0
430 ADDQ AX, acc2
431 ADCQ $0, DX
432 MOVQ DX, t1
433
434 MOVQ (8*1)(x_ptr), AX
435 MULQ t0
436 ADDQ t1, acc3
437 ADCQ $0, DX
438 ADDQ AX, acc3
439 ADCQ $0, DX
440 MOVQ DX, t1
441
442 MOVQ (8*2)(x_ptr), AX
443 MULQ t0
444 ADDQ t1, acc4
445 ADCQ $0, DX
446 ADDQ AX, acc4
447 ADCQ $0, DX
448 MOVQ DX, t1
449
450 MOVQ (8*3)(x_ptr), AX
451 MULQ t0
452 ADDQ t1, acc5
453 ADCQ $0, DX
454 ADDQ AX, acc5
455 ADCQ DX, acc0
456 ADCQ $0, acc1
457 // Third reduction step
458 MOVQ acc2, AX
459 MOVQ acc2, t1
460 SHLQ $32, acc2
461 MULQ p256const1<>(SB)
462 SHRQ $32, t1
463 ADDQ acc2, acc3
464 ADCQ t1, acc4
465 ADCQ AX, acc5
466 ADCQ DX, acc0
467 ADCQ $0, acc1
468 XORQ acc2, acc2
469 // x * y[3]
470 MOVQ (8*3)(y_ptr), t0
471
472 MOVQ (8*0)(x_ptr), AX
473 MULQ t0
474 ADDQ AX, acc3
475 ADCQ $0, DX
476 MOVQ DX, t1
477
478 MOVQ (8*1)(x_ptr), AX
479 MULQ t0
480 ADDQ t1, acc4
481 ADCQ $0, DX
482 ADDQ AX, acc4
483 ADCQ $0, DX
484 MOVQ DX, t1
485
486 MOVQ (8*2)(x_ptr), AX
487 MULQ t0
488 ADDQ t1, acc5
489 ADCQ $0, DX
490 ADDQ AX, acc5
491 ADCQ $0, DX
492 MOVQ DX, t1
493
494 MOVQ (8*3)(x_ptr), AX
495 MULQ t0
496 ADDQ t1, acc0
497 ADCQ $0, DX
498 ADDQ AX, acc0
499 ADCQ DX, acc1
500 ADCQ $0, acc2
501 // Last reduction step
502 MOVQ acc3, AX
503 MOVQ acc3, t1
504 SHLQ $32, acc3
505 MULQ p256const1<>(SB)
506 SHRQ $32, t1
507 ADDQ acc3, acc4
508 ADCQ t1, acc5
509 ADCQ AX, acc0
510 ADCQ DX, acc1
511 ADCQ $0, acc2
512 // Copy result [255:0]
513 MOVQ acc4, x_ptr
514 MOVQ acc5, acc3
515 MOVQ acc0, t0
516 MOVQ acc1, t1
517 // Subtract p256
518 SUBQ $-1, acc4
519 SBBQ p256const0<>(SB) ,acc5
520 SBBQ $0, acc0
521 SBBQ p256const1<>(SB), acc1
522 SBBQ $0, acc2
523
524 CMOVQCS x_ptr, acc4
525 CMOVQCS acc3, acc5
526 CMOVQCS t0, acc0
527 CMOVQCS t1, acc1
528
529 MOVQ acc4, (8*0)(res_ptr)
530 MOVQ acc5, (8*1)(res_ptr)
531 MOVQ acc0, (8*2)(res_ptr)
532 MOVQ acc1, (8*3)(res_ptr)
533
534 RET
535 /* ---------------------------------------*/
536 // func p256FromMont(res, in *p256Element)
537 TEXT ·p256FromMont(SB),NOSPLIT,$0
538 MOVQ res+0(FP), res_ptr
539 MOVQ in+8(FP), x_ptr
540
541 MOVQ (8*0)(x_ptr), acc0
542 MOVQ (8*1)(x_ptr), acc1
543 MOVQ (8*2)(x_ptr), acc2
544 MOVQ (8*3)(x_ptr), acc3
545 XORQ acc4, acc4
546
547 // Only reduce, no multiplications are needed
548 // First stage
549 MOVQ acc0, AX
550 MOVQ acc0, t1
551 SHLQ $32, acc0
552 MULQ p256const1<>(SB)
553 SHRQ $32, t1
554 ADDQ acc0, acc1
555 ADCQ t1, acc2
556 ADCQ AX, acc3
557 ADCQ DX, acc4
558 XORQ acc5, acc5
559 // Second stage
560 MOVQ acc1, AX
561 MOVQ acc1, t1
562 SHLQ $32, acc1
563 MULQ p256const1<>(SB)
564 SHRQ $32, t1
565 ADDQ acc1, acc2
566 ADCQ t1, acc3
567 ADCQ AX, acc4
568 ADCQ DX, acc5
569 XORQ acc0, acc0
570 // Third stage
571 MOVQ acc2, AX
572 MOVQ acc2, t1
573 SHLQ $32, acc2
574 MULQ p256const1<>(SB)
575 SHRQ $32, t1
576 ADDQ acc2, acc3
577 ADCQ t1, acc4
578 ADCQ AX, acc5
579 ADCQ DX, acc0
580 XORQ acc1, acc1
581 // Last stage
582 MOVQ acc3, AX
583 MOVQ acc3, t1
584 SHLQ $32, acc3
585 MULQ p256const1<>(SB)
586 SHRQ $32, t1
587 ADDQ acc3, acc4
588 ADCQ t1, acc5
589 ADCQ AX, acc0
590 ADCQ DX, acc1
591
592 MOVQ acc4, x_ptr
593 MOVQ acc5, acc3
594 MOVQ acc0, t0
595 MOVQ acc1, t1
596
597 SUBQ $-1, acc4
598 SBBQ p256const0<>(SB), acc5
599 SBBQ $0, acc0
600 SBBQ p256const1<>(SB), acc1
601
602 CMOVQCS x_ptr, acc4
603 CMOVQCS acc3, acc5
604 CMOVQCS t0, acc0
605 CMOVQCS t1, acc1
606
607 MOVQ acc4, (8*0)(res_ptr)
608 MOVQ acc5, (8*1)(res_ptr)
609 MOVQ acc0, (8*2)(res_ptr)
610 MOVQ acc1, (8*3)(res_ptr)
611
612 RET
613 /* ---------------------------------------*/
614 // func p256Select(res *P256Point, table *p256Table, idx int)
615 TEXT ·p256Select(SB),NOSPLIT,$0
616 MOVQ idx+16(FP),AX
617 MOVQ table+8(FP),DI
618 MOVQ res+0(FP),DX
619
620 PXOR X15, X15 // X15 = 0
621 PCMPEQL X14, X14 // X14 = -1
622 PSUBL X14, X15 // X15 = 1
623 MOVL AX, X14
624 PSHUFD $0, X14, X14
625
626 PXOR X0, X0
627 PXOR X1, X1
628 PXOR X2, X2
629 PXOR X3, X3
630 PXOR X4, X4
631 PXOR X5, X5
632 MOVQ $16, AX
633
634 MOVOU X15, X13
635
636 loop_select:
637
638 MOVOU X13, X12
639 PADDL X15, X13
640 PCMPEQL X14, X12
641
642 MOVOU (16*0)(DI), X6
643 MOVOU (16*1)(DI), X7
644 MOVOU (16*2)(DI), X8
645 MOVOU (16*3)(DI), X9
646 MOVOU (16*4)(DI), X10
647 MOVOU (16*5)(DI), X11
648 ADDQ $(16*6), DI
649
650 PAND X12, X6
651 PAND X12, X7
652 PAND X12, X8
653 PAND X12, X9
654 PAND X12, X10
655 PAND X12, X11
656
657 PXOR X6, X0
658 PXOR X7, X1
659 PXOR X8, X2
660 PXOR X9, X3
661 PXOR X10, X4
662 PXOR X11, X5
663
664 DECQ AX
665 JNE loop_select
666
667 MOVOU X0, (16*0)(DX)
668 MOVOU X1, (16*1)(DX)
669 MOVOU X2, (16*2)(DX)
670 MOVOU X3, (16*3)(DX)
671 MOVOU X4, (16*4)(DX)
672 MOVOU X5, (16*5)(DX)
673
674 RET
675 /* ---------------------------------------*/
676 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
677 TEXT ·p256SelectAffine(SB),NOSPLIT,$0
678 MOVQ idx+16(FP),AX
679 MOVQ table+8(FP),DI
680 MOVQ res+0(FP),DX
681
682 PXOR X15, X15 // X15 = 0
683 PCMPEQL X14, X14 // X14 = -1
684 PSUBL X14, X15 // X15 = 1
685 MOVL AX, X14
686 PSHUFD $0, X14, X14
687
688 PXOR X0, X0
689 PXOR X1, X1
690 PXOR X2, X2
691 PXOR X3, X3
692 MOVQ $16, AX
693
694 MOVOU X15, X13
695
696 loop_select_base:
697
698 MOVOU X13, X12
699 PADDL X15, X13
700 PCMPEQL X14, X12
701
702 MOVOU (16*0)(DI), X4
703 MOVOU (16*1)(DI), X5
704 MOVOU (16*2)(DI), X6
705 MOVOU (16*3)(DI), X7
706
707 MOVOU (16*4)(DI), X8
708 MOVOU (16*5)(DI), X9
709 MOVOU (16*6)(DI), X10
710 MOVOU (16*7)(DI), X11
711
712 ADDQ $(16*8), DI
713
714 PAND X12, X4
715 PAND X12, X5
716 PAND X12, X6
717 PAND X12, X7
718
719 MOVOU X13, X12
720 PADDL X15, X13
721 PCMPEQL X14, X12
722
723 PAND X12, X8
724 PAND X12, X9
725 PAND X12, X10
726 PAND X12, X11
727
728 PXOR X4, X0
729 PXOR X5, X1
730 PXOR X6, X2
731 PXOR X7, X3
732
733 PXOR X8, X0
734 PXOR X9, X1
735 PXOR X10, X2
736 PXOR X11, X3
737
738 DECQ AX
739 JNE loop_select_base
740
741 MOVOU X0, (16*0)(DX)
742 MOVOU X1, (16*1)(DX)
743 MOVOU X2, (16*2)(DX)
744 MOVOU X3, (16*3)(DX)
745
746 RET
747 /* ---------------------------------------*/
748 // func p256OrdMul(res, in1, in2 *p256OrdElement)
749 TEXT ·p256OrdMul(SB),NOSPLIT,$0
750 MOVQ res+0(FP), res_ptr
751 MOVQ in1+8(FP), x_ptr
752 MOVQ in2+16(FP), y_ptr
753 // x * y[0]
754 MOVQ (8*0)(y_ptr), t0
755
756 MOVQ (8*0)(x_ptr), AX
757 MULQ t0
758 MOVQ AX, acc0
759 MOVQ DX, acc1
760
761 MOVQ (8*1)(x_ptr), AX
762 MULQ t0
763 ADDQ AX, acc1
764 ADCQ $0, DX
765 MOVQ DX, acc2
766
767 MOVQ (8*2)(x_ptr), AX
768 MULQ t0
769 ADDQ AX, acc2
770 ADCQ $0, DX
771 MOVQ DX, acc3
772
773 MOVQ (8*3)(x_ptr), AX
774 MULQ t0
775 ADDQ AX, acc3
776 ADCQ $0, DX
777 MOVQ DX, acc4
778 XORQ acc5, acc5
779 // First reduction step
780 MOVQ acc0, AX
781 MULQ p256ordK0<>(SB)
782 MOVQ AX, t0
783
784 MOVQ p256ord<>+0x00(SB), AX
785 MULQ t0
786 ADDQ AX, acc0
787 ADCQ $0, DX
788 MOVQ DX, t1
789
790 MOVQ p256ord<>+0x08(SB), AX
791 MULQ t0
792 ADDQ t1, acc1
793 ADCQ $0, DX
794 ADDQ AX, acc1
795 ADCQ $0, DX
796 MOVQ DX, t1
797
798 MOVQ p256ord<>+0x10(SB), AX
799 MULQ t0
800 ADDQ t1, acc2
801 ADCQ $0, DX
802 ADDQ AX, acc2
803 ADCQ $0, DX
804 MOVQ DX, t1
805
806 MOVQ p256ord<>+0x18(SB), AX
807 MULQ t0
808 ADDQ t1, acc3
809 ADCQ $0, DX
810 ADDQ AX, acc3
811 ADCQ DX, acc4
812 ADCQ $0, acc5
813 // x * y[1]
814 MOVQ (8*1)(y_ptr), t0
815
816 MOVQ (8*0)(x_ptr), AX
817 MULQ t0
818 ADDQ AX, acc1
819 ADCQ $0, DX
820 MOVQ DX, t1
821
822 MOVQ (8*1)(x_ptr), AX
823 MULQ t0
824 ADDQ t1, acc2
825 ADCQ $0, DX
826 ADDQ AX, acc2
827 ADCQ $0, DX
828 MOVQ DX, t1
829
830 MOVQ (8*2)(x_ptr), AX
831 MULQ t0
832 ADDQ t1, acc3
833 ADCQ $0, DX
834 ADDQ AX, acc3
835 ADCQ $0, DX
836 MOVQ DX, t1
837
838 MOVQ (8*3)(x_ptr), AX
839 MULQ t0
840 ADDQ t1, acc4
841 ADCQ $0, DX
842 ADDQ AX, acc4
843 ADCQ DX, acc5
844 ADCQ $0, acc0
845 // Second reduction step
846 MOVQ acc1, AX
847 MULQ p256ordK0<>(SB)
848 MOVQ AX, t0
849
850 MOVQ p256ord<>+0x00(SB), AX
851 MULQ t0
852 ADDQ AX, acc1
853 ADCQ $0, DX
854 MOVQ DX, t1
855
856 MOVQ p256ord<>+0x08(SB), AX
857 MULQ t0
858 ADDQ t1, acc2
859 ADCQ $0, DX
860 ADDQ AX, acc2
861 ADCQ $0, DX
862 MOVQ DX, t1
863
864 MOVQ p256ord<>+0x10(SB), AX
865 MULQ t0
866 ADDQ t1, acc3
867 ADCQ $0, DX
868 ADDQ AX, acc3
869 ADCQ $0, DX
870 MOVQ DX, t1
871
872 MOVQ p256ord<>+0x18(SB), AX
873 MULQ t0
874 ADDQ t1, acc4
875 ADCQ $0, DX
876 ADDQ AX, acc4
877 ADCQ DX, acc5
878 ADCQ $0, acc0
879 // x * y[2]
880 MOVQ (8*2)(y_ptr), t0
881
882 MOVQ (8*0)(x_ptr), AX
883 MULQ t0
884 ADDQ AX, acc2
885 ADCQ $0, DX
886 MOVQ DX, t1
887
888 MOVQ (8*1)(x_ptr), AX
889 MULQ t0
890 ADDQ t1, acc3
891 ADCQ $0, DX
892 ADDQ AX, acc3
893 ADCQ $0, DX
894 MOVQ DX, t1
895
896 MOVQ (8*2)(x_ptr), AX
897 MULQ t0
898 ADDQ t1, acc4
899 ADCQ $0, DX
900 ADDQ AX, acc4
901 ADCQ $0, DX
902 MOVQ DX, t1
903
904 MOVQ (8*3)(x_ptr), AX
905 MULQ t0
906 ADDQ t1, acc5
907 ADCQ $0, DX
908 ADDQ AX, acc5
909 ADCQ DX, acc0
910 ADCQ $0, acc1
911 // Third reduction step
912 MOVQ acc2, AX
913 MULQ p256ordK0<>(SB)
914 MOVQ AX, t0
915
916 MOVQ p256ord<>+0x00(SB), AX
917 MULQ t0
918 ADDQ AX, acc2
919 ADCQ $0, DX
920 MOVQ DX, t1
921
922 MOVQ p256ord<>+0x08(SB), AX
923 MULQ t0
924 ADDQ t1, acc3
925 ADCQ $0, DX
926 ADDQ AX, acc3
927 ADCQ $0, DX
928 MOVQ DX, t1
929
930 MOVQ p256ord<>+0x10(SB), AX
931 MULQ t0
932 ADDQ t1, acc4
933 ADCQ $0, DX
934 ADDQ AX, acc4
935 ADCQ $0, DX
936 MOVQ DX, t1
937
938 MOVQ p256ord<>+0x18(SB), AX
939 MULQ t0
940 ADDQ t1, acc5
941 ADCQ $0, DX
942 ADDQ AX, acc5
943 ADCQ DX, acc0
944 ADCQ $0, acc1
945 // x * y[3]
946 MOVQ (8*3)(y_ptr), t0
947
948 MOVQ (8*0)(x_ptr), AX
949 MULQ t0
950 ADDQ AX, acc3
951 ADCQ $0, DX
952 MOVQ DX, t1
953
954 MOVQ (8*1)(x_ptr), AX
955 MULQ t0
956 ADDQ t1, acc4
957 ADCQ $0, DX
958 ADDQ AX, acc4
959 ADCQ $0, DX
960 MOVQ DX, t1
961
962 MOVQ (8*2)(x_ptr), AX
963 MULQ t0
964 ADDQ t1, acc5
965 ADCQ $0, DX
966 ADDQ AX, acc5
967 ADCQ $0, DX
968 MOVQ DX, t1
969
970 MOVQ (8*3)(x_ptr), AX
971 MULQ t0
972 ADDQ t1, acc0
973 ADCQ $0, DX
974 ADDQ AX, acc0
975 ADCQ DX, acc1
976 ADCQ $0, acc2
977 // Last reduction step
978 MOVQ acc3, AX
979 MULQ p256ordK0<>(SB)
980 MOVQ AX, t0
981
982 MOVQ p256ord<>+0x00(SB), AX
983 MULQ t0
984 ADDQ AX, acc3
985 ADCQ $0, DX
986 MOVQ DX, t1
987
988 MOVQ p256ord<>+0x08(SB), AX
989 MULQ t0
990 ADDQ t1, acc4
991 ADCQ $0, DX
992 ADDQ AX, acc4
993 ADCQ $0, DX
994 MOVQ DX, t1
995
996 MOVQ p256ord<>+0x10(SB), AX
997 MULQ t0
998 ADDQ t1, acc5
999 ADCQ $0, DX
1000 ADDQ AX, acc5
1001 ADCQ $0, DX
1002 MOVQ DX, t1
1003
1004 MOVQ p256ord<>+0x18(SB), AX
1005 MULQ t0
1006 ADDQ t1, acc0
1007 ADCQ $0, DX
1008 ADDQ AX, acc0
1009 ADCQ DX, acc1
1010 ADCQ $0, acc2
1011 // Copy result [255:0]
1012 MOVQ acc4, x_ptr
1013 MOVQ acc5, acc3
1014 MOVQ acc0, t0
1015 MOVQ acc1, t1
1016 // Subtract p256
1017 SUBQ p256ord<>+0x00(SB), acc4
1018 SBBQ p256ord<>+0x08(SB) ,acc5
1019 SBBQ p256ord<>+0x10(SB), acc0
1020 SBBQ p256ord<>+0x18(SB), acc1
1021 SBBQ $0, acc2
1022
1023 CMOVQCS x_ptr, acc4
1024 CMOVQCS acc3, acc5
1025 CMOVQCS t0, acc0
1026 CMOVQCS t1, acc1
1027
1028 MOVQ acc4, (8*0)(res_ptr)
1029 MOVQ acc5, (8*1)(res_ptr)
1030 MOVQ acc0, (8*2)(res_ptr)
1031 MOVQ acc1, (8*3)(res_ptr)
1032
1033 RET
1034 /* ---------------------------------------*/
1035 // func p256OrdSqr(res, in *p256OrdElement, n int)
1036 TEXT ·p256OrdSqr(SB),NOSPLIT,$0
1037 MOVQ res+0(FP), res_ptr
1038 MOVQ in+8(FP), x_ptr
1039 MOVQ n+16(FP), BX
1040
1041 ordSqrLoop:
1042
1043 // y[1:] * y[0]
1044 MOVQ (8*0)(x_ptr), t0
1045
1046 MOVQ (8*1)(x_ptr), AX
1047 MULQ t0
1048 MOVQ AX, acc1
1049 MOVQ DX, acc2
1050
1051 MOVQ (8*2)(x_ptr), AX
1052 MULQ t0
1053 ADDQ AX, acc2
1054 ADCQ $0, DX
1055 MOVQ DX, acc3
1056
1057 MOVQ (8*3)(x_ptr), AX
1058 MULQ t0
1059 ADDQ AX, acc3
1060 ADCQ $0, DX
1061 MOVQ DX, acc4
1062 // y[2:] * y[1]
1063 MOVQ (8*1)(x_ptr), t0
1064
1065 MOVQ (8*2)(x_ptr), AX
1066 MULQ t0
1067 ADDQ AX, acc3
1068 ADCQ $0, DX
1069 MOVQ DX, t1
1070
1071 MOVQ (8*3)(x_ptr), AX
1072 MULQ t0
1073 ADDQ t1, acc4
1074 ADCQ $0, DX
1075 ADDQ AX, acc4
1076 ADCQ $0, DX
1077 MOVQ DX, acc5
1078 // y[3] * y[2]
1079 MOVQ (8*2)(x_ptr), t0
1080
1081 MOVQ (8*3)(x_ptr), AX
1082 MULQ t0
1083 ADDQ AX, acc5
1084 ADCQ $0, DX
1085 MOVQ DX, y_ptr
1086 XORQ t1, t1
1087 // *2
1088 ADDQ acc1, acc1
1089 ADCQ acc2, acc2
1090 ADCQ acc3, acc3
1091 ADCQ acc4, acc4
1092 ADCQ acc5, acc5
1093 ADCQ y_ptr, y_ptr
1094 ADCQ $0, t1
1095 // Missing products
1096 MOVQ (8*0)(x_ptr), AX
1097 MULQ AX
1098 MOVQ AX, acc0
1099 MOVQ DX, t0
1100
1101 MOVQ (8*1)(x_ptr), AX
1102 MULQ AX
1103 ADDQ t0, acc1
1104 ADCQ AX, acc2
1105 ADCQ $0, DX
1106 MOVQ DX, t0
1107
1108 MOVQ (8*2)(x_ptr), AX
1109 MULQ AX
1110 ADDQ t0, acc3
1111 ADCQ AX, acc4
1112 ADCQ $0, DX
1113 MOVQ DX, t0
1114
1115 MOVQ (8*3)(x_ptr), AX
1116 MULQ AX
1117 ADDQ t0, acc5
1118 ADCQ AX, y_ptr
1119 ADCQ DX, t1
1120 MOVQ t1, x_ptr
1121 // First reduction step
1122 MOVQ acc0, AX
1123 MULQ p256ordK0<>(SB)
1124 MOVQ AX, t0
1125
1126 MOVQ p256ord<>+0x00(SB), AX
1127 MULQ t0
1128 ADDQ AX, acc0
1129 ADCQ $0, DX
1130 MOVQ DX, t1
1131
1132 MOVQ p256ord<>+0x08(SB), AX
1133 MULQ t0
1134 ADDQ t1, acc1
1135 ADCQ $0, DX
1136 ADDQ AX, acc1
1137
1138 MOVQ t0, t1
1139 ADCQ DX, acc2
1140 ADCQ $0, t1
1141 SUBQ t0, acc2
1142 SBBQ $0, t1
1143
1144 MOVQ t0, AX
1145 MOVQ t0, DX
1146 MOVQ t0, acc0
1147 SHLQ $32, AX
1148 SHRQ $32, DX
1149
1150 ADDQ t1, acc3
1151 ADCQ $0, acc0
1152 SUBQ AX, acc3
1153 SBBQ DX, acc0
1154 // Second reduction step
1155 MOVQ acc1, AX
1156 MULQ p256ordK0<>(SB)
1157 MOVQ AX, t0
1158
1159 MOVQ p256ord<>+0x00(SB), AX
1160 MULQ t0
1161 ADDQ AX, acc1
1162 ADCQ $0, DX
1163 MOVQ DX, t1
1164
1165 MOVQ p256ord<>+0x08(SB), AX
1166 MULQ t0
1167 ADDQ t1, acc2
1168 ADCQ $0, DX
1169 ADDQ AX, acc2
1170
1171 MOVQ t0, t1
1172 ADCQ DX, acc3
1173 ADCQ $0, t1
1174 SUBQ t0, acc3
1175 SBBQ $0, t1
1176
1177 MOVQ t0, AX
1178 MOVQ t0, DX
1179 MOVQ t0, acc1
1180 SHLQ $32, AX
1181 SHRQ $32, DX
1182
1183 ADDQ t1, acc0
1184 ADCQ $0, acc1
1185 SUBQ AX, acc0
1186 SBBQ DX, acc1
1187 // Third reduction step
1188 MOVQ acc2, AX
1189 MULQ p256ordK0<>(SB)
1190 MOVQ AX, t0
1191
1192 MOVQ p256ord<>+0x00(SB), AX
1193 MULQ t0
1194 ADDQ AX, acc2
1195 ADCQ $0, DX
1196 MOVQ DX, t1
1197
1198 MOVQ p256ord<>+0x08(SB), AX
1199 MULQ t0
1200 ADDQ t1, acc3
1201 ADCQ $0, DX
1202 ADDQ AX, acc3
1203
1204 MOVQ t0, t1
1205 ADCQ DX, acc0
1206 ADCQ $0, t1
1207 SUBQ t0, acc0
1208 SBBQ $0, t1
1209
1210 MOVQ t0, AX
1211 MOVQ t0, DX
1212 MOVQ t0, acc2
1213 SHLQ $32, AX
1214 SHRQ $32, DX
1215
1216 ADDQ t1, acc1
1217 ADCQ $0, acc2
1218 SUBQ AX, acc1
1219 SBBQ DX, acc2
1220 // Last reduction step
1221 MOVQ acc3, AX
1222 MULQ p256ordK0<>(SB)
1223 MOVQ AX, t0
1224
1225 MOVQ p256ord<>+0x00(SB), AX
1226 MULQ t0
1227 ADDQ AX, acc3
1228 ADCQ $0, DX
1229 MOVQ DX, t1
1230
1231 MOVQ p256ord<>+0x08(SB), AX
1232 MULQ t0
1233 ADDQ t1, acc0
1234 ADCQ $0, DX
1235 ADDQ AX, acc0
1236 ADCQ $0, DX
1237 MOVQ DX, t1
1238
1239 MOVQ t0, t1
1240 ADCQ DX, acc1
1241 ADCQ $0, t1
1242 SUBQ t0, acc1
1243 SBBQ $0, t1
1244
1245 MOVQ t0, AX
1246 MOVQ t0, DX
1247 MOVQ t0, acc3
1248 SHLQ $32, AX
1249 SHRQ $32, DX
1250
1251 ADDQ t1, acc2
1252 ADCQ $0, acc3
1253 SUBQ AX, acc2
1254 SBBQ DX, acc3
1255 XORQ t0, t0
1256 // Add bits [511:256] of the sqr result
1257 ADCQ acc4, acc0
1258 ADCQ acc5, acc1
1259 ADCQ y_ptr, acc2
1260 ADCQ x_ptr, acc3
1261 ADCQ $0, t0
1262
1263 MOVQ acc0, acc4
1264 MOVQ acc1, acc5
1265 MOVQ acc2, y_ptr
1266 MOVQ acc3, t1
1267 // Subtract p256
1268 SUBQ p256ord<>+0x00(SB), acc0
1269 SBBQ p256ord<>+0x08(SB) ,acc1
1270 SBBQ p256ord<>+0x10(SB), acc2
1271 SBBQ p256ord<>+0x18(SB), acc3
1272 SBBQ $0, t0
1273
1274 CMOVQCS acc4, acc0
1275 CMOVQCS acc5, acc1
1276 CMOVQCS y_ptr, acc2
1277 CMOVQCS t1, acc3
1278
1279 MOVQ acc0, (8*0)(res_ptr)
1280 MOVQ acc1, (8*1)(res_ptr)
1281 MOVQ acc2, (8*2)(res_ptr)
1282 MOVQ acc3, (8*3)(res_ptr)
1283 MOVQ res_ptr, x_ptr
1284 DECQ BX
1285 JNE ordSqrLoop
1286
1287 RET
1288 /* ---------------------------------------*/
1289 #undef res_ptr
1290 #undef x_ptr
1291 #undef y_ptr
1292
1293 #undef acc0
1294 #undef acc1
1295 #undef acc2
1296 #undef acc3
1297 #undef acc4
1298 #undef acc5
1299 #undef t0
1300 #undef t1
1301 /* ---------------------------------------*/
1302 #define mul0 AX
1303 #define mul1 DX
1304 #define acc0 BX
1305 #define acc1 CX
1306 #define acc2 R8
1307 #define acc3 R9
1308 #define acc4 R10
1309 #define acc5 R11
1310 #define acc6 R12
1311 #define acc7 R13
1312 #define t0 R14
1313 #define t1 R15
1314 #define t2 DI
1315 #define t3 SI
1316 #define hlp BP
1317 /* ---------------------------------------*/
1318 TEXT p256SubInternal(SB),NOSPLIT,$0
1319 XORQ mul0, mul0
1320 SUBQ t0, acc4
1321 SBBQ t1, acc5
1322 SBBQ t2, acc6
1323 SBBQ t3, acc7
1324 SBBQ $0, mul0
1325
1326 MOVQ acc4, acc0
1327 MOVQ acc5, acc1
1328 MOVQ acc6, acc2
1329 MOVQ acc7, acc3
1330
1331 ADDQ $-1, acc4
1332 ADCQ p256const0<>(SB), acc5
1333 ADCQ $0, acc6
1334 ADCQ p256const1<>(SB), acc7
1335 ANDQ $1, mul0
1336
1337 CMOVQEQ acc0, acc4
1338 CMOVQEQ acc1, acc5
1339 CMOVQEQ acc2, acc6
1340 CMOVQEQ acc3, acc7
1341
1342 RET
1343 /* ---------------------------------------*/
1344 TEXT p256MulInternal(SB),NOSPLIT,$8
1345 MOVQ acc4, mul0
1346 MULQ t0
1347 MOVQ mul0, acc0
1348 MOVQ mul1, acc1
1349
1350 MOVQ acc4, mul0
1351 MULQ t1
1352 ADDQ mul0, acc1
1353 ADCQ $0, mul1
1354 MOVQ mul1, acc2
1355
1356 MOVQ acc4, mul0
1357 MULQ t2
1358 ADDQ mul0, acc2
1359 ADCQ $0, mul1
1360 MOVQ mul1, acc3
1361
1362 MOVQ acc4, mul0
1363 MULQ t3
1364 ADDQ mul0, acc3
1365 ADCQ $0, mul1
1366 MOVQ mul1, acc4
1367
1368 MOVQ acc5, mul0
1369 MULQ t0
1370 ADDQ mul0, acc1
1371 ADCQ $0, mul1
1372 MOVQ mul1, hlp
1373
1374 MOVQ acc5, mul0
1375 MULQ t1
1376 ADDQ hlp, acc2
1377 ADCQ $0, mul1
1378 ADDQ mul0, acc2
1379 ADCQ $0, mul1
1380 MOVQ mul1, hlp
1381
1382 MOVQ acc5, mul0
1383 MULQ t2
1384 ADDQ hlp, acc3
1385 ADCQ $0, mul1
1386 ADDQ mul0, acc3
1387 ADCQ $0, mul1
1388 MOVQ mul1, hlp
1389
1390 MOVQ acc5, mul0
1391 MULQ t3
1392 ADDQ hlp, acc4
1393 ADCQ $0, mul1
1394 ADDQ mul0, acc4
1395 ADCQ $0, mul1
1396 MOVQ mul1, acc5
1397
1398 MOVQ acc6, mul0
1399 MULQ t0
1400 ADDQ mul0, acc2
1401 ADCQ $0, mul1
1402 MOVQ mul1, hlp
1403
1404 MOVQ acc6, mul0
1405 MULQ t1
1406 ADDQ hlp, acc3
1407 ADCQ $0, mul1
1408 ADDQ mul0, acc3
1409 ADCQ $0, mul1
1410 MOVQ mul1, hlp
1411
1412 MOVQ acc6, mul0
1413 MULQ t2
1414 ADDQ hlp, acc4
1415 ADCQ $0, mul1
1416 ADDQ mul0, acc4
1417 ADCQ $0, mul1
1418 MOVQ mul1, hlp
1419
1420 MOVQ acc6, mul0
1421 MULQ t3
1422 ADDQ hlp, acc5
1423 ADCQ $0, mul1
1424 ADDQ mul0, acc5
1425 ADCQ $0, mul1
1426 MOVQ mul1, acc6
1427
1428 MOVQ acc7, mul0
1429 MULQ t0
1430 ADDQ mul0, acc3
1431 ADCQ $0, mul1
1432 MOVQ mul1, hlp
1433
1434 MOVQ acc7, mul0
1435 MULQ t1
1436 ADDQ hlp, acc4
1437 ADCQ $0, mul1
1438 ADDQ mul0, acc4
1439 ADCQ $0, mul1
1440 MOVQ mul1, hlp
1441
1442 MOVQ acc7, mul0
1443 MULQ t2
1444 ADDQ hlp, acc5
1445 ADCQ $0, mul1
1446 ADDQ mul0, acc5
1447 ADCQ $0, mul1
1448 MOVQ mul1, hlp
1449
1450 MOVQ acc7, mul0
1451 MULQ t3
1452 ADDQ hlp, acc6
1453 ADCQ $0, mul1
1454 ADDQ mul0, acc6
1455 ADCQ $0, mul1
1456 MOVQ mul1, acc7
1457 // First reduction step
1458 MOVQ acc0, mul0
1459 MOVQ acc0, hlp
1460 SHLQ $32, acc0
1461 MULQ p256const1<>(SB)
1462 SHRQ $32, hlp
1463 ADDQ acc0, acc1
1464 ADCQ hlp, acc2
1465 ADCQ mul0, acc3
1466 ADCQ $0, mul1
1467 MOVQ mul1, acc0
1468 // Second reduction step
1469 MOVQ acc1, mul0
1470 MOVQ acc1, hlp
1471 SHLQ $32, acc1
1472 MULQ p256const1<>(SB)
1473 SHRQ $32, hlp
1474 ADDQ acc1, acc2
1475 ADCQ hlp, acc3
1476 ADCQ mul0, acc0
1477 ADCQ $0, mul1
1478 MOVQ mul1, acc1
1479 // Third reduction step
1480 MOVQ acc2, mul0
1481 MOVQ acc2, hlp
1482 SHLQ $32, acc2
1483 MULQ p256const1<>(SB)
1484 SHRQ $32, hlp
1485 ADDQ acc2, acc3
1486 ADCQ hlp, acc0
1487 ADCQ mul0, acc1
1488 ADCQ $0, mul1
1489 MOVQ mul1, acc2
1490 // Last reduction step
1491 MOVQ acc3, mul0
1492 MOVQ acc3, hlp
1493 SHLQ $32, acc3
1494 MULQ p256const1<>(SB)
1495 SHRQ $32, hlp
1496 ADDQ acc3, acc0
1497 ADCQ hlp, acc1
1498 ADCQ mul0, acc2
1499 ADCQ $0, mul1
1500 MOVQ mul1, acc3
1501 MOVQ $0, BP
1502 // Add bits [511:256] of the result
1503 ADCQ acc0, acc4
1504 ADCQ acc1, acc5
1505 ADCQ acc2, acc6
1506 ADCQ acc3, acc7
1507 ADCQ $0, hlp
1508 // Copy result
1509 MOVQ acc4, acc0
1510 MOVQ acc5, acc1
1511 MOVQ acc6, acc2
1512 MOVQ acc7, acc3
1513 // Subtract p256
1514 SUBQ $-1, acc4
1515 SBBQ p256const0<>(SB) ,acc5
1516 SBBQ $0, acc6
1517 SBBQ p256const1<>(SB), acc7
1518 SBBQ $0, hlp
1519 // If the result of the subtraction is negative, restore the previous result
1520 CMOVQCS acc0, acc4
1521 CMOVQCS acc1, acc5
1522 CMOVQCS acc2, acc6
1523 CMOVQCS acc3, acc7
1524
1525 RET
1526 /* ---------------------------------------*/
1527 TEXT p256SqrInternal(SB),NOSPLIT,$8
1528
1529 MOVQ acc4, mul0
1530 MULQ acc5
1531 MOVQ mul0, acc1
1532 MOVQ mul1, acc2
1533
1534 MOVQ acc4, mul0
1535 MULQ acc6
1536 ADDQ mul0, acc2
1537 ADCQ $0, mul1
1538 MOVQ mul1, acc3
1539
1540 MOVQ acc4, mul0
1541 MULQ acc7
1542 ADDQ mul0, acc3
1543 ADCQ $0, mul1
1544 MOVQ mul1, t0
1545
1546 MOVQ acc5, mul0
1547 MULQ acc6
1548 ADDQ mul0, acc3
1549 ADCQ $0, mul1
1550 MOVQ mul1, hlp
1551
1552 MOVQ acc5, mul0
1553 MULQ acc7
1554 ADDQ hlp, t0
1555 ADCQ $0, mul1
1556 ADDQ mul0, t0
1557 ADCQ $0, mul1
1558 MOVQ mul1, t1
1559
1560 MOVQ acc6, mul0
1561 MULQ acc7
1562 ADDQ mul0, t1
1563 ADCQ $0, mul1
1564 MOVQ mul1, t2
1565 XORQ t3, t3
1566 // *2
1567 ADDQ acc1, acc1
1568 ADCQ acc2, acc2
1569 ADCQ acc3, acc3
1570 ADCQ t0, t0
1571 ADCQ t1, t1
1572 ADCQ t2, t2
1573 ADCQ $0, t3
1574 // Missing products
1575 MOVQ acc4, mul0
1576 MULQ mul0
1577 MOVQ mul0, acc0
1578 MOVQ DX, acc4
1579
1580 MOVQ acc5, mul0
1581 MULQ mul0
1582 ADDQ acc4, acc1
1583 ADCQ mul0, acc2
1584 ADCQ $0, DX
1585 MOVQ DX, acc4
1586
1587 MOVQ acc6, mul0
1588 MULQ mul0
1589 ADDQ acc4, acc3
1590 ADCQ mul0, t0
1591 ADCQ $0, DX
1592 MOVQ DX, acc4
1593
1594 MOVQ acc7, mul0
1595 MULQ mul0
1596 ADDQ acc4, t1
1597 ADCQ mul0, t2
1598 ADCQ DX, t3
1599 // First reduction step
1600 MOVQ acc0, mul0
1601 MOVQ acc0, hlp
1602 SHLQ $32, acc0
1603 MULQ p256const1<>(SB)
1604 SHRQ $32, hlp
1605 ADDQ acc0, acc1
1606 ADCQ hlp, acc2
1607 ADCQ mul0, acc3
1608 ADCQ $0, mul1
1609 MOVQ mul1, acc0
1610 // Second reduction step
1611 MOVQ acc1, mul0
1612 MOVQ acc1, hlp
1613 SHLQ $32, acc1
1614 MULQ p256const1<>(SB)
1615 SHRQ $32, hlp
1616 ADDQ acc1, acc2
1617 ADCQ hlp, acc3
1618 ADCQ mul0, acc0
1619 ADCQ $0, mul1
1620 MOVQ mul1, acc1
1621 // Third reduction step
1622 MOVQ acc2, mul0
1623 MOVQ acc2, hlp
1624 SHLQ $32, acc2
1625 MULQ p256const1<>(SB)
1626 SHRQ $32, hlp
1627 ADDQ acc2, acc3
1628 ADCQ hlp, acc0
1629 ADCQ mul0, acc1
1630 ADCQ $0, mul1
1631 MOVQ mul1, acc2
1632 // Last reduction step
1633 MOVQ acc3, mul0
1634 MOVQ acc3, hlp
1635 SHLQ $32, acc3
1636 MULQ p256const1<>(SB)
1637 SHRQ $32, hlp
1638 ADDQ acc3, acc0
1639 ADCQ hlp, acc1
1640 ADCQ mul0, acc2
1641 ADCQ $0, mul1
1642 MOVQ mul1, acc3
1643 MOVQ $0, BP
1644 // Add bits [511:256] of the result
1645 ADCQ acc0, t0
1646 ADCQ acc1, t1
1647 ADCQ acc2, t2
1648 ADCQ acc3, t3
1649 ADCQ $0, hlp
1650 // Copy result
1651 MOVQ t0, acc4
1652 MOVQ t1, acc5
1653 MOVQ t2, acc6
1654 MOVQ t3, acc7
1655 // Subtract p256
1656 SUBQ $-1, acc4
1657 SBBQ p256const0<>(SB) ,acc5
1658 SBBQ $0, acc6
1659 SBBQ p256const1<>(SB), acc7
1660 SBBQ $0, hlp
1661 // If the result of the subtraction is negative, restore the previous result
1662 CMOVQCS t0, acc4
1663 CMOVQCS t1, acc5
1664 CMOVQCS t2, acc6
1665 CMOVQCS t3, acc7
1666
1667 RET
1668 /* ---------------------------------------*/
1669 #define p256MulBy2Inline\
1670 XORQ mul0, mul0;\
1671 ADDQ acc4, acc4;\
1672 ADCQ acc5, acc5;\
1673 ADCQ acc6, acc6;\
1674 ADCQ acc7, acc7;\
1675 ADCQ $0, mul0;\
1676 MOVQ acc4, t0;\
1677 MOVQ acc5, t1;\
1678 MOVQ acc6, t2;\
1679 MOVQ acc7, t3;\
1680 SUBQ $-1, t0;\
1681 SBBQ p256const0<>(SB), t1;\
1682 SBBQ $0, t2;\
1683 SBBQ p256const1<>(SB), t3;\
1684 SBBQ $0, mul0;\
1685 CMOVQCS acc4, t0;\
1686 CMOVQCS acc5, t1;\
1687 CMOVQCS acc6, t2;\
1688 CMOVQCS acc7, t3;
1689 /* ---------------------------------------*/
1690 #define p256AddInline \
1691 XORQ mul0, mul0;\
1692 ADDQ t0, acc4;\
1693 ADCQ t1, acc5;\
1694 ADCQ t2, acc6;\
1695 ADCQ t3, acc7;\
1696 ADCQ $0, mul0;\
1697 MOVQ acc4, t0;\
1698 MOVQ acc5, t1;\
1699 MOVQ acc6, t2;\
1700 MOVQ acc7, t3;\
1701 SUBQ $-1, t0;\
1702 SBBQ p256const0<>(SB), t1;\
1703 SBBQ $0, t2;\
1704 SBBQ p256const1<>(SB), t3;\
1705 SBBQ $0, mul0;\
1706 CMOVQCS acc4, t0;\
1707 CMOVQCS acc5, t1;\
1708 CMOVQCS acc6, t2;\
1709 CMOVQCS acc7, t3;
1710 /* ---------------------------------------*/
1711 #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
1712 #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
1713 #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
1714 #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
1715 #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
1716 #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
1717 /* ---------------------------------------*/
1718 #define x1in(off) (32*0 + off)(SP)
1719 #define y1in(off) (32*1 + off)(SP)
1720 #define z1in(off) (32*2 + off)(SP)
1721 #define x2in(off) (32*3 + off)(SP)
1722 #define y2in(off) (32*4 + off)(SP)
1723 #define xout(off) (32*5 + off)(SP)
1724 #define yout(off) (32*6 + off)(SP)
1725 #define zout(off) (32*7 + off)(SP)
1726 #define s2(off) (32*8 + off)(SP)
1727 #define z1sqr(off) (32*9 + off)(SP)
1728 #define h(off) (32*10 + off)(SP)
1729 #define r(off) (32*11 + off)(SP)
1730 #define hsqr(off) (32*12 + off)(SP)
1731 #define rsqr(off) (32*13 + off)(SP)
1732 #define hcub(off) (32*14 + off)(SP)
1733 #define rptr (32*15)(SP)
1734 #define sel_save (32*15 + 8)(SP)
1735 #define zero_save (32*15 + 8 + 4)(SP)
1736
1737 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1738 TEXT ·p256PointAddAffineAsm(SB),0,$512-48
1739 // Move input to stack in order to free registers
1740 MOVQ res+0(FP), AX
1741 MOVQ in1+8(FP), BX
1742 MOVQ in2+16(FP), CX
1743 MOVQ sign+24(FP), DX
1744 MOVQ sel+32(FP), t1
1745 MOVQ zero+40(FP), t2
1746
1747 MOVOU (16*0)(BX), X0
1748 MOVOU (16*1)(BX), X1
1749 MOVOU (16*2)(BX), X2
1750 MOVOU (16*3)(BX), X3
1751 MOVOU (16*4)(BX), X4
1752 MOVOU (16*5)(BX), X5
1753
1754 MOVOU X0, x1in(16*0)
1755 MOVOU X1, x1in(16*1)
1756 MOVOU X2, y1in(16*0)
1757 MOVOU X3, y1in(16*1)
1758 MOVOU X4, z1in(16*0)
1759 MOVOU X5, z1in(16*1)
1760
1761 MOVOU (16*0)(CX), X0
1762 MOVOU (16*1)(CX), X1
1763
1764 MOVOU X0, x2in(16*0)
1765 MOVOU X1, x2in(16*1)
1766 // Store pointer to result
1767 MOVQ mul0, rptr
1768 MOVL t1, sel_save
1769 MOVL t2, zero_save
1770 // Negate y2in based on sign
1771 MOVQ (16*2 + 8*0)(CX), acc4
1772 MOVQ (16*2 + 8*1)(CX), acc5
1773 MOVQ (16*2 + 8*2)(CX), acc6
1774 MOVQ (16*2 + 8*3)(CX), acc7
1775 MOVQ $-1, acc0
1776 MOVQ p256const0<>(SB), acc1
1777 MOVQ $0, acc2
1778 MOVQ p256const1<>(SB), acc3
1779 XORQ mul0, mul0
1780 // Speculatively subtract
1781 SUBQ acc4, acc0
1782 SBBQ acc5, acc1
1783 SBBQ acc6, acc2
1784 SBBQ acc7, acc3
1785 SBBQ $0, mul0
1786 MOVQ acc0, t0
1787 MOVQ acc1, t1
1788 MOVQ acc2, t2
1789 MOVQ acc3, t3
1790 // Add in case the operand was > p256
1791 ADDQ $-1, acc0
1792 ADCQ p256const0<>(SB), acc1
1793 ADCQ $0, acc2
1794 ADCQ p256const1<>(SB), acc3
1795 ADCQ $0, mul0
1796 CMOVQNE t0, acc0
1797 CMOVQNE t1, acc1
1798 CMOVQNE t2, acc2
1799 CMOVQNE t3, acc3
1800 // If condition is 0, keep original value
1801 TESTQ DX, DX
1802 CMOVQEQ acc4, acc0
1803 CMOVQEQ acc5, acc1
1804 CMOVQEQ acc6, acc2
1805 CMOVQEQ acc7, acc3
1806 // Store result
1807 MOVQ acc0, y2in(8*0)
1808 MOVQ acc1, y2in(8*1)
1809 MOVQ acc2, y2in(8*2)
1810 MOVQ acc3, y2in(8*3)
1811 // Begin point add
1812 LDacc (z1in)
1813 CALL p256SqrInternal(SB) // z1ˆ2
1814 ST (z1sqr)
1815
1816 LDt (x2in)
1817 CALL p256MulInternal(SB) // x2 * z1ˆ2
1818
1819 LDt (x1in)
1820 CALL p256SubInternal(SB) // h = u2 - u1
1821 ST (h)
1822
1823 LDt (z1in)
1824 CALL p256MulInternal(SB) // z3 = h * z1
1825 ST (zout)
1826
1827 LDacc (z1sqr)
1828 CALL p256MulInternal(SB) // z1ˆ3
1829
1830 LDt (y2in)
1831 CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3
1832 ST (s2)
1833
1834 LDt (y1in)
1835 CALL p256SubInternal(SB) // r = s2 - s1
1836 ST (r)
1837
1838 CALL p256SqrInternal(SB) // rsqr = rˆ2
1839 ST (rsqr)
1840
1841 LDacc (h)
1842 CALL p256SqrInternal(SB) // hsqr = hˆ2
1843 ST (hsqr)
1844
1845 LDt (h)
1846 CALL p256MulInternal(SB) // hcub = hˆ3
1847 ST (hcub)
1848
1849 LDt (y1in)
1850 CALL p256MulInternal(SB) // y1 * hˆ3
1851 ST (s2)
1852
1853 LDacc (x1in)
1854 LDt (hsqr)
1855 CALL p256MulInternal(SB) // u1 * hˆ2
1856 ST (h)
1857
1858 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1859 LDacc (rsqr)
1860 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
1861
1862 LDt (hcub)
1863 CALL p256SubInternal(SB)
1864 ST (xout)
1865
1866 MOVQ acc4, t0
1867 MOVQ acc5, t1
1868 MOVQ acc6, t2
1869 MOVQ acc7, t3
1870 LDacc (h)
1871 CALL p256SubInternal(SB)
1872
1873 LDt (r)
1874 CALL p256MulInternal(SB)
1875
1876 LDt (s2)
1877 CALL p256SubInternal(SB)
1878 ST (yout)
1879 // Load stored values from stack
1880 MOVQ rptr, AX
1881 MOVL sel_save, BX
1882 MOVL zero_save, CX
1883 // The result is not valid if (sel == 0), conditional choose
1884 MOVOU xout(16*0), X0
1885 MOVOU xout(16*1), X1
1886 MOVOU yout(16*0), X2
1887 MOVOU yout(16*1), X3
1888 MOVOU zout(16*0), X4
1889 MOVOU zout(16*1), X5
1890
1891 MOVL BX, X6
1892 MOVL CX, X7
1893
1894 PXOR X8, X8
1895 PCMPEQL X9, X9
1896
1897 PSHUFD $0, X6, X6
1898 PSHUFD $0, X7, X7
1899
1900 PCMPEQL X8, X6
1901 PCMPEQL X8, X7
1902
1903 MOVOU X6, X15
1904 PANDN X9, X15
1905
1906 MOVOU x1in(16*0), X9
1907 MOVOU x1in(16*1), X10
1908 MOVOU y1in(16*0), X11
1909 MOVOU y1in(16*1), X12
1910 MOVOU z1in(16*0), X13
1911 MOVOU z1in(16*1), X14
1912
1913 PAND X15, X0
1914 PAND X15, X1
1915 PAND X15, X2
1916 PAND X15, X3
1917 PAND X15, X4
1918 PAND X15, X5
1919
1920 PAND X6, X9
1921 PAND X6, X10
1922 PAND X6, X11
1923 PAND X6, X12
1924 PAND X6, X13
1925 PAND X6, X14
1926
1927 PXOR X9, X0
1928 PXOR X10, X1
1929 PXOR X11, X2
1930 PXOR X12, X3
1931 PXOR X13, X4
1932 PXOR X14, X5
1933 // Similarly if zero == 0
1934 PCMPEQL X9, X9
1935 MOVOU X7, X15
1936 PANDN X9, X15
1937
1938 MOVOU x2in(16*0), X9
1939 MOVOU x2in(16*1), X10
1940 MOVOU y2in(16*0), X11
1941 MOVOU y2in(16*1), X12
1942 MOVOU p256one<>+0x00(SB), X13
1943 MOVOU p256one<>+0x10(SB), X14
1944
1945 PAND X15, X0
1946 PAND X15, X1
1947 PAND X15, X2
1948 PAND X15, X3
1949 PAND X15, X4
1950 PAND X15, X5
1951
1952 PAND X7, X9
1953 PAND X7, X10
1954 PAND X7, X11
1955 PAND X7, X12
1956 PAND X7, X13
1957 PAND X7, X14
1958
1959 PXOR X9, X0
1960 PXOR X10, X1
1961 PXOR X11, X2
1962 PXOR X12, X3
1963 PXOR X13, X4
1964 PXOR X14, X5
1965 // Finally output the result
1966 MOVOU X0, (16*0)(AX)
1967 MOVOU X1, (16*1)(AX)
1968 MOVOU X2, (16*2)(AX)
1969 MOVOU X3, (16*3)(AX)
1970 MOVOU X4, (16*4)(AX)
1971 MOVOU X5, (16*5)(AX)
1972 MOVQ $0, rptr
1973
1974 RET
1975 #undef x1in
1976 #undef y1in
1977 #undef z1in
1978 #undef x2in
1979 #undef y2in
1980 #undef xout
1981 #undef yout
1982 #undef zout
1983 #undef s2
1984 #undef z1sqr
1985 #undef h
1986 #undef r
1987 #undef hsqr
1988 #undef rsqr
1989 #undef hcub
1990 #undef rptr
1991 #undef sel_save
1992 #undef zero_save
1993
1994 // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
1995 // otherwise. It writes to [acc4..acc7], t0 and t1.
1996 TEXT p256IsZero(SB),NOSPLIT,$0
1997 // AX contains a flag that is set if the input is zero.
1998 XORQ AX, AX
1999 MOVQ $1, t1
2000
2001 // Check whether [acc4..acc7] are all zero.
2002 MOVQ acc4, t0
2003 ORQ acc5, t0
2004 ORQ acc6, t0
2005 ORQ acc7, t0
2006
2007 // Set the zero flag if so. (CMOV of a constant to a register doesn't
2008 // appear to be supported in Go. Thus t1 = 1.)
2009 CMOVQEQ t1, AX
2010
2011 // XOR [acc4..acc7] with P and compare with zero again.
2012 XORQ $-1, acc4
2013 XORQ p256const0<>(SB), acc5
2014 XORQ p256const1<>(SB), acc7
2015 ORQ acc5, acc4
2016 ORQ acc6, acc4
2017 ORQ acc7, acc4
2018
2019 // Set the zero flag if so.
2020 CMOVQEQ t1, AX
2021 RET
2022
2023 /* ---------------------------------------*/
2024 #define x1in(off) (32*0 + off)(SP)
2025 #define y1in(off) (32*1 + off)(SP)
2026 #define z1in(off) (32*2 + off)(SP)
2027 #define x2in(off) (32*3 + off)(SP)
2028 #define y2in(off) (32*4 + off)(SP)
2029 #define z2in(off) (32*5 + off)(SP)
2030
2031 #define xout(off) (32*6 + off)(SP)
2032 #define yout(off) (32*7 + off)(SP)
2033 #define zout(off) (32*8 + off)(SP)
2034
2035 #define u1(off) (32*9 + off)(SP)
2036 #define u2(off) (32*10 + off)(SP)
2037 #define s1(off) (32*11 + off)(SP)
2038 #define s2(off) (32*12 + off)(SP)
2039 #define z1sqr(off) (32*13 + off)(SP)
2040 #define z2sqr(off) (32*14 + off)(SP)
2041 #define h(off) (32*15 + off)(SP)
2042 #define r(off) (32*16 + off)(SP)
2043 #define hsqr(off) (32*17 + off)(SP)
2044 #define rsqr(off) (32*18 + off)(SP)
2045 #define hcub(off) (32*19 + off)(SP)
2046 #define rptr (32*20)(SP)
2047 #define points_eq (32*20+8)(SP)
2048
2049 //func p256PointAddAsm(res, in1, in2 *P256Point) int
2050 TEXT ·p256PointAddAsm(SB),0,$680-32
2051 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
2052 // Move input to stack in order to free registers
2053 MOVQ res+0(FP), AX
2054 MOVQ in1+8(FP), BX
2055 MOVQ in2+16(FP), CX
2056
2057 MOVOU (16*0)(BX), X0
2058 MOVOU (16*1)(BX), X1
2059 MOVOU (16*2)(BX), X2
2060 MOVOU (16*3)(BX), X3
2061 MOVOU (16*4)(BX), X4
2062 MOVOU (16*5)(BX), X5
2063
2064 MOVOU X0, x1in(16*0)
2065 MOVOU X1, x1in(16*1)
2066 MOVOU X2, y1in(16*0)
2067 MOVOU X3, y1in(16*1)
2068 MOVOU X4, z1in(16*0)
2069 MOVOU X5, z1in(16*1)
2070
2071 MOVOU (16*0)(CX), X0
2072 MOVOU (16*1)(CX), X1
2073 MOVOU (16*2)(CX), X2
2074 MOVOU (16*3)(CX), X3
2075 MOVOU (16*4)(CX), X4
2076 MOVOU (16*5)(CX), X5
2077
2078 MOVOU X0, x2in(16*0)
2079 MOVOU X1, x2in(16*1)
2080 MOVOU X2, y2in(16*0)
2081 MOVOU X3, y2in(16*1)
2082 MOVOU X4, z2in(16*0)
2083 MOVOU X5, z2in(16*1)
2084 // Store pointer to result
2085 MOVQ AX, rptr
2086 // Begin point add
2087 LDacc (z2in)
2088 CALL p256SqrInternal(SB) // z2ˆ2
2089 ST (z2sqr)
2090 LDt (z2in)
2091 CALL p256MulInternal(SB) // z2ˆ3
2092 LDt (y1in)
2093 CALL p256MulInternal(SB) // s1 = z2ˆ3*y1
2094 ST (s1)
2095
2096 LDacc (z1in)
2097 CALL p256SqrInternal(SB) // z1ˆ2
2098 ST (z1sqr)
2099 LDt (z1in)
2100 CALL p256MulInternal(SB) // z1ˆ3
2101 LDt (y2in)
2102 CALL p256MulInternal(SB) // s2 = z1ˆ3*y2
2103 ST (s2)
2104
2105 LDt (s1)
2106 CALL p256SubInternal(SB) // r = s2 - s1
2107 ST (r)
2108 CALL p256IsZero(SB)
2109 MOVQ AX, points_eq
2110
2111 LDacc (z2sqr)
2112 LDt (x1in)
2113 CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2
2114 ST (u1)
2115 LDacc (z1sqr)
2116 LDt (x2in)
2117 CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2
2118 ST (u2)
2119
2120 LDt (u1)
2121 CALL p256SubInternal(SB) // h = u2 - u1
2122 ST (h)
2123 CALL p256IsZero(SB)
2124 ANDQ points_eq, AX
2125 MOVQ AX, points_eq
2126
2127 LDacc (r)
2128 CALL p256SqrInternal(SB) // rsqr = rˆ2
2129 ST (rsqr)
2130
2131 LDacc (h)
2132 CALL p256SqrInternal(SB) // hsqr = hˆ2
2133 ST (hsqr)
2134
2135 LDt (h)
2136 CALL p256MulInternal(SB) // hcub = hˆ3
2137 ST (hcub)
2138
2139 LDt (s1)
2140 CALL p256MulInternal(SB)
2141 ST (s2)
2142
2143 LDacc (z1in)
2144 LDt (z2in)
2145 CALL p256MulInternal(SB) // z1 * z2
2146 LDt (h)
2147 CALL p256MulInternal(SB) // z1 * z2 * h
2148 ST (zout)
2149
2150 LDacc (hsqr)
2151 LDt (u1)
2152 CALL p256MulInternal(SB) // hˆ2 * u1
2153 ST (u2)
2154
2155 p256MulBy2Inline // u1 * hˆ2 * 2, inline
2156 LDacc (rsqr)
2157 CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
2158
2159 LDt (hcub)
2160 CALL p256SubInternal(SB)
2161 ST (xout)
2162
2163 MOVQ acc4, t0
2164 MOVQ acc5, t1
2165 MOVQ acc6, t2
2166 MOVQ acc7, t3
2167 LDacc (u2)
2168 CALL p256SubInternal(SB)
2169
2170 LDt (r)
2171 CALL p256MulInternal(SB)
2172
2173 LDt (s2)
2174 CALL p256SubInternal(SB)
2175 ST (yout)
2176
2177 MOVOU xout(16*0), X0
2178 MOVOU xout(16*1), X1
2179 MOVOU yout(16*0), X2
2180 MOVOU yout(16*1), X3
2181 MOVOU zout(16*0), X4
2182 MOVOU zout(16*1), X5
2183 // Finally output the result
2184 MOVQ rptr, AX
2185 MOVQ $0, rptr
2186 MOVOU X0, (16*0)(AX)
2187 MOVOU X1, (16*1)(AX)
2188 MOVOU X2, (16*2)(AX)
2189 MOVOU X3, (16*3)(AX)
2190 MOVOU X4, (16*4)(AX)
2191 MOVOU X5, (16*5)(AX)
2192
2193 MOVQ points_eq, AX
2194 MOVQ AX, ret+24(FP)
2195
2196 RET
2197 #undef x1in
2198 #undef y1in
2199 #undef z1in
2200 #undef x2in
2201 #undef y2in
2202 #undef z2in
2203 #undef xout
2204 #undef yout
2205 #undef zout
2206 #undef s1
2207 #undef s2
2208 #undef u1
2209 #undef u2
2210 #undef z1sqr
2211 #undef z2sqr
2212 #undef h
2213 #undef r
2214 #undef hsqr
2215 #undef rsqr
2216 #undef hcub
2217 #undef rptr
2218 /* ---------------------------------------*/
2219 #define x(off) (32*0 + off)(SP)
2220 #define y(off) (32*1 + off)(SP)
2221 #define z(off) (32*2 + off)(SP)
2222
2223 #define s(off) (32*3 + off)(SP)
2224 #define m(off) (32*4 + off)(SP)
2225 #define zsqr(off) (32*5 + off)(SP)
2226 #define tmp(off) (32*6 + off)(SP)
2227 #define rptr (32*7)(SP)
2228
2229 //func p256PointDoubleAsm(res, in *P256Point)
2230 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
2231 // Move input to stack in order to free registers
2232 MOVQ res+0(FP), AX
2233 MOVQ in+8(FP), BX
2234
2235 MOVOU (16*0)(BX), X0
2236 MOVOU (16*1)(BX), X1
2237 MOVOU (16*2)(BX), X2
2238 MOVOU (16*3)(BX), X3
2239 MOVOU (16*4)(BX), X4
2240 MOVOU (16*5)(BX), X5
2241
2242 MOVOU X0, x(16*0)
2243 MOVOU X1, x(16*1)
2244 MOVOU X2, y(16*0)
2245 MOVOU X3, y(16*1)
2246 MOVOU X4, z(16*0)
2247 MOVOU X5, z(16*1)
2248 // Store pointer to result
2249 MOVQ AX, rptr
2250 // Begin point double
2251 LDacc (z)
2252 CALL p256SqrInternal(SB)
2253 ST (zsqr)
2254
2255 LDt (x)
2256 p256AddInline
2257 STt (m)
2258
2259 LDacc (z)
2260 LDt (y)
2261 CALL p256MulInternal(SB)
2262 p256MulBy2Inline
2263 MOVQ rptr, AX
2264 // Store z
2265 MOVQ t0, (16*4 + 8*0)(AX)
2266 MOVQ t1, (16*4 + 8*1)(AX)
2267 MOVQ t2, (16*4 + 8*2)(AX)
2268 MOVQ t3, (16*4 + 8*3)(AX)
2269
2270 LDacc (x)
2271 LDt (zsqr)
2272 CALL p256SubInternal(SB)
2273 LDt (m)
2274 CALL p256MulInternal(SB)
2275 ST (m)
2276 // Multiply by 3
2277 p256MulBy2Inline
2278 LDacc (m)
2279 p256AddInline
2280 STt (m)
2281 ////////////////////////
2282 LDacc (y)
2283 p256MulBy2Inline
2284 t2acc
2285 CALL p256SqrInternal(SB)
2286 ST (s)
2287 CALL p256SqrInternal(SB)
2288 // Divide by 2
2289 XORQ mul0, mul0
2290 MOVQ acc4, t0
2291 MOVQ acc5, t1
2292 MOVQ acc6, t2
2293 MOVQ acc7, t3
2294
2295 ADDQ $-1, acc4
2296 ADCQ p256const0<>(SB), acc5
2297 ADCQ $0, acc6
2298 ADCQ p256const1<>(SB), acc7
2299 ADCQ $0, mul0
2300 TESTQ $1, t0
2301
2302 CMOVQEQ t0, acc4
2303 CMOVQEQ t1, acc5
2304 CMOVQEQ t2, acc6
2305 CMOVQEQ t3, acc7
2306 ANDQ t0, mul0
2307
2308 SHRQ $1, acc5, acc4
2309 SHRQ $1, acc6, acc5
2310 SHRQ $1, acc7, acc6
2311 SHRQ $1, mul0, acc7
2312 ST (y)
2313 /////////////////////////
2314 LDacc (x)
2315 LDt (s)
2316 CALL p256MulInternal(SB)
2317 ST (s)
2318 p256MulBy2Inline
2319 STt (tmp)
2320
2321 LDacc (m)
2322 CALL p256SqrInternal(SB)
2323 LDt (tmp)
2324 CALL p256SubInternal(SB)
2325
2326 MOVQ rptr, AX
2327 // Store x
2328 MOVQ acc4, (16*0 + 8*0)(AX)
2329 MOVQ acc5, (16*0 + 8*1)(AX)
2330 MOVQ acc6, (16*0 + 8*2)(AX)
2331 MOVQ acc7, (16*0 + 8*3)(AX)
2332
2333 acc2t
2334 LDacc (s)
2335 CALL p256SubInternal(SB)
2336
2337 LDt (m)
2338 CALL p256MulInternal(SB)
2339
2340 LDt (y)
2341 CALL p256SubInternal(SB)
2342 MOVQ rptr, AX
2343 // Store y
2344 MOVQ acc4, (16*2 + 8*0)(AX)
2345 MOVQ acc5, (16*2 + 8*1)(AX)
2346 MOVQ acc6, (16*2 + 8*2)(AX)
2347 MOVQ acc7, (16*2 + 8*3)(AX)
2348 ///////////////////////
2349 MOVQ $0, rptr
2350
2351 RET
2352 /* ---------------------------------------*/
2353
View as plain text