1 // Code generated by command: go run p256_asm_amd64.go -out ../p256_asm_amd64.s -pkg nistec. DO NOT EDIT.
2
3 //go:build !purego
4
5 #include "textflag.h"
6
7 // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
8 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
9 JMP ·p256BigToLittle(SB)
10
11 // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
12 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
13 JMP ·p256BigToLittle(SB)
14
15 // func p256LittleToBig(res *[32]byte, in *p256Element)
16 TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
17 JMP ·p256BigToLittle(SB)
18
19 // func p256BigToLittle(res *p256Element, in *[32]byte)
20 TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
21 MOVQ res+0(FP), DI
22 MOVQ in+8(FP), SI
23 MOVQ (SI), R8
24 MOVQ 8(SI), R9
25 MOVQ 16(SI), R10
26 MOVQ 24(SI), R11
27 BSWAPQ R8
28 BSWAPQ R9
29 BSWAPQ R10
30 BSWAPQ R11
31 MOVQ R11, (DI)
32 MOVQ R10, 8(DI)
33 MOVQ R9, 16(DI)
34 MOVQ R8, 24(DI)
35 RET
36
37 // func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
38 // Requires: SSE2
39 TEXT ·p256MovCond(SB), NOSPLIT, $0-32
40 MOVQ res+0(FP), DI
41 MOVQ a+8(FP), SI
42 MOVQ b+16(FP), CX
43 MOVQ cond+24(FP), X12
44 PXOR X13, X13
45 PSHUFD $0x00, X12, X12
46 PCMPEQL X13, X12
47 MOVOU X12, X0
48 MOVOU (SI), X6
49 PANDN X6, X0
50 MOVOU X12, X1
51 MOVOU 16(SI), X7
52 PANDN X7, X1
53 MOVOU X12, X2
54 MOVOU 32(SI), X8
55 PANDN X8, X2
56 MOVOU X12, X3
57 MOVOU 48(SI), X9
58 PANDN X9, X3
59 MOVOU X12, X4
60 MOVOU 64(SI), X10
61 PANDN X10, X4
62 MOVOU X12, X5
63 MOVOU 80(SI), X11
64 PANDN X11, X5
65 MOVOU (CX), X6
66 MOVOU 16(CX), X7
67 MOVOU 32(CX), X8
68 MOVOU 48(CX), X9
69 MOVOU 64(CX), X10
70 MOVOU 80(CX), X11
71 PAND X12, X6
72 PAND X12, X7
73 PAND X12, X8
74 PAND X12, X9
75 PAND X12, X10
76 PAND X12, X11
77 PXOR X6, X0
78 PXOR X7, X1
79 PXOR X8, X2
80 PXOR X9, X3
81 PXOR X10, X4
82 PXOR X11, X5
83 MOVOU X0, (DI)
84 MOVOU X1, 16(DI)
85 MOVOU X2, 32(DI)
86 MOVOU X3, 48(DI)
87 MOVOU X4, 64(DI)
88 MOVOU X5, 80(DI)
89 RET
90
91 // func p256NegCond(val *p256Element, cond int)
92 // Requires: CMOV
93 TEXT ·p256NegCond(SB), NOSPLIT, $0-16
94 MOVQ val+0(FP), DI
95 MOVQ cond+8(FP), R14
96
97 // acc = poly
98 MOVQ $-1, R8
99 MOVQ p256const0<>+0(SB), R9
100 MOVQ $+0, R10
101 MOVQ p256const1<>+0(SB), R11
102
103 // Load the original value
104 MOVQ (DI), R13
105 MOVQ 8(DI), SI
106 MOVQ 16(DI), CX
107 MOVQ 24(DI), R15
108
109 // Speculatively subtract
110 SUBQ R13, R8
111 SBBQ SI, R9
112 SBBQ CX, R10
113 SBBQ R15, R11
114
115 // If condition is 0, keep original value
116 TESTQ R14, R14
117 CMOVQEQ R13, R8
118 CMOVQEQ SI, R9
119 CMOVQEQ CX, R10
120 CMOVQEQ R15, R11
121
122 // Store result
123 MOVQ R8, (DI)
124 MOVQ R9, 8(DI)
125 MOVQ R10, 16(DI)
126 MOVQ R11, 24(DI)
127 RET
128
129 DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
130 GLOBL p256const0<>(SB), RODATA, $8
131
132 DATA p256const1<>+0(SB)/8, $0xffffffff00000001
133 GLOBL p256const1<>(SB), RODATA, $8
134
135 // func p256Sqr(res *p256Element, in *p256Element, n int)
136 // Requires: CMOV
137 TEXT ·p256Sqr(SB), NOSPLIT, $0-24
138 MOVQ res+0(FP), DI
139 MOVQ in+8(FP), SI
140 MOVQ n+16(FP), BX
141
142 sqrLoop:
143 // y[1:] * y[0]
144 MOVQ (SI), R14
145 MOVQ 8(SI), AX
146 MULQ R14
147 MOVQ AX, R9
148 MOVQ DX, R10
149 MOVQ 16(SI), AX
150 MULQ R14
151 ADDQ AX, R10
152 ADCQ $0x00, DX
153 MOVQ DX, R11
154 MOVQ 24(SI), AX
155 MULQ R14
156 ADDQ AX, R11
157 ADCQ $0x00, DX
158 MOVQ DX, R12
159
160 // y[2:] * y[1]
161 MOVQ 8(SI), R14
162 MOVQ 16(SI), AX
163 MULQ R14
164 ADDQ AX, R11
165 ADCQ $0x00, DX
166 MOVQ DX, R15
167 MOVQ 24(SI), AX
168 MULQ R14
169 ADDQ R15, R12
170 ADCQ $0x00, DX
171 ADDQ AX, R12
172 ADCQ $0x00, DX
173 MOVQ DX, R13
174
175 // y[3] * y[2]
176 MOVQ 16(SI), R14
177 MOVQ 24(SI), AX
178 MULQ R14
179 ADDQ AX, R13
180 ADCQ $0x00, DX
181 MOVQ DX, CX
182 XORQ R15, R15
183
184 // *2
185 ADDQ R9, R9
186 ADCQ R10, R10
187 ADCQ R11, R11
188 ADCQ R12, R12
189 ADCQ R13, R13
190 ADCQ CX, CX
191 ADCQ $0x00, R15
192
193 // Missing products
194 MOVQ (SI), AX
195 MULQ AX
196 MOVQ AX, R8
197 MOVQ DX, R14
198 MOVQ 8(SI), AX
199 MULQ AX
200 ADDQ R14, R9
201 ADCQ AX, R10
202 ADCQ $0x00, DX
203 MOVQ DX, R14
204 MOVQ 16(SI), AX
205 MULQ AX
206 ADDQ R14, R11
207 ADCQ AX, R12
208 ADCQ $0x00, DX
209 MOVQ DX, R14
210 MOVQ 24(SI), AX
211 MULQ AX
212 ADDQ R14, R13
213 ADCQ AX, CX
214 ADCQ DX, R15
215 MOVQ R15, SI
216
217 // First reduction step
218 MOVQ R8, AX
219 MOVQ R8, R15
220 SHLQ $0x20, R8
221 MULQ p256const1<>+0(SB)
222 SHRQ $0x20, R15
223 ADDQ R8, R9
224 ADCQ R15, R10
225 ADCQ AX, R11
226 ADCQ $0x00, DX
227 MOVQ DX, R8
228
229 // Second reduction step
230 MOVQ R9, AX
231 MOVQ R9, R15
232 SHLQ $0x20, R9
233 MULQ p256const1<>+0(SB)
234 SHRQ $0x20, R15
235 ADDQ R9, R10
236 ADCQ R15, R11
237 ADCQ AX, R8
238 ADCQ $0x00, DX
239 MOVQ DX, R9
240
241 // Third reduction step
242 MOVQ R10, AX
243 MOVQ R10, R15
244 SHLQ $0x20, R10
245 MULQ p256const1<>+0(SB)
246 SHRQ $0x20, R15
247 ADDQ R10, R11
248 ADCQ R15, R8
249 ADCQ AX, R9
250 ADCQ $0x00, DX
251 MOVQ DX, R10
252
253 // Last reduction step
254 XORQ R14, R14
255 MOVQ R11, AX
256 MOVQ R11, R15
257 SHLQ $0x20, R11
258 MULQ p256const1<>+0(SB)
259 SHRQ $0x20, R15
260 ADDQ R11, R8
261 ADCQ R15, R9
262 ADCQ AX, R10
263 ADCQ $0x00, DX
264 MOVQ DX, R11
265
266 // Add bits [511:256] of the sqr result
267 ADCQ R12, R8
268 ADCQ R13, R9
269 ADCQ CX, R10
270 ADCQ SI, R11
271 ADCQ $0x00, R14
272 MOVQ R8, R12
273 MOVQ R9, R13
274 MOVQ R10, CX
275 MOVQ R11, R15
276
277 // Subtract p256
278 SUBQ $-1, R8
279 SBBQ p256const0<>+0(SB), R9
280 SBBQ $0x00, R10
281 SBBQ p256const1<>+0(SB), R11
282 SBBQ $0x00, R14
283 CMOVQCS R12, R8
284 CMOVQCS R13, R9
285 CMOVQCS CX, R10
286 CMOVQCS R15, R11
287 MOVQ R8, (DI)
288 MOVQ R9, 8(DI)
289 MOVQ R10, 16(DI)
290 MOVQ R11, 24(DI)
291 MOVQ DI, SI
292 DECQ BX
293 JNE sqrLoop
294 RET
295
296 // func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
297 // Requires: CMOV
298 TEXT ·p256Mul(SB), NOSPLIT, $0-24
299 MOVQ res+0(FP), DI
300 MOVQ in1+8(FP), SI
301 MOVQ in2+16(FP), CX
302
303 // x * y[0]
304 MOVQ (CX), R14
305 MOVQ (SI), AX
306 MULQ R14
307 MOVQ AX, R8
308 MOVQ DX, R9
309 MOVQ 8(SI), AX
310 MULQ R14
311 ADDQ AX, R9
312 ADCQ $0x00, DX
313 MOVQ DX, R10
314 MOVQ 16(SI), AX
315 MULQ R14
316 ADDQ AX, R10
317 ADCQ $0x00, DX
318 MOVQ DX, R11
319 MOVQ 24(SI), AX
320 MULQ R14
321 ADDQ AX, R11
322 ADCQ $0x00, DX
323 MOVQ DX, R12
324 XORQ R13, R13
325
326 // First reduction step
327 MOVQ R8, AX
328 MOVQ R8, R15
329 SHLQ $0x20, R8
330 MULQ p256const1<>+0(SB)
331 SHRQ $0x20, R15
332 ADDQ R8, R9
333 ADCQ R15, R10
334 ADCQ AX, R11
335 ADCQ DX, R12
336 ADCQ $0x00, R13
337 XORQ R8, R8
338
339 // x * y[1]
340 MOVQ 8(CX), R14
341 MOVQ (SI), AX
342 MULQ R14
343 ADDQ AX, R9
344 ADCQ $0x00, DX
345 MOVQ DX, R15
346 MOVQ 8(SI), AX
347 MULQ R14
348 ADDQ R15, R10
349 ADCQ $0x00, DX
350 ADDQ AX, R10
351 ADCQ $0x00, DX
352 MOVQ DX, R15
353 MOVQ 16(SI), AX
354 MULQ R14
355 ADDQ R15, R11
356 ADCQ $0x00, DX
357 ADDQ AX, R11
358 ADCQ $0x00, DX
359 MOVQ DX, R15
360 MOVQ 24(SI), AX
361 MULQ R14
362 ADDQ R15, R12
363 ADCQ $0x00, DX
364 ADDQ AX, R12
365 ADCQ DX, R13
366 ADCQ $0x00, R8
367
368 // Second reduction step
369 MOVQ R9, AX
370 MOVQ R9, R15
371 SHLQ $0x20, R9
372 MULQ p256const1<>+0(SB)
373 SHRQ $0x20, R15
374 ADDQ R9, R10
375 ADCQ R15, R11
376 ADCQ AX, R12
377 ADCQ DX, R13
378 ADCQ $0x00, R8
379 XORQ R9, R9
380
381 // x * y[2]
382 MOVQ 16(CX), R14
383 MOVQ (SI), AX
384 MULQ R14
385 ADDQ AX, R10
386 ADCQ $0x00, DX
387 MOVQ DX, R15
388 MOVQ 8(SI), AX
389 MULQ R14
390 ADDQ R15, R11
391 ADCQ $0x00, DX
392 ADDQ AX, R11
393 ADCQ $0x00, DX
394 MOVQ DX, R15
395 MOVQ 16(SI), AX
396 MULQ R14
397 ADDQ R15, R12
398 ADCQ $0x00, DX
399 ADDQ AX, R12
400 ADCQ $0x00, DX
401 MOVQ DX, R15
402 MOVQ 24(SI), AX
403 MULQ R14
404 ADDQ R15, R13
405 ADCQ $0x00, DX
406 ADDQ AX, R13
407 ADCQ DX, R8
408 ADCQ $0x00, R9
409
410 // Third reduction step
411 MOVQ R10, AX
412 MOVQ R10, R15
413 SHLQ $0x20, R10
414 MULQ p256const1<>+0(SB)
415 SHRQ $0x20, R15
416 ADDQ R10, R11
417 ADCQ R15, R12
418 ADCQ AX, R13
419 ADCQ DX, R8
420 ADCQ $0x00, R9
421 XORQ R10, R10
422
423 // x * y[3]
424 MOVQ 24(CX), R14
425 MOVQ (SI), AX
426 MULQ R14
427 ADDQ AX, R11
428 ADCQ $0x00, DX
429 MOVQ DX, R15
430 MOVQ 8(SI), AX
431 MULQ R14
432 ADDQ R15, R12
433 ADCQ $0x00, DX
434 ADDQ AX, R12
435 ADCQ $0x00, DX
436 MOVQ DX, R15
437 MOVQ 16(SI), AX
438 MULQ R14
439 ADDQ R15, R13
440 ADCQ $0x00, DX
441 ADDQ AX, R13
442 ADCQ $0x00, DX
443 MOVQ DX, R15
444 MOVQ 24(SI), AX
445 MULQ R14
446 ADDQ R15, R8
447 ADCQ $0x00, DX
448 ADDQ AX, R8
449 ADCQ DX, R9
450 ADCQ $0x00, R10
451
452 // Last reduction step
453 MOVQ R11, AX
454 MOVQ R11, R15
455 SHLQ $0x20, R11
456 MULQ p256const1<>+0(SB)
457 SHRQ $0x20, R15
458 ADDQ R11, R12
459 ADCQ R15, R13
460 ADCQ AX, R8
461 ADCQ DX, R9
462 ADCQ $0x00, R10
463
464 // Copy result [255:0]
465 MOVQ R12, SI
466 MOVQ R13, R11
467 MOVQ R8, R14
468 MOVQ R9, R15
469
470 // Subtract p256
471 SUBQ $-1, R12
472 SBBQ p256const0<>+0(SB), R13
473 SBBQ $0x00, R8
474 SBBQ p256const1<>+0(SB), R9
475 SBBQ $0x00, R10
476 CMOVQCS SI, R12
477 CMOVQCS R11, R13
478 CMOVQCS R14, R8
479 CMOVQCS R15, R9
480 MOVQ R12, (DI)
481 MOVQ R13, 8(DI)
482 MOVQ R8, 16(DI)
483 MOVQ R9, 24(DI)
484 RET
485
486 // func p256FromMont(res *p256Element, in *p256Element)
487 // Requires: CMOV
488 TEXT ·p256FromMont(SB), NOSPLIT, $0-16
489 MOVQ res+0(FP), DI
490 MOVQ in+8(FP), SI
491 MOVQ (SI), R8
492 MOVQ 8(SI), R9
493 MOVQ 16(SI), R10
494 MOVQ 24(SI), R11
495 XORQ R12, R12
496
497 // Only reduce, no multiplications are needed
498 // First stage
499 MOVQ R8, AX
500 MOVQ R8, R15
501 SHLQ $0x20, R8
502 MULQ p256const1<>+0(SB)
503 SHRQ $0x20, R15
504 ADDQ R8, R9
505 ADCQ R15, R10
506 ADCQ AX, R11
507 ADCQ DX, R12
508 XORQ R13, R13
509
510 // Second stage
511 MOVQ R9, AX
512 MOVQ R9, R15
513 SHLQ $0x20, R9
514 MULQ p256const1<>+0(SB)
515 SHRQ $0x20, R15
516 ADDQ R9, R10
517 ADCQ R15, R11
518 ADCQ AX, R12
519 ADCQ DX, R13
520 XORQ R8, R8
521
522 // Third stage
523 MOVQ R10, AX
524 MOVQ R10, R15
525 SHLQ $0x20, R10
526 MULQ p256const1<>+0(SB)
527 SHRQ $0x20, R15
528 ADDQ R10, R11
529 ADCQ R15, R12
530 ADCQ AX, R13
531 ADCQ DX, R8
532 XORQ R9, R9
533
534 // Last stage
535 MOVQ R11, AX
536 MOVQ R11, R15
537 SHLQ $0x20, R11
538 MULQ p256const1<>+0(SB)
539 SHRQ $0x20, R15
540 ADDQ R11, R12
541 ADCQ R15, R13
542 ADCQ AX, R8
543 ADCQ DX, R9
544 MOVQ R12, SI
545 MOVQ R13, R11
546 MOVQ R8, R14
547 MOVQ R9, R15
548 SUBQ $-1, R12
549 SBBQ p256const0<>+0(SB), R13
550 SBBQ $0x00, R8
551 SBBQ p256const1<>+0(SB), R9
552 CMOVQCS SI, R12
553 CMOVQCS R11, R13
554 CMOVQCS R14, R8
555 CMOVQCS R15, R9
556 MOVQ R12, (DI)
557 MOVQ R13, 8(DI)
558 MOVQ R8, 16(DI)
559 MOVQ R9, 24(DI)
560 RET
561
562 // func p256Select(res *P256Point, table *p256Table, idx int)
563 // Requires: SSE2
564 TEXT ·p256Select(SB), NOSPLIT, $0-24
565 MOVQ idx+16(FP), AX
566 MOVQ table+8(FP), DI
567 MOVQ res+0(FP), DX
568 PXOR X15, X15
569 PCMPEQL X14, X14
570 PSUBL X14, X15
571 MOVL AX, X14
572 PSHUFD $0x00, X14, X14
573 PXOR X0, X0
574 PXOR X1, X1
575 PXOR X2, X2
576 PXOR X3, X3
577 PXOR X4, X4
578 PXOR X5, X5
579 MOVQ $0x00000010, AX
580 MOVOU X15, X13
581
582 loop_select:
583 MOVOU X13, X12
584 PADDL X15, X13
585 PCMPEQL X14, X12
586 MOVOU (DI), X6
587 MOVOU 16(DI), X7
588 MOVOU 32(DI), X8
589 MOVOU 48(DI), X9
590 MOVOU 64(DI), X10
591 MOVOU 80(DI), X11
592 ADDQ $0x60, DI
593 PAND X12, X6
594 PAND X12, X7
595 PAND X12, X8
596 PAND X12, X9
597 PAND X12, X10
598 PAND X12, X11
599 PXOR X6, X0
600 PXOR X7, X1
601 PXOR X8, X2
602 PXOR X9, X3
603 PXOR X10, X4
604 PXOR X11, X5
605 DECQ AX
606 JNE loop_select
607 MOVOU X0, (DX)
608 MOVOU X1, 16(DX)
609 MOVOU X2, 32(DX)
610 MOVOU X3, 48(DX)
611 MOVOU X4, 64(DX)
612 MOVOU X5, 80(DX)
613 RET
614
615 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
616 // Requires: SSE2
617 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
618 MOVQ idx+16(FP), AX
619 MOVQ table+8(FP), DI
620 MOVQ res+0(FP), DX
621 PXOR X15, X15
622 PCMPEQL X14, X14
623 PSUBL X14, X15
624 MOVL AX, X14
625 PSHUFD $0x00, X14, X14
626 PXOR X0, X0
627 PXOR X1, X1
628 PXOR X2, X2
629 PXOR X3, X3
630 MOVQ $0x00000010, AX
631 MOVOU X15, X13
632
633 loop_select_base:
634 MOVOU X13, X12
635 PADDL X15, X13
636 PCMPEQL X14, X12
637 MOVOU (DI), X4
638 MOVOU 16(DI), X5
639 MOVOU 32(DI), X6
640 MOVOU 48(DI), X7
641 MOVOU 64(DI), X8
642 MOVOU 80(DI), X9
643 MOVOU 96(DI), X10
644 MOVOU 112(DI), X11
645 ADDQ $0x80, DI
646 PAND X12, X4
647 PAND X12, X5
648 PAND X12, X6
649 PAND X12, X7
650 MOVOU X13, X12
651 PADDL X15, X13
652 PCMPEQL X14, X12
653 PAND X12, X8
654 PAND X12, X9
655 PAND X12, X10
656 PAND X12, X11
657 PXOR X4, X0
658 PXOR X5, X1
659 PXOR X6, X2
660 PXOR X7, X3
661 PXOR X8, X0
662 PXOR X9, X1
663 PXOR X10, X2
664 PXOR X11, X3
665 DECQ AX
666 JNE loop_select_base
667 MOVOU X0, (DX)
668 MOVOU X1, 16(DX)
669 MOVOU X2, 32(DX)
670 MOVOU X3, 48(DX)
671 RET
672
673 // func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
674 // Requires: CMOV
675 TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
676 MOVQ res+0(FP), DI
677 MOVQ in1+8(FP), SI
678 MOVQ in2+16(FP), CX
679
680 // x * y[0]
681 MOVQ (CX), R14
682 MOVQ (SI), AX
683 MULQ R14
684 MOVQ AX, R8
685 MOVQ DX, R9
686 MOVQ 8(SI), AX
687 MULQ R14
688 ADDQ AX, R9
689 ADCQ $0x00, DX
690 MOVQ DX, R10
691 MOVQ 16(SI), AX
692 MULQ R14
693 ADDQ AX, R10
694 ADCQ $0x00, DX
695 MOVQ DX, R11
696 MOVQ 24(SI), AX
697 MULQ R14
698 ADDQ AX, R11
699 ADCQ $0x00, DX
700 MOVQ DX, R12
701 XORQ R13, R13
702
703 // First reduction step
704 MOVQ R8, AX
705 MULQ p256ordK0<>+0(SB)
706 MOVQ AX, R14
707 MOVQ p256ord<>+0(SB), AX
708 MULQ R14
709 ADDQ AX, R8
710 ADCQ $0x00, DX
711 MOVQ DX, R15
712 MOVQ p256ord<>+8(SB), AX
713 MULQ R14
714 ADDQ R15, R9
715 ADCQ $0x00, DX
716 ADDQ AX, R9
717 ADCQ $0x00, DX
718 MOVQ DX, R15
719 MOVQ p256ord<>+16(SB), AX
720 MULQ R14
721 ADDQ R15, R10
722 ADCQ $0x00, DX
723 ADDQ AX, R10
724 ADCQ $0x00, DX
725 MOVQ DX, R15
726 MOVQ p256ord<>+24(SB), AX
727 MULQ R14
728 ADDQ R15, R11
729 ADCQ $0x00, DX
730 ADDQ AX, R11
731 ADCQ DX, R12
732 ADCQ $0x00, R13
733
734 // x * y[1]
735 MOVQ 8(CX), R14
736 MOVQ (SI), AX
737 MULQ R14
738 ADDQ AX, R9
739 ADCQ $0x00, DX
740 MOVQ DX, R15
741 MOVQ 8(SI), AX
742 MULQ R14
743 ADDQ R15, R10
744 ADCQ $0x00, DX
745 ADDQ AX, R10
746 ADCQ $0x00, DX
747 MOVQ DX, R15
748 MOVQ 16(SI), AX
749 MULQ R14
750 ADDQ R15, R11
751 ADCQ $0x00, DX
752 ADDQ AX, R11
753 ADCQ $0x00, DX
754 MOVQ DX, R15
755 MOVQ 24(SI), AX
756 MULQ R14
757 ADDQ R15, R12
758 ADCQ $0x00, DX
759 ADDQ AX, R12
760 ADCQ DX, R13
761 ADCQ $0x00, R8
762
763 // Second reduction step
764 MOVQ R9, AX
765 MULQ p256ordK0<>+0(SB)
766 MOVQ AX, R14
767 MOVQ p256ord<>+0(SB), AX
768 MULQ R14
769 ADDQ AX, R9
770 ADCQ $0x00, DX
771 MOVQ DX, R15
772 MOVQ p256ord<>+8(SB), AX
773 MULQ R14
774 ADDQ R15, R10
775 ADCQ $0x00, DX
776 ADDQ AX, R10
777 ADCQ $0x00, DX
778 MOVQ DX, R15
779 MOVQ p256ord<>+16(SB), AX
780 MULQ R14
781 ADDQ R15, R11
782 ADCQ $0x00, DX
783 ADDQ AX, R11
784 ADCQ $0x00, DX
785 MOVQ DX, R15
786 MOVQ p256ord<>+24(SB), AX
787 MULQ R14
788 ADDQ R15, R12
789 ADCQ $0x00, DX
790 ADDQ AX, R12
791 ADCQ DX, R13
792 ADCQ $0x00, R8
793
794 // x * y[2]
795 MOVQ 16(CX), R14
796 MOVQ (SI), AX
797 MULQ R14
798 ADDQ AX, R10
799 ADCQ $0x00, DX
800 MOVQ DX, R15
801 MOVQ 8(SI), AX
802 MULQ R14
803 ADDQ R15, R11
804 ADCQ $0x00, DX
805 ADDQ AX, R11
806 ADCQ $0x00, DX
807 MOVQ DX, R15
808 MOVQ 16(SI), AX
809 MULQ R14
810 ADDQ R15, R12
811 ADCQ $0x00, DX
812 ADDQ AX, R12
813 ADCQ $0x00, DX
814 MOVQ DX, R15
815 MOVQ 24(SI), AX
816 MULQ R14
817 ADDQ R15, R13
818 ADCQ $0x00, DX
819 ADDQ AX, R13
820 ADCQ DX, R8
821 ADCQ $0x00, R9
822
823 // Third reduction step
824 MOVQ R10, AX
825 MULQ p256ordK0<>+0(SB)
826 MOVQ AX, R14
827 MOVQ p256ord<>+0(SB), AX
828 MULQ R14
829 ADDQ AX, R10
830 ADCQ $0x00, DX
831 MOVQ DX, R15
832 MOVQ p256ord<>+8(SB), AX
833 MULQ R14
834 ADDQ R15, R11
835 ADCQ $0x00, DX
836 ADDQ AX, R11
837 ADCQ $0x00, DX
838 MOVQ DX, R15
839 MOVQ p256ord<>+16(SB), AX
840 MULQ R14
841 ADDQ R15, R12
842 ADCQ $0x00, DX
843 ADDQ AX, R12
844 ADCQ $0x00, DX
845 MOVQ DX, R15
846 MOVQ p256ord<>+24(SB), AX
847 MULQ R14
848 ADDQ R15, R13
849 ADCQ $0x00, DX
850 ADDQ AX, R13
851 ADCQ DX, R8
852 ADCQ $0x00, R9
853
854 // x * y[3]
855 MOVQ 24(CX), R14
856 MOVQ (SI), AX
857 MULQ R14
858 ADDQ AX, R11
859 ADCQ $0x00, DX
860 MOVQ DX, R15
861 MOVQ 8(SI), AX
862 MULQ R14
863 ADDQ R15, R12
864 ADCQ $0x00, DX
865 ADDQ AX, R12
866 ADCQ $0x00, DX
867 MOVQ DX, R15
868 MOVQ 16(SI), AX
869 MULQ R14
870 ADDQ R15, R13
871 ADCQ $0x00, DX
872 ADDQ AX, R13
873 ADCQ $0x00, DX
874 MOVQ DX, R15
875 MOVQ 24(SI), AX
876 MULQ R14
877 ADDQ R15, R8
878 ADCQ $0x00, DX
879 ADDQ AX, R8
880 ADCQ DX, R9
881 ADCQ $0x00, R10
882
883 // Last reduction step
884 MOVQ R11, AX
885 MULQ p256ordK0<>+0(SB)
886 MOVQ AX, R14
887 MOVQ p256ord<>+0(SB), AX
888 MULQ R14
889 ADDQ AX, R11
890 ADCQ $0x00, DX
891 MOVQ DX, R15
892 MOVQ p256ord<>+8(SB), AX
893 MULQ R14
894 ADDQ R15, R12
895 ADCQ $0x00, DX
896 ADDQ AX, R12
897 ADCQ $0x00, DX
898 MOVQ DX, R15
899 MOVQ p256ord<>+16(SB), AX
900 MULQ R14
901 ADDQ R15, R13
902 ADCQ $0x00, DX
903 ADDQ AX, R13
904 ADCQ $0x00, DX
905 MOVQ DX, R15
906 MOVQ p256ord<>+24(SB), AX
907 MULQ R14
908 ADDQ R15, R8
909 ADCQ $0x00, DX
910 ADDQ AX, R8
911 ADCQ DX, R9
912 ADCQ $0x00, R10
913
914 // Copy result [255:0]
915 MOVQ R12, SI
916 MOVQ R13, R11
917 MOVQ R8, R14
918 MOVQ R9, R15
919
920 // Subtract p256
921 SUBQ p256ord<>+0(SB), R12
922 SBBQ p256ord<>+8(SB), R13
923 SBBQ p256ord<>+16(SB), R8
924 SBBQ p256ord<>+24(SB), R9
925 SBBQ $0x00, R10
926 CMOVQCS SI, R12
927 CMOVQCS R11, R13
928 CMOVQCS R14, R8
929 CMOVQCS R15, R9
930 MOVQ R12, (DI)
931 MOVQ R13, 8(DI)
932 MOVQ R8, 16(DI)
933 MOVQ R9, 24(DI)
934 RET
935
936 DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
937 GLOBL p256ordK0<>(SB), RODATA, $8
938
939 DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
940 DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
941 DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
942 DATA p256ord<>+24(SB)/8, $0xffffffff00000000
943 GLOBL p256ord<>(SB), RODATA, $32
944
945 // func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
946 // Requires: CMOV
947 TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
948 MOVQ res+0(FP), DI
949 MOVQ in+8(FP), SI
950 MOVQ n+16(FP), BX
951
952 ordSqrLoop:
953 // y[1:] * y[0]
954 MOVQ (SI), R14
955 MOVQ 8(SI), AX
956 MULQ R14
957 MOVQ AX, R9
958 MOVQ DX, R10
959 MOVQ 16(SI), AX
960 MULQ R14
961 ADDQ AX, R10
962 ADCQ $0x00, DX
963 MOVQ DX, R11
964 MOVQ 24(SI), AX
965 MULQ R14
966 ADDQ AX, R11
967 ADCQ $0x00, DX
968 MOVQ DX, R12
969
970 // y[2:] * y[1]
971 MOVQ 8(SI), R14
972 MOVQ 16(SI), AX
973 MULQ R14
974 ADDQ AX, R11
975 ADCQ $0x00, DX
976 MOVQ DX, R15
977 MOVQ 24(SI), AX
978 MULQ R14
979 ADDQ R15, R12
980 ADCQ $0x00, DX
981 ADDQ AX, R12
982 ADCQ $0x00, DX
983 MOVQ DX, R13
984
985 // y[3] * y[2]
986 MOVQ 16(SI), R14
987 MOVQ 24(SI), AX
988 MULQ R14
989 ADDQ AX, R13
990 ADCQ $0x00, DX
991 MOVQ DX, CX
992 XORQ R15, R15
993
994 // *2
995 ADDQ R9, R9
996 ADCQ R10, R10
997 ADCQ R11, R11
998 ADCQ R12, R12
999 ADCQ R13, R13
1000 ADCQ CX, CX
1001 ADCQ $0x00, R15
1002
1003 // Missing products
1004 MOVQ (SI), AX
1005 MULQ AX
1006 MOVQ AX, R8
1007 MOVQ DX, R14
1008 MOVQ 8(SI), AX
1009 MULQ AX
1010 ADDQ R14, R9
1011 ADCQ AX, R10
1012 ADCQ $0x00, DX
1013 MOVQ DX, R14
1014 MOVQ 16(SI), AX
1015 MULQ AX
1016 ADDQ R14, R11
1017 ADCQ AX, R12
1018 ADCQ $0x00, DX
1019 MOVQ DX, R14
1020 MOVQ 24(SI), AX
1021 MULQ AX
1022 ADDQ R14, R13
1023 ADCQ AX, CX
1024 ADCQ DX, R15
1025 MOVQ R15, SI
1026
1027 // First reduction step
1028 MOVQ R8, AX
1029 MULQ p256ordK0<>+0(SB)
1030 MOVQ AX, R14
1031 MOVQ p256ord<>+0(SB), AX
1032 MULQ R14
1033 ADDQ AX, R8
1034 ADCQ $0x00, DX
1035 MOVQ DX, R15
1036 MOVQ p256ord<>+8(SB), AX
1037 MULQ R14
1038 ADDQ R15, R9
1039 ADCQ $0x00, DX
1040 ADDQ AX, R9
1041 MOVQ R14, R15
1042 ADCQ DX, R10
1043 ADCQ $0x00, R15
1044 SUBQ R14, R10
1045 SBBQ $0x00, R15
1046 MOVQ R14, AX
1047 MOVQ R14, DX
1048 MOVQ R14, R8
1049 SHLQ $0x20, AX
1050 SHRQ $0x20, DX
1051 ADDQ R15, R11
1052 ADCQ $0x00, R8
1053 SUBQ AX, R11
1054 SBBQ DX, R8
1055
1056 // Second reduction step
1057 MOVQ R9, AX
1058 MULQ p256ordK0<>+0(SB)
1059 MOVQ AX, R14
1060 MOVQ p256ord<>+0(SB), AX
1061 MULQ R14
1062 ADDQ AX, R9
1063 ADCQ $0x00, DX
1064 MOVQ DX, R15
1065 MOVQ p256ord<>+8(SB), AX
1066 MULQ R14
1067 ADDQ R15, R10
1068 ADCQ $0x00, DX
1069 ADDQ AX, R10
1070 MOVQ R14, R15
1071 ADCQ DX, R11
1072 ADCQ $0x00, R15
1073 SUBQ R14, R11
1074 SBBQ $0x00, R15
1075 MOVQ R14, AX
1076 MOVQ R14, DX
1077 MOVQ R14, R9
1078 SHLQ $0x20, AX
1079 SHRQ $0x20, DX
1080 ADDQ R15, R8
1081 ADCQ $0x00, R9
1082 SUBQ AX, R8
1083 SBBQ DX, R9
1084
1085 // Third reduction step
1086 MOVQ R10, AX
1087 MULQ p256ordK0<>+0(SB)
1088 MOVQ AX, R14
1089 MOVQ p256ord<>+0(SB), AX
1090 MULQ R14
1091 ADDQ AX, R10
1092 ADCQ $0x00, DX
1093 MOVQ DX, R15
1094 MOVQ p256ord<>+8(SB), AX
1095 MULQ R14
1096 ADDQ R15, R11
1097 ADCQ $0x00, DX
1098 ADDQ AX, R11
1099 MOVQ R14, R15
1100 ADCQ DX, R8
1101 ADCQ $0x00, R15
1102 SUBQ R14, R8
1103 SBBQ $0x00, R15
1104 MOVQ R14, AX
1105 MOVQ R14, DX
1106 MOVQ R14, R10
1107 SHLQ $0x20, AX
1108 SHRQ $0x20, DX
1109 ADDQ R15, R9
1110 ADCQ $0x00, R10
1111 SUBQ AX, R9
1112 SBBQ DX, R10
1113
1114 // Last reduction step
1115 MOVQ R11, AX
1116 MULQ p256ordK0<>+0(SB)
1117 MOVQ AX, R14
1118 MOVQ p256ord<>+0(SB), AX
1119 MULQ R14
1120 ADDQ AX, R11
1121 ADCQ $0x00, DX
1122 MOVQ DX, R15
1123 MOVQ p256ord<>+8(SB), AX
1124 MULQ R14
1125 ADDQ R15, R8
1126 ADCQ $0x00, DX
1127 ADDQ AX, R8
1128 ADCQ $0x00, DX
1129 MOVQ DX, R15
1130 MOVQ R14, R15
1131 ADCQ DX, R9
1132 ADCQ $0x00, R15
1133 SUBQ R14, R9
1134 SBBQ $0x00, R15
1135 MOVQ R14, AX
1136 MOVQ R14, DX
1137 MOVQ R14, R11
1138 SHLQ $0x20, AX
1139 SHRQ $0x20, DX
1140 ADDQ R15, R10
1141 ADCQ $0x00, R11
1142 SUBQ AX, R10
1143 SBBQ DX, R11
1144 XORQ R14, R14
1145
1146 // Add bits [511:256] of the sqr result
1147 ADCQ R12, R8
1148 ADCQ R13, R9
1149 ADCQ CX, R10
1150 ADCQ SI, R11
1151 ADCQ $0x00, R14
1152 MOVQ R8, R12
1153 MOVQ R9, R13
1154 MOVQ R10, CX
1155 MOVQ R11, R15
1156
1157 // Subtract p256
1158 SUBQ p256ord<>+0(SB), R8
1159 SBBQ p256ord<>+8(SB), R9
1160 SBBQ p256ord<>+16(SB), R10
1161 SBBQ p256ord<>+24(SB), R11
1162 SBBQ $0x00, R14
1163 CMOVQCS R12, R8
1164 CMOVQCS R13, R9
1165 CMOVQCS CX, R10
1166 CMOVQCS R15, R11
1167 MOVQ R8, (DI)
1168 MOVQ R9, 8(DI)
1169 MOVQ R10, 16(DI)
1170 MOVQ R11, 24(DI)
1171 MOVQ DI, SI
1172 DECQ BX
1173 JNE ordSqrLoop
1174 RET
1175
1176 // func p256SubInternal()
1177 // Requires: CMOV
1178 TEXT p256SubInternal(SB), NOSPLIT, $0
1179 XORQ AX, AX
1180 SUBQ R14, R10
1181 SBBQ R15, R11
1182 SBBQ DI, R12
1183 SBBQ SI, R13
1184 SBBQ $0x00, AX
1185 MOVQ R10, BX
1186 MOVQ R11, CX
1187 MOVQ R12, R8
1188 MOVQ R13, R9
1189 ADDQ $-1, R10
1190 ADCQ p256const0<>+0(SB), R11
1191 ADCQ $0x00, R12
1192 ADCQ p256const1<>+0(SB), R13
1193 ANDQ $0x01, AX
1194 CMOVQEQ BX, R10
1195 CMOVQEQ CX, R11
1196 CMOVQEQ R8, R12
1197 CMOVQEQ R9, R13
1198 RET
1199
1200 // func p256MulInternal()
1201 // Requires: CMOV
1202 TEXT p256MulInternal(SB), NOSPLIT, $8
1203 MOVQ R10, AX
1204 MULQ R14
1205 MOVQ AX, BX
1206 MOVQ DX, CX
1207 MOVQ R10, AX
1208 MULQ R15
1209 ADDQ AX, CX
1210 ADCQ $0x00, DX
1211 MOVQ DX, R8
1212 MOVQ R10, AX
1213 MULQ DI
1214 ADDQ AX, R8
1215 ADCQ $0x00, DX
1216 MOVQ DX, R9
1217 MOVQ R10, AX
1218 MULQ SI
1219 ADDQ AX, R9
1220 ADCQ $0x00, DX
1221 MOVQ DX, R10
1222 MOVQ R11, AX
1223 MULQ R14
1224 ADDQ AX, CX
1225 ADCQ $0x00, DX
1226 MOVQ DX, BP
1227 MOVQ R11, AX
1228 MULQ R15
1229 ADDQ BP, R8
1230 ADCQ $0x00, DX
1231 ADDQ AX, R8
1232 ADCQ $0x00, DX
1233 MOVQ DX, BP
1234 MOVQ R11, AX
1235 MULQ DI
1236 ADDQ BP, R9
1237 ADCQ $0x00, DX
1238 ADDQ AX, R9
1239 ADCQ $0x00, DX
1240 MOVQ DX, BP
1241 MOVQ R11, AX
1242 MULQ SI
1243 ADDQ BP, R10
1244 ADCQ $0x00, DX
1245 ADDQ AX, R10
1246 ADCQ $0x00, DX
1247 MOVQ DX, R11
1248 MOVQ R12, AX
1249 MULQ R14
1250 ADDQ AX, R8
1251 ADCQ $0x00, DX
1252 MOVQ DX, BP
1253 MOVQ R12, AX
1254 MULQ R15
1255 ADDQ BP, R9
1256 ADCQ $0x00, DX
1257 ADDQ AX, R9
1258 ADCQ $0x00, DX
1259 MOVQ DX, BP
1260 MOVQ R12, AX
1261 MULQ DI
1262 ADDQ BP, R10
1263 ADCQ $0x00, DX
1264 ADDQ AX, R10
1265 ADCQ $0x00, DX
1266 MOVQ DX, BP
1267 MOVQ R12, AX
1268 MULQ SI
1269 ADDQ BP, R11
1270 ADCQ $0x00, DX
1271 ADDQ AX, R11
1272 ADCQ $0x00, DX
1273 MOVQ DX, R12
1274 MOVQ R13, AX
1275 MULQ R14
1276 ADDQ AX, R9
1277 ADCQ $0x00, DX
1278 MOVQ DX, BP
1279 MOVQ R13, AX
1280 MULQ R15
1281 ADDQ BP, R10
1282 ADCQ $0x00, DX
1283 ADDQ AX, R10
1284 ADCQ $0x00, DX
1285 MOVQ DX, BP
1286 MOVQ R13, AX
1287 MULQ DI
1288 ADDQ BP, R11
1289 ADCQ $0x00, DX
1290 ADDQ AX, R11
1291 ADCQ $0x00, DX
1292 MOVQ DX, BP
1293 MOVQ R13, AX
1294 MULQ SI
1295 ADDQ BP, R12
1296 ADCQ $0x00, DX
1297 ADDQ AX, R12
1298 ADCQ $0x00, DX
1299 MOVQ DX, R13
1300
1301 // First reduction step
1302 MOVQ BX, AX
1303 MOVQ BX, BP
1304 SHLQ $0x20, BX
1305 MULQ p256const1<>+0(SB)
1306 SHRQ $0x20, BP
1307 ADDQ BX, CX
1308 ADCQ BP, R8
1309 ADCQ AX, R9
1310 ADCQ $0x00, DX
1311 MOVQ DX, BX
1312
1313 // Second reduction step
1314 MOVQ CX, AX
1315 MOVQ CX, BP
1316 SHLQ $0x20, CX
1317 MULQ p256const1<>+0(SB)
1318 SHRQ $0x20, BP
1319 ADDQ CX, R8
1320 ADCQ BP, R9
1321 ADCQ AX, BX
1322 ADCQ $0x00, DX
1323 MOVQ DX, CX
1324
1325 // Third reduction step
1326 MOVQ R8, AX
1327 MOVQ R8, BP
1328 SHLQ $0x20, R8
1329 MULQ p256const1<>+0(SB)
1330 SHRQ $0x20, BP
1331 ADDQ R8, R9
1332 ADCQ BP, BX
1333 ADCQ AX, CX
1334 ADCQ $0x00, DX
1335 MOVQ DX, R8
1336
1337 // Last reduction step
1338 MOVQ R9, AX
1339 MOVQ R9, BP
1340 SHLQ $0x20, R9
1341 MULQ p256const1<>+0(SB)
1342 SHRQ $0x20, BP
1343 ADDQ R9, BX
1344 ADCQ BP, CX
1345 ADCQ AX, R8
1346 ADCQ $0x00, DX
1347 MOVQ DX, R9
1348 MOVQ $0x00000000, BP
1349
1350 // Add bits [511:256] of the result
1351 ADCQ BX, R10
1352 ADCQ CX, R11
1353 ADCQ R8, R12
1354 ADCQ R9, R13
1355 ADCQ $0x00, BP
1356
1357 // Copy result
1358 MOVQ R10, BX
1359 MOVQ R11, CX
1360 MOVQ R12, R8
1361 MOVQ R13, R9
1362
1363 // Subtract p256
1364 SUBQ $-1, R10
1365 SBBQ p256const0<>+0(SB), R11
1366 SBBQ $0x00, R12
1367 SBBQ p256const1<>+0(SB), R13
1368 SBBQ $0x00, BP
1369
1370 // If the result of the subtraction is negative, restore the previous result
1371 CMOVQCS BX, R10
1372 CMOVQCS CX, R11
1373 CMOVQCS R8, R12
1374 CMOVQCS R9, R13
1375 RET
1376
1377 // func p256SqrInternal()
1378 // Requires: CMOV
1379 TEXT p256SqrInternal(SB), NOSPLIT, $8
1380 MOVQ R10, AX
1381 MULQ R11
1382 MOVQ AX, CX
1383 MOVQ DX, R8
1384 MOVQ R10, AX
1385 MULQ R12
1386 ADDQ AX, R8
1387 ADCQ $0x00, DX
1388 MOVQ DX, R9
1389 MOVQ R10, AX
1390 MULQ R13
1391 ADDQ AX, R9
1392 ADCQ $0x00, DX
1393 MOVQ DX, R14
1394 MOVQ R11, AX
1395 MULQ R12
1396 ADDQ AX, R9
1397 ADCQ $0x00, DX
1398 MOVQ DX, BP
1399 MOVQ R11, AX
1400 MULQ R13
1401 ADDQ BP, R14
1402 ADCQ $0x00, DX
1403 ADDQ AX, R14
1404 ADCQ $0x00, DX
1405 MOVQ DX, R15
1406 MOVQ R12, AX
1407 MULQ R13
1408 ADDQ AX, R15
1409 ADCQ $0x00, DX
1410 MOVQ DX, DI
1411 XORQ SI, SI
1412
1413 // *2
1414 ADDQ CX, CX
1415 ADCQ R8, R8
1416 ADCQ R9, R9
1417 ADCQ R14, R14
1418 ADCQ R15, R15
1419 ADCQ DI, DI
1420 ADCQ $0x00, SI
1421
1422 // Missing products
1423 MOVQ R10, AX
1424 MULQ AX
1425 MOVQ AX, BX
1426 MOVQ DX, R10
1427 MOVQ R11, AX
1428 MULQ AX
1429 ADDQ R10, CX
1430 ADCQ AX, R8
1431 ADCQ $0x00, DX
1432 MOVQ DX, R10
1433 MOVQ R12, AX
1434 MULQ AX
1435 ADDQ R10, R9
1436 ADCQ AX, R14
1437 ADCQ $0x00, DX
1438 MOVQ DX, R10
1439 MOVQ R13, AX
1440 MULQ AX
1441 ADDQ R10, R15
1442 ADCQ AX, DI
1443 ADCQ DX, SI
1444
1445 // First reduction step
1446 MOVQ BX, AX
1447 MOVQ BX, BP
1448 SHLQ $0x20, BX
1449 MULQ p256const1<>+0(SB)
1450 SHRQ $0x20, BP
1451 ADDQ BX, CX
1452 ADCQ BP, R8
1453 ADCQ AX, R9
1454 ADCQ $0x00, DX
1455 MOVQ DX, BX
1456
1457 // Second reduction step
1458 MOVQ CX, AX
1459 MOVQ CX, BP
1460 SHLQ $0x20, CX
1461 MULQ p256const1<>+0(SB)
1462 SHRQ $0x20, BP
1463 ADDQ CX, R8
1464 ADCQ BP, R9
1465 ADCQ AX, BX
1466 ADCQ $0x00, DX
1467 MOVQ DX, CX
1468
1469 // Third reduction step
1470 MOVQ R8, AX
1471 MOVQ R8, BP
1472 SHLQ $0x20, R8
1473 MULQ p256const1<>+0(SB)
1474 SHRQ $0x20, BP
1475 ADDQ R8, R9
1476 ADCQ BP, BX
1477 ADCQ AX, CX
1478 ADCQ $0x00, DX
1479 MOVQ DX, R8
1480
1481 // Last reduction step
1482 MOVQ R9, AX
1483 MOVQ R9, BP
1484 SHLQ $0x20, R9
1485 MULQ p256const1<>+0(SB)
1486 SHRQ $0x20, BP
1487 ADDQ R9, BX
1488 ADCQ BP, CX
1489 ADCQ AX, R8
1490 ADCQ $0x00, DX
1491 MOVQ DX, R9
1492 MOVQ $0x00000000, BP
1493
1494 // Add bits [511:256] of the result
1495 ADCQ BX, R14
1496 ADCQ CX, R15
1497 ADCQ R8, DI
1498 ADCQ R9, SI
1499 ADCQ $0x00, BP
1500
1501 // Copy result
1502 MOVQ R14, R10
1503 MOVQ R15, R11
1504 MOVQ DI, R12
1505 MOVQ SI, R13
1506
1507 // Subtract p256
1508 SUBQ $-1, R10
1509 SBBQ p256const0<>+0(SB), R11
1510 SBBQ $0x00, R12
1511 SBBQ p256const1<>+0(SB), R13
1512 SBBQ $0x00, BP
1513
1514 // If the result of the subtraction is negative, restore the previous result
1515 CMOVQCS R14, R10
1516 CMOVQCS R15, R11
1517 CMOVQCS DI, R12
1518 CMOVQCS SI, R13
1519 RET
1520
1521 // func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
1522 // Requires: CMOV, SSE2
1523 TEXT ·p256PointAddAffineAsm(SB), $512-48
1524 MOVQ res+0(FP), AX
1525 MOVQ in1+8(FP), BX
1526 MOVQ in2+16(FP), CX
1527 MOVQ sign+24(FP), DX
1528 MOVQ sel+32(FP), R15
1529 MOVQ zero+40(FP), DI
1530 MOVOU (BX), X0
1531 MOVOU 16(BX), X1
1532 MOVOU 32(BX), X2
1533 MOVOU 48(BX), X3
1534 MOVOU 64(BX), X4
1535 MOVOU 80(BX), X5
1536 MOVOU X0, (SP)
1537 MOVOU X1, 16(SP)
1538 MOVOU X2, 32(SP)
1539 MOVOU X3, 48(SP)
1540 MOVOU X4, 64(SP)
1541 MOVOU X5, 80(SP)
1542 MOVOU (CX), X0
1543 MOVOU 16(CX), X1
1544 MOVOU X0, 96(SP)
1545 MOVOU X1, 112(SP)
1546
1547 // Store pointer to result
1548 MOVQ AX, 480(SP)
1549 MOVL R15, 488(SP)
1550 MOVL DI, 492(SP)
1551
1552 // Negate y2in based on sign
1553 MOVQ 32(CX), R10
1554 MOVQ 40(CX), R11
1555 MOVQ 48(CX), R12
1556 MOVQ 56(CX), R13
1557 MOVQ $-1, BX
1558 MOVQ p256const0<>+0(SB), CX
1559 MOVQ $0x00000000, R8
1560 MOVQ p256const1<>+0(SB), R9
1561 XORQ AX, AX
1562
1563 // Speculatively subtract
1564 SUBQ R10, BX
1565 SBBQ R11, CX
1566 SBBQ R12, R8
1567 SBBQ R13, R9
1568 SBBQ $0x00, AX
1569 MOVQ BX, R14
1570 MOVQ CX, R15
1571 MOVQ R8, DI
1572 MOVQ R9, SI
1573
1574 // Add in case the operand was > p256
1575 ADDQ $-1, BX
1576 ADCQ p256const0<>+0(SB), CX
1577 ADCQ $0x00, R8
1578 ADCQ p256const1<>+0(SB), R9
1579 ADCQ $0x00, AX
1580 CMOVQNE R14, BX
1581 CMOVQNE R15, CX
1582 CMOVQNE DI, R8
1583 CMOVQNE SI, R9
1584
1585 // If condition is 0, keep original value
1586 TESTQ DX, DX
1587 CMOVQEQ R10, BX
1588 CMOVQEQ R11, CX
1589 CMOVQEQ R12, R8
1590 CMOVQEQ R13, R9
1591
1592 // Store result
1593 MOVQ BX, 128(SP)
1594 MOVQ CX, 136(SP)
1595 MOVQ R8, 144(SP)
1596 MOVQ R9, 152(SP)
1597
1598 // Begin point add
1599 MOVQ 64(SP), R10
1600 MOVQ 72(SP), R11
1601 MOVQ 80(SP), R12
1602 MOVQ 88(SP), R13
1603 CALL p256SqrInternal(SB)
1604 MOVQ R10, 288(SP)
1605 MOVQ R11, 296(SP)
1606 MOVQ R12, 304(SP)
1607 MOVQ R13, 312(SP)
1608 MOVQ 96(SP), R14
1609 MOVQ 104(SP), R15
1610 MOVQ 112(SP), DI
1611 MOVQ 120(SP), SI
1612 CALL p256MulInternal(SB)
1613 MOVQ (SP), R14
1614 MOVQ 8(SP), R15
1615 MOVQ 16(SP), DI
1616 MOVQ 24(SP), SI
1617 CALL p256SubInternal(SB)
1618 MOVQ R10, 320(SP)
1619 MOVQ R11, 328(SP)
1620 MOVQ R12, 336(SP)
1621 MOVQ R13, 344(SP)
1622 MOVQ 64(SP), R14
1623 MOVQ 72(SP), R15
1624 MOVQ 80(SP), DI
1625 MOVQ 88(SP), SI
1626 CALL p256MulInternal(SB)
1627 MOVQ R10, 224(SP)
1628 MOVQ R11, 232(SP)
1629 MOVQ R12, 240(SP)
1630 MOVQ R13, 248(SP)
1631 MOVQ 288(SP), R10
1632 MOVQ 296(SP), R11
1633 MOVQ 304(SP), R12
1634 MOVQ 312(SP), R13
1635 CALL p256MulInternal(SB)
1636 MOVQ 128(SP), R14
1637 MOVQ 136(SP), R15
1638 MOVQ 144(SP), DI
1639 MOVQ 152(SP), SI
1640 CALL p256MulInternal(SB)
1641 MOVQ R10, 256(SP)
1642 MOVQ R11, 264(SP)
1643 MOVQ R12, 272(SP)
1644 MOVQ R13, 280(SP)
1645 MOVQ 32(SP), R14
1646 MOVQ 40(SP), R15
1647 MOVQ 48(SP), DI
1648 MOVQ 56(SP), SI
1649 CALL p256SubInternal(SB)
1650 MOVQ R10, 352(SP)
1651 MOVQ R11, 360(SP)
1652 MOVQ R12, 368(SP)
1653 MOVQ R13, 376(SP)
1654 CALL p256SqrInternal(SB)
1655 MOVQ R10, 416(SP)
1656 MOVQ R11, 424(SP)
1657 MOVQ R12, 432(SP)
1658 MOVQ R13, 440(SP)
1659 MOVQ 320(SP), R10
1660 MOVQ 328(SP), R11
1661 MOVQ 336(SP), R12
1662 MOVQ 344(SP), R13
1663 CALL p256SqrInternal(SB)
1664 MOVQ R10, 384(SP)
1665 MOVQ R11, 392(SP)
1666 MOVQ R12, 400(SP)
1667 MOVQ R13, 408(SP)
1668 MOVQ 320(SP), R14
1669 MOVQ 328(SP), R15
1670 MOVQ 336(SP), DI
1671 MOVQ 344(SP), SI
1672 CALL p256MulInternal(SB)
1673 MOVQ R10, 448(SP)
1674 MOVQ R11, 456(SP)
1675 MOVQ R12, 464(SP)
1676 MOVQ R13, 472(SP)
1677 MOVQ 32(SP), R14
1678 MOVQ 40(SP), R15
1679 MOVQ 48(SP), DI
1680 MOVQ 56(SP), SI
1681 CALL p256MulInternal(SB)
1682 MOVQ R10, 256(SP)
1683 MOVQ R11, 264(SP)
1684 MOVQ R12, 272(SP)
1685 MOVQ R13, 280(SP)
1686 MOVQ (SP), R10
1687 MOVQ 8(SP), R11
1688 MOVQ 16(SP), R12
1689 MOVQ 24(SP), R13
1690 MOVQ 384(SP), R14
1691 MOVQ 392(SP), R15
1692 MOVQ 400(SP), DI
1693 MOVQ 408(SP), SI
1694 CALL p256MulInternal(SB)
1695 MOVQ R10, 320(SP)
1696 MOVQ R11, 328(SP)
1697 MOVQ R12, 336(SP)
1698 MOVQ R13, 344(SP)
1699 XORQ AX, AX
1700 ADDQ R10, R10
1701 ADCQ R11, R11
1702 ADCQ R12, R12
1703 ADCQ R13, R13
1704 ADCQ $+0, AX
1705 MOVQ R10, R14
1706 MOVQ R11, R15
1707 MOVQ R12, DI
1708 MOVQ R13, SI
1709 SUBQ $-1, R14
1710 SBBQ p256const0<>+0(SB), R15
1711 SBBQ $+0, DI
1712 SBBQ p256const1<>+0(SB), SI
1713 SBBQ $+0, AX
1714 CMOVQCS R10, R14
1715 CMOVQCS R11, R15
1716 CMOVQCS R12, DI
1717 CMOVQCS R13, SI
1718 MOVQ 416(SP), R10
1719 MOVQ 424(SP), R11
1720 MOVQ 432(SP), R12
1721 MOVQ 440(SP), R13
1722 CALL p256SubInternal(SB)
1723 MOVQ 448(SP), R14
1724 MOVQ 456(SP), R15
1725 MOVQ 464(SP), DI
1726 MOVQ 472(SP), SI
1727 CALL p256SubInternal(SB)
1728 MOVQ R10, 160(SP)
1729 MOVQ R11, 168(SP)
1730 MOVQ R12, 176(SP)
1731 MOVQ R13, 184(SP)
1732 MOVQ R10, R14
1733 MOVQ R11, R15
1734 MOVQ R12, DI
1735 MOVQ R13, SI
1736 MOVQ 320(SP), R10
1737 MOVQ 328(SP), R11
1738 MOVQ 336(SP), R12
1739 MOVQ 344(SP), R13
1740 CALL p256SubInternal(SB)
1741 MOVQ 352(SP), R14
1742 MOVQ 360(SP), R15
1743 MOVQ 368(SP), DI
1744 MOVQ 376(SP), SI
1745 CALL p256MulInternal(SB)
1746 MOVQ 256(SP), R14
1747 MOVQ 264(SP), R15
1748 MOVQ 272(SP), DI
1749 MOVQ 280(SP), SI
1750 CALL p256SubInternal(SB)
1751 MOVQ R10, 192(SP)
1752 MOVQ R11, 200(SP)
1753 MOVQ R12, 208(SP)
1754 MOVQ R13, 216(SP)
1755
1756 // Load stored values from stack
1757 MOVQ 480(SP), AX
1758 MOVL 488(SP), BX
1759 MOVL 492(SP), CX
1760
1761 // The result is not valid if (sel == 0), conditional choose
1762 MOVOU 160(SP), X0
1763 MOVOU 176(SP), X1
1764 MOVOU 192(SP), X2
1765 MOVOU 208(SP), X3
1766 MOVOU 224(SP), X4
1767 MOVOU 240(SP), X5
1768 MOVL BX, X6
1769 MOVL CX, X7
1770 PXOR X8, X8
1771 PCMPEQL X9, X9
1772 PSHUFD $0x00, X6, X6
1773 PSHUFD $0x00, X7, X7
1774 PCMPEQL X8, X6
1775 PCMPEQL X8, X7
1776 MOVOU X6, X15
1777 PANDN X9, X15
1778 MOVOU (SP), X9
1779 MOVOU 16(SP), X10
1780 MOVOU 32(SP), X11
1781 MOVOU 48(SP), X12
1782 MOVOU 64(SP), X13
1783 MOVOU 80(SP), X14
1784 PAND X15, X0
1785 PAND X15, X1
1786 PAND X15, X2
1787 PAND X15, X3
1788 PAND X15, X4
1789 PAND X15, X5
1790 PAND X6, X9
1791 PAND X6, X10
1792 PAND X6, X11
1793 PAND X6, X12
1794 PAND X6, X13
1795 PAND X6, X14
1796 PXOR X9, X0
1797 PXOR X10, X1
1798 PXOR X11, X2
1799 PXOR X12, X3
1800 PXOR X13, X4
1801 PXOR X14, X5
1802
1803 // Similarly if zero == 0
1804 PCMPEQL X9, X9
1805 MOVOU X7, X15
1806 PANDN X9, X15
1807 MOVOU 96(SP), X9
1808 MOVOU 112(SP), X10
1809 MOVOU 128(SP), X11
1810 MOVOU 144(SP), X12
1811 MOVOU p256one<>+0(SB), X13
1812 MOVOU p256one<>+16(SB), X14
1813 PAND X15, X0
1814 PAND X15, X1
1815 PAND X15, X2
1816 PAND X15, X3
1817 PAND X15, X4
1818 PAND X15, X5
1819 PAND X7, X9
1820 PAND X7, X10
1821 PAND X7, X11
1822 PAND X7, X12
1823 PAND X7, X13
1824 PAND X7, X14
1825 PXOR X9, X0
1826 PXOR X10, X1
1827 PXOR X11, X2
1828 PXOR X12, X3
1829 PXOR X13, X4
1830 PXOR X14, X5
1831
1832 // Finally output the result
1833 MOVOU X0, (AX)
1834 MOVOU X1, 16(AX)
1835 MOVOU X2, 32(AX)
1836 MOVOU X3, 48(AX)
1837 MOVOU X4, 64(AX)
1838 MOVOU X5, 80(AX)
1839 MOVQ $0x00000000, 480(SP)
1840 RET
1841
1842 DATA p256one<>+0(SB)/8, $0x0000000000000001
1843 DATA p256one<>+8(SB)/8, $0xffffffff00000000
1844 DATA p256one<>+16(SB)/8, $0xffffffffffffffff
1845 DATA p256one<>+24(SB)/8, $0x00000000fffffffe
1846 GLOBL p256one<>(SB), RODATA, $32
1847
1848 // func p256IsZero()
1849 // Requires: CMOV
1850 TEXT p256IsZero(SB), NOSPLIT, $0
1851 // AX contains a flag that is set if the input is zero.
1852 XORQ AX, AX
1853 MOVQ $0x00000001, R15
1854
1855 // Check whether [acc4..acc7] are all zero.
1856 MOVQ R10, R14
1857 ORQ R11, R14
1858 ORQ R12, R14
1859 ORQ R13, R14
1860
1861 // Set the zero flag if so. (CMOV of a constant to a register doesn't
1862 // appear to be supported in Go. Thus t1 = 1.)
1863 CMOVQEQ R15, AX
1864
1865 // XOR [acc4..acc7] with P and compare with zero again.
1866 XORQ $-1, R10
1867 XORQ p256const0<>+0(SB), R11
1868 XORQ p256const1<>+0(SB), R13
1869 ORQ R11, R10
1870 ORQ R12, R10
1871 ORQ R13, R10
1872
1873 // Set the zero flag if so.
1874 CMOVQEQ R15, AX
1875 RET
1876
1877 // func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
1878 // Requires: CMOV, SSE2
1879 TEXT ·p256PointAddAsm(SB), $680-32
1880 // Move input to stack in order to free registers
1881 MOVQ res+0(FP), AX
1882 MOVQ in1+8(FP), BX
1883 MOVQ in2+16(FP), CX
1884 MOVOU (BX), X0
1885 MOVOU 16(BX), X1
1886 MOVOU 32(BX), X2
1887 MOVOU 48(BX), X3
1888 MOVOU 64(BX), X4
1889 MOVOU 80(BX), X5
1890 MOVOU X0, (SP)
1891 MOVOU X1, 16(SP)
1892 MOVOU X2, 32(SP)
1893 MOVOU X3, 48(SP)
1894 MOVOU X4, 64(SP)
1895 MOVOU X5, 80(SP)
1896 MOVOU (CX), X0
1897 MOVOU 16(CX), X1
1898 MOVOU 32(CX), X2
1899 MOVOU 48(CX), X3
1900 MOVOU 64(CX), X4
1901 MOVOU 80(CX), X5
1902 MOVOU X0, 96(SP)
1903 MOVOU X1, 112(SP)
1904 MOVOU X2, 128(SP)
1905 MOVOU X3, 144(SP)
1906 MOVOU X4, 160(SP)
1907 MOVOU X5, 176(SP)
1908
1909 // Store pointer to result
1910 MOVQ AX, 640(SP)
1911
1912 // Begin point add
1913 MOVQ 160(SP), R10
1914 MOVQ 168(SP), R11
1915 MOVQ 176(SP), R12
1916 MOVQ 184(SP), R13
1917 CALL p256SqrInternal(SB)
1918 MOVQ R10, 448(SP)
1919 MOVQ R11, 456(SP)
1920 MOVQ R12, 464(SP)
1921 MOVQ R13, 472(SP)
1922 MOVQ 160(SP), R14
1923 MOVQ 168(SP), R15
1924 MOVQ 176(SP), DI
1925 MOVQ 184(SP), SI
1926 CALL p256MulInternal(SB)
1927 MOVQ 32(SP), R14
1928 MOVQ 40(SP), R15
1929 MOVQ 48(SP), DI
1930 MOVQ 56(SP), SI
1931 CALL p256MulInternal(SB)
1932 MOVQ R10, 352(SP)
1933 MOVQ R11, 360(SP)
1934 MOVQ R12, 368(SP)
1935 MOVQ R13, 376(SP)
1936 MOVQ 64(SP), R10
1937 MOVQ 72(SP), R11
1938 MOVQ 80(SP), R12
1939 MOVQ 88(SP), R13
1940 CALL p256SqrInternal(SB)
1941 MOVQ R10, 416(SP)
1942 MOVQ R11, 424(SP)
1943 MOVQ R12, 432(SP)
1944 MOVQ R13, 440(SP)
1945 MOVQ 64(SP), R14
1946 MOVQ 72(SP), R15
1947 MOVQ 80(SP), DI
1948 MOVQ 88(SP), SI
1949 CALL p256MulInternal(SB)
1950 MOVQ 128(SP), R14
1951 MOVQ 136(SP), R15
1952 MOVQ 144(SP), DI
1953 MOVQ 152(SP), SI
1954 CALL p256MulInternal(SB)
1955 MOVQ R10, 384(SP)
1956 MOVQ R11, 392(SP)
1957 MOVQ R12, 400(SP)
1958 MOVQ R13, 408(SP)
1959 MOVQ 352(SP), R14
1960 MOVQ 360(SP), R15
1961 MOVQ 368(SP), DI
1962 MOVQ 376(SP), SI
1963 CALL p256SubInternal(SB)
1964 MOVQ R10, 512(SP)
1965 MOVQ R11, 520(SP)
1966 MOVQ R12, 528(SP)
1967 MOVQ R13, 536(SP)
1968 CALL p256IsZero(SB)
1969 MOVQ AX, 648(SP)
1970 MOVQ 448(SP), R10
1971 MOVQ 456(SP), R11
1972 MOVQ 464(SP), R12
1973 MOVQ 472(SP), R13
1974 MOVQ (SP), R14
1975 MOVQ 8(SP), R15
1976 MOVQ 16(SP), DI
1977 MOVQ 24(SP), SI
1978 CALL p256MulInternal(SB)
1979 MOVQ R10, 288(SP)
1980 MOVQ R11, 296(SP)
1981 MOVQ R12, 304(SP)
1982 MOVQ R13, 312(SP)
1983 MOVQ 416(SP), R10
1984 MOVQ 424(SP), R11
1985 MOVQ 432(SP), R12
1986 MOVQ 440(SP), R13
1987 MOVQ 96(SP), R14
1988 MOVQ 104(SP), R15
1989 MOVQ 112(SP), DI
1990 MOVQ 120(SP), SI
1991 CALL p256MulInternal(SB)
1992 MOVQ R10, 320(SP)
1993 MOVQ R11, 328(SP)
1994 MOVQ R12, 336(SP)
1995 MOVQ R13, 344(SP)
1996 MOVQ 288(SP), R14
1997 MOVQ 296(SP), R15
1998 MOVQ 304(SP), DI
1999 MOVQ 312(SP), SI
2000 CALL p256SubInternal(SB)
2001 MOVQ R10, 480(SP)
2002 MOVQ R11, 488(SP)
2003 MOVQ R12, 496(SP)
2004 MOVQ R13, 504(SP)
2005 CALL p256IsZero(SB)
2006 ANDQ 648(SP), AX
2007 MOVQ AX, 648(SP)
2008 MOVQ 512(SP), R10
2009 MOVQ 520(SP), R11
2010 MOVQ 528(SP), R12
2011 MOVQ 536(SP), R13
2012 CALL p256SqrInternal(SB)
2013 MOVQ R10, 576(SP)
2014 MOVQ R11, 584(SP)
2015 MOVQ R12, 592(SP)
2016 MOVQ R13, 600(SP)
2017 MOVQ 480(SP), R10
2018 MOVQ 488(SP), R11
2019 MOVQ 496(SP), R12
2020 MOVQ 504(SP), R13
2021 CALL p256SqrInternal(SB)
2022 MOVQ R10, 544(SP)
2023 MOVQ R11, 552(SP)
2024 MOVQ R12, 560(SP)
2025 MOVQ R13, 568(SP)
2026 MOVQ 480(SP), R14
2027 MOVQ 488(SP), R15
2028 MOVQ 496(SP), DI
2029 MOVQ 504(SP), SI
2030 CALL p256MulInternal(SB)
2031 MOVQ R10, 608(SP)
2032 MOVQ R11, 616(SP)
2033 MOVQ R12, 624(SP)
2034 MOVQ R13, 632(SP)
2035 MOVQ 352(SP), R14
2036 MOVQ 360(SP), R15
2037 MOVQ 368(SP), DI
2038 MOVQ 376(SP), SI
2039 CALL p256MulInternal(SB)
2040 MOVQ R10, 384(SP)
2041 MOVQ R11, 392(SP)
2042 MOVQ R12, 400(SP)
2043 MOVQ R13, 408(SP)
2044 MOVQ 64(SP), R10
2045 MOVQ 72(SP), R11
2046 MOVQ 80(SP), R12
2047 MOVQ 88(SP), R13
2048 MOVQ 160(SP), R14
2049 MOVQ 168(SP), R15
2050 MOVQ 176(SP), DI
2051 MOVQ 184(SP), SI
2052 CALL p256MulInternal(SB)
2053 MOVQ 480(SP), R14
2054 MOVQ 488(SP), R15
2055 MOVQ 496(SP), DI
2056 MOVQ 504(SP), SI
2057 CALL p256MulInternal(SB)
2058 MOVQ R10, 256(SP)
2059 MOVQ R11, 264(SP)
2060 MOVQ R12, 272(SP)
2061 MOVQ R13, 280(SP)
2062 MOVQ 544(SP), R10
2063 MOVQ 552(SP), R11
2064 MOVQ 560(SP), R12
2065 MOVQ 568(SP), R13
2066 MOVQ 288(SP), R14
2067 MOVQ 296(SP), R15
2068 MOVQ 304(SP), DI
2069 MOVQ 312(SP), SI
2070 CALL p256MulInternal(SB)
2071 MOVQ R10, 320(SP)
2072 MOVQ R11, 328(SP)
2073 MOVQ R12, 336(SP)
2074 MOVQ R13, 344(SP)
2075 XORQ AX, AX
2076 ADDQ R10, R10
2077 ADCQ R11, R11
2078 ADCQ R12, R12
2079 ADCQ R13, R13
2080 ADCQ $+0, AX
2081 MOVQ R10, R14
2082 MOVQ R11, R15
2083 MOVQ R12, DI
2084 MOVQ R13, SI
2085 SUBQ $-1, R14
2086 SBBQ p256const0<>+0(SB), R15
2087 SBBQ $+0, DI
2088 SBBQ p256const1<>+0(SB), SI
2089 SBBQ $+0, AX
2090 CMOVQCS R10, R14
2091 CMOVQCS R11, R15
2092 CMOVQCS R12, DI
2093 CMOVQCS R13, SI
2094 MOVQ 576(SP), R10
2095 MOVQ 584(SP), R11
2096 MOVQ 592(SP), R12
2097 MOVQ 600(SP), R13
2098 CALL p256SubInternal(SB)
2099 MOVQ 608(SP), R14
2100 MOVQ 616(SP), R15
2101 MOVQ 624(SP), DI
2102 MOVQ 632(SP), SI
2103 CALL p256SubInternal(SB)
2104 MOVQ R10, 192(SP)
2105 MOVQ R11, 200(SP)
2106 MOVQ R12, 208(SP)
2107 MOVQ R13, 216(SP)
2108 MOVQ R10, R14
2109 MOVQ R11, R15
2110 MOVQ R12, DI
2111 MOVQ R13, SI
2112 MOVQ 320(SP), R10
2113 MOVQ 328(SP), R11
2114 MOVQ 336(SP), R12
2115 MOVQ 344(SP), R13
2116 CALL p256SubInternal(SB)
2117 MOVQ 512(SP), R14
2118 MOVQ 520(SP), R15
2119 MOVQ 528(SP), DI
2120 MOVQ 536(SP), SI
2121 CALL p256MulInternal(SB)
2122 MOVQ 384(SP), R14
2123 MOVQ 392(SP), R15
2124 MOVQ 400(SP), DI
2125 MOVQ 408(SP), SI
2126 CALL p256SubInternal(SB)
2127 MOVQ R10, 224(SP)
2128 MOVQ R11, 232(SP)
2129 MOVQ R12, 240(SP)
2130 MOVQ R13, 248(SP)
2131 MOVOU 192(SP), X0
2132 MOVOU 208(SP), X1
2133 MOVOU 224(SP), X2
2134 MOVOU 240(SP), X3
2135 MOVOU 256(SP), X4
2136 MOVOU 272(SP), X5
2137
2138 // Finally output the result
2139 MOVQ 640(SP), AX
2140 MOVQ $0x00000000, 640(SP)
2141 MOVOU X0, (AX)
2142 MOVOU X1, 16(AX)
2143 MOVOU X2, 32(AX)
2144 MOVOU X3, 48(AX)
2145 MOVOU X4, 64(AX)
2146 MOVOU X5, 80(AX)
2147 MOVQ 648(SP), AX
2148 MOVQ AX, ret+24(FP)
2149 RET
2150
2151 // func p256PointDoubleAsm(res *P256Point, in *P256Point)
2152 // Requires: CMOV, SSE2
2153 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
2154 MOVQ res+0(FP), AX
2155 MOVQ in+8(FP), BX
2156 MOVOU (BX), X0
2157 MOVOU 16(BX), X1
2158 MOVOU 32(BX), X2
2159 MOVOU 48(BX), X3
2160 MOVOU 64(BX), X4
2161 MOVOU 80(BX), X5
2162 MOVOU X0, (SP)
2163 MOVOU X1, 16(SP)
2164 MOVOU X2, 32(SP)
2165 MOVOU X3, 48(SP)
2166 MOVOU X4, 64(SP)
2167 MOVOU X5, 80(SP)
2168
2169 // Store pointer to result
2170 MOVQ AX, 224(SP)
2171
2172 // Begin point double
2173 MOVQ 64(SP), R10
2174 MOVQ 72(SP), R11
2175 MOVQ 80(SP), R12
2176 MOVQ 88(SP), R13
2177 CALL p256SqrInternal(SB)
2178 MOVQ R10, 160(SP)
2179 MOVQ R11, 168(SP)
2180 MOVQ R12, 176(SP)
2181 MOVQ R13, 184(SP)
2182 MOVQ (SP), R14
2183 MOVQ 8(SP), R15
2184 MOVQ 16(SP), DI
2185 MOVQ 24(SP), SI
2186 XORQ AX, AX
2187 ADDQ R14, R10
2188 ADCQ R15, R11
2189 ADCQ DI, R12
2190 ADCQ SI, R13
2191 ADCQ $+0, AX
2192 MOVQ R10, R14
2193 MOVQ R11, R15
2194 MOVQ R12, DI
2195 MOVQ R13, SI
2196 SUBQ $-1, R14
2197 SBBQ p256const0<>+0(SB), R15
2198 SBBQ $+0, DI
2199 SBBQ p256const1<>+0(SB), SI
2200 SBBQ $+0, AX
2201 CMOVQCS R10, R14
2202 CMOVQCS R11, R15
2203 CMOVQCS R12, DI
2204 CMOVQCS R13, SI
2205 MOVQ R14, 128(SP)
2206 MOVQ R15, 136(SP)
2207 MOVQ DI, 144(SP)
2208 MOVQ SI, 152(SP)
2209 MOVQ 64(SP), R10
2210 MOVQ 72(SP), R11
2211 MOVQ 80(SP), R12
2212 MOVQ 88(SP), R13
2213 MOVQ 32(SP), R14
2214 MOVQ 40(SP), R15
2215 MOVQ 48(SP), DI
2216 MOVQ 56(SP), SI
2217 CALL p256MulInternal(SB)
2218 XORQ AX, AX
2219 ADDQ R10, R10
2220 ADCQ R11, R11
2221 ADCQ R12, R12
2222 ADCQ R13, R13
2223 ADCQ $+0, AX
2224 MOVQ R10, R14
2225 MOVQ R11, R15
2226 MOVQ R12, DI
2227 MOVQ R13, SI
2228 SUBQ $-1, R14
2229 SBBQ p256const0<>+0(SB), R15
2230 SBBQ $+0, DI
2231 SBBQ p256const1<>+0(SB), SI
2232 SBBQ $+0, AX
2233 CMOVQCS R10, R14
2234 CMOVQCS R11, R15
2235 CMOVQCS R12, DI
2236 CMOVQCS R13, SI
2237 MOVQ 224(SP), AX
2238
2239 // Store z
2240 MOVQ R14, 64(AX)
2241 MOVQ R15, 72(AX)
2242 MOVQ DI, 80(AX)
2243 MOVQ SI, 88(AX)
2244 MOVQ (SP), R10
2245 MOVQ 8(SP), R11
2246 MOVQ 16(SP), R12
2247 MOVQ 24(SP), R13
2248 MOVQ 160(SP), R14
2249 MOVQ 168(SP), R15
2250 MOVQ 176(SP), DI
2251 MOVQ 184(SP), SI
2252 CALL p256SubInternal(SB)
2253 MOVQ 128(SP), R14
2254 MOVQ 136(SP), R15
2255 MOVQ 144(SP), DI
2256 MOVQ 152(SP), SI
2257 CALL p256MulInternal(SB)
2258 MOVQ R10, 128(SP)
2259 MOVQ R11, 136(SP)
2260 MOVQ R12, 144(SP)
2261 MOVQ R13, 152(SP)
2262
2263 // Multiply by 3
2264 XORQ AX, AX
2265 ADDQ R10, R10
2266 ADCQ R11, R11
2267 ADCQ R12, R12
2268 ADCQ R13, R13
2269 ADCQ $+0, AX
2270 MOVQ R10, R14
2271 MOVQ R11, R15
2272 MOVQ R12, DI
2273 MOVQ R13, SI
2274 SUBQ $-1, R14
2275 SBBQ p256const0<>+0(SB), R15
2276 SBBQ $+0, DI
2277 SBBQ p256const1<>+0(SB), SI
2278 SBBQ $+0, AX
2279 CMOVQCS R10, R14
2280 CMOVQCS R11, R15
2281 CMOVQCS R12, DI
2282 CMOVQCS R13, SI
2283 MOVQ 128(SP), R10
2284 MOVQ 136(SP), R11
2285 MOVQ 144(SP), R12
2286 MOVQ 152(SP), R13
2287 XORQ AX, AX
2288 ADDQ R14, R10
2289 ADCQ R15, R11
2290 ADCQ DI, R12
2291 ADCQ SI, R13
2292 ADCQ $+0, AX
2293 MOVQ R10, R14
2294 MOVQ R11, R15
2295 MOVQ R12, DI
2296 MOVQ R13, SI
2297 SUBQ $-1, R14
2298 SBBQ p256const0<>+0(SB), R15
2299 SBBQ $+0, DI
2300 SBBQ p256const1<>+0(SB), SI
2301 SBBQ $+0, AX
2302 CMOVQCS R10, R14
2303 CMOVQCS R11, R15
2304 CMOVQCS R12, DI
2305 CMOVQCS R13, SI
2306 MOVQ R14, 128(SP)
2307 MOVQ R15, 136(SP)
2308 MOVQ DI, 144(SP)
2309 MOVQ SI, 152(SP)
2310
2311 // ////////////////////////
2312 MOVQ 32(SP), R10
2313 MOVQ 40(SP), R11
2314 MOVQ 48(SP), R12
2315 MOVQ 56(SP), R13
2316 XORQ AX, AX
2317 ADDQ R10, R10
2318 ADCQ R11, R11
2319 ADCQ R12, R12
2320 ADCQ R13, R13
2321 ADCQ $+0, AX
2322 MOVQ R10, R14
2323 MOVQ R11, R15
2324 MOVQ R12, DI
2325 MOVQ R13, SI
2326 SUBQ $-1, R14
2327 SBBQ p256const0<>+0(SB), R15
2328 SBBQ $+0, DI
2329 SBBQ p256const1<>+0(SB), SI
2330 SBBQ $+0, AX
2331 CMOVQCS R10, R14
2332 CMOVQCS R11, R15
2333 CMOVQCS R12, DI
2334 CMOVQCS R13, SI
2335 MOVQ R14, R10
2336 MOVQ R15, R11
2337 MOVQ DI, R12
2338 MOVQ SI, R13
2339 CALL p256SqrInternal(SB)
2340 MOVQ R10, 96(SP)
2341 MOVQ R11, 104(SP)
2342 MOVQ R12, 112(SP)
2343 MOVQ R13, 120(SP)
2344 CALL p256SqrInternal(SB)
2345
2346 // Divide by 2
2347 XORQ AX, AX
2348 MOVQ R10, R14
2349 MOVQ R11, R15
2350 MOVQ R12, DI
2351 MOVQ R13, SI
2352 ADDQ $-1, R10
2353 ADCQ p256const0<>+0(SB), R11
2354 ADCQ $0x00, R12
2355 ADCQ p256const1<>+0(SB), R13
2356 ADCQ $0x00, AX
2357 TESTQ $0x00000001, R14
2358 CMOVQEQ R14, R10
2359 CMOVQEQ R15, R11
2360 CMOVQEQ DI, R12
2361 CMOVQEQ SI, R13
2362 ANDQ R14, AX
2363 SHRQ $0x01, R11, R10
2364 SHRQ $0x01, R12, R11
2365 SHRQ $0x01, R13, R12
2366 SHRQ $0x01, AX, R13
2367 MOVQ R10, 32(SP)
2368 MOVQ R11, 40(SP)
2369 MOVQ R12, 48(SP)
2370 MOVQ R13, 56(SP)
2371
2372 // /////////////////////////
2373 MOVQ (SP), R10
2374 MOVQ 8(SP), R11
2375 MOVQ 16(SP), R12
2376 MOVQ 24(SP), R13
2377 MOVQ 96(SP), R14
2378 MOVQ 104(SP), R15
2379 MOVQ 112(SP), DI
2380 MOVQ 120(SP), SI
2381 CALL p256MulInternal(SB)
2382 MOVQ R10, 96(SP)
2383 MOVQ R11, 104(SP)
2384 MOVQ R12, 112(SP)
2385 MOVQ R13, 120(SP)
2386 XORQ AX, AX
2387 ADDQ R10, R10
2388 ADCQ R11, R11
2389 ADCQ R12, R12
2390 ADCQ R13, R13
2391 ADCQ $+0, AX
2392 MOVQ R10, R14
2393 MOVQ R11, R15
2394 MOVQ R12, DI
2395 MOVQ R13, SI
2396 SUBQ $-1, R14
2397 SBBQ p256const0<>+0(SB), R15
2398 SBBQ $+0, DI
2399 SBBQ p256const1<>+0(SB), SI
2400 SBBQ $+0, AX
2401 CMOVQCS R10, R14
2402 CMOVQCS R11, R15
2403 CMOVQCS R12, DI
2404 CMOVQCS R13, SI
2405 MOVQ R14, 192(SP)
2406 MOVQ R15, 200(SP)
2407 MOVQ DI, 208(SP)
2408 MOVQ SI, 216(SP)
2409 MOVQ 128(SP), R10
2410 MOVQ 136(SP), R11
2411 MOVQ 144(SP), R12
2412 MOVQ 152(SP), R13
2413 CALL p256SqrInternal(SB)
2414 MOVQ 192(SP), R14
2415 MOVQ 200(SP), R15
2416 MOVQ 208(SP), DI
2417 MOVQ 216(SP), SI
2418 CALL p256SubInternal(SB)
2419 MOVQ 224(SP), AX
2420
2421 // Store x
2422 MOVQ R10, (AX)
2423 MOVQ R11, 8(AX)
2424 MOVQ R12, 16(AX)
2425 MOVQ R13, 24(AX)
2426 MOVQ R10, R14
2427 MOVQ R11, R15
2428 MOVQ R12, DI
2429 MOVQ R13, SI
2430 MOVQ 96(SP), R10
2431 MOVQ 104(SP), R11
2432 MOVQ 112(SP), R12
2433 MOVQ 120(SP), R13
2434 CALL p256SubInternal(SB)
2435 MOVQ 128(SP), R14
2436 MOVQ 136(SP), R15
2437 MOVQ 144(SP), DI
2438 MOVQ 152(SP), SI
2439 CALL p256MulInternal(SB)
2440 MOVQ 32(SP), R14
2441 MOVQ 40(SP), R15
2442 MOVQ 48(SP), DI
2443 MOVQ 56(SP), SI
2444 CALL p256SubInternal(SB)
2445 MOVQ 224(SP), AX
2446
2447 // Store y
2448 MOVQ R10, 32(AX)
2449 MOVQ R11, 40(AX)
2450 MOVQ R12, 48(AX)
2451 MOVQ R13, 56(AX)
2452
2453 // ///////////////////////
2454 MOVQ $0x00000000, 224(SP)
2455 RET
2456
View as plain text