// Code generated by command: go run p256_asm_amd64.go -out ../p256_asm_amd64.s -pkg nistec. DO NOT EDIT. //go:build !purego #include "textflag.h" // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16 JMP ·p256BigToLittle(SB) // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16 JMP ·p256BigToLittle(SB) // func p256LittleToBig(res *[32]byte, in *p256Element) TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16 JMP ·p256BigToLittle(SB) // func p256BigToLittle(res *p256Element, in *[32]byte) TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16 MOVQ res+0(FP), DI MOVQ in+8(FP), SI MOVQ (SI), R8 MOVQ 8(SI), R9 MOVQ 16(SI), R10 MOVQ 24(SI), R11 BSWAPQ R8 BSWAPQ R9 BSWAPQ R10 BSWAPQ R11 MOVQ R11, (DI) MOVQ R10, 8(DI) MOVQ R9, 16(DI) MOVQ R8, 24(DI) RET // func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int) // Requires: SSE2 TEXT ·p256MovCond(SB), NOSPLIT, $0-32 MOVQ res+0(FP), DI MOVQ a+8(FP), SI MOVQ b+16(FP), CX MOVQ cond+24(FP), X12 PXOR X13, X13 PSHUFD $0x00, X12, X12 PCMPEQL X13, X12 MOVOU X12, X0 MOVOU (SI), X6 PANDN X6, X0 MOVOU X12, X1 MOVOU 16(SI), X7 PANDN X7, X1 MOVOU X12, X2 MOVOU 32(SI), X8 PANDN X8, X2 MOVOU X12, X3 MOVOU 48(SI), X9 PANDN X9, X3 MOVOU X12, X4 MOVOU 64(SI), X10 PANDN X10, X4 MOVOU X12, X5 MOVOU 80(SI), X11 PANDN X11, X5 MOVOU (CX), X6 MOVOU 16(CX), X7 MOVOU 32(CX), X8 MOVOU 48(CX), X9 MOVOU 64(CX), X10 MOVOU 80(CX), X11 PAND X12, X6 PAND X12, X7 PAND X12, X8 PAND X12, X9 PAND X12, X10 PAND X12, X11 PXOR X6, X0 PXOR X7, X1 PXOR X8, X2 PXOR X9, X3 PXOR X10, X4 PXOR X11, X5 MOVOU X0, (DI) MOVOU X1, 16(DI) MOVOU X2, 32(DI) MOVOU X3, 48(DI) MOVOU X4, 64(DI) MOVOU X5, 80(DI) RET // func p256NegCond(val *p256Element, cond int) // Requires: CMOV TEXT ·p256NegCond(SB), NOSPLIT, $0-16 MOVQ val+0(FP), DI MOVQ cond+8(FP), R14 // acc = poly MOVQ $-1, R8 MOVQ p256const0<>+0(SB), R9 MOVQ $+0, R10 MOVQ p256const1<>+0(SB), R11 // Load the original value MOVQ (DI), R13 MOVQ 8(DI), SI MOVQ 16(DI), CX MOVQ 24(DI), R15 // Speculatively subtract SUBQ R13, R8 SBBQ SI, R9 SBBQ CX, R10 SBBQ R15, R11 // If condition is 0, keep original value TESTQ R14, R14 CMOVQEQ R13, R8 CMOVQEQ SI, R9 CMOVQEQ CX, R10 CMOVQEQ R15, R11 // Store result MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) MOVQ R11, 24(DI) RET DATA p256const0<>+0(SB)/8, $0x00000000ffffffff GLOBL p256const0<>(SB), RODATA, $8 DATA p256const1<>+0(SB)/8, $0xffffffff00000001 GLOBL p256const1<>(SB), RODATA, $8 // func p256Sqr(res *p256Element, in *p256Element, n int) // Requires: CMOV TEXT ·p256Sqr(SB), NOSPLIT, $0-24 MOVQ res+0(FP), DI MOVQ in+8(FP), SI MOVQ n+16(FP), BX sqrLoop: // y[1:] * y[0] MOVQ (SI), R14 MOVQ 8(SI), AX MULQ R14 MOVQ AX, R9 MOVQ DX, R10 MOVQ 16(SI), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R11 MOVQ 24(SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R12 // y[2:] * y[1] MOVQ 8(SI), R14 MOVQ 16(SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R13 // y[3] * y[2] MOVQ 16(SI), R14 MOVQ 24(SI), AX MULQ R14 ADDQ AX, R13 ADCQ $0x00, DX MOVQ DX, CX XORQ R15, R15 // *2 ADDQ R9, R9 ADCQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ CX, CX ADCQ $0x00, R15 // Missing products MOVQ (SI), AX MULQ AX MOVQ AX, R8 MOVQ DX, R14 MOVQ 8(SI), AX MULQ AX ADDQ R14, R9 ADCQ AX, R10 ADCQ $0x00, DX MOVQ DX, R14 MOVQ 16(SI), AX MULQ AX ADDQ R14, R11 ADCQ AX, R12 ADCQ $0x00, DX MOVQ DX, R14 MOVQ 24(SI), AX MULQ AX ADDQ R14, R13 ADCQ AX, CX ADCQ DX, R15 MOVQ R15, SI // First reduction step MOVQ R8, AX MOVQ R8, R15 SHLQ $0x20, R8 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R8, R9 ADCQ R15, R10 ADCQ AX, R11 ADCQ $0x00, DX MOVQ DX, R8 // Second reduction step MOVQ R9, AX MOVQ R9, R15 SHLQ $0x20, R9 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R9, R10 ADCQ R15, R11 ADCQ AX, R8 ADCQ $0x00, DX MOVQ DX, R9 // Third reduction step MOVQ R10, AX MOVQ R10, R15 SHLQ $0x20, R10 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R10, R11 ADCQ R15, R8 ADCQ AX, R9 ADCQ $0x00, DX MOVQ DX, R10 // Last reduction step XORQ R14, R14 MOVQ R11, AX MOVQ R11, R15 SHLQ $0x20, R11 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R11, R8 ADCQ R15, R9 ADCQ AX, R10 ADCQ $0x00, DX MOVQ DX, R11 // Add bits [511:256] of the sqr result ADCQ R12, R8 ADCQ R13, R9 ADCQ CX, R10 ADCQ SI, R11 ADCQ $0x00, R14 MOVQ R8, R12 MOVQ R9, R13 MOVQ R10, CX MOVQ R11, R15 // Subtract p256 SUBQ $-1, R8 SBBQ p256const0<>+0(SB), R9 SBBQ $0x00, R10 SBBQ p256const1<>+0(SB), R11 SBBQ $0x00, R14 CMOVQCS R12, R8 CMOVQCS R13, R9 CMOVQCS CX, R10 CMOVQCS R15, R11 MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) MOVQ R11, 24(DI) MOVQ DI, SI DECQ BX JNE sqrLoop RET // func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element) // Requires: CMOV TEXT ·p256Mul(SB), NOSPLIT, $0-24 MOVQ res+0(FP), DI MOVQ in1+8(FP), SI MOVQ in2+16(FP), CX // x * y[0] MOVQ (CX), R14 MOVQ (SI), AX MULQ R14 MOVQ AX, R8 MOVQ DX, R9 MOVQ 8(SI), AX MULQ R14 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 16(SI), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R11 MOVQ 24(SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R12 XORQ R13, R13 // First reduction step MOVQ R8, AX MOVQ R8, R15 SHLQ $0x20, R8 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R8, R9 ADCQ R15, R10 ADCQ AX, R11 ADCQ DX, R12 ADCQ $0x00, R13 XORQ R8, R8 // x * y[1] MOVQ 8(CX), R14 MOVQ (SI), AX MULQ R14 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 8(SI), AX MULQ R14 ADDQ R15, R10 ADCQ $0x00, DX ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 16(SI), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ DX, R13 ADCQ $0x00, R8 // Second reduction step MOVQ R9, AX MOVQ R9, R15 SHLQ $0x20, R9 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R9, R10 ADCQ R15, R11 ADCQ AX, R12 ADCQ DX, R13 ADCQ $0x00, R8 XORQ R9, R9 // x * y[2] MOVQ 16(CX), R14 MOVQ (SI), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 8(SI), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 16(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R13 ADCQ $0x00, DX ADDQ AX, R13 ADCQ DX, R8 ADCQ $0x00, R9 // Third reduction step MOVQ R10, AX MOVQ R10, R15 SHLQ $0x20, R10 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R10, R11 ADCQ R15, R12 ADCQ AX, R13 ADCQ DX, R8 ADCQ $0x00, R9 XORQ R10, R10 // x * y[3] MOVQ 24(CX), R14 MOVQ (SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 8(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 16(SI), AX MULQ R14 ADDQ R15, R13 ADCQ $0x00, DX ADDQ AX, R13 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R8 ADCQ $0x00, DX ADDQ AX, R8 ADCQ DX, R9 ADCQ $0x00, R10 // Last reduction step MOVQ R11, AX MOVQ R11, R15 SHLQ $0x20, R11 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R11, R12 ADCQ R15, R13 ADCQ AX, R8 ADCQ DX, R9 ADCQ $0x00, R10 // Copy result [255:0] MOVQ R12, SI MOVQ R13, R11 MOVQ R8, R14 MOVQ R9, R15 // Subtract p256 SUBQ $-1, R12 SBBQ p256const0<>+0(SB), R13 SBBQ $0x00, R8 SBBQ p256const1<>+0(SB), R9 SBBQ $0x00, R10 CMOVQCS SI, R12 CMOVQCS R11, R13 CMOVQCS R14, R8 CMOVQCS R15, R9 MOVQ R12, (DI) MOVQ R13, 8(DI) MOVQ R8, 16(DI) MOVQ R9, 24(DI) RET // func p256FromMont(res *p256Element, in *p256Element) // Requires: CMOV TEXT ·p256FromMont(SB), NOSPLIT, $0-16 MOVQ res+0(FP), DI MOVQ in+8(FP), SI MOVQ (SI), R8 MOVQ 8(SI), R9 MOVQ 16(SI), R10 MOVQ 24(SI), R11 XORQ R12, R12 // Only reduce, no multiplications are needed // First stage MOVQ R8, AX MOVQ R8, R15 SHLQ $0x20, R8 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R8, R9 ADCQ R15, R10 ADCQ AX, R11 ADCQ DX, R12 XORQ R13, R13 // Second stage MOVQ R9, AX MOVQ R9, R15 SHLQ $0x20, R9 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R9, R10 ADCQ R15, R11 ADCQ AX, R12 ADCQ DX, R13 XORQ R8, R8 // Third stage MOVQ R10, AX MOVQ R10, R15 SHLQ $0x20, R10 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R10, R11 ADCQ R15, R12 ADCQ AX, R13 ADCQ DX, R8 XORQ R9, R9 // Last stage MOVQ R11, AX MOVQ R11, R15 SHLQ $0x20, R11 MULQ p256const1<>+0(SB) SHRQ $0x20, R15 ADDQ R11, R12 ADCQ R15, R13 ADCQ AX, R8 ADCQ DX, R9 MOVQ R12, SI MOVQ R13, R11 MOVQ R8, R14 MOVQ R9, R15 SUBQ $-1, R12 SBBQ p256const0<>+0(SB), R13 SBBQ $0x00, R8 SBBQ p256const1<>+0(SB), R9 CMOVQCS SI, R12 CMOVQCS R11, R13 CMOVQCS R14, R8 CMOVQCS R15, R9 MOVQ R12, (DI) MOVQ R13, 8(DI) MOVQ R8, 16(DI) MOVQ R9, 24(DI) RET // func p256Select(res *P256Point, table *p256Table, idx int) // Requires: SSE2 TEXT ·p256Select(SB), NOSPLIT, $0-24 MOVQ idx+16(FP), AX MOVQ table+8(FP), DI MOVQ res+0(FP), DX PXOR X15, X15 PCMPEQL X14, X14 PSUBL X14, X15 MOVL AX, X14 PSHUFD $0x00, X14, X14 PXOR X0, X0 PXOR X1, X1 PXOR X2, X2 PXOR X3, X3 PXOR X4, X4 PXOR X5, X5 MOVQ $0x00000010, AX MOVOU X15, X13 loop_select: MOVOU X13, X12 PADDL X15, X13 PCMPEQL X14, X12 MOVOU (DI), X6 MOVOU 16(DI), X7 MOVOU 32(DI), X8 MOVOU 48(DI), X9 MOVOU 64(DI), X10 MOVOU 80(DI), X11 ADDQ $0x60, DI PAND X12, X6 PAND X12, X7 PAND X12, X8 PAND X12, X9 PAND X12, X10 PAND X12, X11 PXOR X6, X0 PXOR X7, X1 PXOR X8, X2 PXOR X9, X3 PXOR X10, X4 PXOR X11, X5 DECQ AX JNE loop_select MOVOU X0, (DX) MOVOU X1, 16(DX) MOVOU X2, 32(DX) MOVOU X3, 48(DX) MOVOU X4, 64(DX) MOVOU X5, 80(DX) RET // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) // Requires: SSE2 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24 MOVQ idx+16(FP), AX MOVQ table+8(FP), DI MOVQ res+0(FP), DX PXOR X15, X15 PCMPEQL X14, X14 PSUBL X14, X15 MOVL AX, X14 PSHUFD $0x00, X14, X14 PXOR X0, X0 PXOR X1, X1 PXOR X2, X2 PXOR X3, X3 MOVQ $0x00000010, AX MOVOU X15, X13 loop_select_base: MOVOU X13, X12 PADDL X15, X13 PCMPEQL X14, X12 MOVOU (DI), X4 MOVOU 16(DI), X5 MOVOU 32(DI), X6 MOVOU 48(DI), X7 MOVOU 64(DI), X8 MOVOU 80(DI), X9 MOVOU 96(DI), X10 MOVOU 112(DI), X11 ADDQ $0x80, DI PAND X12, X4 PAND X12, X5 PAND X12, X6 PAND X12, X7 MOVOU X13, X12 PADDL X15, X13 PCMPEQL X14, X12 PAND X12, X8 PAND X12, X9 PAND X12, X10 PAND X12, X11 PXOR X4, X0 PXOR X5, X1 PXOR X6, X2 PXOR X7, X3 PXOR X8, X0 PXOR X9, X1 PXOR X10, X2 PXOR X11, X3 DECQ AX JNE loop_select_base MOVOU X0, (DX) MOVOU X1, 16(DX) MOVOU X2, 32(DX) MOVOU X3, 48(DX) RET // func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement) // Requires: CMOV TEXT ·p256OrdMul(SB), NOSPLIT, $0-24 MOVQ res+0(FP), DI MOVQ in1+8(FP), SI MOVQ in2+16(FP), CX // x * y[0] MOVQ (CX), R14 MOVQ (SI), AX MULQ R14 MOVQ AX, R8 MOVQ DX, R9 MOVQ 8(SI), AX MULQ R14 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R10 MOVQ 16(SI), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R11 MOVQ 24(SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R12 XORQ R13, R13 // First reduction step MOVQ R8, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R8 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R9 ADCQ $0x00, DX ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+16(SB), AX MULQ R14 ADDQ R15, R10 ADCQ $0x00, DX ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+24(SB), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ DX, R12 ADCQ $0x00, R13 // x * y[1] MOVQ 8(CX), R14 MOVQ (SI), AX MULQ R14 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 8(SI), AX MULQ R14 ADDQ R15, R10 ADCQ $0x00, DX ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 16(SI), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ DX, R13 ADCQ $0x00, R8 // Second reduction step MOVQ R9, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R10 ADCQ $0x00, DX ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+16(SB), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+24(SB), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ DX, R13 ADCQ $0x00, R8 // x * y[2] MOVQ 16(CX), R14 MOVQ (SI), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 8(SI), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 16(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R13 ADCQ $0x00, DX ADDQ AX, R13 ADCQ DX, R8 ADCQ $0x00, R9 // Third reduction step MOVQ R10, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+16(SB), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+24(SB), AX MULQ R14 ADDQ R15, R13 ADCQ $0x00, DX ADDQ AX, R13 ADCQ DX, R8 ADCQ $0x00, R9 // x * y[3] MOVQ 24(CX), R14 MOVQ (SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 8(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 16(SI), AX MULQ R14 ADDQ R15, R13 ADCQ $0x00, DX ADDQ AX, R13 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R8 ADCQ $0x00, DX ADDQ AX, R8 ADCQ DX, R9 ADCQ $0x00, R10 // Last reduction step MOVQ R11, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+16(SB), AX MULQ R14 ADDQ R15, R13 ADCQ $0x00, DX ADDQ AX, R13 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+24(SB), AX MULQ R14 ADDQ R15, R8 ADCQ $0x00, DX ADDQ AX, R8 ADCQ DX, R9 ADCQ $0x00, R10 // Copy result [255:0] MOVQ R12, SI MOVQ R13, R11 MOVQ R8, R14 MOVQ R9, R15 // Subtract p256 SUBQ p256ord<>+0(SB), R12 SBBQ p256ord<>+8(SB), R13 SBBQ p256ord<>+16(SB), R8 SBBQ p256ord<>+24(SB), R9 SBBQ $0x00, R10 CMOVQCS SI, R12 CMOVQCS R11, R13 CMOVQCS R14, R8 CMOVQCS R15, R9 MOVQ R12, (DI) MOVQ R13, 8(DI) MOVQ R8, 16(DI) MOVQ R9, 24(DI) RET DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f GLOBL p256ordK0<>(SB), RODATA, $8 DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551 DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84 DATA p256ord<>+16(SB)/8, $0xffffffffffffffff DATA p256ord<>+24(SB)/8, $0xffffffff00000000 GLOBL p256ord<>(SB), RODATA, $32 // func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int) // Requires: CMOV TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24 MOVQ res+0(FP), DI MOVQ in+8(FP), SI MOVQ n+16(FP), BX ordSqrLoop: // y[1:] * y[0] MOVQ (SI), R14 MOVQ 8(SI), AX MULQ R14 MOVQ AX, R9 MOVQ DX, R10 MOVQ 16(SI), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R11 MOVQ 24(SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R12 // y[2:] * y[1] MOVQ 8(SI), R14 MOVQ 16(SI), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ 24(SI), AX MULQ R14 ADDQ R15, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R13 // y[3] * y[2] MOVQ 16(SI), R14 MOVQ 24(SI), AX MULQ R14 ADDQ AX, R13 ADCQ $0x00, DX MOVQ DX, CX XORQ R15, R15 // *2 ADDQ R9, R9 ADCQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ CX, CX ADCQ $0x00, R15 // Missing products MOVQ (SI), AX MULQ AX MOVQ AX, R8 MOVQ DX, R14 MOVQ 8(SI), AX MULQ AX ADDQ R14, R9 ADCQ AX, R10 ADCQ $0x00, DX MOVQ DX, R14 MOVQ 16(SI), AX MULQ AX ADDQ R14, R11 ADCQ AX, R12 ADCQ $0x00, DX MOVQ DX, R14 MOVQ 24(SI), AX MULQ AX ADDQ R14, R13 ADCQ AX, CX ADCQ DX, R15 MOVQ R15, SI // First reduction step MOVQ R8, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R8 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R9 ADCQ $0x00, DX ADDQ AX, R9 MOVQ R14, R15 ADCQ DX, R10 ADCQ $0x00, R15 SUBQ R14, R10 SBBQ $0x00, R15 MOVQ R14, AX MOVQ R14, DX MOVQ R14, R8 SHLQ $0x20, AX SHRQ $0x20, DX ADDQ R15, R11 ADCQ $0x00, R8 SUBQ AX, R11 SBBQ DX, R8 // Second reduction step MOVQ R9, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R10 ADCQ $0x00, DX ADDQ AX, R10 MOVQ R14, R15 ADCQ DX, R11 ADCQ $0x00, R15 SUBQ R14, R11 SBBQ $0x00, R15 MOVQ R14, AX MOVQ R14, DX MOVQ R14, R9 SHLQ $0x20, AX SHRQ $0x20, DX ADDQ R15, R8 ADCQ $0x00, R9 SUBQ AX, R8 SBBQ DX, R9 // Third reduction step MOVQ R10, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R11 ADCQ $0x00, DX ADDQ AX, R11 MOVQ R14, R15 ADCQ DX, R8 ADCQ $0x00, R15 SUBQ R14, R8 SBBQ $0x00, R15 MOVQ R14, AX MOVQ R14, DX MOVQ R14, R10 SHLQ $0x20, AX SHRQ $0x20, DX ADDQ R15, R9 ADCQ $0x00, R10 SUBQ AX, R9 SBBQ DX, R10 // Last reduction step MOVQ R11, AX MULQ p256ordK0<>+0(SB) MOVQ AX, R14 MOVQ p256ord<>+0(SB), AX MULQ R14 ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R15 MOVQ p256ord<>+8(SB), AX MULQ R14 ADDQ R15, R8 ADCQ $0x00, DX ADDQ AX, R8 ADCQ $0x00, DX MOVQ DX, R15 MOVQ R14, R15 ADCQ DX, R9 ADCQ $0x00, R15 SUBQ R14, R9 SBBQ $0x00, R15 MOVQ R14, AX MOVQ R14, DX MOVQ R14, R11 SHLQ $0x20, AX SHRQ $0x20, DX ADDQ R15, R10 ADCQ $0x00, R11 SUBQ AX, R10 SBBQ DX, R11 XORQ R14, R14 // Add bits [511:256] of the sqr result ADCQ R12, R8 ADCQ R13, R9 ADCQ CX, R10 ADCQ SI, R11 ADCQ $0x00, R14 MOVQ R8, R12 MOVQ R9, R13 MOVQ R10, CX MOVQ R11, R15 // Subtract p256 SUBQ p256ord<>+0(SB), R8 SBBQ p256ord<>+8(SB), R9 SBBQ p256ord<>+16(SB), R10 SBBQ p256ord<>+24(SB), R11 SBBQ $0x00, R14 CMOVQCS R12, R8 CMOVQCS R13, R9 CMOVQCS CX, R10 CMOVQCS R15, R11 MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) MOVQ R11, 24(DI) MOVQ DI, SI DECQ BX JNE ordSqrLoop RET // func p256SubInternal() // Requires: CMOV TEXT p256SubInternal(SB), NOSPLIT, $0 XORQ AX, AX SUBQ R14, R10 SBBQ R15, R11 SBBQ DI, R12 SBBQ SI, R13 SBBQ $0x00, AX MOVQ R10, BX MOVQ R11, CX MOVQ R12, R8 MOVQ R13, R9 ADDQ $-1, R10 ADCQ p256const0<>+0(SB), R11 ADCQ $0x00, R12 ADCQ p256const1<>+0(SB), R13 ANDQ $0x01, AX CMOVQEQ BX, R10 CMOVQEQ CX, R11 CMOVQEQ R8, R12 CMOVQEQ R9, R13 RET // func p256MulInternal() // Requires: CMOV TEXT p256MulInternal(SB), NOSPLIT, $8 MOVQ R10, AX MULQ R14 MOVQ AX, BX MOVQ DX, CX MOVQ R10, AX MULQ R15 ADDQ AX, CX ADCQ $0x00, DX MOVQ DX, R8 MOVQ R10, AX MULQ DI ADDQ AX, R8 ADCQ $0x00, DX MOVQ DX, R9 MOVQ R10, AX MULQ SI ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R10 MOVQ R11, AX MULQ R14 ADDQ AX, CX ADCQ $0x00, DX MOVQ DX, BP MOVQ R11, AX MULQ R15 ADDQ BP, R8 ADCQ $0x00, DX ADDQ AX, R8 ADCQ $0x00, DX MOVQ DX, BP MOVQ R11, AX MULQ DI ADDQ BP, R9 ADCQ $0x00, DX ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, BP MOVQ R11, AX MULQ SI ADDQ BP, R10 ADCQ $0x00, DX ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, R11 MOVQ R12, AX MULQ R14 ADDQ AX, R8 ADCQ $0x00, DX MOVQ DX, BP MOVQ R12, AX MULQ R15 ADDQ BP, R9 ADCQ $0x00, DX ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, BP MOVQ R12, AX MULQ DI ADDQ BP, R10 ADCQ $0x00, DX ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, BP MOVQ R12, AX MULQ SI ADDQ BP, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, R12 MOVQ R13, AX MULQ R14 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, BP MOVQ R13, AX MULQ R15 ADDQ BP, R10 ADCQ $0x00, DX ADDQ AX, R10 ADCQ $0x00, DX MOVQ DX, BP MOVQ R13, AX MULQ DI ADDQ BP, R11 ADCQ $0x00, DX ADDQ AX, R11 ADCQ $0x00, DX MOVQ DX, BP MOVQ R13, AX MULQ SI ADDQ BP, R12 ADCQ $0x00, DX ADDQ AX, R12 ADCQ $0x00, DX MOVQ DX, R13 // First reduction step MOVQ BX, AX MOVQ BX, BP SHLQ $0x20, BX MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ BX, CX ADCQ BP, R8 ADCQ AX, R9 ADCQ $0x00, DX MOVQ DX, BX // Second reduction step MOVQ CX, AX MOVQ CX, BP SHLQ $0x20, CX MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ CX, R8 ADCQ BP, R9 ADCQ AX, BX ADCQ $0x00, DX MOVQ DX, CX // Third reduction step MOVQ R8, AX MOVQ R8, BP SHLQ $0x20, R8 MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ R8, R9 ADCQ BP, BX ADCQ AX, CX ADCQ $0x00, DX MOVQ DX, R8 // Last reduction step MOVQ R9, AX MOVQ R9, BP SHLQ $0x20, R9 MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ R9, BX ADCQ BP, CX ADCQ AX, R8 ADCQ $0x00, DX MOVQ DX, R9 MOVQ $0x00000000, BP // Add bits [511:256] of the result ADCQ BX, R10 ADCQ CX, R11 ADCQ R8, R12 ADCQ R9, R13 ADCQ $0x00, BP // Copy result MOVQ R10, BX MOVQ R11, CX MOVQ R12, R8 MOVQ R13, R9 // Subtract p256 SUBQ $-1, R10 SBBQ p256const0<>+0(SB), R11 SBBQ $0x00, R12 SBBQ p256const1<>+0(SB), R13 SBBQ $0x00, BP // If the result of the subtraction is negative, restore the previous result CMOVQCS BX, R10 CMOVQCS CX, R11 CMOVQCS R8, R12 CMOVQCS R9, R13 RET // func p256SqrInternal() // Requires: CMOV TEXT p256SqrInternal(SB), NOSPLIT, $8 MOVQ R10, AX MULQ R11 MOVQ AX, CX MOVQ DX, R8 MOVQ R10, AX MULQ R12 ADDQ AX, R8 ADCQ $0x00, DX MOVQ DX, R9 MOVQ R10, AX MULQ R13 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, R14 MOVQ R11, AX MULQ R12 ADDQ AX, R9 ADCQ $0x00, DX MOVQ DX, BP MOVQ R11, AX MULQ R13 ADDQ BP, R14 ADCQ $0x00, DX ADDQ AX, R14 ADCQ $0x00, DX MOVQ DX, R15 MOVQ R12, AX MULQ R13 ADDQ AX, R15 ADCQ $0x00, DX MOVQ DX, DI XORQ SI, SI // *2 ADDQ CX, CX ADCQ R8, R8 ADCQ R9, R9 ADCQ R14, R14 ADCQ R15, R15 ADCQ DI, DI ADCQ $0x00, SI // Missing products MOVQ R10, AX MULQ AX MOVQ AX, BX MOVQ DX, R10 MOVQ R11, AX MULQ AX ADDQ R10, CX ADCQ AX, R8 ADCQ $0x00, DX MOVQ DX, R10 MOVQ R12, AX MULQ AX ADDQ R10, R9 ADCQ AX, R14 ADCQ $0x00, DX MOVQ DX, R10 MOVQ R13, AX MULQ AX ADDQ R10, R15 ADCQ AX, DI ADCQ DX, SI // First reduction step MOVQ BX, AX MOVQ BX, BP SHLQ $0x20, BX MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ BX, CX ADCQ BP, R8 ADCQ AX, R9 ADCQ $0x00, DX MOVQ DX, BX // Second reduction step MOVQ CX, AX MOVQ CX, BP SHLQ $0x20, CX MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ CX, R8 ADCQ BP, R9 ADCQ AX, BX ADCQ $0x00, DX MOVQ DX, CX // Third reduction step MOVQ R8, AX MOVQ R8, BP SHLQ $0x20, R8 MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ R8, R9 ADCQ BP, BX ADCQ AX, CX ADCQ $0x00, DX MOVQ DX, R8 // Last reduction step MOVQ R9, AX MOVQ R9, BP SHLQ $0x20, R9 MULQ p256const1<>+0(SB) SHRQ $0x20, BP ADDQ R9, BX ADCQ BP, CX ADCQ AX, R8 ADCQ $0x00, DX MOVQ DX, R9 MOVQ $0x00000000, BP // Add bits [511:256] of the result ADCQ BX, R14 ADCQ CX, R15 ADCQ R8, DI ADCQ R9, SI ADCQ $0x00, BP // Copy result MOVQ R14, R10 MOVQ R15, R11 MOVQ DI, R12 MOVQ SI, R13 // Subtract p256 SUBQ $-1, R10 SBBQ p256const0<>+0(SB), R11 SBBQ $0x00, R12 SBBQ p256const1<>+0(SB), R13 SBBQ $0x00, BP // If the result of the subtraction is negative, restore the previous result CMOVQCS R14, R10 CMOVQCS R15, R11 CMOVQCS DI, R12 CMOVQCS SI, R13 RET // func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int) // Requires: CMOV, SSE2 TEXT ·p256PointAddAffineAsm(SB), $512-48 MOVQ res+0(FP), AX MOVQ in1+8(FP), BX MOVQ in2+16(FP), CX MOVQ sign+24(FP), DX MOVQ sel+32(FP), R15 MOVQ zero+40(FP), DI MOVOU (BX), X0 MOVOU 16(BX), X1 MOVOU 32(BX), X2 MOVOU 48(BX), X3 MOVOU 64(BX), X4 MOVOU 80(BX), X5 MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) MOVOU X3, 48(SP) MOVOU X4, 64(SP) MOVOU X5, 80(SP) MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU X0, 96(SP) MOVOU X1, 112(SP) // Store pointer to result MOVQ AX, 480(SP) MOVL R15, 488(SP) MOVL DI, 492(SP) // Negate y2in based on sign MOVQ 32(CX), R10 MOVQ 40(CX), R11 MOVQ 48(CX), R12 MOVQ 56(CX), R13 MOVQ $-1, BX MOVQ p256const0<>+0(SB), CX MOVQ $0x00000000, R8 MOVQ p256const1<>+0(SB), R9 XORQ AX, AX // Speculatively subtract SUBQ R10, BX SBBQ R11, CX SBBQ R12, R8 SBBQ R13, R9 SBBQ $0x00, AX MOVQ BX, R14 MOVQ CX, R15 MOVQ R8, DI MOVQ R9, SI // Add in case the operand was > p256 ADDQ $-1, BX ADCQ p256const0<>+0(SB), CX ADCQ $0x00, R8 ADCQ p256const1<>+0(SB), R9 ADCQ $0x00, AX CMOVQNE R14, BX CMOVQNE R15, CX CMOVQNE DI, R8 CMOVQNE SI, R9 // If condition is 0, keep original value TESTQ DX, DX CMOVQEQ R10, BX CMOVQEQ R11, CX CMOVQEQ R12, R8 CMOVQEQ R13, R9 // Store result MOVQ BX, 128(SP) MOVQ CX, 136(SP) MOVQ R8, 144(SP) MOVQ R9, 152(SP) // Begin point add MOVQ 64(SP), R10 MOVQ 72(SP), R11 MOVQ 80(SP), R12 MOVQ 88(SP), R13 CALL p256SqrInternal(SB) MOVQ R10, 288(SP) MOVQ R11, 296(SP) MOVQ R12, 304(SP) MOVQ R13, 312(SP) MOVQ 96(SP), R14 MOVQ 104(SP), R15 MOVQ 112(SP), DI MOVQ 120(SP), SI CALL p256MulInternal(SB) MOVQ (SP), R14 MOVQ 8(SP), R15 MOVQ 16(SP), DI MOVQ 24(SP), SI CALL p256SubInternal(SB) MOVQ R10, 320(SP) MOVQ R11, 328(SP) MOVQ R12, 336(SP) MOVQ R13, 344(SP) MOVQ 64(SP), R14 MOVQ 72(SP), R15 MOVQ 80(SP), DI MOVQ 88(SP), SI CALL p256MulInternal(SB) MOVQ R10, 224(SP) MOVQ R11, 232(SP) MOVQ R12, 240(SP) MOVQ R13, 248(SP) MOVQ 288(SP), R10 MOVQ 296(SP), R11 MOVQ 304(SP), R12 MOVQ 312(SP), R13 CALL p256MulInternal(SB) MOVQ 128(SP), R14 MOVQ 136(SP), R15 MOVQ 144(SP), DI MOVQ 152(SP), SI CALL p256MulInternal(SB) MOVQ R10, 256(SP) MOVQ R11, 264(SP) MOVQ R12, 272(SP) MOVQ R13, 280(SP) MOVQ 32(SP), R14 MOVQ 40(SP), R15 MOVQ 48(SP), DI MOVQ 56(SP), SI CALL p256SubInternal(SB) MOVQ R10, 352(SP) MOVQ R11, 360(SP) MOVQ R12, 368(SP) MOVQ R13, 376(SP) CALL p256SqrInternal(SB) MOVQ R10, 416(SP) MOVQ R11, 424(SP) MOVQ R12, 432(SP) MOVQ R13, 440(SP) MOVQ 320(SP), R10 MOVQ 328(SP), R11 MOVQ 336(SP), R12 MOVQ 344(SP), R13 CALL p256SqrInternal(SB) MOVQ R10, 384(SP) MOVQ R11, 392(SP) MOVQ R12, 400(SP) MOVQ R13, 408(SP) MOVQ 320(SP), R14 MOVQ 328(SP), R15 MOVQ 336(SP), DI MOVQ 344(SP), SI CALL p256MulInternal(SB) MOVQ R10, 448(SP) MOVQ R11, 456(SP) MOVQ R12, 464(SP) MOVQ R13, 472(SP) MOVQ 32(SP), R14 MOVQ 40(SP), R15 MOVQ 48(SP), DI MOVQ 56(SP), SI CALL p256MulInternal(SB) MOVQ R10, 256(SP) MOVQ R11, 264(SP) MOVQ R12, 272(SP) MOVQ R13, 280(SP) MOVQ (SP), R10 MOVQ 8(SP), R11 MOVQ 16(SP), R12 MOVQ 24(SP), R13 MOVQ 384(SP), R14 MOVQ 392(SP), R15 MOVQ 400(SP), DI MOVQ 408(SP), SI CALL p256MulInternal(SB) MOVQ R10, 320(SP) MOVQ R11, 328(SP) MOVQ R12, 336(SP) MOVQ R13, 344(SP) XORQ AX, AX ADDQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ 416(SP), R10 MOVQ 424(SP), R11 MOVQ 432(SP), R12 MOVQ 440(SP), R13 CALL p256SubInternal(SB) MOVQ 448(SP), R14 MOVQ 456(SP), R15 MOVQ 464(SP), DI MOVQ 472(SP), SI CALL p256SubInternal(SB) MOVQ R10, 160(SP) MOVQ R11, 168(SP) MOVQ R12, 176(SP) MOVQ R13, 184(SP) MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI MOVQ 320(SP), R10 MOVQ 328(SP), R11 MOVQ 336(SP), R12 MOVQ 344(SP), R13 CALL p256SubInternal(SB) MOVQ 352(SP), R14 MOVQ 360(SP), R15 MOVQ 368(SP), DI MOVQ 376(SP), SI CALL p256MulInternal(SB) MOVQ 256(SP), R14 MOVQ 264(SP), R15 MOVQ 272(SP), DI MOVQ 280(SP), SI CALL p256SubInternal(SB) MOVQ R10, 192(SP) MOVQ R11, 200(SP) MOVQ R12, 208(SP) MOVQ R13, 216(SP) // Load stored values from stack MOVQ 480(SP), AX MOVL 488(SP), BX MOVL 492(SP), CX // The result is not valid if (sel == 0), conditional choose MOVOU 160(SP), X0 MOVOU 176(SP), X1 MOVOU 192(SP), X2 MOVOU 208(SP), X3 MOVOU 224(SP), X4 MOVOU 240(SP), X5 MOVL BX, X6 MOVL CX, X7 PXOR X8, X8 PCMPEQL X9, X9 PSHUFD $0x00, X6, X6 PSHUFD $0x00, X7, X7 PCMPEQL X8, X6 PCMPEQL X8, X7 MOVOU X6, X15 PANDN X9, X15 MOVOU (SP), X9 MOVOU 16(SP), X10 MOVOU 32(SP), X11 MOVOU 48(SP), X12 MOVOU 64(SP), X13 MOVOU 80(SP), X14 PAND X15, X0 PAND X15, X1 PAND X15, X2 PAND X15, X3 PAND X15, X4 PAND X15, X5 PAND X6, X9 PAND X6, X10 PAND X6, X11 PAND X6, X12 PAND X6, X13 PAND X6, X14 PXOR X9, X0 PXOR X10, X1 PXOR X11, X2 PXOR X12, X3 PXOR X13, X4 PXOR X14, X5 // Similarly if zero == 0 PCMPEQL X9, X9 MOVOU X7, X15 PANDN X9, X15 MOVOU 96(SP), X9 MOVOU 112(SP), X10 MOVOU 128(SP), X11 MOVOU 144(SP), X12 MOVOU p256one<>+0(SB), X13 MOVOU p256one<>+16(SB), X14 PAND X15, X0 PAND X15, X1 PAND X15, X2 PAND X15, X3 PAND X15, X4 PAND X15, X5 PAND X7, X9 PAND X7, X10 PAND X7, X11 PAND X7, X12 PAND X7, X13 PAND X7, X14 PXOR X9, X0 PXOR X10, X1 PXOR X11, X2 PXOR X12, X3 PXOR X13, X4 PXOR X14, X5 // Finally output the result MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) MOVOU X4, 64(AX) MOVOU X5, 80(AX) MOVQ $0x00000000, 480(SP) RET DATA p256one<>+0(SB)/8, $0x0000000000000001 DATA p256one<>+8(SB)/8, $0xffffffff00000000 DATA p256one<>+16(SB)/8, $0xffffffffffffffff DATA p256one<>+24(SB)/8, $0x00000000fffffffe GLOBL p256one<>(SB), RODATA, $32 // func p256IsZero() // Requires: CMOV TEXT p256IsZero(SB), NOSPLIT, $0 // AX contains a flag that is set if the input is zero. XORQ AX, AX MOVQ $0x00000001, R15 // Check whether [acc4..acc7] are all zero. MOVQ R10, R14 ORQ R11, R14 ORQ R12, R14 ORQ R13, R14 // Set the zero flag if so. (CMOV of a constant to a register doesn't // appear to be supported in Go. Thus t1 = 1.) CMOVQEQ R15, AX // XOR [acc4..acc7] with P and compare with zero again. XORQ $-1, R10 XORQ p256const0<>+0(SB), R11 XORQ p256const1<>+0(SB), R13 ORQ R11, R10 ORQ R12, R10 ORQ R13, R10 // Set the zero flag if so. CMOVQEQ R15, AX RET // func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int // Requires: CMOV, SSE2 TEXT ·p256PointAddAsm(SB), $680-32 // Move input to stack in order to free registers MOVQ res+0(FP), AX MOVQ in1+8(FP), BX MOVQ in2+16(FP), CX MOVOU (BX), X0 MOVOU 16(BX), X1 MOVOU 32(BX), X2 MOVOU 48(BX), X3 MOVOU 64(BX), X4 MOVOU 80(BX), X5 MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) MOVOU X3, 48(SP) MOVOU X4, 64(SP) MOVOU X5, 80(SP) MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU 32(CX), X2 MOVOU 48(CX), X3 MOVOU 64(CX), X4 MOVOU 80(CX), X5 MOVOU X0, 96(SP) MOVOU X1, 112(SP) MOVOU X2, 128(SP) MOVOU X3, 144(SP) MOVOU X4, 160(SP) MOVOU X5, 176(SP) // Store pointer to result MOVQ AX, 640(SP) // Begin point add MOVQ 160(SP), R10 MOVQ 168(SP), R11 MOVQ 176(SP), R12 MOVQ 184(SP), R13 CALL p256SqrInternal(SB) MOVQ R10, 448(SP) MOVQ R11, 456(SP) MOVQ R12, 464(SP) MOVQ R13, 472(SP) MOVQ 160(SP), R14 MOVQ 168(SP), R15 MOVQ 176(SP), DI MOVQ 184(SP), SI CALL p256MulInternal(SB) MOVQ 32(SP), R14 MOVQ 40(SP), R15 MOVQ 48(SP), DI MOVQ 56(SP), SI CALL p256MulInternal(SB) MOVQ R10, 352(SP) MOVQ R11, 360(SP) MOVQ R12, 368(SP) MOVQ R13, 376(SP) MOVQ 64(SP), R10 MOVQ 72(SP), R11 MOVQ 80(SP), R12 MOVQ 88(SP), R13 CALL p256SqrInternal(SB) MOVQ R10, 416(SP) MOVQ R11, 424(SP) MOVQ R12, 432(SP) MOVQ R13, 440(SP) MOVQ 64(SP), R14 MOVQ 72(SP), R15 MOVQ 80(SP), DI MOVQ 88(SP), SI CALL p256MulInternal(SB) MOVQ 128(SP), R14 MOVQ 136(SP), R15 MOVQ 144(SP), DI MOVQ 152(SP), SI CALL p256MulInternal(SB) MOVQ R10, 384(SP) MOVQ R11, 392(SP) MOVQ R12, 400(SP) MOVQ R13, 408(SP) MOVQ 352(SP), R14 MOVQ 360(SP), R15 MOVQ 368(SP), DI MOVQ 376(SP), SI CALL p256SubInternal(SB) MOVQ R10, 512(SP) MOVQ R11, 520(SP) MOVQ R12, 528(SP) MOVQ R13, 536(SP) CALL p256IsZero(SB) MOVQ AX, 648(SP) MOVQ 448(SP), R10 MOVQ 456(SP), R11 MOVQ 464(SP), R12 MOVQ 472(SP), R13 MOVQ (SP), R14 MOVQ 8(SP), R15 MOVQ 16(SP), DI MOVQ 24(SP), SI CALL p256MulInternal(SB) MOVQ R10, 288(SP) MOVQ R11, 296(SP) MOVQ R12, 304(SP) MOVQ R13, 312(SP) MOVQ 416(SP), R10 MOVQ 424(SP), R11 MOVQ 432(SP), R12 MOVQ 440(SP), R13 MOVQ 96(SP), R14 MOVQ 104(SP), R15 MOVQ 112(SP), DI MOVQ 120(SP), SI CALL p256MulInternal(SB) MOVQ R10, 320(SP) MOVQ R11, 328(SP) MOVQ R12, 336(SP) MOVQ R13, 344(SP) MOVQ 288(SP), R14 MOVQ 296(SP), R15 MOVQ 304(SP), DI MOVQ 312(SP), SI CALL p256SubInternal(SB) MOVQ R10, 480(SP) MOVQ R11, 488(SP) MOVQ R12, 496(SP) MOVQ R13, 504(SP) CALL p256IsZero(SB) ANDQ 648(SP), AX MOVQ AX, 648(SP) MOVQ 512(SP), R10 MOVQ 520(SP), R11 MOVQ 528(SP), R12 MOVQ 536(SP), R13 CALL p256SqrInternal(SB) MOVQ R10, 576(SP) MOVQ R11, 584(SP) MOVQ R12, 592(SP) MOVQ R13, 600(SP) MOVQ 480(SP), R10 MOVQ 488(SP), R11 MOVQ 496(SP), R12 MOVQ 504(SP), R13 CALL p256SqrInternal(SB) MOVQ R10, 544(SP) MOVQ R11, 552(SP) MOVQ R12, 560(SP) MOVQ R13, 568(SP) MOVQ 480(SP), R14 MOVQ 488(SP), R15 MOVQ 496(SP), DI MOVQ 504(SP), SI CALL p256MulInternal(SB) MOVQ R10, 608(SP) MOVQ R11, 616(SP) MOVQ R12, 624(SP) MOVQ R13, 632(SP) MOVQ 352(SP), R14 MOVQ 360(SP), R15 MOVQ 368(SP), DI MOVQ 376(SP), SI CALL p256MulInternal(SB) MOVQ R10, 384(SP) MOVQ R11, 392(SP) MOVQ R12, 400(SP) MOVQ R13, 408(SP) MOVQ 64(SP), R10 MOVQ 72(SP), R11 MOVQ 80(SP), R12 MOVQ 88(SP), R13 MOVQ 160(SP), R14 MOVQ 168(SP), R15 MOVQ 176(SP), DI MOVQ 184(SP), SI CALL p256MulInternal(SB) MOVQ 480(SP), R14 MOVQ 488(SP), R15 MOVQ 496(SP), DI MOVQ 504(SP), SI CALL p256MulInternal(SB) MOVQ R10, 256(SP) MOVQ R11, 264(SP) MOVQ R12, 272(SP) MOVQ R13, 280(SP) MOVQ 544(SP), R10 MOVQ 552(SP), R11 MOVQ 560(SP), R12 MOVQ 568(SP), R13 MOVQ 288(SP), R14 MOVQ 296(SP), R15 MOVQ 304(SP), DI MOVQ 312(SP), SI CALL p256MulInternal(SB) MOVQ R10, 320(SP) MOVQ R11, 328(SP) MOVQ R12, 336(SP) MOVQ R13, 344(SP) XORQ AX, AX ADDQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ 576(SP), R10 MOVQ 584(SP), R11 MOVQ 592(SP), R12 MOVQ 600(SP), R13 CALL p256SubInternal(SB) MOVQ 608(SP), R14 MOVQ 616(SP), R15 MOVQ 624(SP), DI MOVQ 632(SP), SI CALL p256SubInternal(SB) MOVQ R10, 192(SP) MOVQ R11, 200(SP) MOVQ R12, 208(SP) MOVQ R13, 216(SP) MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI MOVQ 320(SP), R10 MOVQ 328(SP), R11 MOVQ 336(SP), R12 MOVQ 344(SP), R13 CALL p256SubInternal(SB) MOVQ 512(SP), R14 MOVQ 520(SP), R15 MOVQ 528(SP), DI MOVQ 536(SP), SI CALL p256MulInternal(SB) MOVQ 384(SP), R14 MOVQ 392(SP), R15 MOVQ 400(SP), DI MOVQ 408(SP), SI CALL p256SubInternal(SB) MOVQ R10, 224(SP) MOVQ R11, 232(SP) MOVQ R12, 240(SP) MOVQ R13, 248(SP) MOVOU 192(SP), X0 MOVOU 208(SP), X1 MOVOU 224(SP), X2 MOVOU 240(SP), X3 MOVOU 256(SP), X4 MOVOU 272(SP), X5 // Finally output the result MOVQ 640(SP), AX MOVQ $0x00000000, 640(SP) MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) MOVOU X4, 64(AX) MOVOU X5, 80(AX) MOVQ 648(SP), AX MOVQ AX, ret+24(FP) RET // func p256PointDoubleAsm(res *P256Point, in *P256Point) // Requires: CMOV, SSE2 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16 MOVQ res+0(FP), AX MOVQ in+8(FP), BX MOVOU (BX), X0 MOVOU 16(BX), X1 MOVOU 32(BX), X2 MOVOU 48(BX), X3 MOVOU 64(BX), X4 MOVOU 80(BX), X5 MOVOU X0, (SP) MOVOU X1, 16(SP) MOVOU X2, 32(SP) MOVOU X3, 48(SP) MOVOU X4, 64(SP) MOVOU X5, 80(SP) // Store pointer to result MOVQ AX, 224(SP) // Begin point double MOVQ 64(SP), R10 MOVQ 72(SP), R11 MOVQ 80(SP), R12 MOVQ 88(SP), R13 CALL p256SqrInternal(SB) MOVQ R10, 160(SP) MOVQ R11, 168(SP) MOVQ R12, 176(SP) MOVQ R13, 184(SP) MOVQ (SP), R14 MOVQ 8(SP), R15 MOVQ 16(SP), DI MOVQ 24(SP), SI XORQ AX, AX ADDQ R14, R10 ADCQ R15, R11 ADCQ DI, R12 ADCQ SI, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ R14, 128(SP) MOVQ R15, 136(SP) MOVQ DI, 144(SP) MOVQ SI, 152(SP) MOVQ 64(SP), R10 MOVQ 72(SP), R11 MOVQ 80(SP), R12 MOVQ 88(SP), R13 MOVQ 32(SP), R14 MOVQ 40(SP), R15 MOVQ 48(SP), DI MOVQ 56(SP), SI CALL p256MulInternal(SB) XORQ AX, AX ADDQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ 224(SP), AX // Store z MOVQ R14, 64(AX) MOVQ R15, 72(AX) MOVQ DI, 80(AX) MOVQ SI, 88(AX) MOVQ (SP), R10 MOVQ 8(SP), R11 MOVQ 16(SP), R12 MOVQ 24(SP), R13 MOVQ 160(SP), R14 MOVQ 168(SP), R15 MOVQ 176(SP), DI MOVQ 184(SP), SI CALL p256SubInternal(SB) MOVQ 128(SP), R14 MOVQ 136(SP), R15 MOVQ 144(SP), DI MOVQ 152(SP), SI CALL p256MulInternal(SB) MOVQ R10, 128(SP) MOVQ R11, 136(SP) MOVQ R12, 144(SP) MOVQ R13, 152(SP) // Multiply by 3 XORQ AX, AX ADDQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ 128(SP), R10 MOVQ 136(SP), R11 MOVQ 144(SP), R12 MOVQ 152(SP), R13 XORQ AX, AX ADDQ R14, R10 ADCQ R15, R11 ADCQ DI, R12 ADCQ SI, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ R14, 128(SP) MOVQ R15, 136(SP) MOVQ DI, 144(SP) MOVQ SI, 152(SP) // //////////////////////// MOVQ 32(SP), R10 MOVQ 40(SP), R11 MOVQ 48(SP), R12 MOVQ 56(SP), R13 XORQ AX, AX ADDQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ R14, R10 MOVQ R15, R11 MOVQ DI, R12 MOVQ SI, R13 CALL p256SqrInternal(SB) MOVQ R10, 96(SP) MOVQ R11, 104(SP) MOVQ R12, 112(SP) MOVQ R13, 120(SP) CALL p256SqrInternal(SB) // Divide by 2 XORQ AX, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI ADDQ $-1, R10 ADCQ p256const0<>+0(SB), R11 ADCQ $0x00, R12 ADCQ p256const1<>+0(SB), R13 ADCQ $0x00, AX TESTQ $0x00000001, R14 CMOVQEQ R14, R10 CMOVQEQ R15, R11 CMOVQEQ DI, R12 CMOVQEQ SI, R13 ANDQ R14, AX SHRQ $0x01, R11, R10 SHRQ $0x01, R12, R11 SHRQ $0x01, R13, R12 SHRQ $0x01, AX, R13 MOVQ R10, 32(SP) MOVQ R11, 40(SP) MOVQ R12, 48(SP) MOVQ R13, 56(SP) // ///////////////////////// MOVQ (SP), R10 MOVQ 8(SP), R11 MOVQ 16(SP), R12 MOVQ 24(SP), R13 MOVQ 96(SP), R14 MOVQ 104(SP), R15 MOVQ 112(SP), DI MOVQ 120(SP), SI CALL p256MulInternal(SB) MOVQ R10, 96(SP) MOVQ R11, 104(SP) MOVQ R12, 112(SP) MOVQ R13, 120(SP) XORQ AX, AX ADDQ R10, R10 ADCQ R11, R11 ADCQ R12, R12 ADCQ R13, R13 ADCQ $+0, AX MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI SUBQ $-1, R14 SBBQ p256const0<>+0(SB), R15 SBBQ $+0, DI SBBQ p256const1<>+0(SB), SI SBBQ $+0, AX CMOVQCS R10, R14 CMOVQCS R11, R15 CMOVQCS R12, DI CMOVQCS R13, SI MOVQ R14, 192(SP) MOVQ R15, 200(SP) MOVQ DI, 208(SP) MOVQ SI, 216(SP) MOVQ 128(SP), R10 MOVQ 136(SP), R11 MOVQ 144(SP), R12 MOVQ 152(SP), R13 CALL p256SqrInternal(SB) MOVQ 192(SP), R14 MOVQ 200(SP), R15 MOVQ 208(SP), DI MOVQ 216(SP), SI CALL p256SubInternal(SB) MOVQ 224(SP), AX // Store x MOVQ R10, (AX) MOVQ R11, 8(AX) MOVQ R12, 16(AX) MOVQ R13, 24(AX) MOVQ R10, R14 MOVQ R11, R15 MOVQ R12, DI MOVQ R13, SI MOVQ 96(SP), R10 MOVQ 104(SP), R11 MOVQ 112(SP), R12 MOVQ 120(SP), R13 CALL p256SubInternal(SB) MOVQ 128(SP), R14 MOVQ 136(SP), R15 MOVQ 144(SP), DI MOVQ 152(SP), SI CALL p256MulInternal(SB) MOVQ 32(SP), R14 MOVQ 40(SP), R15 MOVQ 48(SP), DI MOVQ 56(SP), SI CALL p256SubInternal(SB) MOVQ 224(SP), AX // Store y MOVQ R10, 32(AX) MOVQ R11, 40(AX) MOVQ R12, 48(AX) MOVQ R13, 56(AX) // /////////////////////// MOVQ $0x00000000, 224(SP) RET