1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // ChaCha8 is ChaCha with 8 rounds.
8 // See https://cr.yp.to/chacha/chacha-20080128.pdf.
9 // See chacha8_generic.go for additional details.
10
11 // ROL rotates the uint32s in register R left by N bits, using temporary T.
12 #define ROL(N, R, T) \
13 MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
14
15 // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
16 #ifdef GOAMD64_v2
17 #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
18 #else
19 #define ROL16(R, T) ROL(16, R, T)
20 #endif
21
22 // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
23 #ifdef GOAMD64_v2
24 #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
25 #else
26 #define ROL8(R, T) ROL(8, R, T)
27 #endif
28
29 // QR is the ChaCha quarter-round on A, B, C, and D. T is an available temporary.
30 #define QR(A, B, C, D, T) \
31 PADDD B, A; PXOR A, D; ROL16(D, T); \
32 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B; \
33 PADDD B, A; PXOR A, D; ROL8(D, T); \
34 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
35
36 // REPLREG replicates the register R into 4 uint32s in XR.
37 #define REPLREG(R, XR) \
38 MOVQ R, XR; \
39 PSHUFD $0, XR, XR
40
41 // REPL replicates the uint32 constant val into 4 uint32s in XR. It smashes DX.
42 #define REPL(val, XR) \
43 MOVL $val, DX; \
44 REPLREG(DX, XR)
45
46 // SEED copies the off'th uint32 of the seed into the register XR,
47 // replicating it into all four stripes of the register.
48 #define SEED(off, reg, XR) \
49 MOVL (4*off)(AX), reg; \
50 REPLREG(reg, XR) \
51
52 // block runs 4 ChaCha8 block transformations in the four stripes of the X registers.
53
54 // func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32)
55 TEXT ·block<ABIInternal>(SB), NOSPLIT, $16
56 // seed in AX
57 // blocks in BX
58 // counter in CX
59
60 // Load initial constants into top row.
61 REPL(0x61707865, X0)
62 REPL(0x3320646e, X1)
63 REPL(0x79622d32, X2)
64 REPL(0x6b206574, X3)
65
66 // Load counter into bottom left cell.
67 // Each stripe gets a different counter: 0, 1, 2, 3.
68 // (PINSRD is not available in GOAMD64_v1,
69 // so just do it in memory on all systems.
70 // This is not on the critical path.)
71 MOVL CX, 0(SP)
72 INCL CX
73 MOVL CX, 4(SP)
74 INCL CX
75 MOVL CX, 8(SP)
76 INCL CX
77 MOVL CX, 12(SP)
78 MOVOU 0(SP), X12
79
80 // Load seed words into next two rows and into DI, SI, R8..R13
81 SEED(0, DI, X4)
82 SEED(1, SI, X5)
83 SEED(2, R8, X6)
84 SEED(3, R9, X7)
85 SEED(4, R10, X8)
86 SEED(5, R11, X9)
87 SEED(6, R12, X10)
88 SEED(7, R13, X11)
89
90 // Zeros for remaining two matrix entries.
91 // We have just enough XMM registers to hold the state,
92 // without one for the temporary, so we flush and restore
93 // some values to and from memory to provide a temporary.
94 // The initial temporary is X15, so zero its memory instead
95 // of X15 itself.
96 MOVL $0, DX
97 MOVQ DX, X13
98 MOVQ DX, X14
99 MOVOU X14, (15*16)(BX)
100
101 // 4 iterations. Each iteration is 8 quarter-rounds.
102 MOVL $4, DX
103 loop:
104 QR(X0, X4, X8, X12, X15)
105 MOVOU X4, (4*16)(BX) // save X4
106 QR(X1, X5, X9, X13, X15)
107 MOVOU (15*16)(BX), X15 // reload X15; temp now X4
108 QR(X2, X6, X10, X14, X4)
109 QR(X3, X7, X11, X15, X4)
110
111 QR(X0, X5, X10, X15, X4)
112 MOVOU X15, (15*16)(BX) // save X15
113 QR(X1, X6, X11, X12, X4)
114 MOVOU (4*16)(BX), X4 // reload X4; temp now X15
115 QR(X2, X7, X8, X13, X15)
116 QR(X3, X4, X9, X14, X15)
117
118 DECL DX
119 JNZ loop
120
121 // Store interlaced blocks back to output buffer,
122 // adding original seed along the way.
123
124 // First the top and bottom rows.
125 MOVOU X0, (0*16)(BX)
126 MOVOU X1, (1*16)(BX)
127 MOVOU X2, (2*16)(BX)
128 MOVOU X3, (3*16)(BX)
129 MOVOU X12, (12*16)(BX)
130 MOVOU X13, (13*16)(BX)
131 MOVOU X14, (14*16)(BX)
132 // X15 has already been stored.
133
134 // Now we have X0-X3, X12-X15 available for temporaries.
135 // Add seed rows back to output. We left seed in DI, SI, R8..R13 above.
136 REPLREG(DI, X0)
137 REPLREG(SI, X1)
138 REPLREG(R8, X2)
139 REPLREG(R9, X3)
140 REPLREG(R10, X12)
141 REPLREG(R11, X13)
142 REPLREG(R12, X14)
143 REPLREG(R13, X15)
144 PADDD X0, X4
145 PADDD X1, X5
146 PADDD X2, X6
147 PADDD X3, X7
148 PADDD X12, X8
149 PADDD X13, X9
150 PADDD X14, X10
151 PADDD X15, X11
152 MOVOU X4, (4*16)(BX)
153 MOVOU X5, (5*16)(BX)
154 MOVOU X6, (6*16)(BX)
155 MOVOU X7, (7*16)(BX)
156 MOVOU X8, (8*16)(BX)
157 MOVOU X9, (9*16)(BX)
158 MOVOU X10, (10*16)(BX)
159 MOVOU X11, (11*16)(BX)
160
161 MOVL $0, AX
162 MOVQ AX, X15 // must be 0 on return
163
164 RET
165
166 // rotate left 16 indexes for PSHUFB
167 GLOBL ·rol16<>(SB), NOPTR|RODATA, $16
168 DATA ·rol16<>+0(SB)/8, $0x0504070601000302
169 DATA ·rol16<>+8(SB)/8, $0x0D0C0F0E09080B0A
170
171 // rotate left 8 indexes for PSHUFB
172 GLOBL ·rol8<>(SB), NOPTR|RODATA, $16
173 DATA ·rol8<>+0(SB)/8, $0x0605040702010003
174 DATA ·rol8<>+8(SB)/8, $0x0E0D0C0F0A09080B
175
View as plain text