Text file
src/hash/crc32/crc32_amd64.s
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
8 //
9 // func castagnoliSSE42(crc uint32, p []byte) uint32
10 TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
11 MOVL crc+0(FP), AX // CRC value
12 MOVQ p+8(FP), SI // data pointer
13 MOVQ p_len+16(FP), CX // len(p)
14
15 // If there are fewer than 8 bytes to process, skip alignment.
16 CMPQ CX, $8
17 JL less_than_8
18
19 MOVQ SI, BX
20 ANDQ $7, BX
21 JZ aligned
22
23 // Process the first few bytes to 8-byte align the input.
24
25 // BX = 8 - BX. We need to process this many bytes to align.
26 SUBQ $1, BX
27 XORQ $7, BX
28
29 BTQ $0, BX
30 JNC align_2
31
32 CRC32B (SI), AX
33 DECQ CX
34 INCQ SI
35
36 align_2:
37 BTQ $1, BX
38 JNC align_4
39
40 CRC32W (SI), AX
41
42 SUBQ $2, CX
43 ADDQ $2, SI
44
45 align_4:
46 BTQ $2, BX
47 JNC aligned
48
49 CRC32L (SI), AX
50
51 SUBQ $4, CX
52 ADDQ $4, SI
53
54 aligned:
55 // The input is now 8-byte aligned and we can process 8-byte chunks.
56 CMPQ CX, $8
57 JL less_than_8
58
59 CRC32Q (SI), AX
60 ADDQ $8, SI
61 SUBQ $8, CX
62 JMP aligned
63
64 less_than_8:
65 // We may have some bytes left over; process 4 bytes, then 2, then 1.
66 BTQ $2, CX
67 JNC less_than_4
68
69 CRC32L (SI), AX
70 ADDQ $4, SI
71
72 less_than_4:
73 BTQ $1, CX
74 JNC less_than_2
75
76 CRC32W (SI), AX
77 ADDQ $2, SI
78
79 less_than_2:
80 BTQ $0, CX
81 JNC done
82
83 CRC32B (SI), AX
84
85 done:
86 MOVL AX, ret+32(FP)
87 RET
88
89 // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
90 // bytes from each buffer.
91 //
92 // func castagnoliSSE42Triple(
93 // crc1, crc2, crc3 uint32,
94 // a, b, c []byte,
95 // rounds uint32,
96 // ) (retA uint32, retB uint32, retC uint32)
97 TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
98 MOVL crcA+0(FP), AX
99 MOVL crcB+4(FP), CX
100 MOVL crcC+8(FP), DX
101
102 MOVQ a+16(FP), R8 // data pointer
103 MOVQ b+40(FP), R9 // data pointer
104 MOVQ c+64(FP), R10 // data pointer
105
106 MOVL rounds+88(FP), R11
107
108 loop:
109 CRC32Q (R8), AX
110 CRC32Q (R9), CX
111 CRC32Q (R10), DX
112
113 CRC32Q 8(R8), AX
114 CRC32Q 8(R9), CX
115 CRC32Q 8(R10), DX
116
117 CRC32Q 16(R8), AX
118 CRC32Q 16(R9), CX
119 CRC32Q 16(R10), DX
120
121 ADDQ $24, R8
122 ADDQ $24, R9
123 ADDQ $24, R10
124
125 DECQ R11
126 JNZ loop
127
128 MOVL AX, retA+96(FP)
129 MOVL CX, retB+100(FP)
130 MOVL DX, retC+104(FP)
131 RET
132
133 // CRC32 polynomial data
134 //
135 // These constants are lifted from the
136 // Linux kernel, since they avoid the costly
137 // PSHUFB 16 byte reversal proposed in the
138 // original Intel paper.
139 DATA r2r1<>+0(SB)/8, $0x154442bd4
140 DATA r2r1<>+8(SB)/8, $0x1c6e41596
141 DATA r4r3<>+0(SB)/8, $0x1751997d0
142 DATA r4r3<>+8(SB)/8, $0x0ccaa009e
143 DATA rupoly<>+0(SB)/8, $0x1db710641
144 DATA rupoly<>+8(SB)/8, $0x1f7011641
145 DATA r5<>+0(SB)/8, $0x163cd6124
146
147 GLOBL r2r1<>(SB),RODATA,$16
148 GLOBL r4r3<>(SB),RODATA,$16
149 GLOBL rupoly<>(SB),RODATA,$16
150 GLOBL r5<>(SB),RODATA,$8
151
152 // Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
153 // len(p) must be at least 64, and must be a multiple of 16.
154
155 // func ieeeCLMUL(crc uint32, p []byte) uint32
156 TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
157 MOVL crc+0(FP), X0 // Initial CRC value
158 MOVQ p+8(FP), SI // data pointer
159 MOVQ p_len+16(FP), CX // len(p)
160
161 MOVOU (SI), X1
162 MOVOU 16(SI), X2
163 MOVOU 32(SI), X3
164 MOVOU 48(SI), X4
165 PXOR X0, X1
166 ADDQ $64, SI // buf+=64
167 SUBQ $64, CX // len-=64
168 CMPQ CX, $64 // Less than 64 bytes left
169 JB remain64
170
171 MOVOA r2r1<>+0(SB), X0
172 loopback64:
173 MOVOA X1, X5
174 MOVOA X2, X6
175 MOVOA X3, X7
176 MOVOA X4, X8
177
178 PCLMULQDQ $0, X0, X1
179 PCLMULQDQ $0, X0, X2
180 PCLMULQDQ $0, X0, X3
181 PCLMULQDQ $0, X0, X4
182
183 /* Load next early */
184 MOVOU (SI), X11
185 MOVOU 16(SI), X12
186 MOVOU 32(SI), X13
187 MOVOU 48(SI), X14
188
189 PCLMULQDQ $0x11, X0, X5
190 PCLMULQDQ $0x11, X0, X6
191 PCLMULQDQ $0x11, X0, X7
192 PCLMULQDQ $0x11, X0, X8
193
194 PXOR X5, X1
195 PXOR X6, X2
196 PXOR X7, X3
197 PXOR X8, X4
198
199 PXOR X11, X1
200 PXOR X12, X2
201 PXOR X13, X3
202 PXOR X14, X4
203
204 ADDQ $0x40, DI
205 ADDQ $64, SI // buf+=64
206 SUBQ $64, CX // len-=64
207 CMPQ CX, $64 // Less than 64 bytes left?
208 JGE loopback64
209
210 /* Fold result into a single register (X1) */
211 remain64:
212 MOVOA r4r3<>+0(SB), X0
213
214 MOVOA X1, X5
215 PCLMULQDQ $0, X0, X1
216 PCLMULQDQ $0x11, X0, X5
217 PXOR X5, X1
218 PXOR X2, X1
219
220 MOVOA X1, X5
221 PCLMULQDQ $0, X0, X1
222 PCLMULQDQ $0x11, X0, X5
223 PXOR X5, X1
224 PXOR X3, X1
225
226 MOVOA X1, X5
227 PCLMULQDQ $0, X0, X1
228 PCLMULQDQ $0x11, X0, X5
229 PXOR X5, X1
230 PXOR X4, X1
231
232 /* If there is less than 16 bytes left we are done */
233 CMPQ CX, $16
234 JB finish
235
236 /* Encode 16 bytes */
237 remain16:
238 MOVOU (SI), X10
239 MOVOA X1, X5
240 PCLMULQDQ $0, X0, X1
241 PCLMULQDQ $0x11, X0, X5
242 PXOR X5, X1
243 PXOR X10, X1
244 SUBQ $16, CX
245 ADDQ $16, SI
246 CMPQ CX, $16
247 JGE remain16
248
249 finish:
250 /* Fold final result into 32 bits and return it */
251 PCMPEQB X3, X3
252 PCLMULQDQ $1, X1, X0
253 PSRLDQ $8, X1
254 PXOR X0, X1
255
256 MOVOA X1, X2
257 MOVQ r5<>+0(SB), X0
258
259 /* Creates 32 bit mask. Note that we don't care about upper half. */
260 PSRLQ $32, X3
261
262 PSRLDQ $4, X2
263 PAND X3, X1
264 PCLMULQDQ $0, X0, X1
265 PXOR X2, X1
266
267 MOVOA rupoly<>+0(SB), X0
268
269 MOVOA X1, X2
270 PAND X3, X1
271 PCLMULQDQ $0x10, X0, X1
272 PAND X3, X1
273 PCLMULQDQ $0, X0, X1
274 PXOR X2, X1
275
276 PEXTRD $1, X1, AX
277 MOVL AX, ret+32(FP)
278
279 RET
280
View as plain text