Text file
src/runtime/memmove_amd64.s
1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
3 //
4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
6 // Portions Copyright 2009 The Go Authors. All rights reserved.
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 // THE SOFTWARE.
25
26 //go:build !plan9
27
28 #include "go_asm.h"
29 #include "textflag.h"
30
31 // See memmove Go doc for important implementation constraints.
32
33 // func memmove(to, from unsafe.Pointer, n uintptr)
34 // ABIInternal for performance.
35 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
36 // AX = to
37 // BX = from
38 // CX = n
39 MOVQ AX, DI
40 MOVQ BX, SI
41 MOVQ CX, BX
42
43 // REP instructions have a high startup cost, so we handle small sizes
44 // with some straightline code. The REP MOVSQ instruction is really fast
45 // for large sizes. The cutover is approximately 2K.
46 tail:
47 // move_129through256 or smaller work whether or not the source and the
48 // destination memory regions overlap because they load all data into
49 // registers before writing it back. move_256through2048 on the other
50 // hand can be used only when the memory regions don't overlap or the copy
51 // direction is forward.
52 //
53 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
54 TESTQ BX, BX
55 JEQ move_0
56 CMPQ BX, $2
57 JBE move_1or2
58 CMPQ BX, $4
59 JB move_3
60 JBE move_4
61 CMPQ BX, $8
62 JB move_5through7
63 JE move_8
64 CMPQ BX, $16
65 JBE move_9through16
66 CMPQ BX, $32
67 JBE move_17through32
68 CMPQ BX, $64
69 JBE move_33through64
70 CMPQ BX, $128
71 JBE move_65through128
72 CMPQ BX, $256
73 JBE move_129through256
74
75 TESTB $1, runtime·useAVXmemmove(SB)
76 JNZ avxUnaligned
77
78 /*
79 * check and set for backwards
80 */
81 CMPQ SI, DI
82 JLS back
83
84 /*
85 * forward copy loop
86 */
87 forward:
88 CMPQ BX, $2048
89 JLS move_256through2048
90
91 // If REP MOVSB isn't fast, don't use it
92 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
93 JNE fwdBy8
94
95 // Check alignment
96 MOVL SI, AX
97 ORL DI, AX
98 TESTL $7, AX
99 JEQ fwdBy8
100
101 // Do 1 byte at a time
102 MOVQ BX, CX
103 REP; MOVSB
104 RET
105
106 fwdBy8:
107 // Do 8 bytes at a time
108 MOVQ BX, CX
109 SHRQ $3, CX
110 ANDQ $7, BX
111 REP; MOVSQ
112 JMP tail
113
114 back:
115 /*
116 * check overlap
117 */
118 MOVQ SI, CX
119 ADDQ BX, CX
120 CMPQ CX, DI
121 JLS forward
122 /*
123 * whole thing backwards has
124 * adjusted addresses
125 */
126 ADDQ BX, DI
127 ADDQ BX, SI
128 STD
129
130 /*
131 * copy
132 */
133 MOVQ BX, CX
134 SHRQ $3, CX
135 ANDQ $7, BX
136
137 SUBQ $8, DI
138 SUBQ $8, SI
139 REP; MOVSQ
140
141 CLD
142 ADDQ $8, DI
143 ADDQ $8, SI
144 SUBQ BX, DI
145 SUBQ BX, SI
146 JMP tail
147
148 move_1or2:
149 MOVB (SI), AX
150 MOVB -1(SI)(BX*1), CX
151 MOVB AX, (DI)
152 MOVB CX, -1(DI)(BX*1)
153 RET
154 move_0:
155 RET
156 move_4:
157 MOVL (SI), AX
158 MOVL AX, (DI)
159 RET
160 move_3:
161 MOVW (SI), AX
162 MOVB 2(SI), CX
163 MOVW AX, (DI)
164 MOVB CX, 2(DI)
165 RET
166 move_5through7:
167 MOVL (SI), AX
168 MOVL -4(SI)(BX*1), CX
169 MOVL AX, (DI)
170 MOVL CX, -4(DI)(BX*1)
171 RET
172 move_8:
173 // We need a separate case for 8 to make sure we write pointers atomically.
174 MOVQ (SI), AX
175 MOVQ AX, (DI)
176 RET
177 move_9through16:
178 MOVQ (SI), AX
179 MOVQ -8(SI)(BX*1), CX
180 MOVQ AX, (DI)
181 MOVQ CX, -8(DI)(BX*1)
182 RET
183 move_17through32:
184 MOVOU (SI), X0
185 MOVOU -16(SI)(BX*1), X1
186 MOVOU X0, (DI)
187 MOVOU X1, -16(DI)(BX*1)
188 RET
189 move_33through64:
190 MOVOU (SI), X0
191 MOVOU 16(SI), X1
192 MOVOU -32(SI)(BX*1), X2
193 MOVOU -16(SI)(BX*1), X3
194 MOVOU X0, (DI)
195 MOVOU X1, 16(DI)
196 MOVOU X2, -32(DI)(BX*1)
197 MOVOU X3, -16(DI)(BX*1)
198 RET
199 move_65through128:
200 MOVOU (SI), X0
201 MOVOU 16(SI), X1
202 MOVOU 32(SI), X2
203 MOVOU 48(SI), X3
204 MOVOU -64(SI)(BX*1), X4
205 MOVOU -48(SI)(BX*1), X5
206 MOVOU -32(SI)(BX*1), X6
207 MOVOU -16(SI)(BX*1), X7
208 MOVOU X0, (DI)
209 MOVOU X1, 16(DI)
210 MOVOU X2, 32(DI)
211 MOVOU X3, 48(DI)
212 MOVOU X4, -64(DI)(BX*1)
213 MOVOU X5, -48(DI)(BX*1)
214 MOVOU X6, -32(DI)(BX*1)
215 MOVOU X7, -16(DI)(BX*1)
216 RET
217 move_129through256:
218 MOVOU (SI), X0
219 MOVOU 16(SI), X1
220 MOVOU 32(SI), X2
221 MOVOU 48(SI), X3
222 MOVOU 64(SI), X4
223 MOVOU 80(SI), X5
224 MOVOU 96(SI), X6
225 MOVOU 112(SI), X7
226 MOVOU -128(SI)(BX*1), X8
227 MOVOU -112(SI)(BX*1), X9
228 MOVOU -96(SI)(BX*1), X10
229 MOVOU -80(SI)(BX*1), X11
230 MOVOU -64(SI)(BX*1), X12
231 MOVOU -48(SI)(BX*1), X13
232 MOVOU -32(SI)(BX*1), X14
233 MOVOU -16(SI)(BX*1), X15
234 MOVOU X0, (DI)
235 MOVOU X1, 16(DI)
236 MOVOU X2, 32(DI)
237 MOVOU X3, 48(DI)
238 MOVOU X4, 64(DI)
239 MOVOU X5, 80(DI)
240 MOVOU X6, 96(DI)
241 MOVOU X7, 112(DI)
242 MOVOU X8, -128(DI)(BX*1)
243 MOVOU X9, -112(DI)(BX*1)
244 MOVOU X10, -96(DI)(BX*1)
245 MOVOU X11, -80(DI)(BX*1)
246 MOVOU X12, -64(DI)(BX*1)
247 MOVOU X13, -48(DI)(BX*1)
248 MOVOU X14, -32(DI)(BX*1)
249 MOVOU X15, -16(DI)(BX*1)
250 // X15 must be zero on return
251 PXOR X15, X15
252 RET
253 move_256through2048:
254 SUBQ $256, BX
255 MOVOU (SI), X0
256 MOVOU 16(SI), X1
257 MOVOU 32(SI), X2
258 MOVOU 48(SI), X3
259 MOVOU 64(SI), X4
260 MOVOU 80(SI), X5
261 MOVOU 96(SI), X6
262 MOVOU 112(SI), X7
263 MOVOU 128(SI), X8
264 MOVOU 144(SI), X9
265 MOVOU 160(SI), X10
266 MOVOU 176(SI), X11
267 MOVOU 192(SI), X12
268 MOVOU 208(SI), X13
269 MOVOU 224(SI), X14
270 MOVOU 240(SI), X15
271 MOVOU X0, (DI)
272 MOVOU X1, 16(DI)
273 MOVOU X2, 32(DI)
274 MOVOU X3, 48(DI)
275 MOVOU X4, 64(DI)
276 MOVOU X5, 80(DI)
277 MOVOU X6, 96(DI)
278 MOVOU X7, 112(DI)
279 MOVOU X8, 128(DI)
280 MOVOU X9, 144(DI)
281 MOVOU X10, 160(DI)
282 MOVOU X11, 176(DI)
283 MOVOU X12, 192(DI)
284 MOVOU X13, 208(DI)
285 MOVOU X14, 224(DI)
286 MOVOU X15, 240(DI)
287 CMPQ BX, $256
288 LEAQ 256(SI), SI
289 LEAQ 256(DI), DI
290 JGE move_256through2048
291 // X15 must be zero on return
292 PXOR X15, X15
293 JMP tail
294
295 avxUnaligned:
296 // There are two implementations of move algorithm.
297 // The first one for non-overlapped memory regions. It uses forward copying.
298 // The second one for overlapped regions. It uses backward copying
299 MOVQ DI, CX
300 SUBQ SI, CX
301 // Now CX contains distance between SRC and DEST
302 CMPQ CX, BX
303 // If the distance lesser than region length it means that regions are overlapped
304 JC copy_backward
305
306 // Non-temporal copy would be better for big sizes.
307 CMPQ BX, $0x100000
308 JAE gobble_big_data_fwd
309
310 // Memory layout on the source side
311 // SI CX
312 // |<---------BX before correction--------->|
313 // | |<--BX corrected-->| |
314 // | | |<--- AX --->|
315 // |<-R11->| |<-128 bytes->|
316 // +----------------------------------------+
317 // | Head | Body | Tail |
318 // +-------+------------------+-------------+
319 // ^ ^ ^
320 // | | |
321 // Save head into Y4 Save tail into X5..X12
322 // |
323 // SI+R11, where R11 = ((DI & -32) + 32) - DI
324 // Algorithm:
325 // 1. Unaligned save of the tail's 128 bytes
326 // 2. Unaligned save of the head's 32 bytes
327 // 3. Destination-aligned copying of body (128 bytes per iteration)
328 // 4. Put head on the new place
329 // 5. Put the tail on the new place
330 // It can be important to satisfy processor's pipeline requirements for
331 // small sizes as the cost of unaligned memory region copying is
332 // comparable with the cost of main loop. So code is slightly messed there.
333 // There is more clean implementation of that algorithm for bigger sizes
334 // where the cost of unaligned part copying is negligible.
335 // You can see it after gobble_big_data_fwd label.
336 LEAQ (SI)(BX*1), CX
337 MOVQ DI, R10
338 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
339 MOVOU -0x80(CX), X5
340 MOVOU -0x70(CX), X6
341 MOVQ $0x80, AX
342 // Align destination address
343 ANDQ $-32, DI
344 ADDQ $32, DI
345 // Continue tail saving.
346 MOVOU -0x60(CX), X7
347 MOVOU -0x50(CX), X8
348 // Make R11 delta between aligned and unaligned destination addresses.
349 MOVQ DI, R11
350 SUBQ R10, R11
351 // Continue tail saving.
352 MOVOU -0x40(CX), X9
353 MOVOU -0x30(CX), X10
354 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
355 SUBQ R11, BX
356 // Continue tail saving.
357 MOVOU -0x20(CX), X11
358 MOVOU -0x10(CX), X12
359 // The tail will be put on its place after main body copying.
360 // It's time for the unaligned heading part.
361 VMOVDQU (SI), Y4
362 // Adjust source address to point past head.
363 ADDQ R11, SI
364 SUBQ AX, BX
365 // Aligned memory copying there
366 gobble_128_loop:
367 VMOVDQU (SI), Y0
368 VMOVDQU 0x20(SI), Y1
369 VMOVDQU 0x40(SI), Y2
370 VMOVDQU 0x60(SI), Y3
371 ADDQ AX, SI
372 VMOVDQA Y0, (DI)
373 VMOVDQA Y1, 0x20(DI)
374 VMOVDQA Y2, 0x40(DI)
375 VMOVDQA Y3, 0x60(DI)
376 ADDQ AX, DI
377 SUBQ AX, BX
378 JA gobble_128_loop
379 // Now we can store unaligned parts.
380 ADDQ AX, BX
381 ADDQ DI, BX
382 VMOVDQU Y4, (R10)
383 VZEROUPPER
384 MOVOU X5, -0x80(BX)
385 MOVOU X6, -0x70(BX)
386 MOVOU X7, -0x60(BX)
387 MOVOU X8, -0x50(BX)
388 MOVOU X9, -0x40(BX)
389 MOVOU X10, -0x30(BX)
390 MOVOU X11, -0x20(BX)
391 MOVOU X12, -0x10(BX)
392 RET
393
394 gobble_big_data_fwd:
395 // There is forward copying for big regions.
396 // It uses non-temporal mov instructions.
397 // Details of this algorithm are commented previously for small sizes.
398 LEAQ (SI)(BX*1), CX
399 MOVOU -0x80(SI)(BX*1), X5
400 MOVOU -0x70(CX), X6
401 MOVOU -0x60(CX), X7
402 MOVOU -0x50(CX), X8
403 MOVOU -0x40(CX), X9
404 MOVOU -0x30(CX), X10
405 MOVOU -0x20(CX), X11
406 MOVOU -0x10(CX), X12
407 VMOVDQU (SI), Y4
408 MOVQ DI, R8
409 ANDQ $-32, DI
410 ADDQ $32, DI
411 MOVQ DI, R10
412 SUBQ R8, R10
413 SUBQ R10, BX
414 ADDQ R10, SI
415 LEAQ (DI)(BX*1), CX
416 SUBQ $0x80, BX
417 gobble_mem_fwd_loop:
418 PREFETCHNTA 0x1C0(SI)
419 PREFETCHNTA 0x280(SI)
420 // Prefetch values were chosen empirically.
421 // Approach for prefetch usage as in 9.5.6 of [1]
422 // [1] 64-ia-32-architectures-optimization-manual.pdf
423 // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
424 VMOVDQU (SI), Y0
425 VMOVDQU 0x20(SI), Y1
426 VMOVDQU 0x40(SI), Y2
427 VMOVDQU 0x60(SI), Y3
428 ADDQ $0x80, SI
429 VMOVNTDQ Y0, (DI)
430 VMOVNTDQ Y1, 0x20(DI)
431 VMOVNTDQ Y2, 0x40(DI)
432 VMOVNTDQ Y3, 0x60(DI)
433 ADDQ $0x80, DI
434 SUBQ $0x80, BX
435 JA gobble_mem_fwd_loop
436 // NT instructions don't follow the normal cache-coherency rules.
437 // We need SFENCE there to make copied data available timely.
438 SFENCE
439 VMOVDQU Y4, (R8)
440 VZEROUPPER
441 MOVOU X5, -0x80(CX)
442 MOVOU X6, -0x70(CX)
443 MOVOU X7, -0x60(CX)
444 MOVOU X8, -0x50(CX)
445 MOVOU X9, -0x40(CX)
446 MOVOU X10, -0x30(CX)
447 MOVOU X11, -0x20(CX)
448 MOVOU X12, -0x10(CX)
449 RET
450
451 copy_backward:
452 MOVQ DI, AX
453 // Backward copying is about the same as the forward one.
454 // Firstly we load unaligned tail in the beginning of region.
455 MOVOU (SI), X5
456 MOVOU 0x10(SI), X6
457 ADDQ BX, DI
458 MOVOU 0x20(SI), X7
459 MOVOU 0x30(SI), X8
460 LEAQ -0x20(DI), R10
461 MOVQ DI, R11
462 MOVOU 0x40(SI), X9
463 MOVOU 0x50(SI), X10
464 ANDQ $0x1F, R11
465 MOVOU 0x60(SI), X11
466 MOVOU 0x70(SI), X12
467 XORQ R11, DI
468 // Let's point SI to the end of region
469 ADDQ BX, SI
470 // and load unaligned head into X4.
471 VMOVDQU -0x20(SI), Y4
472 SUBQ R11, SI
473 SUBQ R11, BX
474 // If there is enough data for non-temporal moves go to special loop
475 CMPQ BX, $0x100000
476 JA gobble_big_data_bwd
477 SUBQ $0x80, BX
478 gobble_mem_bwd_loop:
479 VMOVDQU -0x20(SI), Y0
480 VMOVDQU -0x40(SI), Y1
481 VMOVDQU -0x60(SI), Y2
482 VMOVDQU -0x80(SI), Y3
483 SUBQ $0x80, SI
484 VMOVDQA Y0, -0x20(DI)
485 VMOVDQA Y1, -0x40(DI)
486 VMOVDQA Y2, -0x60(DI)
487 VMOVDQA Y3, -0x80(DI)
488 SUBQ $0x80, DI
489 SUBQ $0x80, BX
490 JA gobble_mem_bwd_loop
491 // Let's store unaligned data
492 VMOVDQU Y4, (R10)
493 VZEROUPPER
494 MOVOU X5, (AX)
495 MOVOU X6, 0x10(AX)
496 MOVOU X7, 0x20(AX)
497 MOVOU X8, 0x30(AX)
498 MOVOU X9, 0x40(AX)
499 MOVOU X10, 0x50(AX)
500 MOVOU X11, 0x60(AX)
501 MOVOU X12, 0x70(AX)
502 RET
503
504 gobble_big_data_bwd:
505 SUBQ $0x80, BX
506 gobble_big_mem_bwd_loop:
507 PREFETCHNTA -0x1C0(SI)
508 PREFETCHNTA -0x280(SI)
509 VMOVDQU -0x20(SI), Y0
510 VMOVDQU -0x40(SI), Y1
511 VMOVDQU -0x60(SI), Y2
512 VMOVDQU -0x80(SI), Y3
513 SUBQ $0x80, SI
514 VMOVNTDQ Y0, -0x20(DI)
515 VMOVNTDQ Y1, -0x40(DI)
516 VMOVNTDQ Y2, -0x60(DI)
517 VMOVNTDQ Y3, -0x80(DI)
518 SUBQ $0x80, DI
519 SUBQ $0x80, BX
520 JA gobble_big_mem_bwd_loop
521 SFENCE
522 VMOVDQU Y4, (R10)
523 VZEROUPPER
524 MOVOU X5, (AX)
525 MOVOU X6, 0x10(AX)
526 MOVOU X7, 0x20(AX)
527 MOVOU X8, 0x30(AX)
528 MOVOU X9, 0x40(AX)
529 MOVOU X10, 0x50(AX)
530 MOVOU X11, 0x60(AX)
531 MOVOU X12, 0x70(AX)
532 RET
533
View as plain text