Text file
src/runtime/memmove_amd64.s
1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
3 //
4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
6 // Portions Copyright 2009 The Go Authors. All rights reserved.
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 // THE SOFTWARE.
25
26 //go:build !plan9
27
28 #include "go_asm.h"
29 #include "textflag.h"
30
31 // See memmove Go doc for important implementation constraints.
32
33 // func memmove(to, from unsafe.Pointer, n uintptr)
34 // ABIInternal for performance.
35 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
36 // AX = to
37 // BX = from
38 // CX = n
39 MOVQ AX, DI
40 MOVQ BX, SI
41 MOVQ CX, BX
42
43 // REP instructions have a high startup cost, so we handle small sizes
44 // with some straightline code. The REP MOVSQ instruction is really fast
45 // for large sizes. The cutover is approximately 2K.
46 tail:
47 // move_129through256 or smaller work whether or not the source and the
48 // destination memory regions overlap because they load all data into
49 // registers before writing it back. move_256through2048 on the other
50 // hand can be used only when the memory regions don't overlap or the copy
51 // direction is forward.
52 //
53 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
54 TESTQ BX, BX
55 JEQ move_0
56 CMPQ BX, $2
57 JBE move_1or2
58 CMPQ BX, $4
59 JB move_3
60 JBE move_4
61 CMPQ BX, $8
62 JB move_5through7
63 JE move_8
64 CMPQ BX, $16
65 JBE move_9through16
66 CMPQ BX, $32
67 JBE move_17through32
68 CMPQ BX, $64
69 JBE move_33through64
70 CMPQ BX, $128
71 JBE move_65through128
72 CMPQ BX, $256
73 JBE move_129through256
74
75 MOVB runtime·memmoveBits(SB), AX
76 // We have AVX but we don't want to use REP MOVSx.
77 CMPB AX, $const_avxSupported
78 JEQ avxUnaligned
79 /*
80 * check and set for backwards
81 */
82 CMPQ SI, DI
83 JLS back
84
85 /*
86 * forward copy loop
87 */
88 forward:
89 CMPQ BX, $2048
90 JL check_avx
91 // REP MOVSx is slow if destination address is unaligned.
92 TESTQ $15,DI
93 JNZ check_avx
94 TESTB $const_repmovsPreferred, AX
95 JNZ fwdBy8
96 // For backward copy, REP MOVSx performs worse than avx.
97 check_avx:
98 TESTB $const_avxSupported, AX
99 JNZ avxUnaligned
100
101 CMPQ BX, $2048
102 JLS move_256through2048
103 // Check alignment
104 MOVL SI, AX
105 ORL DI, AX
106 TESTL $7, AX
107 JEQ fwdBy8
108
109 // Do 1 byte at a time
110 MOVQ BX, CX
111 REP; MOVSB
112 RET
113
114 fwdBy8:
115 // Loading the last (possibly partially overlapping) word and writing
116 // it at the end.
117 MOVQ -8(SI)(BX*1), AX
118 LEAQ -8(DI)(BX*1), DX
119 // Do 8 bytes at a time
120 LEAQ -1(BX),CX
121 SHRQ $3, CX
122 REP; MOVSQ
123 MOVQ AX, (DX)
124 RET
125
126 back:
127 /*
128 * check overlap
129 */
130 MOVQ SI, CX
131 ADDQ BX, CX
132 CMPQ CX, DI
133 JLS forward
134
135 TESTB $const_avxSupported, AX
136 JNZ avxUnaligned
137 /*
138 * whole thing backwards has
139 * adjusted addresses
140 */
141 ADDQ BX, DI
142 ADDQ BX, SI
143 STD
144
145 /*
146 * copy
147 */
148 MOVQ BX, CX
149 SHRQ $3, CX
150 ANDQ $7, BX
151
152 SUBQ $8, DI
153 SUBQ $8, SI
154 REP; MOVSQ
155
156 CLD
157 ADDQ $8, DI
158 ADDQ $8, SI
159 SUBQ BX, DI
160 SUBQ BX, SI
161 JMP tail
162
163 move_1or2:
164 MOVB (SI), AX
165 MOVB -1(SI)(BX*1), CX
166 MOVB AX, (DI)
167 MOVB CX, -1(DI)(BX*1)
168 RET
169 move_0:
170 RET
171 move_4:
172 MOVL (SI), AX
173 MOVL AX, (DI)
174 RET
175 move_3:
176 MOVW (SI), AX
177 MOVB 2(SI), CX
178 MOVW AX, (DI)
179 MOVB CX, 2(DI)
180 RET
181 move_5through7:
182 MOVL (SI), AX
183 MOVL -4(SI)(BX*1), CX
184 MOVL AX, (DI)
185 MOVL CX, -4(DI)(BX*1)
186 RET
187 move_8:
188 // We need a separate case for 8 to make sure we write pointers atomically.
189 MOVQ (SI), AX
190 MOVQ AX, (DI)
191 RET
192 move_9through16:
193 MOVQ (SI), AX
194 MOVQ -8(SI)(BX*1), CX
195 MOVQ AX, (DI)
196 MOVQ CX, -8(DI)(BX*1)
197 RET
198 move_17through32:
199 MOVOU (SI), X0
200 MOVOU -16(SI)(BX*1), X1
201 MOVOU X0, (DI)
202 MOVOU X1, -16(DI)(BX*1)
203 RET
204 move_33through64:
205 MOVOU (SI), X0
206 MOVOU 16(SI), X1
207 MOVOU -32(SI)(BX*1), X2
208 MOVOU -16(SI)(BX*1), X3
209 MOVOU X0, (DI)
210 MOVOU X1, 16(DI)
211 MOVOU X2, -32(DI)(BX*1)
212 MOVOU X3, -16(DI)(BX*1)
213 RET
214 move_65through128:
215 MOVOU (SI), X0
216 MOVOU 16(SI), X1
217 MOVOU 32(SI), X2
218 MOVOU 48(SI), X3
219 MOVOU -64(SI)(BX*1), X4
220 MOVOU -48(SI)(BX*1), X5
221 MOVOU -32(SI)(BX*1), X6
222 MOVOU -16(SI)(BX*1), X7
223 MOVOU X0, (DI)
224 MOVOU X1, 16(DI)
225 MOVOU X2, 32(DI)
226 MOVOU X3, 48(DI)
227 MOVOU X4, -64(DI)(BX*1)
228 MOVOU X5, -48(DI)(BX*1)
229 MOVOU X6, -32(DI)(BX*1)
230 MOVOU X7, -16(DI)(BX*1)
231 RET
232 move_129through256:
233 MOVOU (SI), X0
234 MOVOU 16(SI), X1
235 MOVOU 32(SI), X2
236 MOVOU 48(SI), X3
237 MOVOU 64(SI), X4
238 MOVOU 80(SI), X5
239 MOVOU 96(SI), X6
240 MOVOU 112(SI), X7
241 MOVOU -128(SI)(BX*1), X8
242 MOVOU -112(SI)(BX*1), X9
243 MOVOU -96(SI)(BX*1), X10
244 MOVOU -80(SI)(BX*1), X11
245 MOVOU -64(SI)(BX*1), X12
246 MOVOU -48(SI)(BX*1), X13
247 MOVOU -32(SI)(BX*1), X14
248 MOVOU -16(SI)(BX*1), X15
249 MOVOU X0, (DI)
250 MOVOU X1, 16(DI)
251 MOVOU X2, 32(DI)
252 MOVOU X3, 48(DI)
253 MOVOU X4, 64(DI)
254 MOVOU X5, 80(DI)
255 MOVOU X6, 96(DI)
256 MOVOU X7, 112(DI)
257 MOVOU X8, -128(DI)(BX*1)
258 MOVOU X9, -112(DI)(BX*1)
259 MOVOU X10, -96(DI)(BX*1)
260 MOVOU X11, -80(DI)(BX*1)
261 MOVOU X12, -64(DI)(BX*1)
262 MOVOU X13, -48(DI)(BX*1)
263 MOVOU X14, -32(DI)(BX*1)
264 MOVOU X15, -16(DI)(BX*1)
265 // X15 must be zero on return
266 PXOR X15, X15
267 RET
268 move_256through2048:
269 SUBQ $256, BX
270 MOVOU (SI), X0
271 MOVOU 16(SI), X1
272 MOVOU 32(SI), X2
273 MOVOU 48(SI), X3
274 MOVOU 64(SI), X4
275 MOVOU 80(SI), X5
276 MOVOU 96(SI), X6
277 MOVOU 112(SI), X7
278 MOVOU 128(SI), X8
279 MOVOU 144(SI), X9
280 MOVOU 160(SI), X10
281 MOVOU 176(SI), X11
282 MOVOU 192(SI), X12
283 MOVOU 208(SI), X13
284 MOVOU 224(SI), X14
285 MOVOU 240(SI), X15
286 MOVOU X0, (DI)
287 MOVOU X1, 16(DI)
288 MOVOU X2, 32(DI)
289 MOVOU X3, 48(DI)
290 MOVOU X4, 64(DI)
291 MOVOU X5, 80(DI)
292 MOVOU X6, 96(DI)
293 MOVOU X7, 112(DI)
294 MOVOU X8, 128(DI)
295 MOVOU X9, 144(DI)
296 MOVOU X10, 160(DI)
297 MOVOU X11, 176(DI)
298 MOVOU X12, 192(DI)
299 MOVOU X13, 208(DI)
300 MOVOU X14, 224(DI)
301 MOVOU X15, 240(DI)
302 CMPQ BX, $256
303 LEAQ 256(SI), SI
304 LEAQ 256(DI), DI
305 JGE move_256through2048
306 // X15 must be zero on return
307 PXOR X15, X15
308 JMP tail
309
310 avxUnaligned:
311 // There are two implementations of move algorithm.
312 // The first one for non-overlapped memory regions. It uses forward copying.
313 // The second one for overlapped regions. It uses backward copying
314 MOVQ DI, CX
315 SUBQ SI, CX
316 // Now CX contains distance between SRC and DEST
317 CMPQ CX, BX
318 // If the distance lesser than region length it means that regions are overlapped
319 JC copy_backward
320
321 // Non-temporal copy would be better for big sizes.
322 CMPQ BX, $0x100000
323 JAE gobble_big_data_fwd
324
325 // Memory layout on the source side
326 // SI CX
327 // |<---------BX before correction--------->|
328 // | |<--BX corrected-->| |
329 // | | |<--- AX --->|
330 // |<-R11->| |<-128 bytes->|
331 // +----------------------------------------+
332 // | Head | Body | Tail |
333 // +-------+------------------+-------------+
334 // ^ ^ ^
335 // | | |
336 // Save head into Y4 Save tail into X5..X12
337 // |
338 // SI+R11, where R11 = ((DI & -32) + 32) - DI
339 // Algorithm:
340 // 1. Unaligned save of the tail's 128 bytes
341 // 2. Unaligned save of the head's 32 bytes
342 // 3. Destination-aligned copying of body (128 bytes per iteration)
343 // 4. Put head on the new place
344 // 5. Put the tail on the new place
345 // It can be important to satisfy processor's pipeline requirements for
346 // small sizes as the cost of unaligned memory region copying is
347 // comparable with the cost of main loop. So code is slightly messed there.
348 // There is more clean implementation of that algorithm for bigger sizes
349 // where the cost of unaligned part copying is negligible.
350 // You can see it after gobble_big_data_fwd label.
351 LEAQ (SI)(BX*1), CX
352 MOVQ DI, R10
353 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
354 MOVOU -0x80(CX), X5
355 MOVOU -0x70(CX), X6
356 MOVQ $0x80, AX
357 // Align destination address
358 ANDQ $-32, DI
359 ADDQ $32, DI
360 // Continue tail saving.
361 MOVOU -0x60(CX), X7
362 MOVOU -0x50(CX), X8
363 // Make R11 delta between aligned and unaligned destination addresses.
364 MOVQ DI, R11
365 SUBQ R10, R11
366 // Continue tail saving.
367 MOVOU -0x40(CX), X9
368 MOVOU -0x30(CX), X10
369 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
370 SUBQ R11, BX
371 // Continue tail saving.
372 MOVOU -0x20(CX), X11
373 MOVOU -0x10(CX), X12
374 // The tail will be put on its place after main body copying.
375 // It's time for the unaligned heading part.
376 VMOVDQU (SI), Y4
377 // Adjust source address to point past head.
378 ADDQ R11, SI
379 SUBQ AX, BX
380 // Aligned memory copying there
381 gobble_128_loop:
382 VMOVDQU (SI), Y0
383 VMOVDQU 0x20(SI), Y1
384 VMOVDQU 0x40(SI), Y2
385 VMOVDQU 0x60(SI), Y3
386 ADDQ AX, SI
387 VMOVDQA Y0, (DI)
388 VMOVDQA Y1, 0x20(DI)
389 VMOVDQA Y2, 0x40(DI)
390 VMOVDQA Y3, 0x60(DI)
391 ADDQ AX, DI
392 SUBQ AX, BX
393 JA gobble_128_loop
394 // Now we can store unaligned parts.
395 ADDQ AX, BX
396 ADDQ DI, BX
397 VMOVDQU Y4, (R10)
398 VZEROUPPER
399 MOVOU X5, -0x80(BX)
400 MOVOU X6, -0x70(BX)
401 MOVOU X7, -0x60(BX)
402 MOVOU X8, -0x50(BX)
403 MOVOU X9, -0x40(BX)
404 MOVOU X10, -0x30(BX)
405 MOVOU X11, -0x20(BX)
406 MOVOU X12, -0x10(BX)
407 RET
408
409 gobble_big_data_fwd:
410 // There is forward copying for big regions.
411 // It uses non-temporal mov instructions.
412 // Details of this algorithm are commented previously for small sizes.
413 LEAQ (SI)(BX*1), CX
414 MOVOU -0x80(SI)(BX*1), X5
415 MOVOU -0x70(CX), X6
416 MOVOU -0x60(CX), X7
417 MOVOU -0x50(CX), X8
418 MOVOU -0x40(CX), X9
419 MOVOU -0x30(CX), X10
420 MOVOU -0x20(CX), X11
421 MOVOU -0x10(CX), X12
422 VMOVDQU (SI), Y4
423 MOVQ DI, R8
424 ANDQ $-32, DI
425 ADDQ $32, DI
426 MOVQ DI, R10
427 SUBQ R8, R10
428 SUBQ R10, BX
429 ADDQ R10, SI
430 LEAQ (DI)(BX*1), CX
431 SUBQ $0x80, BX
432 gobble_mem_fwd_loop:
433 PREFETCHNTA 0x1C0(SI)
434 PREFETCHNTA 0x280(SI)
435 // Prefetch values were chosen empirically.
436 // Approach for prefetch usage as in 9.5.6 of [1]
437 // [1] 64-ia-32-architectures-optimization-manual.pdf
438 // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
439 VMOVDQU (SI), Y0
440 VMOVDQU 0x20(SI), Y1
441 VMOVDQU 0x40(SI), Y2
442 VMOVDQU 0x60(SI), Y3
443 ADDQ $0x80, SI
444 VMOVNTDQ Y0, (DI)
445 VMOVNTDQ Y1, 0x20(DI)
446 VMOVNTDQ Y2, 0x40(DI)
447 VMOVNTDQ Y3, 0x60(DI)
448 ADDQ $0x80, DI
449 SUBQ $0x80, BX
450 JA gobble_mem_fwd_loop
451 // NT instructions don't follow the normal cache-coherency rules.
452 // We need SFENCE there to make copied data available timely.
453 SFENCE
454 VMOVDQU Y4, (R8)
455 VZEROUPPER
456 MOVOU X5, -0x80(CX)
457 MOVOU X6, -0x70(CX)
458 MOVOU X7, -0x60(CX)
459 MOVOU X8, -0x50(CX)
460 MOVOU X9, -0x40(CX)
461 MOVOU X10, -0x30(CX)
462 MOVOU X11, -0x20(CX)
463 MOVOU X12, -0x10(CX)
464 RET
465
466 copy_backward:
467 MOVQ DI, AX
468 // Backward copying is about the same as the forward one.
469 // Firstly we load unaligned tail in the beginning of region.
470 MOVOU (SI), X5
471 MOVOU 0x10(SI), X6
472 ADDQ BX, DI
473 MOVOU 0x20(SI), X7
474 MOVOU 0x30(SI), X8
475 LEAQ -0x20(DI), R10
476 MOVQ DI, R11
477 MOVOU 0x40(SI), X9
478 MOVOU 0x50(SI), X10
479 ANDQ $0x1F, R11
480 MOVOU 0x60(SI), X11
481 MOVOU 0x70(SI), X12
482 XORQ R11, DI
483 // Let's point SI to the end of region
484 ADDQ BX, SI
485 // and load unaligned head into X4.
486 VMOVDQU -0x20(SI), Y4
487 SUBQ R11, SI
488 SUBQ R11, BX
489 // If there is enough data for non-temporal moves go to special loop
490 CMPQ BX, $0x100000
491 JA gobble_big_data_bwd
492 SUBQ $0x80, BX
493 gobble_mem_bwd_loop:
494 VMOVDQU -0x20(SI), Y0
495 VMOVDQU -0x40(SI), Y1
496 VMOVDQU -0x60(SI), Y2
497 VMOVDQU -0x80(SI), Y3
498 SUBQ $0x80, SI
499 VMOVDQA Y0, -0x20(DI)
500 VMOVDQA Y1, -0x40(DI)
501 VMOVDQA Y2, -0x60(DI)
502 VMOVDQA Y3, -0x80(DI)
503 SUBQ $0x80, DI
504 SUBQ $0x80, BX
505 JA gobble_mem_bwd_loop
506 // Let's store unaligned data
507 VMOVDQU Y4, (R10)
508 VZEROUPPER
509 MOVOU X5, (AX)
510 MOVOU X6, 0x10(AX)
511 MOVOU X7, 0x20(AX)
512 MOVOU X8, 0x30(AX)
513 MOVOU X9, 0x40(AX)
514 MOVOU X10, 0x50(AX)
515 MOVOU X11, 0x60(AX)
516 MOVOU X12, 0x70(AX)
517 RET
518
519 gobble_big_data_bwd:
520 SUBQ $0x80, BX
521 gobble_big_mem_bwd_loop:
522 PREFETCHNTA -0x1C0(SI)
523 PREFETCHNTA -0x280(SI)
524 VMOVDQU -0x20(SI), Y0
525 VMOVDQU -0x40(SI), Y1
526 VMOVDQU -0x60(SI), Y2
527 VMOVDQU -0x80(SI), Y3
528 SUBQ $0x80, SI
529 VMOVNTDQ Y0, -0x20(DI)
530 VMOVNTDQ Y1, -0x40(DI)
531 VMOVNTDQ Y2, -0x60(DI)
532 VMOVNTDQ Y3, -0x80(DI)
533 SUBQ $0x80, DI
534 SUBQ $0x80, BX
535 JA gobble_big_mem_bwd_loop
536 SFENCE
537 VMOVDQU Y4, (R10)
538 VZEROUPPER
539 MOVOU X5, (AX)
540 MOVOU X6, 0x10(AX)
541 MOVOU X7, 0x20(AX)
542 MOVOU X8, 0x30(AX)
543 MOVOU X9, 0x40(AX)
544 MOVOU X10, 0x50(AX)
545 MOVOU X11, 0x60(AX)
546 MOVOU X12, 0x70(AX)
547 RET
548
View as plain text