memmove_amd64.s

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  //go:build !plan9
    27  
    28  #include "go_asm.h"
    29  #include "textflag.h"
    30  
    31  // See memmove Go doc for important implementation constraints.
    32  
    33  // func memmove(to, from unsafe.Pointer, n uintptr)
    34  // ABIInternal for performance.
    35  TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
    36  	// AX = to
    37  	// BX = from
    38  	// CX = n
    39  	MOVQ	AX, DI
    40  	MOVQ	BX, SI
    41  	MOVQ	CX, BX
    42  
    43  	// REP instructions have a high startup cost, so we handle small sizes
    44  	// with some straightline code. The REP MOVSQ instruction is really fast
    45  	// for large sizes. The cutover is approximately 2K.
    46  tail:
    47  	// move_129through256 or smaller work whether or not the source and the
    48  	// destination memory regions overlap because they load all data into
    49  	// registers before writing it back.  move_256through2048 on the other
    50  	// hand can be used only when the memory regions don't overlap or the copy
    51  	// direction is forward.
    52  	//
    53  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    54  	TESTQ	BX, BX
    55  	JEQ	move_0
    56  	CMPQ	BX, $2
    57  	JBE	move_1or2
    58  	CMPQ	BX, $4
    59  	JB	move_3
    60  	JBE	move_4
    61  	CMPQ	BX, $8
    62  	JB	move_5through7
    63  	JE	move_8
    64  	CMPQ	BX, $16
    65  	JBE	move_9through16
    66  	CMPQ	BX, $32
    67  	JBE	move_17through32
    68  	CMPQ	BX, $64
    69  	JBE	move_33through64
    70  	CMPQ	BX, $128
    71  	JBE	move_65through128
    72  	CMPQ	BX, $256
    73  	JBE	move_129through256
    74  
    75  	MOVB	runtime·memmoveBits(SB), AX
    76  	// We have AVX but we don't want to use REP MOVSx.
    77  	CMPB	AX, $const_avxSupported
    78  	JEQ	avxUnaligned
    79  /*
    80   * check and set for backwards
    81   */
    82  	CMPQ	SI, DI
    83  	JLS	back
    84  
    85  /*
    86  * forward copy loop
    87  */
    88  forward:
    89  	CMPQ	BX, $2048
    90  	JL	check_avx
    91  	// REP MOVSx is slow if destination address is unaligned.
    92  	TESTQ	$15,DI
    93  	JNZ	check_avx
    94  	TESTB	$const_repmovsPreferred, AX
    95  	JNZ	fwdBy8
    96  	// For backward copy, REP MOVSx performs worse than avx.
    97  check_avx:
    98  	TESTB	$const_avxSupported, AX
    99  	JNZ	avxUnaligned
   100  
   101  	CMPQ	BX, $2048
   102  	JLS	move_256through2048
   103  	// Check alignment
   104  	MOVL	SI, AX
   105  	ORL	DI, AX
   106  	TESTL	$7, AX
   107  	JEQ	fwdBy8
   108  
   109  	// Do 1 byte at a time
   110  	MOVQ	BX, CX
   111  	REP;	MOVSB
   112  	RET
   113  
   114  fwdBy8:
   115  	// Loading the last (possibly partially overlapping) word and writing
   116  	// it at the end.
   117  	MOVQ	-8(SI)(BX*1), AX
   118  	LEAQ	-8(DI)(BX*1), DX
   119  	// Do 8 bytes at a time
   120  	LEAQ 	-1(BX),CX
   121  	SHRQ	$3, CX
   122  	REP;	MOVSQ
   123  	MOVQ	AX, (DX)
   124  	RET
   125  
   126  back:
   127  /*
   128   * check overlap
   129   */
   130  	MOVQ	SI, CX
   131  	ADDQ	BX, CX
   132  	CMPQ	CX, DI
   133  	JLS	forward
   134  
   135  	TESTB	$const_avxSupported, AX
   136  	JNZ	avxUnaligned
   137  /*
   138   * whole thing backwards has
   139   * adjusted addresses
   140   */
   141  	ADDQ	BX, DI
   142  	ADDQ	BX, SI
   143  	STD
   144  
   145  /*
   146   * copy
   147   */
   148  	MOVQ	BX, CX
   149  	SHRQ	$3, CX
   150  	ANDQ	$7, BX
   151  
   152  	SUBQ	$8, DI
   153  	SUBQ	$8, SI
   154  	REP;	MOVSQ
   155  
   156  	CLD
   157  	ADDQ	$8, DI
   158  	ADDQ	$8, SI
   159  	SUBQ	BX, DI
   160  	SUBQ	BX, SI
   161  	JMP	tail
   162  
   163  move_1or2:
   164  	MOVB	(SI), AX
   165  	MOVB	-1(SI)(BX*1), CX
   166  	MOVB	AX, (DI)
   167  	MOVB	CX, -1(DI)(BX*1)
   168  	RET
   169  move_0:
   170  	RET
   171  move_4:
   172  	MOVL	(SI), AX
   173  	MOVL	AX, (DI)
   174  	RET
   175  move_3:
   176  	MOVW	(SI), AX
   177  	MOVB	2(SI), CX
   178  	MOVW	AX, (DI)
   179  	MOVB	CX, 2(DI)
   180  	RET
   181  move_5through7:
   182  	MOVL	(SI), AX
   183  	MOVL	-4(SI)(BX*1), CX
   184  	MOVL	AX, (DI)
   185  	MOVL	CX, -4(DI)(BX*1)
   186  	RET
   187  move_8:
   188  	// We need a separate case for 8 to make sure we write pointers atomically.
   189  	MOVQ	(SI), AX
   190  	MOVQ	AX, (DI)
   191  	RET
   192  move_9through16:
   193  	MOVQ	(SI), AX
   194  	MOVQ	-8(SI)(BX*1), CX
   195  	MOVQ	AX, (DI)
   196  	MOVQ	CX, -8(DI)(BX*1)
   197  	RET
   198  move_17through32:
   199  	MOVOU	(SI), X0
   200  	MOVOU	-16(SI)(BX*1), X1
   201  	MOVOU	X0, (DI)
   202  	MOVOU	X1, -16(DI)(BX*1)
   203  	RET
   204  move_33through64:
   205  	MOVOU	(SI), X0
   206  	MOVOU	16(SI), X1
   207  	MOVOU	-32(SI)(BX*1), X2
   208  	MOVOU	-16(SI)(BX*1), X3
   209  	MOVOU	X0, (DI)
   210  	MOVOU	X1, 16(DI)
   211  	MOVOU	X2, -32(DI)(BX*1)
   212  	MOVOU	X3, -16(DI)(BX*1)
   213  	RET
   214  move_65through128:
   215  	MOVOU	(SI), X0
   216  	MOVOU	16(SI), X1
   217  	MOVOU	32(SI), X2
   218  	MOVOU	48(SI), X3
   219  	MOVOU	-64(SI)(BX*1), X4
   220  	MOVOU	-48(SI)(BX*1), X5
   221  	MOVOU	-32(SI)(BX*1), X6
   222  	MOVOU	-16(SI)(BX*1), X7
   223  	MOVOU	X0, (DI)
   224  	MOVOU	X1, 16(DI)
   225  	MOVOU	X2, 32(DI)
   226  	MOVOU	X3, 48(DI)
   227  	MOVOU	X4, -64(DI)(BX*1)
   228  	MOVOU	X5, -48(DI)(BX*1)
   229  	MOVOU	X6, -32(DI)(BX*1)
   230  	MOVOU	X7, -16(DI)(BX*1)
   231  	RET
   232  move_129through256:
   233  	MOVOU	(SI), X0
   234  	MOVOU	16(SI), X1
   235  	MOVOU	32(SI), X2
   236  	MOVOU	48(SI), X3
   237  	MOVOU	64(SI), X4
   238  	MOVOU	80(SI), X5
   239  	MOVOU	96(SI), X6
   240  	MOVOU	112(SI), X7
   241  	MOVOU	-128(SI)(BX*1), X8
   242  	MOVOU	-112(SI)(BX*1), X9
   243  	MOVOU	-96(SI)(BX*1), X10
   244  	MOVOU	-80(SI)(BX*1), X11
   245  	MOVOU	-64(SI)(BX*1), X12
   246  	MOVOU	-48(SI)(BX*1), X13
   247  	MOVOU	-32(SI)(BX*1), X14
   248  	MOVOU	-16(SI)(BX*1), X15
   249  	MOVOU	X0, (DI)
   250  	MOVOU	X1, 16(DI)
   251  	MOVOU	X2, 32(DI)
   252  	MOVOU	X3, 48(DI)
   253  	MOVOU	X4, 64(DI)
   254  	MOVOU	X5, 80(DI)
   255  	MOVOU	X6, 96(DI)
   256  	MOVOU	X7, 112(DI)
   257  	MOVOU	X8, -128(DI)(BX*1)
   258  	MOVOU	X9, -112(DI)(BX*1)
   259  	MOVOU	X10, -96(DI)(BX*1)
   260  	MOVOU	X11, -80(DI)(BX*1)
   261  	MOVOU	X12, -64(DI)(BX*1)
   262  	MOVOU	X13, -48(DI)(BX*1)
   263  	MOVOU	X14, -32(DI)(BX*1)
   264  	MOVOU	X15, -16(DI)(BX*1)
   265  	// X15 must be zero on return
   266  	PXOR	X15, X15
   267  	RET
   268  move_256through2048:
   269  	SUBQ	$256, BX
   270  	MOVOU	(SI), X0
   271  	MOVOU	16(SI), X1
   272  	MOVOU	32(SI), X2
   273  	MOVOU	48(SI), X3
   274  	MOVOU	64(SI), X4
   275  	MOVOU	80(SI), X5
   276  	MOVOU	96(SI), X6
   277  	MOVOU	112(SI), X7
   278  	MOVOU	128(SI), X8
   279  	MOVOU	144(SI), X9
   280  	MOVOU	160(SI), X10
   281  	MOVOU	176(SI), X11
   282  	MOVOU	192(SI), X12
   283  	MOVOU	208(SI), X13
   284  	MOVOU	224(SI), X14
   285  	MOVOU	240(SI), X15
   286  	MOVOU	X0, (DI)
   287  	MOVOU	X1, 16(DI)
   288  	MOVOU	X2, 32(DI)
   289  	MOVOU	X3, 48(DI)
   290  	MOVOU	X4, 64(DI)
   291  	MOVOU	X5, 80(DI)
   292  	MOVOU	X6, 96(DI)
   293  	MOVOU	X7, 112(DI)
   294  	MOVOU	X8, 128(DI)
   295  	MOVOU	X9, 144(DI)
   296  	MOVOU	X10, 160(DI)
   297  	MOVOU	X11, 176(DI)
   298  	MOVOU	X12, 192(DI)
   299  	MOVOU	X13, 208(DI)
   300  	MOVOU	X14, 224(DI)
   301  	MOVOU	X15, 240(DI)
   302  	CMPQ	BX, $256
   303  	LEAQ	256(SI), SI
   304  	LEAQ	256(DI), DI
   305  	JGE	move_256through2048
   306  	// X15 must be zero on return
   307  	PXOR	X15, X15
   308  	JMP	tail
   309  
   310  avxUnaligned:
   311  	// There are two implementations of move algorithm.
   312  	// The first one for non-overlapped memory regions. It uses forward copying.
   313  	// The second one for overlapped regions. It uses backward copying
   314  	MOVQ	DI, CX
   315  	SUBQ	SI, CX
   316  	// Now CX contains distance between SRC and DEST
   317  	CMPQ	CX, BX
   318  	// If the distance lesser than region length it means that regions are overlapped
   319  	JC	copy_backward
   320  
   321  	// Non-temporal copy would be better for big sizes.
   322  	CMPQ	BX, $0x100000
   323  	JAE	gobble_big_data_fwd
   324  
   325  	// Memory layout on the source side
   326  	// SI                                       CX
   327  	// |<---------BX before correction--------->|
   328  	// |       |<--BX corrected-->|             |
   329  	// |       |                  |<--- AX  --->|
   330  	// |<-R11->|                  |<-128 bytes->|
   331  	// +----------------------------------------+
   332  	// | Head  | Body             | Tail        |
   333  	// +-------+------------------+-------------+
   334  	// ^       ^                  ^
   335  	// |       |                  |
   336  	// Save head into Y4          Save tail into X5..X12
   337  	//         |
   338  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   339  	// Algorithm:
   340  	// 1. Unaligned save of the tail's 128 bytes
   341  	// 2. Unaligned save of the head's 32  bytes
   342  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   343  	// 4. Put head on the new place
   344  	// 5. Put the tail on the new place
   345  	// It can be important to satisfy processor's pipeline requirements for
   346  	// small sizes as the cost of unaligned memory region copying is
   347  	// comparable with the cost of main loop. So code is slightly messed there.
   348  	// There is more clean implementation of that algorithm for bigger sizes
   349  	// where the cost of unaligned part copying is negligible.
   350  	// You can see it after gobble_big_data_fwd label.
   351  	LEAQ	(SI)(BX*1), CX
   352  	MOVQ	DI, R10
   353  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   354  	MOVOU	-0x80(CX), X5
   355  	MOVOU	-0x70(CX), X6
   356  	MOVQ	$0x80, AX
   357  	// Align destination address
   358  	ANDQ	$-32, DI
   359  	ADDQ	$32, DI
   360  	// Continue tail saving.
   361  	MOVOU	-0x60(CX), X7
   362  	MOVOU	-0x50(CX), X8
   363  	// Make R11 delta between aligned and unaligned destination addresses.
   364  	MOVQ	DI, R11
   365  	SUBQ	R10, R11
   366  	// Continue tail saving.
   367  	MOVOU	-0x40(CX), X9
   368  	MOVOU	-0x30(CX), X10
   369  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   370  	SUBQ	R11, BX
   371  	// Continue tail saving.
   372  	MOVOU	-0x20(CX), X11
   373  	MOVOU	-0x10(CX), X12
   374  	// The tail will be put on its place after main body copying.
   375  	// It's time for the unaligned heading part.
   376  	VMOVDQU	(SI), Y4
   377  	// Adjust source address to point past head.
   378  	ADDQ	R11, SI
   379  	SUBQ	AX, BX
   380  	// Aligned memory copying there
   381  gobble_128_loop:
   382  	VMOVDQU	(SI), Y0
   383  	VMOVDQU	0x20(SI), Y1
   384  	VMOVDQU	0x40(SI), Y2
   385  	VMOVDQU	0x60(SI), Y3
   386  	ADDQ	AX, SI
   387  	VMOVDQA	Y0, (DI)
   388  	VMOVDQA	Y1, 0x20(DI)
   389  	VMOVDQA	Y2, 0x40(DI)
   390  	VMOVDQA	Y3, 0x60(DI)
   391  	ADDQ	AX, DI
   392  	SUBQ	AX, BX
   393  	JA	gobble_128_loop
   394  	// Now we can store unaligned parts.
   395  	ADDQ	AX, BX
   396  	ADDQ	DI, BX
   397  	VMOVDQU	Y4, (R10)
   398  	VZEROUPPER
   399  	MOVOU	X5, -0x80(BX)
   400  	MOVOU	X6, -0x70(BX)
   401  	MOVOU	X7, -0x60(BX)
   402  	MOVOU	X8, -0x50(BX)
   403  	MOVOU	X9, -0x40(BX)
   404  	MOVOU	X10, -0x30(BX)
   405  	MOVOU	X11, -0x20(BX)
   406  	MOVOU	X12, -0x10(BX)
   407  	RET
   408  
   409  gobble_big_data_fwd:
   410  	// There is forward copying for big regions.
   411  	// It uses non-temporal mov instructions.
   412  	// Details of this algorithm are commented previously for small sizes.
   413  	LEAQ	(SI)(BX*1), CX
   414  	MOVOU	-0x80(SI)(BX*1), X5
   415  	MOVOU	-0x70(CX), X6
   416  	MOVOU	-0x60(CX), X7
   417  	MOVOU	-0x50(CX), X8
   418  	MOVOU	-0x40(CX), X9
   419  	MOVOU	-0x30(CX), X10
   420  	MOVOU	-0x20(CX), X11
   421  	MOVOU	-0x10(CX), X12
   422  	VMOVDQU	(SI), Y4
   423  	MOVQ	DI, R8
   424  	ANDQ	$-32, DI
   425  	ADDQ	$32, DI
   426  	MOVQ	DI, R10
   427  	SUBQ	R8, R10
   428  	SUBQ	R10, BX
   429  	ADDQ	R10, SI
   430  	LEAQ	(DI)(BX*1), CX
   431  	SUBQ	$0x80, BX
   432  gobble_mem_fwd_loop:
   433  	PREFETCHNTA 0x1C0(SI)
   434  	PREFETCHNTA 0x280(SI)
   435  	// Prefetch values were chosen empirically.
   436  	// Approach for prefetch usage as in 9.5.6 of [1]
   437  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   438  	// https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   439  	VMOVDQU	(SI), Y0
   440  	VMOVDQU	0x20(SI), Y1
   441  	VMOVDQU	0x40(SI), Y2
   442  	VMOVDQU	0x60(SI), Y3
   443  	ADDQ	$0x80, SI
   444  	VMOVNTDQ Y0, (DI)
   445  	VMOVNTDQ Y1, 0x20(DI)
   446  	VMOVNTDQ Y2, 0x40(DI)
   447  	VMOVNTDQ Y3, 0x60(DI)
   448  	ADDQ	$0x80, DI
   449  	SUBQ	$0x80, BX
   450  	JA		gobble_mem_fwd_loop
   451  	// NT instructions don't follow the normal cache-coherency rules.
   452  	// We need SFENCE there to make copied data available timely.
   453  	SFENCE
   454  	VMOVDQU	Y4, (R8)
   455  	VZEROUPPER
   456  	MOVOU	X5, -0x80(CX)
   457  	MOVOU	X6, -0x70(CX)
   458  	MOVOU	X7, -0x60(CX)
   459  	MOVOU	X8, -0x50(CX)
   460  	MOVOU	X9, -0x40(CX)
   461  	MOVOU	X10, -0x30(CX)
   462  	MOVOU	X11, -0x20(CX)
   463  	MOVOU	X12, -0x10(CX)
   464  	RET
   465  
   466  copy_backward:
   467  	MOVQ	DI, AX
   468  	// Backward copying is about the same as the forward one.
   469  	// Firstly we load unaligned tail in the beginning of region.
   470  	MOVOU	(SI), X5
   471  	MOVOU	0x10(SI), X6
   472  	ADDQ	BX, DI
   473  	MOVOU	0x20(SI), X7
   474  	MOVOU	0x30(SI), X8
   475  	LEAQ	-0x20(DI), R10
   476  	MOVQ	DI, R11
   477  	MOVOU	0x40(SI), X9
   478  	MOVOU	0x50(SI), X10
   479  	ANDQ	$0x1F, R11
   480  	MOVOU	0x60(SI), X11
   481  	MOVOU	0x70(SI), X12
   482  	XORQ	R11, DI
   483  	// Let's point SI to the end of region
   484  	ADDQ	BX, SI
   485  	// and load unaligned head into X4.
   486  	VMOVDQU	-0x20(SI), Y4
   487  	SUBQ	R11, SI
   488  	SUBQ	R11, BX
   489  	// If there is enough data for non-temporal moves go to special loop
   490  	CMPQ	BX, $0x100000
   491  	JA		gobble_big_data_bwd
   492  	SUBQ	$0x80, BX
   493  gobble_mem_bwd_loop:
   494  	VMOVDQU	-0x20(SI), Y0
   495  	VMOVDQU	-0x40(SI), Y1
   496  	VMOVDQU	-0x60(SI), Y2
   497  	VMOVDQU	-0x80(SI), Y3
   498  	SUBQ	$0x80, SI
   499  	VMOVDQA	Y0, -0x20(DI)
   500  	VMOVDQA	Y1, -0x40(DI)
   501  	VMOVDQA	Y2, -0x60(DI)
   502  	VMOVDQA	Y3, -0x80(DI)
   503  	SUBQ	$0x80, DI
   504  	SUBQ	$0x80, BX
   505  	JA		gobble_mem_bwd_loop
   506  	// Let's store unaligned data
   507  	VMOVDQU	Y4, (R10)
   508  	VZEROUPPER
   509  	MOVOU	X5, (AX)
   510  	MOVOU	X6, 0x10(AX)
   511  	MOVOU	X7, 0x20(AX)
   512  	MOVOU	X8, 0x30(AX)
   513  	MOVOU	X9, 0x40(AX)
   514  	MOVOU	X10, 0x50(AX)
   515  	MOVOU	X11, 0x60(AX)
   516  	MOVOU	X12, 0x70(AX)
   517  	RET
   518  
   519  gobble_big_data_bwd:
   520  	SUBQ	$0x80, BX
   521  gobble_big_mem_bwd_loop:
   522  	PREFETCHNTA -0x1C0(SI)
   523  	PREFETCHNTA -0x280(SI)
   524  	VMOVDQU	-0x20(SI), Y0
   525  	VMOVDQU	-0x40(SI), Y1
   526  	VMOVDQU	-0x60(SI), Y2
   527  	VMOVDQU	-0x80(SI), Y3
   528  	SUBQ	$0x80, SI
   529  	VMOVNTDQ	Y0, -0x20(DI)
   530  	VMOVNTDQ	Y1, -0x40(DI)
   531  	VMOVNTDQ	Y2, -0x60(DI)
   532  	VMOVNTDQ	Y3, -0x80(DI)
   533  	SUBQ	$0x80, DI
   534  	SUBQ	$0x80, BX
   535  	JA	gobble_big_mem_bwd_loop
   536  	SFENCE
   537  	VMOVDQU	Y4, (R10)
   538  	VZEROUPPER
   539  	MOVOU	X5, (AX)
   540  	MOVOU	X6, 0x10(AX)
   541  	MOVOU	X7, 0x20(AX)
   542  	MOVOU	X8, 0x30(AX)
   543  	MOVOU	X9, 0x40(AX)
   544  	MOVOU	X10, 0x50(AX)
   545  	MOVOU	X11, 0x60(AX)
   546  	MOVOU	X12, 0x70(AX)
   547  	RET
   548
View as plain text