memmove_amd64.s

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  //go:build !plan9
    27  
    28  #include "go_asm.h"
    29  #include "textflag.h"
    30  
    31  // See memmove Go doc for important implementation constraints.
    32  
    33  // func memmove(to, from unsafe.Pointer, n uintptr)
    34  // ABIInternal for performance.
    35  TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
    36  	// AX = to
    37  	// BX = from
    38  	// CX = n
    39  	MOVQ	AX, DI
    40  	MOVQ	BX, SI
    41  	MOVQ	CX, BX
    42  
    43  	// REP instructions have a high startup cost, so we handle small sizes
    44  	// with some straightline code. The REP MOVSQ instruction is really fast
    45  	// for large sizes. The cutover is approximately 2K.
    46  tail:
    47  	// move_129through256 or smaller work whether or not the source and the
    48  	// destination memory regions overlap because they load all data into
    49  	// registers before writing it back.  move_256through2048 on the other
    50  	// hand can be used only when the memory regions don't overlap or the copy
    51  	// direction is forward.
    52  	//
    53  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    54  	TESTQ	BX, BX
    55  	JEQ	move_0
    56  	CMPQ	BX, $2
    57  	JBE	move_1or2
    58  	CMPQ	BX, $4
    59  	JB	move_3
    60  	JBE	move_4
    61  	CMPQ	BX, $8
    62  	JB	move_5through7
    63  	JE	move_8
    64  	CMPQ	BX, $16
    65  	JBE	move_9through16
    66  	CMPQ	BX, $32
    67  	JBE	move_17through32
    68  	CMPQ	BX, $64
    69  	JBE	move_33through64
    70  	CMPQ	BX, $128
    71  	JBE	move_65through128
    72  	CMPQ	BX, $256
    73  	JBE	move_129through256
    74  
    75  	TESTB	$1, runtime·useAVXmemmove(SB)
    76  	JNZ	avxUnaligned
    77  
    78  /*
    79   * check and set for backwards
    80   */
    81  	CMPQ	SI, DI
    82  	JLS	back
    83  
    84  /*
    85   * forward copy loop
    86   */
    87  forward:
    88  	CMPQ	BX, $2048
    89  	JLS	move_256through2048
    90  
    91  	// If REP MOVSB isn't fast, don't use it
    92  	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    93  	JNE	fwdBy8
    94  
    95  	// Check alignment
    96  	MOVL	SI, AX
    97  	ORL	DI, AX
    98  	TESTL	$7, AX
    99  	JEQ	fwdBy8
   100  
   101  	// Do 1 byte at a time
   102  	MOVQ	BX, CX
   103  	REP;	MOVSB
   104  	RET
   105  
   106  fwdBy8:
   107  	// Do 8 bytes at a time
   108  	MOVQ	BX, CX
   109  	SHRQ	$3, CX
   110  	ANDQ	$7, BX
   111  	REP;	MOVSQ
   112  	JMP	tail
   113  
   114  back:
   115  /*
   116   * check overlap
   117   */
   118  	MOVQ	SI, CX
   119  	ADDQ	BX, CX
   120  	CMPQ	CX, DI
   121  	JLS	forward
   122  /*
   123   * whole thing backwards has
   124   * adjusted addresses
   125   */
   126  	ADDQ	BX, DI
   127  	ADDQ	BX, SI
   128  	STD
   129  
   130  /*
   131   * copy
   132   */
   133  	MOVQ	BX, CX
   134  	SHRQ	$3, CX
   135  	ANDQ	$7, BX
   136  
   137  	SUBQ	$8, DI
   138  	SUBQ	$8, SI
   139  	REP;	MOVSQ
   140  
   141  	CLD
   142  	ADDQ	$8, DI
   143  	ADDQ	$8, SI
   144  	SUBQ	BX, DI
   145  	SUBQ	BX, SI
   146  	JMP	tail
   147  
   148  move_1or2:
   149  	MOVB	(SI), AX
   150  	MOVB	-1(SI)(BX*1), CX
   151  	MOVB	AX, (DI)
   152  	MOVB	CX, -1(DI)(BX*1)
   153  	RET
   154  move_0:
   155  	RET
   156  move_4:
   157  	MOVL	(SI), AX
   158  	MOVL	AX, (DI)
   159  	RET
   160  move_3:
   161  	MOVW	(SI), AX
   162  	MOVB	2(SI), CX
   163  	MOVW	AX, (DI)
   164  	MOVB	CX, 2(DI)
   165  	RET
   166  move_5through7:
   167  	MOVL	(SI), AX
   168  	MOVL	-4(SI)(BX*1), CX
   169  	MOVL	AX, (DI)
   170  	MOVL	CX, -4(DI)(BX*1)
   171  	RET
   172  move_8:
   173  	// We need a separate case for 8 to make sure we write pointers atomically.
   174  	MOVQ	(SI), AX
   175  	MOVQ	AX, (DI)
   176  	RET
   177  move_9through16:
   178  	MOVQ	(SI), AX
   179  	MOVQ	-8(SI)(BX*1), CX
   180  	MOVQ	AX, (DI)
   181  	MOVQ	CX, -8(DI)(BX*1)
   182  	RET
   183  move_17through32:
   184  	MOVOU	(SI), X0
   185  	MOVOU	-16(SI)(BX*1), X1
   186  	MOVOU	X0, (DI)
   187  	MOVOU	X1, -16(DI)(BX*1)
   188  	RET
   189  move_33through64:
   190  	MOVOU	(SI), X0
   191  	MOVOU	16(SI), X1
   192  	MOVOU	-32(SI)(BX*1), X2
   193  	MOVOU	-16(SI)(BX*1), X3
   194  	MOVOU	X0, (DI)
   195  	MOVOU	X1, 16(DI)
   196  	MOVOU	X2, -32(DI)(BX*1)
   197  	MOVOU	X3, -16(DI)(BX*1)
   198  	RET
   199  move_65through128:
   200  	MOVOU	(SI), X0
   201  	MOVOU	16(SI), X1
   202  	MOVOU	32(SI), X2
   203  	MOVOU	48(SI), X3
   204  	MOVOU	-64(SI)(BX*1), X4
   205  	MOVOU	-48(SI)(BX*1), X5
   206  	MOVOU	-32(SI)(BX*1), X6
   207  	MOVOU	-16(SI)(BX*1), X7
   208  	MOVOU	X0, (DI)
   209  	MOVOU	X1, 16(DI)
   210  	MOVOU	X2, 32(DI)
   211  	MOVOU	X3, 48(DI)
   212  	MOVOU	X4, -64(DI)(BX*1)
   213  	MOVOU	X5, -48(DI)(BX*1)
   214  	MOVOU	X6, -32(DI)(BX*1)
   215  	MOVOU	X7, -16(DI)(BX*1)
   216  	RET
   217  move_129through256:
   218  	MOVOU	(SI), X0
   219  	MOVOU	16(SI), X1
   220  	MOVOU	32(SI), X2
   221  	MOVOU	48(SI), X3
   222  	MOVOU	64(SI), X4
   223  	MOVOU	80(SI), X5
   224  	MOVOU	96(SI), X6
   225  	MOVOU	112(SI), X7
   226  	MOVOU	-128(SI)(BX*1), X8
   227  	MOVOU	-112(SI)(BX*1), X9
   228  	MOVOU	-96(SI)(BX*1), X10
   229  	MOVOU	-80(SI)(BX*1), X11
   230  	MOVOU	-64(SI)(BX*1), X12
   231  	MOVOU	-48(SI)(BX*1), X13
   232  	MOVOU	-32(SI)(BX*1), X14
   233  	MOVOU	-16(SI)(BX*1), X15
   234  	MOVOU	X0, (DI)
   235  	MOVOU	X1, 16(DI)
   236  	MOVOU	X2, 32(DI)
   237  	MOVOU	X3, 48(DI)
   238  	MOVOU	X4, 64(DI)
   239  	MOVOU	X5, 80(DI)
   240  	MOVOU	X6, 96(DI)
   241  	MOVOU	X7, 112(DI)
   242  	MOVOU	X8, -128(DI)(BX*1)
   243  	MOVOU	X9, -112(DI)(BX*1)
   244  	MOVOU	X10, -96(DI)(BX*1)
   245  	MOVOU	X11, -80(DI)(BX*1)
   246  	MOVOU	X12, -64(DI)(BX*1)
   247  	MOVOU	X13, -48(DI)(BX*1)
   248  	MOVOU	X14, -32(DI)(BX*1)
   249  	MOVOU	X15, -16(DI)(BX*1)
   250  	// X15 must be zero on return
   251  	PXOR	X15, X15
   252  	RET
   253  move_256through2048:
   254  	SUBQ	$256, BX
   255  	MOVOU	(SI), X0
   256  	MOVOU	16(SI), X1
   257  	MOVOU	32(SI), X2
   258  	MOVOU	48(SI), X3
   259  	MOVOU	64(SI), X4
   260  	MOVOU	80(SI), X5
   261  	MOVOU	96(SI), X6
   262  	MOVOU	112(SI), X7
   263  	MOVOU	128(SI), X8
   264  	MOVOU	144(SI), X9
   265  	MOVOU	160(SI), X10
   266  	MOVOU	176(SI), X11
   267  	MOVOU	192(SI), X12
   268  	MOVOU	208(SI), X13
   269  	MOVOU	224(SI), X14
   270  	MOVOU	240(SI), X15
   271  	MOVOU	X0, (DI)
   272  	MOVOU	X1, 16(DI)
   273  	MOVOU	X2, 32(DI)
   274  	MOVOU	X3, 48(DI)
   275  	MOVOU	X4, 64(DI)
   276  	MOVOU	X5, 80(DI)
   277  	MOVOU	X6, 96(DI)
   278  	MOVOU	X7, 112(DI)
   279  	MOVOU	X8, 128(DI)
   280  	MOVOU	X9, 144(DI)
   281  	MOVOU	X10, 160(DI)
   282  	MOVOU	X11, 176(DI)
   283  	MOVOU	X12, 192(DI)
   284  	MOVOU	X13, 208(DI)
   285  	MOVOU	X14, 224(DI)
   286  	MOVOU	X15, 240(DI)
   287  	CMPQ	BX, $256
   288  	LEAQ	256(SI), SI
   289  	LEAQ	256(DI), DI
   290  	JGE	move_256through2048
   291  	// X15 must be zero on return
   292  	PXOR	X15, X15
   293  	JMP	tail
   294  
   295  avxUnaligned:
   296  	// There are two implementations of move algorithm.
   297  	// The first one for non-overlapped memory regions. It uses forward copying.
   298  	// The second one for overlapped regions. It uses backward copying
   299  	MOVQ	DI, CX
   300  	SUBQ	SI, CX
   301  	// Now CX contains distance between SRC and DEST
   302  	CMPQ	CX, BX
   303  	// If the distance lesser than region length it means that regions are overlapped
   304  	JC	copy_backward
   305  
   306  	// Non-temporal copy would be better for big sizes.
   307  	CMPQ	BX, $0x100000
   308  	JAE	gobble_big_data_fwd
   309  
   310  	// Memory layout on the source side
   311  	// SI                                       CX
   312  	// |<---------BX before correction--------->|
   313  	// |       |<--BX corrected-->|             |
   314  	// |       |                  |<--- AX  --->|
   315  	// |<-R11->|                  |<-128 bytes->|
   316  	// +----------------------------------------+
   317  	// | Head  | Body             | Tail        |
   318  	// +-------+------------------+-------------+
   319  	// ^       ^                  ^
   320  	// |       |                  |
   321  	// Save head into Y4          Save tail into X5..X12
   322  	//         |
   323  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   324  	// Algorithm:
   325  	// 1. Unaligned save of the tail's 128 bytes
   326  	// 2. Unaligned save of the head's 32  bytes
   327  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   328  	// 4. Put head on the new place
   329  	// 5. Put the tail on the new place
   330  	// It can be important to satisfy processor's pipeline requirements for
   331  	// small sizes as the cost of unaligned memory region copying is
   332  	// comparable with the cost of main loop. So code is slightly messed there.
   333  	// There is more clean implementation of that algorithm for bigger sizes
   334  	// where the cost of unaligned part copying is negligible.
   335  	// You can see it after gobble_big_data_fwd label.
   336  	LEAQ	(SI)(BX*1), CX
   337  	MOVQ	DI, R10
   338  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   339  	MOVOU	-0x80(CX), X5
   340  	MOVOU	-0x70(CX), X6
   341  	MOVQ	$0x80, AX
   342  	// Align destination address
   343  	ANDQ	$-32, DI
   344  	ADDQ	$32, DI
   345  	// Continue tail saving.
   346  	MOVOU	-0x60(CX), X7
   347  	MOVOU	-0x50(CX), X8
   348  	// Make R11 delta between aligned and unaligned destination addresses.
   349  	MOVQ	DI, R11
   350  	SUBQ	R10, R11
   351  	// Continue tail saving.
   352  	MOVOU	-0x40(CX), X9
   353  	MOVOU	-0x30(CX), X10
   354  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   355  	SUBQ	R11, BX
   356  	// Continue tail saving.
   357  	MOVOU	-0x20(CX), X11
   358  	MOVOU	-0x10(CX), X12
   359  	// The tail will be put on its place after main body copying.
   360  	// It's time for the unaligned heading part.
   361  	VMOVDQU	(SI), Y4
   362  	// Adjust source address to point past head.
   363  	ADDQ	R11, SI
   364  	SUBQ	AX, BX
   365  	// Aligned memory copying there
   366  gobble_128_loop:
   367  	VMOVDQU	(SI), Y0
   368  	VMOVDQU	0x20(SI), Y1
   369  	VMOVDQU	0x40(SI), Y2
   370  	VMOVDQU	0x60(SI), Y3
   371  	ADDQ	AX, SI
   372  	VMOVDQA	Y0, (DI)
   373  	VMOVDQA	Y1, 0x20(DI)
   374  	VMOVDQA	Y2, 0x40(DI)
   375  	VMOVDQA	Y3, 0x60(DI)
   376  	ADDQ	AX, DI
   377  	SUBQ	AX, BX
   378  	JA	gobble_128_loop
   379  	// Now we can store unaligned parts.
   380  	ADDQ	AX, BX
   381  	ADDQ	DI, BX
   382  	VMOVDQU	Y4, (R10)
   383  	VZEROUPPER
   384  	MOVOU	X5, -0x80(BX)
   385  	MOVOU	X6, -0x70(BX)
   386  	MOVOU	X7, -0x60(BX)
   387  	MOVOU	X8, -0x50(BX)
   388  	MOVOU	X9, -0x40(BX)
   389  	MOVOU	X10, -0x30(BX)
   390  	MOVOU	X11, -0x20(BX)
   391  	MOVOU	X12, -0x10(BX)
   392  	RET
   393  
   394  gobble_big_data_fwd:
   395  	// There is forward copying for big regions.
   396  	// It uses non-temporal mov instructions.
   397  	// Details of this algorithm are commented previously for small sizes.
   398  	LEAQ	(SI)(BX*1), CX
   399  	MOVOU	-0x80(SI)(BX*1), X5
   400  	MOVOU	-0x70(CX), X6
   401  	MOVOU	-0x60(CX), X7
   402  	MOVOU	-0x50(CX), X8
   403  	MOVOU	-0x40(CX), X9
   404  	MOVOU	-0x30(CX), X10
   405  	MOVOU	-0x20(CX), X11
   406  	MOVOU	-0x10(CX), X12
   407  	VMOVDQU	(SI), Y4
   408  	MOVQ	DI, R8
   409  	ANDQ	$-32, DI
   410  	ADDQ	$32, DI
   411  	MOVQ	DI, R10
   412  	SUBQ	R8, R10
   413  	SUBQ	R10, BX
   414  	ADDQ	R10, SI
   415  	LEAQ	(DI)(BX*1), CX
   416  	SUBQ	$0x80, BX
   417  gobble_mem_fwd_loop:
   418  	PREFETCHNTA 0x1C0(SI)
   419  	PREFETCHNTA 0x280(SI)
   420  	// Prefetch values were chosen empirically.
   421  	// Approach for prefetch usage as in 9.5.6 of [1]
   422  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   423  	// https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   424  	VMOVDQU	(SI), Y0
   425  	VMOVDQU	0x20(SI), Y1
   426  	VMOVDQU	0x40(SI), Y2
   427  	VMOVDQU	0x60(SI), Y3
   428  	ADDQ	$0x80, SI
   429  	VMOVNTDQ Y0, (DI)
   430  	VMOVNTDQ Y1, 0x20(DI)
   431  	VMOVNTDQ Y2, 0x40(DI)
   432  	VMOVNTDQ Y3, 0x60(DI)
   433  	ADDQ	$0x80, DI
   434  	SUBQ	$0x80, BX
   435  	JA		gobble_mem_fwd_loop
   436  	// NT instructions don't follow the normal cache-coherency rules.
   437  	// We need SFENCE there to make copied data available timely.
   438  	SFENCE
   439  	VMOVDQU	Y4, (R8)
   440  	VZEROUPPER
   441  	MOVOU	X5, -0x80(CX)
   442  	MOVOU	X6, -0x70(CX)
   443  	MOVOU	X7, -0x60(CX)
   444  	MOVOU	X8, -0x50(CX)
   445  	MOVOU	X9, -0x40(CX)
   446  	MOVOU	X10, -0x30(CX)
   447  	MOVOU	X11, -0x20(CX)
   448  	MOVOU	X12, -0x10(CX)
   449  	RET
   450  
   451  copy_backward:
   452  	MOVQ	DI, AX
   453  	// Backward copying is about the same as the forward one.
   454  	// Firstly we load unaligned tail in the beginning of region.
   455  	MOVOU	(SI), X5
   456  	MOVOU	0x10(SI), X6
   457  	ADDQ	BX, DI
   458  	MOVOU	0x20(SI), X7
   459  	MOVOU	0x30(SI), X8
   460  	LEAQ	-0x20(DI), R10
   461  	MOVQ	DI, R11
   462  	MOVOU	0x40(SI), X9
   463  	MOVOU	0x50(SI), X10
   464  	ANDQ	$0x1F, R11
   465  	MOVOU	0x60(SI), X11
   466  	MOVOU	0x70(SI), X12
   467  	XORQ	R11, DI
   468  	// Let's point SI to the end of region
   469  	ADDQ	BX, SI
   470  	// and load unaligned head into X4.
   471  	VMOVDQU	-0x20(SI), Y4
   472  	SUBQ	R11, SI
   473  	SUBQ	R11, BX
   474  	// If there is enough data for non-temporal moves go to special loop
   475  	CMPQ	BX, $0x100000
   476  	JA		gobble_big_data_bwd
   477  	SUBQ	$0x80, BX
   478  gobble_mem_bwd_loop:
   479  	VMOVDQU	-0x20(SI), Y0
   480  	VMOVDQU	-0x40(SI), Y1
   481  	VMOVDQU	-0x60(SI), Y2
   482  	VMOVDQU	-0x80(SI), Y3
   483  	SUBQ	$0x80, SI
   484  	VMOVDQA	Y0, -0x20(DI)
   485  	VMOVDQA	Y1, -0x40(DI)
   486  	VMOVDQA	Y2, -0x60(DI)
   487  	VMOVDQA	Y3, -0x80(DI)
   488  	SUBQ	$0x80, DI
   489  	SUBQ	$0x80, BX
   490  	JA		gobble_mem_bwd_loop
   491  	// Let's store unaligned data
   492  	VMOVDQU	Y4, (R10)
   493  	VZEROUPPER
   494  	MOVOU	X5, (AX)
   495  	MOVOU	X6, 0x10(AX)
   496  	MOVOU	X7, 0x20(AX)
   497  	MOVOU	X8, 0x30(AX)
   498  	MOVOU	X9, 0x40(AX)
   499  	MOVOU	X10, 0x50(AX)
   500  	MOVOU	X11, 0x60(AX)
   501  	MOVOU	X12, 0x70(AX)
   502  	RET
   503  
   504  gobble_big_data_bwd:
   505  	SUBQ	$0x80, BX
   506  gobble_big_mem_bwd_loop:
   507  	PREFETCHNTA -0x1C0(SI)
   508  	PREFETCHNTA -0x280(SI)
   509  	VMOVDQU	-0x20(SI), Y0
   510  	VMOVDQU	-0x40(SI), Y1
   511  	VMOVDQU	-0x60(SI), Y2
   512  	VMOVDQU	-0x80(SI), Y3
   513  	SUBQ	$0x80, SI
   514  	VMOVNTDQ	Y0, -0x20(DI)
   515  	VMOVNTDQ	Y1, -0x40(DI)
   516  	VMOVNTDQ	Y2, -0x60(DI)
   517  	VMOVNTDQ	Y3, -0x80(DI)
   518  	SUBQ	$0x80, DI
   519  	SUBQ	$0x80, BX
   520  	JA	gobble_big_mem_bwd_loop
   521  	SFENCE
   522  	VMOVDQU	Y4, (R10)
   523  	VZEROUPPER
   524  	MOVOU	X5, (AX)
   525  	MOVOU	X6, 0x10(AX)
   526  	MOVOU	X7, 0x20(AX)
   527  	MOVOU	X8, 0x30(AX)
   528  	MOVOU	X9, 0x40(AX)
   529  	MOVOU	X10, 0x50(AX)
   530  	MOVOU	X11, 0x60(AX)
   531  	MOVOU	X12, 0x70(AX)
   532  	RET
   533
View as plain text