memclr_amd64.s

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !plan9
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  #include "asm_amd64.h"
    10  
    11  // See memclrNoHeapPointers Go doc for important implementation constraints.
    12  
    13  // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    14  // ABIInternal for performance.
    15  TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
    16  	// AX = ptr
    17  	// BX = n
    18  	MOVQ	AX, DI	// DI = ptr
    19  	XORQ	AX, AX
    20  
    21  	// MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
    22  tail:
    23  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    24  	TESTQ	BX, BX
    25  	JEQ	_0
    26  	CMPQ	BX, $2
    27  	JBE	_1or2
    28  	CMPQ	BX, $4
    29  	JBE	_3or4
    30  	CMPQ	BX, $8
    31  	JB	_5through7
    32  	JE	_8
    33  	CMPQ	BX, $16
    34  	JBE	_9through16
    35  	CMPQ	BX, $32
    36  	JBE	_17through32
    37  	CMPQ	BX, $64
    38  	JBE	_33through64
    39  	CMPQ	BX, $128
    40  	JBE	_65through128
    41  	CMPQ	BX, $256
    42  	JBE	_129through256
    43  
    44  	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    45  	JNE	skip_erms
    46  
    47  	// If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
    48  	// Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
    49  	// in the Intel Optimization Guide shows better performance for ERMSB starting
    50  	// from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
    51  	CMPQ    BX, $2048
    52  	JAE	loop_preheader_erms
    53  
    54  skip_erms:
    55  #ifndef hasAVX2
    56  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    57  	JE	loop_preheader_avx2
    58  	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
    59  
    60  loop:
    61  	MOVOU	X15, 0(DI)
    62  	MOVOU	X15, 16(DI)
    63  	MOVOU	X15, 32(DI)
    64  	MOVOU	X15, 48(DI)
    65  	MOVOU	X15, 64(DI)
    66  	MOVOU	X15, 80(DI)
    67  	MOVOU	X15, 96(DI)
    68  	MOVOU	X15, 112(DI)
    69  	MOVOU	X15, 128(DI)
    70  	MOVOU	X15, 144(DI)
    71  	MOVOU	X15, 160(DI)
    72  	MOVOU	X15, 176(DI)
    73  	MOVOU	X15, 192(DI)
    74  	MOVOU	X15, 208(DI)
    75  	MOVOU	X15, 224(DI)
    76  	MOVOU	X15, 240(DI)
    77  	SUBQ	$256, BX
    78  	ADDQ	$256, DI
    79  	CMPQ	BX, $256
    80  	JAE	loop
    81  	JMP	tail
    82  #endif
    83  
    84  loop_preheader_avx2:
    85  	VPXOR X0, X0, X0
    86  	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
    87  	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
    88  	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
    89  	CMPQ    BX, $0x2000000
    90  	JAE	loop_preheader_avx2_huge
    91  
    92  loop_avx2:
    93  	VMOVDQU	Y0, 0(DI)
    94  	VMOVDQU	Y0, 32(DI)
    95  	VMOVDQU	Y0, 64(DI)
    96  	VMOVDQU	Y0, 96(DI)
    97  	SUBQ	$128, BX
    98  	ADDQ	$128, DI
    99  	CMPQ	BX, $128
   100  	JAE	loop_avx2
   101  	VMOVDQU  Y0, -32(DI)(BX*1)
   102  	VMOVDQU  Y0, -64(DI)(BX*1)
   103  	VMOVDQU  Y0, -96(DI)(BX*1)
   104  	VMOVDQU  Y0, -128(DI)(BX*1)
   105  	VZEROUPPER
   106  	RET
   107  
   108  loop_preheader_erms:
   109  #ifndef hasAVX2
   110  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   111  	JNE	loop_erms
   112  #endif
   113  
   114  	VPXOR X0, X0, X0
   115  	// At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
   116  	// write protocol, ERMS could show the same or slower performance comparing to
   117  	// Non-Temporal Stores when the size is bigger than LLC depending on hardware.
   118  	CMPQ	BX, $0x2000000
   119  	JAE	loop_preheader_avx2_huge
   120  
   121  loop_erms:
   122  	// STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
   123  	// for a memory subsystem as the GC requires this.
   124  	MOVQ	BX, CX
   125  	SHRQ	$3, CX
   126  	ANDQ	$7, BX
   127  	REP;	STOSQ
   128  	JMP	tail
   129  
   130  loop_preheader_avx2_huge:
   131  	// Align to 32 byte boundary
   132  	VMOVDQU  Y0, 0(DI)
   133  	MOVQ	DI, SI
   134  	ADDQ	$32, DI
   135  	ANDQ	$~31, DI
   136  	SUBQ	DI, SI
   137  	ADDQ	SI, BX
   138  loop_avx2_huge:
   139  	VMOVNTDQ	Y0, 0(DI)
   140  	VMOVNTDQ	Y0, 32(DI)
   141  	VMOVNTDQ	Y0, 64(DI)
   142  	VMOVNTDQ	Y0, 96(DI)
   143  	SUBQ	$128, BX
   144  	ADDQ	$128, DI
   145  	CMPQ	BX, $128
   146  	JAE	loop_avx2_huge
   147  	// In the description of MOVNTDQ in [1]
   148  	// "... fencing operation implemented with the SFENCE or MFENCE instruction
   149  	// should be used in conjunction with MOVNTDQ instructions..."
   150  	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
   151  	SFENCE
   152  	VMOVDQU  Y0, -32(DI)(BX*1)
   153  	VMOVDQU  Y0, -64(DI)(BX*1)
   154  	VMOVDQU  Y0, -96(DI)(BX*1)
   155  	VMOVDQU  Y0, -128(DI)(BX*1)
   156  	VZEROUPPER
   157  	RET
   158  
   159  _1or2:
   160  	MOVB	AX, (DI)
   161  	MOVB	AX, -1(DI)(BX*1)
   162  	RET
   163  _0:
   164  	RET
   165  _3or4:
   166  	MOVW	AX, (DI)
   167  	MOVW	AX, -2(DI)(BX*1)
   168  	RET
   169  _5through7:
   170  	MOVL	AX, (DI)
   171  	MOVL	AX, -4(DI)(BX*1)
   172  	RET
   173  _8:
   174  	// We need a separate case for 8 to make sure we clear pointers atomically.
   175  	MOVQ	AX, (DI)
   176  	RET
   177  _9through16:
   178  	MOVQ	AX, (DI)
   179  	MOVQ	AX, -8(DI)(BX*1)
   180  	RET
   181  _17through32:
   182  	MOVOU	X15, (DI)
   183  	MOVOU	X15, -16(DI)(BX*1)
   184  	RET
   185  _33through64:
   186  	MOVOU	X15, (DI)
   187  	MOVOU	X15, 16(DI)
   188  	MOVOU	X15, -32(DI)(BX*1)
   189  	MOVOU	X15, -16(DI)(BX*1)
   190  	RET
   191  _65through128:
   192  	MOVOU	X15, (DI)
   193  	MOVOU	X15, 16(DI)
   194  	MOVOU	X15, 32(DI)
   195  	MOVOU	X15, 48(DI)
   196  	MOVOU	X15, -64(DI)(BX*1)
   197  	MOVOU	X15, -48(DI)(BX*1)
   198  	MOVOU	X15, -32(DI)(BX*1)
   199  	MOVOU	X15, -16(DI)(BX*1)
   200  	RET
   201  _129through256:
   202  	MOVOU	X15, (DI)
   203  	MOVOU	X15, 16(DI)
   204  	MOVOU	X15, 32(DI)
   205  	MOVOU	X15, 48(DI)
   206  	MOVOU	X15, 64(DI)
   207  	MOVOU	X15, 80(DI)
   208  	MOVOU	X15, 96(DI)
   209  	MOVOU	X15, 112(DI)
   210  	MOVOU	X15, -128(DI)(BX*1)
   211  	MOVOU	X15, -112(DI)(BX*1)
   212  	MOVOU	X15, -96(DI)(BX*1)
   213  	MOVOU	X15, -80(DI)(BX*1)
   214  	MOVOU	X15, -64(DI)(BX*1)
   215  	MOVOU	X15, -48(DI)(BX*1)
   216  	MOVOU	X15, -32(DI)(BX*1)
   217  	MOVOU	X15, -16(DI)(BX*1)
   218  	RET
   219
View as plain text