Text file
src/runtime/memclr_amd64.s
1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !plan9
6
7 #include "go_asm.h"
8 #include "textflag.h"
9 #include "asm_amd64.h"
10
11 // See memclrNoHeapPointers Go doc for important implementation constraints.
12
13 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
14 // ABIInternal for performance.
15 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
16 // AX = ptr
17 // BX = n
18 MOVQ AX, DI // DI = ptr
19 XORQ AX, AX
20
21 // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
22 tail:
23 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
24 TESTQ BX, BX
25 JEQ _0
26 CMPQ BX, $2
27 JBE _1or2
28 CMPQ BX, $4
29 JBE _3or4
30 CMPQ BX, $8
31 JB _5through7
32 JE _8
33 CMPQ BX, $16
34 JBE _9through16
35 CMPQ BX, $32
36 JBE _17through32
37 CMPQ BX, $64
38 JBE _33through64
39 CMPQ BX, $128
40 JBE _65through128
41 CMPQ BX, $256
42 JBE _129through256
43
44 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
45 JNE skip_erms
46
47 // If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
48 // Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
49 // in the Intel Optimization Guide shows better performance for ERMSB starting
50 // from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
51 CMPQ BX, $2048
52 JAE loop_preheader_erms
53
54 skip_erms:
55 #ifndef hasAVX2
56 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
57 JE loop_preheader_avx2
58 // TODO: for really big clears, use MOVNTDQ, even without AVX2.
59
60 loop:
61 MOVOU X15, 0(DI)
62 MOVOU X15, 16(DI)
63 MOVOU X15, 32(DI)
64 MOVOU X15, 48(DI)
65 MOVOU X15, 64(DI)
66 MOVOU X15, 80(DI)
67 MOVOU X15, 96(DI)
68 MOVOU X15, 112(DI)
69 MOVOU X15, 128(DI)
70 MOVOU X15, 144(DI)
71 MOVOU X15, 160(DI)
72 MOVOU X15, 176(DI)
73 MOVOU X15, 192(DI)
74 MOVOU X15, 208(DI)
75 MOVOU X15, 224(DI)
76 MOVOU X15, 240(DI)
77 SUBQ $256, BX
78 ADDQ $256, DI
79 CMPQ BX, $256
80 JAE loop
81 JMP tail
82 #endif
83
84 loop_preheader_avx2:
85 VPXOR X0, X0, X0
86 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
87 // For larger sizes it is always faster, even on dual Xeons with 30M cache.
88 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
89 CMPQ BX, $0x2000000
90 JAE loop_preheader_avx2_huge
91
92 loop_avx2:
93 VMOVDQU Y0, 0(DI)
94 VMOVDQU Y0, 32(DI)
95 VMOVDQU Y0, 64(DI)
96 VMOVDQU Y0, 96(DI)
97 SUBQ $128, BX
98 ADDQ $128, DI
99 CMPQ BX, $128
100 JAE loop_avx2
101 VMOVDQU Y0, -32(DI)(BX*1)
102 VMOVDQU Y0, -64(DI)(BX*1)
103 VMOVDQU Y0, -96(DI)(BX*1)
104 VMOVDQU Y0, -128(DI)(BX*1)
105 VZEROUPPER
106 RET
107
108 loop_preheader_erms:
109 #ifndef hasAVX2
110 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
111 JNE loop_erms
112 #endif
113
114 VPXOR X0, X0, X0
115 // At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
116 // write protocol, ERMS could show the same or slower performance comparing to
117 // Non-Temporal Stores when the size is bigger than LLC depending on hardware.
118 CMPQ BX, $0x2000000
119 JAE loop_preheader_avx2_huge
120
121 loop_erms:
122 // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
123 // for a memory subsystem as the GC requires this.
124 MOVQ BX, CX
125 SHRQ $3, CX
126 ANDQ $7, BX
127 REP; STOSQ
128 JMP tail
129
130 loop_preheader_avx2_huge:
131 // Align to 32 byte boundary
132 VMOVDQU Y0, 0(DI)
133 MOVQ DI, SI
134 ADDQ $32, DI
135 ANDQ $~31, DI
136 SUBQ DI, SI
137 ADDQ SI, BX
138 loop_avx2_huge:
139 VMOVNTDQ Y0, 0(DI)
140 VMOVNTDQ Y0, 32(DI)
141 VMOVNTDQ Y0, 64(DI)
142 VMOVNTDQ Y0, 96(DI)
143 SUBQ $128, BX
144 ADDQ $128, DI
145 CMPQ BX, $128
146 JAE loop_avx2_huge
147 // In the description of MOVNTDQ in [1]
148 // "... fencing operation implemented with the SFENCE or MFENCE instruction
149 // should be used in conjunction with MOVNTDQ instructions..."
150 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
151 SFENCE
152 VMOVDQU Y0, -32(DI)(BX*1)
153 VMOVDQU Y0, -64(DI)(BX*1)
154 VMOVDQU Y0, -96(DI)(BX*1)
155 VMOVDQU Y0, -128(DI)(BX*1)
156 VZEROUPPER
157 RET
158
159 _1or2:
160 MOVB AX, (DI)
161 MOVB AX, -1(DI)(BX*1)
162 RET
163 _0:
164 RET
165 _3or4:
166 MOVW AX, (DI)
167 MOVW AX, -2(DI)(BX*1)
168 RET
169 _5through7:
170 MOVL AX, (DI)
171 MOVL AX, -4(DI)(BX*1)
172 RET
173 _8:
174 // We need a separate case for 8 to make sure we clear pointers atomically.
175 MOVQ AX, (DI)
176 RET
177 _9through16:
178 MOVQ AX, (DI)
179 MOVQ AX, -8(DI)(BX*1)
180 RET
181 _17through32:
182 MOVOU X15, (DI)
183 MOVOU X15, -16(DI)(BX*1)
184 RET
185 _33through64:
186 MOVOU X15, (DI)
187 MOVOU X15, 16(DI)
188 MOVOU X15, -32(DI)(BX*1)
189 MOVOU X15, -16(DI)(BX*1)
190 RET
191 _65through128:
192 MOVOU X15, (DI)
193 MOVOU X15, 16(DI)
194 MOVOU X15, 32(DI)
195 MOVOU X15, 48(DI)
196 MOVOU X15, -64(DI)(BX*1)
197 MOVOU X15, -48(DI)(BX*1)
198 MOVOU X15, -32(DI)(BX*1)
199 MOVOU X15, -16(DI)(BX*1)
200 RET
201 _129through256:
202 MOVOU X15, (DI)
203 MOVOU X15, 16(DI)
204 MOVOU X15, 32(DI)
205 MOVOU X15, 48(DI)
206 MOVOU X15, 64(DI)
207 MOVOU X15, 80(DI)
208 MOVOU X15, 96(DI)
209 MOVOU X15, 112(DI)
210 MOVOU X15, -128(DI)(BX*1)
211 MOVOU X15, -112(DI)(BX*1)
212 MOVOU X15, -96(DI)(BX*1)
213 MOVOU X15, -80(DI)(BX*1)
214 MOVOU X15, -64(DI)(BX*1)
215 MOVOU X15, -48(DI)(BX*1)
216 MOVOU X15, -32(DI)(BX*1)
217 MOVOU X15, -16(DI)(BX*1)
218 RET
219
View as plain text