1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "asm_amd64.h"
7 #include "textflag.h"
8
9 // memequal(a, b unsafe.Pointer, size uintptr) bool
10 TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
11 // AX = a (want in SI)
12 // BX = b (want in DI)
13 // CX = size (want in BX)
14 CMPQ AX, BX
15 JNE neq
16 MOVQ $1, AX // return 1
17 RET
18 neq:
19 MOVQ AX, SI
20 MOVQ BX, DI
21 MOVQ CX, BX
22 JMP memeqbody<>(SB)
23
24 // memequal_varlen(a, b unsafe.Pointer) bool
25 TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
26 // AX = a (want in SI)
27 // BX = b (want in DI)
28 // 8(DX) = size (want in BX)
29 CMPQ AX, BX
30 JNE neq
31 MOVQ $1, AX // return 1
32 RET
33 neq:
34 MOVQ AX, SI
35 MOVQ BX, DI
36 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
37 JMP memeqbody<>(SB)
38
39 // Input:
40 // a in SI
41 // b in DI
42 // count in BX
43 // Output:
44 // result in AX
45 TEXT memeqbody<>(SB),NOSPLIT,$0-0
46 CMPQ BX, $8
47 JB small
48 CMPQ BX, $64
49 JB bigloop
50 #ifndef hasAVX2
51 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
52 JE hugeloop_avx2
53
54 // 64 bytes at a time using xmm registers
55 PCALIGN $16
56 hugeloop:
57 CMPQ BX, $64
58 JB bigloop
59 MOVOU (SI), X0
60 MOVOU (DI), X1
61 MOVOU 16(SI), X2
62 MOVOU 16(DI), X3
63 MOVOU 32(SI), X4
64 MOVOU 32(DI), X5
65 MOVOU 48(SI), X6
66 MOVOU 48(DI), X7
67 PCMPEQB X1, X0
68 PCMPEQB X3, X2
69 PCMPEQB X5, X4
70 PCMPEQB X7, X6
71 PAND X2, X0
72 PAND X6, X4
73 PAND X4, X0
74 PMOVMSKB X0, DX
75 ADDQ $64, SI
76 ADDQ $64, DI
77 SUBQ $64, BX
78 CMPL DX, $0xffff
79 JEQ hugeloop
80 XORQ AX, AX // return 0
81 RET
82 #endif
83
84 // 64 bytes at a time using ymm registers
85 PCALIGN $16
86 hugeloop_avx2:
87 CMPQ BX, $64
88 JB bigloop_avx2
89 VMOVDQU (SI), Y0
90 VMOVDQU (DI), Y1
91 VMOVDQU 32(SI), Y2
92 VMOVDQU 32(DI), Y3
93 VPCMPEQB Y1, Y0, Y4
94 VPCMPEQB Y2, Y3, Y5
95 VPAND Y4, Y5, Y6
96 VPMOVMSKB Y6, DX
97 ADDQ $64, SI
98 ADDQ $64, DI
99 SUBQ $64, BX
100 CMPL DX, $0xffffffff
101 JEQ hugeloop_avx2
102 VZEROUPPER
103 XORQ AX, AX // return 0
104 RET
105
106 bigloop_avx2:
107 VZEROUPPER
108
109 // 8 bytes at a time using 64-bit register
110 PCALIGN $16
111 bigloop:
112 CMPQ BX, $8
113 JBE leftover
114 MOVQ (SI), CX
115 MOVQ (DI), DX
116 ADDQ $8, SI
117 ADDQ $8, DI
118 SUBQ $8, BX
119 CMPQ CX, DX
120 JEQ bigloop
121 XORQ AX, AX // return 0
122 RET
123
124 // remaining 0-8 bytes
125 leftover:
126 MOVQ -8(SI)(BX*1), CX
127 MOVQ -8(DI)(BX*1), DX
128 CMPQ CX, DX
129 SETEQ AX
130 RET
131
132 small:
133 CMPQ BX, $0
134 JEQ equal
135
136 LEAQ 0(BX*8), CX
137 NEGQ CX
138
139 CMPB SI, $0xf8
140 JA si_high
141
142 // load at SI won't cross a page boundary.
143 MOVQ (SI), SI
144 JMP si_finish
145 si_high:
146 // address ends in 11111xxx. Load up to bytes we want, move to correct position.
147 MOVQ -8(SI)(BX*1), SI
148 SHRQ CX, SI
149 si_finish:
150
151 // same for DI.
152 CMPB DI, $0xf8
153 JA di_high
154 MOVQ (DI), DI
155 JMP di_finish
156 di_high:
157 MOVQ -8(DI)(BX*1), DI
158 SHRQ CX, DI
159 di_finish:
160
161 SUBQ SI, DI
162 SHLQ CX, DI
163 equal:
164 SETEQ AX
165 RET
166
View as plain text