1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·FilterNilAVX512(SB), NOSPLIT, $0-20
9 // Load arguments
10 MOVQ bufp+0(FP), R8 // R8 = bufp (start of the uint64 array)
11 MOVL n+8(FP), R9 // R9 = n (total length)
12 XORL R10, R10 // R10 = 0 (scanned = 0)
13 XORL R11, R11 // R11 = 0 (cnt = 0)
14
15 MOVL R9, R12 // R12 = n
16 SUBL R10, R12 // R12 = n - scanned
17 CMPL R12, $8 // Compare (n - scanned) with 8
18 JLT scalar_loop // If (n - scanned) < 8, jump to the scalar cleanup
19 VPXOR X15, X15, X15 // Zero the high bits of Z15
20
21 vector_loop:
22 LEAQ (R8)(R10*8), R13 // R13 = buf[scanned:] address
23 VMOVDQU64 (R13), Z1 // Z1 = v (Load 8 uint64s)
24 VPCMPUQ $4, Z1, Z15, K1 // Z15 is always 0, compare Z1 with 0, results in K1.
25
26 LEAQ (R8)(R11*8), R14 // R14 = buf[cnt:] address
27 VPCOMPRESSQ Z1, K1, Z1 // compress v
28 VMOVDQU64 Z1, (R14) // store v to buf[cnt:]
29
30 KMOVW K1, R15
31 POPCNTL R15, R15 // R15 = popcount(K1)
32
33 ADDL R15, R11 // cnt += popcount(K1)
34 ADDL $8, R10 // scanned += 8
35
36 MOVL R9, R12 // R12 = n
37 SUBL R10, R12 // R12 = n - scanned
38 CMPL R12, $8 // Compare (n - scanned) with 8
39 JGE vector_loop // If (n - scanned) >= 8, continue loop
40
41 scalar_loop:
42 CMPL R10, R9 // Compare scanned with n
43 JGE end // If scanned >= n, loop is done
44
45 scalar_next_i:
46 LEAQ (R8)(R10*8), R13 // R13 = &buf[scanned]
47 MOVQ (R13), R14 // R14 = buf[scanned]
48
49 CMPQ R14, $0
50 JE scalar_increment_i // If buf[i] == 0, skip to increment i
51
52 LEAQ (R8)(R11*8), R15 // R15 = &buf[cnt]
53 MOVQ R14, (R15) // buf[cnt] = buf[scanned]
54
55 ADDL $1, R11 // cnt++
56
57 scalar_increment_i:
58 ADDL $1, R10 // scanned++
59
60 CMPL R10, R9
61 JL scalar_next_i // if scanned < n, continue
62
63 end:
64 MOVL R11, ret+16(FP)
65 VZEROUPPER
66 RET
67
View as plain text