1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Count(SB),NOSPLIT,$0-40
9 MOVD b_base+0(FP), R0
10 MOVD b_len+8(FP), R2
11 MOVBU c+24(FP), R1
12 MOVD $ret+32(FP), R8
13 B countbytebody<>(SB)
14
15 TEXT ·CountString(SB),NOSPLIT,$0-32
16 MOVD s_base+0(FP), R0
17 MOVD s_len+8(FP), R2
18 MOVBU c+16(FP), R1
19 MOVD $ret+24(FP), R8
20 B countbytebody<>(SB)
21
22 // input:
23 // R0: data
24 // R2: data len
25 // R1: byte to find
26 // R8: address to put result
27 TEXT countbytebody<>(SB),NOSPLIT,$0
28 // R11 = count of byte to search
29 MOVD $0, R11
30 // short path to handle 0-byte case
31 CBZ R2, done
32 CMP $0x20, R2
33 // jump directly to tail if length < 32
34 BLO tail
35 ANDS $0x1f, R0, R9
36 BEQ chunk
37 // Work with not 32-byte aligned head
38 BIC $0x1f, R0, R3
39 ADD $0x20, R3
40 PCALIGN $16
41 head_loop:
42 MOVBU.P 1(R0), R5
43 CMP R5, R1
44 CINC EQ, R11, R11
45 SUB $1, R2, R2
46 CMP R0, R3
47 BNE head_loop
48 // Work with 32-byte aligned chunks
49 chunk:
50 BIC $0x1f, R2, R9
51 // The first chunk can also be the last
52 CBZ R9, tail
53 // R3 = end of 32-byte chunks
54 ADD R0, R9, R3
55 MOVD $1, R5
56 VMOV R5, V5.B16
57 // R2 = length of tail
58 SUB R9, R2, R2
59 // Duplicate R1 (byte to search) to 16 1-byte elements of V0
60 VMOV R1, V0.B16
61 // Clear the low 64-bit element of V7 and V8
62 VEOR V7.B8, V7.B8, V7.B8
63 VEOR V8.B8, V8.B8, V8.B8
64 PCALIGN $16
65 // Count the target byte in 32-byte chunk
66 chunk_loop:
67 VLD1.P (R0), [V1.B16, V2.B16]
68 CMP R0, R3
69 VCMEQ V0.B16, V1.B16, V3.B16
70 VCMEQ V0.B16, V2.B16, V4.B16
71 // Clear the higher 7 bits
72 VAND V5.B16, V3.B16, V3.B16
73 VAND V5.B16, V4.B16, V4.B16
74 // Count lanes match the requested byte
75 VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
76 VUADDLV V6.B16, V7
77 // Accumulate the count in low 64-bit element of V8 when inside the loop
78 VADD V7, V8
79 BNE chunk_loop
80 VMOV V8.D[0], R6
81 ADD R6, R11, R11
82 CBZ R2, done
83 tail:
84 // Work with tail shorter than 32 bytes
85 MOVBU.P 1(R0), R5
86 SUB $1, R2, R2
87 CMP R5, R1
88 CINC EQ, R11, R11
89 CBNZ R2, tail
90 done:
91 MOVD R11, (R8)
92 RET
93
View as plain text