count_amd64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "asm_amd64.h"
     7  #include "textflag.h"
     8  
     9  TEXT ·Count(SB),NOSPLIT,$0-40
    10  #ifndef hasPOPCNT
    11  	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    12  	JEQ	2(PC)
    13  	JMP	·countGeneric(SB)
    14  #endif
    15  	MOVQ	b_base+0(FP), SI
    16  	MOVQ	b_len+8(FP), BX
    17  	MOVB	c+24(FP), AL
    18  	LEAQ	ret+32(FP), R8
    19  	JMP	countbody<>(SB)
    20  
    21  TEXT ·CountString(SB),NOSPLIT,$0-32
    22  #ifndef hasPOPCNT
    23  	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    24  	JEQ	2(PC)
    25  	JMP	·countGenericString(SB)
    26  #endif
    27  	MOVQ	s_base+0(FP), SI
    28  	MOVQ	s_len+8(FP), BX
    29  	MOVB	c+16(FP), AL
    30  	LEAQ	ret+24(FP), R8
    31  	JMP	countbody<>(SB)
    32  
    33  // input:
    34  //   SI: data
    35  //   BX: data len
    36  //   AL: byte sought
    37  //   R8: address to put result
    38  // This function requires the POPCNT instruction.
    39  TEXT countbody<>(SB),NOSPLIT,$0
    40  	// Shuffle X0 around so that each byte contains
    41  	// the character we're looking for.
    42  	MOVD AX, X0
    43  	PUNPCKLBW X0, X0
    44  	PUNPCKLBW X0, X0
    45  	PSHUFL $0, X0, X0
    46  
    47  	CMPQ BX, $16
    48  	JLT small
    49  
    50  	MOVQ $0, R12 // Accumulator
    51  
    52  	MOVQ SI, DI
    53  
    54  	CMPQ BX, $64
    55  	JAE avx2
    56  sse:
    57  	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    58  	JMP	sseloopentry
    59  
    60  	PCALIGN $16
    61  sseloop:
    62  	// Move the next 16-byte chunk of the data into X1.
    63  	MOVOU	(DI), X1
    64  	// Compare bytes in X0 to X1.
    65  	PCMPEQB	X0, X1
    66  	// Take the top bit of each byte in X1 and put the result in DX.
    67  	PMOVMSKB X1, DX
    68  	// Count number of matching bytes
    69  	POPCNTL DX, DX
    70  	// Accumulate into R12
    71  	ADDQ DX, R12
    72  	// Advance to next block.
    73  	ADDQ	$16, DI
    74  sseloopentry:
    75  	CMPQ	DI, AX
    76  	JBE	sseloop
    77  
    78  	// Get the number of bytes to consider in the last 16 bytes
    79  	ANDQ $15, BX
    80  	JZ end
    81  
    82  	// Create mask to ignore overlap between previous 16 byte block
    83  	// and the next.
    84  	MOVQ $16,CX
    85  	SUBQ BX, CX
    86  	MOVQ $0xFFFF, R10
    87  	SARQ CL, R10
    88  	SALQ CL, R10
    89  
    90  	// Process the last 16-byte chunk. This chunk may overlap with the
    91  	// chunks we've already searched so we need to mask part of it.
    92  	MOVOU	(AX), X1
    93  	PCMPEQB	X0, X1
    94  	PMOVMSKB X1, DX
    95  	// Apply mask
    96  	ANDQ R10, DX
    97  	POPCNTL DX, DX
    98  	ADDQ DX, R12
    99  end:
   100  	MOVQ R12, (R8)
   101  	RET
   102  
   103  // handle for lengths < 16
   104  small:
   105  	TESTQ	BX, BX
   106  	JEQ	endzero
   107  
   108  	// Check if we'll load across a page boundary.
   109  	LEAQ	16(SI), AX
   110  	TESTW	$0xff0, AX
   111  	JEQ	endofpage
   112  
   113  	// We must ignore high bytes as they aren't part of our slice.
   114  	// Create mask.
   115  	MOVB BX, CX
   116  	MOVQ $1, R10
   117  	SALQ CL, R10
   118  	SUBQ $1, R10
   119  
   120  	// Load data
   121  	MOVOU	(SI), X1
   122  	// Compare target byte with each byte in data.
   123  	PCMPEQB	X0, X1
   124  	// Move result bits to integer register.
   125  	PMOVMSKB X1, DX
   126  	// Apply mask
   127  	ANDQ R10, DX
   128  	POPCNTL DX, DX
   129  	// Directly return DX, we don't need to accumulate
   130  	// since we have <16 bytes.
   131  	MOVQ	DX, (R8)
   132  	RET
   133  endzero:
   134  	MOVQ $0, (R8)
   135  	RET
   136  
   137  endofpage:
   138  	// We must ignore low bytes as they aren't part of our slice.
   139  	MOVQ $16,CX
   140  	SUBQ BX, CX
   141  	MOVQ $0xFFFF, R10
   142  	SARQ CL, R10
   143  	SALQ CL, R10
   144  
   145  	// Load data into the high end of X1.
   146  	MOVOU	-16(SI)(BX*1), X1
   147  	// Compare target byte with each byte in data.
   148  	PCMPEQB	X0, X1
   149  	// Move result bits to integer register.
   150  	PMOVMSKB X1, DX
   151  	// Apply mask
   152  	ANDQ R10, DX
   153  	// Directly return DX, we don't need to accumulate
   154  	// since we have <16 bytes.
   155  	POPCNTL DX, DX
   156  	MOVQ	DX, (R8)
   157  	RET
   158  
   159  avx2:
   160  #ifndef hasAVX2
   161  	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   162  	JNE sse
   163  #endif
   164  	MOVD AX, X0
   165  	LEAQ -64(SI)(BX*1), R11
   166  	LEAQ (SI)(BX*1), R13
   167  	VPBROADCASTB  X0, Y1
   168  	PCALIGN $32
   169  avx2_loop:
   170  	VMOVDQU (DI), Y2
   171  	VMOVDQU 32(DI), Y4
   172  	VPCMPEQB Y1, Y2, Y3
   173  	VPCMPEQB Y1, Y4, Y5
   174  	VPMOVMSKB Y3, DX
   175  	VPMOVMSKB Y5, CX
   176  	POPCNTL DX, DX
   177  	POPCNTL CX, CX
   178  	ADDQ DX, R12
   179  	ADDQ CX, R12
   180  	ADDQ $64, DI
   181  	CMPQ DI, R11
   182  	JLE avx2_loop
   183  
   184  	// If last block is already processed,
   185  	// skip to the end.
   186  	//
   187  	// This check is NOT an optimization; if the input length is a
   188  	// multiple of 64, we must not go through the last leg of the
   189  	// function because the bit shift count passed to SALQ below would
   190  	// be 64, which is outside of the 0-63 range supported by those
   191  	// instructions.
   192  	//
   193  	// Tests in the bytes and strings packages with input lengths that
   194  	// are multiples of 64 will break if this condition were removed.
   195  	CMPQ DI, R13
   196  	JEQ endavx
   197  
   198  	// Load address of the last 64 bytes.
   199  	// There is an overlap with the previous block.
   200  	MOVQ R11, DI
   201  	VMOVDQU (DI), Y2
   202  	VMOVDQU 32(DI), Y4
   203  	VPCMPEQB Y1, Y2, Y3
   204  	VPCMPEQB Y1, Y4, Y5
   205  	VPMOVMSKB Y3, DX
   206  	VPMOVMSKB Y5, CX
   207  	// Exit AVX mode.
   208  	VZEROUPPER
   209  	SALQ $32, CX
   210  	ORQ CX, DX
   211  
   212  	// Create mask to ignore overlap between previous 64 byte block
   213  	// and the next.
   214  	ANDQ $63, BX
   215  	MOVQ $64, CX
   216  	SUBQ BX, CX
   217  	MOVQ $0xFFFFFFFFFFFFFFFF, R10
   218  	SALQ CL, R10
   219  	// Apply mask
   220  	ANDQ R10, DX
   221  	POPCNTQ DX, DX
   222  	ADDQ DX, R12
   223  	MOVQ R12, (R8)
   224  	RET
   225  endavx:
   226  	// Exit AVX mode.
   227  	VZEROUPPER
   228  	MOVQ R12, (R8)
   229  	RET
   230
View as plain text