// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// castagnoliUpdate updates the non-inverted crc with the given data.

// func castagnoliUpdate(crc uint32, p []byte) uint32
TEXT ·castagnoliUpdate(SB),NOSPLIT,$0-36
	MOVWU	crc+0(FP), R4		// a0 = CRC value
	MOVV	p+8(FP), R5		// a1 = data pointer
	MOVV	p_len+16(FP), R6	// a2 = len(p)

	SGT	$8, R6, R12
	BNE	R12, less_than_8
	AND	$7, R5, R12
	BEQ	R12, aligned

	// Process the first few bytes to 8-byte align the input.
	// t0 = 8 - t0. We need to process this many bytes to align.
	SUB	$1, R12
	XOR	$7, R12

	AND	$1, R12, R13
	BEQ	R13, align_2
	MOVB	(R5), R13
	CRCCWBW	R4, R13, R4
	ADDV	$1, R5
	ADDV	$-1, R6

align_2:
	AND	$2, R12, R13
	BEQ	R13, align_4
	MOVH	(R5), R13
	CRCCWHW	R4, R13, R4
	ADDV	$2, R5
	ADDV	$-2, R6

align_4:
	AND	$4, R12, R13
	BEQ	R13, aligned
	MOVW	(R5), R13
	CRCCWWW	R4, R13, R4
	ADDV	$4, R5
	ADDV	$-4, R6

aligned:
	// The input is now 8-byte aligned and we can process 8-byte chunks.
	SGT	$8, R6, R12
	BNE	R12, less_than_8
	MOVV	(R5), R13
	CRCCWVW	R4, R13, R4
	ADDV	$8, R5
	ADDV	$-8, R6
	JMP	aligned

less_than_8:
	// We may have some bytes left over; process 4 bytes, then 2, then 1.
	AND	$4, R6, R12
	BEQ	R12, less_than_4
	MOVW	(R5), R13
	CRCCWWW	R4, R13, R4
	ADDV	$4, R5
	ADDV	$-4, R6

less_than_4:
	AND	$2, R6, R12
	BEQ	R12, less_than_2
	MOVH	(R5), R13
	CRCCWHW	R4, R13, R4
	ADDV	$2, R5
	ADDV	$-2, R6

less_than_2:
	BEQ	R6, done
	MOVB	(R5), R13
	CRCCWBW	R4, R13, R4

done:
	MOVW	R4, ret+32(FP)
	RET

// ieeeUpdate updates the non-inverted crc with the given data.

// func ieeeUpdate(crc uint32, p []byte) uint32
TEXT ·ieeeUpdate(SB),NOSPLIT,$0-36
	MOVWU	crc+0(FP), R4		// a0 = CRC value
	MOVV	p+8(FP), R5		// a1 = data pointer
	MOVV	p_len+16(FP), R6	// a2 = len(p)

	SGT	$8, R6, R12
	BNE	R12, less_than_8
	AND	$7, R5, R12
	BEQ	R12, aligned

	// Process the first few bytes to 8-byte align the input.
	// t0 = 8 - t0. We need to process this many bytes to align.
	SUB	$1, R12
	XOR	$7, R12

	AND	$1, R12, R13
	BEQ	R13, align_2
	MOVB	(R5), R13
	CRCWBW	R4, R13, R4
	ADDV	$1, R5
	ADDV	$-1, R6

align_2:
	AND	$2, R12, R13
	BEQ	R13, align_4
	MOVH	(R5), R13
	CRCWHW	R4, R13, R4
	ADDV	$2, R5
	ADDV	$-2, R6

align_4:
	AND	$4, R12, R13
	BEQ	R13, aligned
	MOVW	(R5), R13
	CRCWWW	R4, R13, R4
	ADDV	$4, R5
	ADDV	$-4, R6

aligned:
	// The input is now 8-byte aligned and we can process 8-byte chunks.
	SGT	$8, R6, R12
	BNE	R12, less_than_8
	MOVV	(R5), R13
	CRCWVW	R4, R13, R4
	ADDV	$8, R5
	ADDV	$-8, R6
	JMP	aligned

less_than_8:
	// We may have some bytes left over; process 4 bytes, then 2, then 1.
	AND	$4, R6, R12
	BEQ	R12, less_than_4
	MOVW	(R5), R13
	CRCWWW	R4, R13, R4
	ADDV	$4, R5
	ADDV	$-4, R6

less_than_4:
	AND	$2, R6, R12
	BEQ	R12, less_than_2
	MOVH	(R5), R13
	CRCWHW	R4, R13, R4
	ADDV	$2, R5
	ADDV	$-2, R6

less_than_2:
	BEQ	R6, done
	MOVB	(R5), R13
	CRCWBW	R4, R13, R4

done:
	MOVW	R4, ret+32(FP)
	RET