1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 TEXT ·addMulVVW1024(SB), $0-32
11 MOVD $16, R0
12 JMP addMulVVWx(SB)
13
14 // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 TEXT ·addMulVVW1536(SB), $0-32
16 MOVD $24, R0
17 JMP addMulVVWx(SB)
18
19 // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 TEXT ·addMulVVW2048(SB), $0-32
21 MOVD $32, R0
22 JMP addMulVVWx(SB)
23
24 TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
25 MOVD z+0(FP), R1
26 MOVD x+8(FP), R2
27 MOVD y+16(FP), R3
28 MOVD $0, R4
29
30 // The main loop of this code operates on a block of 4 words every iteration
31 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
32 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
33 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
34 loop:
35 CBZ R0, done
36
37 LDP.P 16(R2), (R5, R6)
38 LDP.P 16(R2), (R7, R8)
39
40 LDP (R1), (R9, R10)
41 ADDS R4, R9
42 MUL R6, R3, R14
43 ADCS R14, R10
44 MUL R7, R3, R15
45 LDP 16(R1), (R11, R12)
46 ADCS R15, R11
47 MUL R8, R3, R16
48 ADCS R16, R12
49 UMULH R8, R3, R20
50 ADC $0, R20
51
52 MUL R5, R3, R13
53 ADDS R13, R9
54 UMULH R5, R3, R17
55 ADCS R17, R10
56 UMULH R6, R3, R21
57 STP.P (R9, R10), 16(R1)
58 ADCS R21, R11
59 UMULH R7, R3, R19
60 ADCS R19, R12
61 STP.P (R11, R12), 16(R1)
62 ADC $0, R20, R4
63
64 SUB $4, R0
65 B loop
66
67 done:
68 MOVD R4, c+24(FP)
69 RET
70
View as plain text