1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego && (ppc64 || ppc64le)
6
7 #include "textflag.h"
8
9 // func addMulVVW1024(z, x *uint, y uint) (c uint)
10 TEXT ·addMulVVW1024(SB), $0-32
11 MOVD $4, R6 // R6 = z_len/4
12 JMP addMulVVWx<>(SB)
13
14 // func addMulVVW1536(z, x *uint, y uint) (c uint)
15 TEXT ·addMulVVW1536(SB), $0-32
16 MOVD $6, R6 // R6 = z_len/4
17 JMP addMulVVWx<>(SB)
18
19 // func addMulVVW2048(z, x *uint, y uint) (c uint)
20 TEXT ·addMulVVW2048(SB), $0-32
21 MOVD $8, R6 // R6 = z_len/4
22 JMP addMulVVWx<>(SB)
23
24 // This local function expects to be called only by
25 // callers above. R6 contains the z length/4
26 // since 4 values are processed for each
27 // loop iteration, and is guaranteed to be > 0.
28 // If other callers are added this function might
29 // need to change.
30 TEXT addMulVVWx<>(SB), NOSPLIT, $0
31 MOVD z+0(FP), R3
32 MOVD x+8(FP), R4
33 MOVD y+16(FP), R5
34
35 MOVD $0, R9 // R9 = c = 0
36 MOVD R6, CTR // Initialize loop counter
37 PCALIGN $16
38
39 loop:
40 MOVD 0(R4), R14 // x[i]
41 MOVD 8(R4), R16 // x[i+1]
42 MOVD 16(R4), R18 // x[i+2]
43 MOVD 24(R4), R20 // x[i+3]
44 MOVD 0(R3), R15 // z[i]
45 MOVD 8(R3), R17 // z[i+1]
46 MOVD 16(R3), R19 // z[i+2]
47 MOVD 24(R3), R21 // z[i+3]
48 MULLD R5, R14, R10 // low x[i]*y
49 MULHDU R5, R14, R11 // high x[i]*y
50 ADDC R15, R10
51 ADDZE R11
52 ADDC R9, R10
53 ADDZE R11, R9
54 MULLD R5, R16, R14 // low x[i+1]*y
55 MULHDU R5, R16, R15 // high x[i+1]*y
56 ADDC R17, R14
57 ADDZE R15
58 ADDC R9, R14
59 ADDZE R15, R9
60 MULLD R5, R18, R16 // low x[i+2]*y
61 MULHDU R5, R18, R17 // high x[i+2]*y
62 ADDC R19, R16
63 ADDZE R17
64 ADDC R9, R16
65 ADDZE R17, R9
66 MULLD R5, R20, R18 // low x[i+3]*y
67 MULHDU R5, R20, R19 // high x[i+3]*y
68 ADDC R21, R18
69 ADDZE R19
70 ADDC R9, R18
71 ADDZE R19, R9
72 MOVD R10, 0(R3) // z[i]
73 MOVD R14, 8(R3) // z[i+1]
74 MOVD R16, 16(R3) // z[i+2]
75 MOVD R18, 24(R3) // z[i+3]
76 ADD $32, R3
77 ADD $32, R4
78 BDNZ loop
79
80 done:
81 MOVD R9, c+24(FP)
82 RET
83
View as plain text