Text file
src/hash/crc32/crc32_ppc64le.s
1 // Copyright 2017 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // The vectorized implementation found below is a derived work
6 // from code written by Anton Blanchard <anton@au.ibm.com> found
7 // at https://github.com/antonblanchard/crc32-vpmsum. The original
8 // is dual licensed under GPL and Apache 2. As the copyright holder
9 // for the work, IBM has contributed this new work under
10 // the golang license.
11
12 // Changes include porting to Go assembler with modifications for
13 // the Go ABI for ppc64le.
14
15 #include "textflag.h"
16
17 #define POWER8_OFFSET 132
18
19 #define off16 R16
20 #define off32 R17
21 #define off48 R18
22 #define off64 R19
23 #define off80 R20
24 #define off96 R21
25 #define off112 R22
26
27 #define const1 V24
28 #define const2 V25
29
30 #define byteswap V26
31 #define mask_32bit V27
32 #define mask_64bit V28
33 #define zeroes V29
34
35 #define MAX_SIZE 32*1024
36 #define REFLECT
37
38 TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
39 MOVWZ crc+0(FP), R3 // incoming crc
40 MOVD table8+8(FP), R4 // *Table
41 MOVD p+16(FP), R5
42 MOVD p_len+24(FP), R6 // p len
43
44 CMP $0,R6 // len == 0?
45 BNE start
46 MOVW R3,ret+40(FP) // return crc
47 RET
48
49 start:
50 NOR R3,R3,R7 // ^crc
51 MOVWZ R7,R7 // 32 bits
52 CMP R6,$16
53 MOVD R6,CTR
54 BLT short
55 SRAD $3,R6,R8 // 8 byte chunks
56 MOVD R8,CTR
57
58 loop:
59 MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian?
60 MOVWZ 4(R5),R9 // 4-7 bytes of p
61 MOVD R4,R10 // &tab[0]
62 XOR R7,R8,R7 // crc ^= byte[0:3]
63 RLDICL $40,R9,$56,R17 // p[7]
64 SLD $2,R17,R17 // p[7]*4
65 RLDICL $40,R7,$56,R8 // crc>>24
66 SLD $2,R8,R8 // crc>>24*4
67 RLDICL $48,R9,$56,R18 // p[6]
68 SLD $2,R18,R18 // p[6]*4
69 MOVWZ (R10)(R17),R21 // tab[0][p[7]]
70 ADD $1024,R10,R10 // tab[1]
71 RLDICL $56,R9,$56,R19 // p[5]
72 SLD $2,R19,R19 // p[5]*4:1
73 MOVWZ (R10)(R18),R22 // tab[1][p[6]]
74 ADD $1024,R10,R10 // tab[2]
75 XOR R21,R22,R21 // xor done R22
76 CLRLSLDI $56,R9,$2,R20
77 MOVWZ (R10)(R19),R23 // tab[2][p[5]]
78 ADD $1024,R10,R10 // &tab[3]
79 XOR R21,R23,R21 // xor done R23
80 MOVWZ (R10)(R20),R24 // tab[3][p[4]]
81 ADD $1024,R10,R10 // &tab[4]
82 XOR R21,R24,R21 // xor done R24
83 MOVWZ (R10)(R8),R25 // tab[4][crc>>24]
84 RLDICL $48,R7,$56,R24 // crc>>16&0xFF
85 XOR R21,R25,R21 // xor done R25
86 ADD $1024,R10,R10 // &tab[5]
87 SLD $2,R24,R24 // crc>>16&0xFF*4
88 MOVWZ (R10)(R24),R26 // tab[5][crc>>16&0xFF]
89 XOR R21,R26,R21 // xor done R26
90 RLDICL $56,R7,$56,R25 // crc>>8
91 ADD $1024,R10,R10 // &tab[6]
92 SLD $2,R25,R25 // crc>>8&FF*2
93 MOVBZ R7,R26 // crc&0xFF
94 MOVWZ (R10)(R25),R27 // tab[6][crc>>8&0xFF]
95 ADD $1024,R10,R10 // &tab[7]
96 SLD $2,R26,R26 // crc&0xFF*2
97 XOR R21,R27,R21 // xor done R27
98 ADD $8,R5 // p = p[8:]
99 MOVWZ (R10)(R26),R28 // tab[7][crc&0xFF]
100 XOR R21,R28,R21 // xor done R28
101 MOVWZ R21,R7 // crc for next round
102 BDNZ loop
103 ANDCC $7,R6,R8 // any leftover bytes
104 BEQ done // none --> done
105 MOVD R8,CTR // byte count
106 PCALIGN $16 // align short loop
107 short:
108 MOVBZ 0(R5),R8 // get v
109 XOR R8,R7,R8 // byte(crc)^v -> R8
110 RLDIC $2,R8,$54,R8 // rldicl r8,r8,2,22
111 SRD $8,R7,R14 // crc>>8
112 MOVWZ (R4)(R8),R10
113 ADD $1,R5
114 XOR R10,R14,R7 // loop crc in R7
115 BDNZ short
116 done:
117 NOR R7,R7,R7 // ^crc
118 MOVW R7,ret+40(FP) // return crc
119 RET
120
121 #ifdef BYTESWAP_DATA
122 DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
123 DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
124
125 GLOBL ·byteswapcons+0(SB),RODATA,$16
126 #endif
127
128 TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
129 MOVWZ crc+0(FP), R3 // incoming crc
130 MOVWZ ctab+4(FP), R14 // crc poly id
131 MOVD p+8(FP), R4
132 MOVD p_len+16(FP), R5 // p len
133
134 // R3 = incoming crc
135 // R14 = constant table identifier
136 // R5 = address of bytes
137 // R6 = length of bytes
138
139 // defines for index loads
140
141 MOVD $16,off16
142 MOVD $32,off32
143 MOVD $48,off48
144 MOVD $64,off64
145 MOVD $80,off80
146 MOVD $96,off96
147 MOVD $112,off112
148 MOVD $0,R15
149
150 MOVD R3,R10 // save initial crc
151
152 NOR R3,R3,R3 // ^crc
153 MOVWZ R3,R3 // 32 bits
154 VXOR zeroes,zeroes,zeroes // clear the V reg
155 VSPLTISW $-1,V0
156 VSLDOI $4,V29,V0,mask_32bit
157 VSLDOI $8,V29,V0,mask_64bit
158
159 VXOR V8,V8,V8
160 MTVSRD R3,VS40 // crc initial value VS40 = V8
161
162 #ifdef REFLECT
163 VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits?
164 #else
165 VSLDOI $4,V8,zeroes,V8
166 #endif
167
168 #ifdef BYTESWAP_DATA
169 MOVD $·byteswapcons(SB),R3
170 LVX (R3),byteswap
171 #endif
172
173 CMPU R5,$256 // length of bytes
174 BLT short
175
176 RLDICR $0,R5,$56,R6 // chunk to process
177
178 // First step for larger sizes
179 l1: MOVD $32768,R7
180 MOVD R7,R9
181 CMP R6,R7 // compare R6, R7 (MAX SIZE)
182 BGT top // less than MAX, just do remainder
183 MOVD R6,R7
184 top:
185 SUB R7,R6,R6
186
187 // mainloop does 128 bytes at a time
188 SRD $7,R7
189
190 // determine the offset into the constants table to start with.
191 // Each constant is 128 bytes, used against 16 bytes of data.
192 SLD $4,R7,R8
193 SRD $3,R9,R9
194 SUB R8,R9,R8
195
196 // The last iteration is reduced in a separate step
197 ADD $-1,R7
198 MOVD R7,CTR
199
200 // Determine which constant table (depends on poly)
201 CMP R14,$1
202 BNE castTable
203 MOVD $·IEEEConst(SB),R3
204 BR startConst
205 castTable:
206 MOVD $·CastConst(SB),R3
207
208 startConst:
209 ADD R3,R8,R3 // starting point in constants table
210
211 VXOR V0,V0,V0 // clear the V regs
212 VXOR V1,V1,V1
213 VXOR V2,V2,V2
214 VXOR V3,V3,V3
215 VXOR V4,V4,V4
216 VXOR V5,V5,V5
217 VXOR V6,V6,V6
218 VXOR V7,V7,V7
219
220 LVX (R3),const1 // loading constant values
221
222 CMP R15,$1 // Identify warm up pass
223 BEQ next
224
225 // First warm up pass: load the bytes to process
226 LVX (R4),V16
227 LVX (R4+off16),V17
228 LVX (R4+off32),V18
229 LVX (R4+off48),V19
230 LVX (R4+off64),V20
231 LVX (R4+off80),V21
232 LVX (R4+off96),V22
233 LVX (R4+off112),V23
234 ADD $128,R4 // bump up to next 128 bytes in buffer
235
236 VXOR V16,V8,V16 // xor in initial CRC in V8
237
238 next:
239 BC 18,0,first_warm_up_done
240
241 ADD $16,R3 // bump up to next constants
242 LVX (R3),const2 // table values
243
244 VPMSUMD V16,const1,V8 // second warm up pass
245 LVX (R4),V16 // load from buffer
246 OR $0,R2,R2
247
248 VPMSUMD V17,const1,V9 // vpmsumd with constants
249 LVX (R4+off16),V17 // load next from buffer
250 OR $0,R2,R2
251
252 VPMSUMD V18,const1,V10 // vpmsumd with constants
253 LVX (R4+off32),V18 // load next from buffer
254 OR $0,R2,R2
255
256 VPMSUMD V19,const1,V11 // vpmsumd with constants
257 LVX (R4+off48),V19 // load next from buffer
258 OR $0,R2,R2
259
260 VPMSUMD V20,const1,V12 // vpmsumd with constants
261 LVX (R4+off64),V20 // load next from buffer
262 OR $0,R2,R2
263
264 VPMSUMD V21,const1,V13 // vpmsumd with constants
265 LVX (R4+off80),V21 // load next from buffer
266 OR $0,R2,R2
267
268 VPMSUMD V22,const1,V14 // vpmsumd with constants
269 LVX (R4+off96),V22 // load next from buffer
270 OR $0,R2,R2
271
272 VPMSUMD V23,const1,V15 // vpmsumd with constants
273 LVX (R4+off112),V23 // load next from buffer
274
275 ADD $128,R4 // bump up to next 128 bytes in buffer
276
277 BC 18,0,first_cool_down
278
279 cool_top:
280 LVX (R3),const1 // constants
281 ADD $16,R3 // inc to next constants
282 OR $0,R2,R2
283
284 VXOR V0,V8,V0 // xor in previous vpmsumd
285 VPMSUMD V16,const2,V8 // vpmsumd with constants
286 LVX (R4),V16 // buffer
287 OR $0,R2,R2
288
289 VXOR V1,V9,V1 // xor in previous
290 VPMSUMD V17,const2,V9 // vpmsumd with constants
291 LVX (R4+off16),V17 // next in buffer
292 OR $0,R2,R2
293
294 VXOR V2,V10,V2 // xor in previous
295 VPMSUMD V18,const2,V10 // vpmsumd with constants
296 LVX (R4+off32),V18 // next in buffer
297 OR $0,R2,R2
298
299 VXOR V3,V11,V3 // xor in previous
300 VPMSUMD V19,const2,V11 // vpmsumd with constants
301 LVX (R4+off48),V19 // next in buffer
302 LVX (R3),const2 // get next constant
303 OR $0,R2,R2
304
305 VXOR V4,V12,V4 // xor in previous
306 VPMSUMD V20,const1,V12 // vpmsumd with constants
307 LVX (R4+off64),V20 // next in buffer
308 OR $0,R2,R2
309
310 VXOR V5,V13,V5 // xor in previous
311 VPMSUMD V21,const1,V13 // vpmsumd with constants
312 LVX (R4+off80),V21 // next in buffer
313 OR $0,R2,R2
314
315 VXOR V6,V14,V6 // xor in previous
316 VPMSUMD V22,const1,V14 // vpmsumd with constants
317 LVX (R4+off96),V22 // next in buffer
318 OR $0,R2,R2
319
320 VXOR V7,V15,V7 // xor in previous
321 VPMSUMD V23,const1,V15 // vpmsumd with constants
322 LVX (R4+off112),V23 // next in buffer
323
324 ADD $128,R4 // bump up buffer pointer
325 BDNZ cool_top // are we done?
326
327 first_cool_down:
328
329 // load the constants
330 // xor in the previous value
331 // vpmsumd the result with constants
332
333 LVX (R3),const1
334 ADD $16,R3
335
336 VXOR V0,V8,V0
337 VPMSUMD V16,const1,V8
338 OR $0,R2,R2
339
340 VXOR V1,V9,V1
341 VPMSUMD V17,const1,V9
342 OR $0,R2,R2
343
344 VXOR V2,V10,V2
345 VPMSUMD V18,const1,V10
346 OR $0,R2,R2
347
348 VXOR V3,V11,V3
349 VPMSUMD V19,const1,V11
350 OR $0,R2,R2
351
352 VXOR V4,V12,V4
353 VPMSUMD V20,const1,V12
354 OR $0,R2,R2
355
356 VXOR V5,V13,V5
357 VPMSUMD V21,const1,V13
358 OR $0,R2,R2
359
360 VXOR V6,V14,V6
361 VPMSUMD V22,const1,V14
362 OR $0,R2,R2
363
364 VXOR V7,V15,V7
365 VPMSUMD V23,const1,V15
366 OR $0,R2,R2
367
368 second_cool_down:
369
370 VXOR V0,V8,V0
371 VXOR V1,V9,V1
372 VXOR V2,V10,V2
373 VXOR V3,V11,V3
374 VXOR V4,V12,V4
375 VXOR V5,V13,V5
376 VXOR V6,V14,V6
377 VXOR V7,V15,V7
378
379 #ifdef REFLECT
380 VSLDOI $4,V0,zeroes,V0
381 VSLDOI $4,V1,zeroes,V1
382 VSLDOI $4,V2,zeroes,V2
383 VSLDOI $4,V3,zeroes,V3
384 VSLDOI $4,V4,zeroes,V4
385 VSLDOI $4,V5,zeroes,V5
386 VSLDOI $4,V6,zeroes,V6
387 VSLDOI $4,V7,zeroes,V7
388 #endif
389
390 LVX (R4),V8
391 LVX (R4+off16),V9
392 LVX (R4+off32),V10
393 LVX (R4+off48),V11
394 LVX (R4+off64),V12
395 LVX (R4+off80),V13
396 LVX (R4+off96),V14
397 LVX (R4+off112),V15
398
399 ADD $128,R4
400
401 VXOR V0,V8,V16
402 VXOR V1,V9,V17
403 VXOR V2,V10,V18
404 VXOR V3,V11,V19
405 VXOR V4,V12,V20
406 VXOR V5,V13,V21
407 VXOR V6,V14,V22
408 VXOR V7,V15,V23
409
410 MOVD $1,R15
411 CMP $0,R6
412 ADD $128,R6
413
414 BNE l1
415 ANDCC $127,R5
416 SUBC R5,$128,R6
417 ADD R3,R6,R3
418
419 SRD $4,R5,R7
420 MOVD R7,CTR
421 LVX (R3),V0
422 LVX (R3+off16),V1
423 LVX (R3+off32),V2
424 LVX (R3+off48),V3
425 LVX (R3+off64),V4
426 LVX (R3+off80),V5
427 LVX (R3+off96),V6
428 LVX (R3+off112),V7
429
430 ADD $128,R3
431
432 VPMSUMW V16,V0,V0
433 VPMSUMW V17,V1,V1
434 VPMSUMW V18,V2,V2
435 VPMSUMW V19,V3,V3
436 VPMSUMW V20,V4,V4
437 VPMSUMW V21,V5,V5
438 VPMSUMW V22,V6,V6
439 VPMSUMW V23,V7,V7
440
441 // now reduce the tail
442
443 CMP $0,R7
444 BEQ next1
445
446 LVX (R4),V16
447 LVX (R3),V17
448 VPMSUMW V16,V17,V16
449 VXOR V0,V16,V0
450 BC 18,0,next1
451
452 LVX (R4+off16),V16
453 LVX (R3+off16),V17
454 VPMSUMW V16,V17,V16
455 VXOR V0,V16,V0
456 BC 18,0,next1
457
458 LVX (R4+off32),V16
459 LVX (R3+off32),V17
460 VPMSUMW V16,V17,V16
461 VXOR V0,V16,V0
462 BC 18,0,next1
463
464 LVX (R4+off48),V16
465 LVX (R3+off48),V17
466 VPMSUMW V16,V17,V16
467 VXOR V0,V16,V0
468 BC 18,0,next1
469
470 LVX (R4+off64),V16
471 LVX (R3+off64),V17
472 VPMSUMW V16,V17,V16
473 VXOR V0,V16,V0
474 BC 18,0,next1
475
476 LVX (R4+off80),V16
477 LVX (R3+off80),V17
478 VPMSUMW V16,V17,V16
479 VXOR V0,V16,V0
480 BC 18,0,next1
481
482 LVX (R4+off96),V16
483 LVX (R3+off96),V17
484 VPMSUMW V16,V17,V16
485 VXOR V0,V16,V0
486
487 next1:
488 VXOR V0,V1,V0
489 VXOR V2,V3,V2
490 VXOR V4,V5,V4
491 VXOR V6,V7,V6
492 VXOR V0,V2,V0
493 VXOR V4,V6,V4
494 VXOR V0,V4,V0
495
496 barrett_reduction:
497
498 CMP R14,$1
499 BNE barcstTable
500 MOVD $·IEEEBarConst(SB),R3
501 BR startbarConst
502 barcstTable:
503 MOVD $·CastBarConst(SB),R3
504
505 startbarConst:
506 LVX (R3),const1
507 LVX (R3+off16),const2
508
509 VSLDOI $8,V0,V0,V1
510 VXOR V0,V1,V0
511
512 #ifdef REFLECT
513 VSPLTISB $1,V1
514 VSL V0,V1,V0
515 #endif
516
517 VAND V0,mask_64bit,V0
518
519 #ifndef REFLECT
520
521 VPMSUMD V0,const1,V1
522 VSLDOI $8,zeroes,V1,V1
523 VPMSUMD V1,const2,V1
524 VXOR V0,V1,V0
525 VSLDOI $8,V0,zeroes,V0
526
527 #else
528
529 VAND V0,mask_32bit,V1
530 VPMSUMD V1,const1,V1
531 VAND V1,mask_32bit,V1
532 VPMSUMD V1,const2,V1
533 VXOR V0,V1,V0
534 VSLDOI $4,V0,zeroes,V0
535
536 #endif
537
538 MFVSRD VS32,R3 // VS32 = V0
539
540 NOR R3,R3,R3 // return ^crc
541 MOVW R3,ret+32(FP)
542 RET
543
544 first_warm_up_done:
545
546 LVX (R3),const1
547 ADD $16,R3
548
549 VPMSUMD V16,const1,V8
550 VPMSUMD V17,const1,V9
551 VPMSUMD V18,const1,V10
552 VPMSUMD V19,const1,V11
553 VPMSUMD V20,const1,V12
554 VPMSUMD V21,const1,V13
555 VPMSUMD V22,const1,V14
556 VPMSUMD V23,const1,V15
557
558 BR second_cool_down
559
560 short:
561 CMP $0,R5
562 BEQ zero
563
564 // compute short constants
565
566 CMP R14,$1
567 BNE castshTable
568 MOVD $·IEEEConst(SB),R3
569 ADD $4080,R3
570 BR startshConst
571 castshTable:
572 MOVD $·CastConst(SB),R3
573 ADD $4080,R3
574
575 startshConst:
576 SUBC R5,$256,R6 // sub from 256
577 ADD R3,R6,R3
578
579 // calculate where to start
580
581 SRD $4,R5,R7
582 MOVD R7,CTR
583
584 VXOR V19,V19,V19
585 VXOR V20,V20,V20
586
587 LVX (R4),V0
588 LVX (R3),V16
589 VXOR V0,V8,V0
590 VPMSUMW V0,V16,V0
591 BC 18,0,v0
592
593 LVX (R4+off16),V1
594 LVX (R3+off16),V17
595 VPMSUMW V1,V17,V1
596 BC 18,0,v1
597
598 LVX (R4+off32),V2
599 LVX (R3+off32),V16
600 VPMSUMW V2,V16,V2
601 BC 18,0,v2
602
603 LVX (R4+off48),V3
604 LVX (R3+off48),V17
605 VPMSUMW V3,V17,V3
606 BC 18,0,v3
607
608 LVX (R4+off64),V4
609 LVX (R3+off64),V16
610 VPMSUMW V4,V16,V4
611 BC 18,0,v4
612
613 LVX (R4+off80),V5
614 LVX (R3+off80),V17
615 VPMSUMW V5,V17,V5
616 BC 18,0,v5
617
618 LVX (R4+off96),V6
619 LVX (R3+off96),V16
620 VPMSUMW V6,V16,V6
621 BC 18,0,v6
622
623 LVX (R4+off112),V7
624 LVX (R3+off112),V17
625 VPMSUMW V7,V17,V7
626 BC 18,0,v7
627
628 ADD $128,R3
629 ADD $128,R4
630
631 LVX (R4),V8
632 LVX (R3),V16
633 VPMSUMW V8,V16,V8
634 BC 18,0,v8
635
636 LVX (R4+off16),V9
637 LVX (R3+off16),V17
638 VPMSUMW V9,V17,V9
639 BC 18,0,v9
640
641 LVX (R4+off32),V10
642 LVX (R3+off32),V16
643 VPMSUMW V10,V16,V10
644 BC 18,0,v10
645
646 LVX (R4+off48),V11
647 LVX (R3+off48),V17
648 VPMSUMW V11,V17,V11
649 BC 18,0,v11
650
651 LVX (R4+off64),V12
652 LVX (R3+off64),V16
653 VPMSUMW V12,V16,V12
654 BC 18,0,v12
655
656 LVX (R4+off80),V13
657 LVX (R3+off80),V17
658 VPMSUMW V13,V17,V13
659 BC 18,0,v13
660
661 LVX (R4+off96),V14
662 LVX (R3+off96),V16
663 VPMSUMW V14,V16,V14
664 BC 18,0,v14
665
666 LVX (R4+off112),V15
667 LVX (R3+off112),V17
668 VPMSUMW V15,V17,V15
669
670 VXOR V19,V15,V19
671 v14: VXOR V20,V14,V20
672 v13: VXOR V19,V13,V19
673 v12: VXOR V20,V12,V20
674 v11: VXOR V19,V11,V19
675 v10: VXOR V20,V10,V20
676 v9: VXOR V19,V9,V19
677 v8: VXOR V20,V8,V20
678 v7: VXOR V19,V7,V19
679 v6: VXOR V20,V6,V20
680 v5: VXOR V19,V5,V19
681 v4: VXOR V20,V4,V20
682 v3: VXOR V19,V3,V19
683 v2: VXOR V20,V2,V20
684 v1: VXOR V19,V1,V19
685 v0: VXOR V20,V0,V20
686
687 VXOR V19,V20,V0
688
689 BR barrett_reduction
690
691 zero:
692 // This case is the original crc, so just return it
693 MOVW R10,ret+32(FP)
694 RET
695
View as plain text