crc32c/
hw_aarch64.rs

1use crate::hw_tables;
2use crate::util::{self, U64Le};
3use std::arch::aarch64 as simd;
4
5pub unsafe fn crc32c(crci: u32, buffer: &[u8]) -> u32 {
6    let mut crc0 = !crci;
7    let (begin, middle, end) = util::split(buffer);
8
9    // We're effectively cheating by using the software implementation
10    // for now. The bit-flips simulate going back-and-forth between
11    // the inner computations of the software implementation
12    //
13    // This needs a little more optimization, and to use the typical
14    // crc32cb instruction rather than using the software implementation.
15    crc0 = crc_u8(crc0, begin);
16
17    // Most CPUs have a latency of 3 on these instructions,
18    // meaning we must use 3 of them at a time, to leverage
19    // hardware parallelism.
20    //
21    // TODO: validate that this is true on ARM
22    //
23    // First do chunks of size LONG * 3.
24    let chunk_size = (hw_tables::LONG * 3) / 8;
25    let last_chunk = middle.len() / chunk_size * chunk_size;
26
27    let (middle_first, middle_last) = middle.split_at(last_chunk);
28
29    crc0 = crc_u64_parallel3(crc0, chunk_size, &hw_tables::LONG_TABLE, middle_first);
30
31    // Now do chunks of size SHORT * 3.
32    let chunk_size = (hw_tables::SHORT * 3) / 8;
33    let last_chunk = middle_last.len() / chunk_size * chunk_size;
34
35    let (middle_last_first, middle_last_last) = middle_last.split_at(last_chunk);
36
37    crc0 = crc_u64_parallel3(crc0, chunk_size, &hw_tables::SHORT_TABLE, middle_last_first);
38
39    // Now the last part, less than SHORT * 3 but still a multiple of 8-bytes.
40    crc0 = crc_u64(crc0, middle_last_last);
41
42    !crc_u8(crc0, end)
43}
44
45#[inline]
46#[target_feature(enable = "crc")]
47unsafe fn crc_u8(crc: u32, buffer: &[u8]) -> u32 {
48    buffer
49        .iter()
50        .fold(crc, |crc, &next| simd::__crc32cb(crc, next))
51}
52
53#[inline(always)]
54unsafe fn crc_u64(crc: u32, words: &[U64Le]) -> u32 {
55    words
56        .iter()
57        .fold(crc, |crc, &next| crc_u64_append(crc, next.get()))
58}
59
60#[inline(always)]
61unsafe fn crc_u64_append(crc: u32, next: u64) -> u32 {
62    simd::__crc32cd(crc, next)
63}
64
65#[inline(always)]
66unsafe fn crc_u64_parallel3(
67    crc: u32,
68    chunk_size: usize,
69    table: &hw_tables::CrcTable,
70    buffer: &[U64Le],
71) -> u32 {
72    buffer.chunks(chunk_size).fold(crc, |mut crc0, chunk| {
73        let mut crc1 = 0;
74        let mut crc2 = 0;
75
76        // Divide it in three.
77        let block_size = chunk_size / 3;
78
79        let mut blocks = chunk.chunks(block_size);
80        let a = blocks.next().unwrap();
81        let b = blocks.next().unwrap();
82        let c = blocks.next().unwrap();
83
84        for i in 0..block_size {
85            crc0 = crc_u64_append(crc0, a[i].get());
86            crc1 = crc_u64_append(crc1, b[i].get());
87            crc2 = crc_u64_append(crc2, c[i].get());
88        }
89
90        crc0 = table.shift_u32(crc0) ^ crc1;
91        crc0 = table.shift_u32(crc0) ^ crc2;
92
93        crc0
94    })
95}