blake2b_simd/
many.rs

1//! Interfaces for hashing multiple inputs at once, using SIMD more
2//! efficiently.
3//!
4//! The throughput of these interfaces is comparable to BLAKE2bp, about twice
5//! the throughput of regular BLAKE2b when AVX2 is available.
6//!
7//! These interfaces can accept any number of inputs, and the implementation
8//! does its best to parallelize them. In general, the more inputs you can pass
9//! in at once the better. If you need to batch your inputs in smaller groups,
10//! see the [`degree`](fn.degree.html) function for a good batch size.
11//!
12//! The implementation keeps working in parallel even when inputs are of
13//! different lengths, by managing a working set of jobs whose input isn't yet
14//! exhausted. However, if one or two inputs are much longer than the others,
15//! and they're encountered only at the end, there might not be any remaining
16//! work to parallelize them with. In this case, sorting the inputs
17//! longest-first can improve parallelism.
18//!
19//! # Example
20//!
21//! ```
22//! use blake2b_simd::{blake2b, State, many::update_many};
23//!
24//! let mut states = [
25//!     State::new(),
26//!     State::new(),
27//!     State::new(),
28//!     State::new(),
29//! ];
30//!
31//! let inputs = [
32//!     &b"foo"[..],
33//!     &b"bar"[..],
34//!     &b"baz"[..],
35//!     &b"bing"[..],
36//! ];
37//!
38//! update_many(states.iter_mut().zip(inputs.iter()));
39//!
40//! for (state, input) in states.iter_mut().zip(inputs.iter()) {
41//!     assert_eq!(blake2b(input), state.finalize());
42//! }
43//! ```
44
45use crate::guts::{self, Finalize, Implementation, Job, LastNode, Stride};
46use crate::state_words_to_bytes;
47use crate::Count;
48use crate::Hash;
49use crate::Params;
50use crate::State;
51use crate::Word;
52use crate::BLOCKBYTES;
53use arrayvec::ArrayVec;
54use core::fmt;
55
56/// The largest possible value of [`degree`](fn.degree.html) on the target
57/// platform.
58///
59/// Note that this constant reflects the parallelism degree supported by this
60/// crate, so it will change over time as support is added or removed. For
61/// example, when Rust stabilizes AVX-512 support and this crate adds an
62/// AVX-512 implementation, this constant will double on x86 targets. If that
63/// implementation is an optional feature (e.g. because it's nightly-only), the
64/// value of this constant will depend on that optional feature also.
65pub const MAX_DEGREE: usize = guts::MAX_DEGREE;
66
67/// The parallelism degree of the implementation, detected at runtime. If you
68/// hash your inputs in small batches, making the batch size a multiple of
69/// `degree` will generally give good performance.
70///
71/// For example, an x86 processor that supports AVX2 can compute four BLAKE2b
72/// hashes in parallel, so `degree` returns 4 on that machine. If you call
73/// [`hash_many`] with only three inputs, that's not enough to use the AVX2
74/// implementation, and your average throughput will be lower. Likewise if you
75/// call it with five inputs of equal length, the first four will be hashed in
76/// parallel with AVX2, but the last one will have to be hashed by itself, and
77/// again your average throughput will be lower.
78///
79/// As noted in the module level docs, performance is more complicated if your
80/// inputs are of different lengths. When parallelizing long and short inputs
81/// together, the longer ones will have bytes left over, and the implementation
82/// will try to parallelize those leftover bytes with subsequent inputs. The
83/// more inputs available in that case, the more the implementation will be
84/// able to parallelize.
85///
86/// If you need a constant batch size, for example to collect inputs in an
87/// array, see [`MAX_DEGREE`].
88///
89/// [`hash_many`]: fn.hash_many.html
90/// [`MAX_DEGREE`]: constant.MAX_DEGREE.html
91pub fn degree() -> usize {
92    guts::Implementation::detect().degree()
93}
94
95type JobsVec<'a, 'b> = ArrayVec<Job<'a, 'b>, { guts::MAX_DEGREE }>;
96
97#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
98#[inline(always)]
99fn fill_jobs_vec<'a, 'b>(
100    jobs_iter: &mut impl Iterator<Item = Job<'a, 'b>>,
101    vec: &mut JobsVec<'a, 'b>,
102    target_len: usize,
103) {
104    while vec.len() < target_len {
105        if let Some(job) = jobs_iter.next() {
106            vec.push(job);
107        } else {
108            break;
109        }
110    }
111}
112
113#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
114#[inline(always)]
115fn evict_finished<'a, 'b>(vec: &mut JobsVec<'a, 'b>, num_jobs: usize) {
116    // Iterate backwards so that removal doesn't cause an out-of-bounds panic.
117    for i in (0..num_jobs).rev() {
118        // Note that is_empty() is only valid because we know all these jobs
119        // have been run at least once. Otherwise we could confuse the empty
120        // input for a finished job, which would be incorrect.
121        //
122        // Avoid a panic branch here in release mode.
123        debug_assert!(vec.len() > i);
124        if vec.len() > i && vec[i].input.is_empty() {
125            // Note that calling pop_at() repeatedly has some overhead, because
126            // later elements need to be shifted up. However, the JobsVec is
127            // small, and this approach guarantees that jobs are encountered in
128            // order.
129            vec.pop_at(i);
130        }
131    }
132}
133
134pub(crate) fn compress_many<'a, 'b, I>(
135    jobs: I,
136    imp: Implementation,
137    finalize: Finalize,
138    stride: Stride,
139) where
140    I: IntoIterator<Item = Job<'a, 'b>>,
141{
142    // Fuse is important for correctness, since each of these blocks tries to
143    // advance the iterator, even if a previous block emptied it.
144    #[allow(unused_mut)]
145    let mut jobs_iter = jobs.into_iter().fuse();
146    #[allow(unused_mut)]
147    let mut jobs_vec = JobsVec::new();
148
149    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
150    if imp.degree() >= 4 {
151        loop {
152            fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 4);
153            if jobs_vec.len() < 4 {
154                break;
155            }
156            let jobs_array = arrayref::array_mut_ref!(jobs_vec, 0, 4);
157            imp.compress4_loop(jobs_array, finalize, stride);
158            evict_finished(&mut jobs_vec, 4);
159        }
160    }
161
162    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
163    if imp.degree() >= 2 {
164        loop {
165            fill_jobs_vec(&mut jobs_iter, &mut jobs_vec, 2);
166            if jobs_vec.len() < 2 {
167                break;
168            }
169            let jobs_array = arrayref::array_mut_ref!(jobs_vec, 0, 2);
170            imp.compress2_loop(jobs_array, finalize, stride);
171            evict_finished(&mut jobs_vec, 2);
172        }
173    }
174
175    for job in jobs_vec.into_iter().chain(jobs_iter) {
176        let Job {
177            input,
178            words,
179            count,
180            last_node,
181        } = job;
182        imp.compress1_loop(input, words, count, last_node, finalize, stride);
183    }
184}
185
186/// Update any number of `State` objects at once.
187///
188/// # Example
189///
190/// ```
191/// use blake2b_simd::{blake2b, State, many::update_many};
192///
193/// let mut states = [
194///     State::new(),
195///     State::new(),
196///     State::new(),
197///     State::new(),
198/// ];
199///
200/// let inputs = [
201///     &b"foo"[..],
202///     &b"bar"[..],
203///     &b"baz"[..],
204///     &b"bing"[..],
205/// ];
206///
207/// update_many(states.iter_mut().zip(inputs.iter()));
208///
209/// for (state, input) in states.iter_mut().zip(inputs.iter()) {
210///     assert_eq!(blake2b(input), state.finalize());
211/// }
212/// ```
213pub fn update_many<'a, 'b, I, T>(pairs: I)
214where
215    I: IntoIterator<Item = (&'a mut State, &'b T)>,
216    T: 'b + AsRef<[u8]> + ?Sized,
217{
218    // Get the guts::Implementation from the first state, if any.
219    let mut peekable_pairs = pairs.into_iter().peekable();
220    let implementation = if let Some((state, _)) = peekable_pairs.peek() {
221        state.implementation
222    } else {
223        // No work items, just short circuit.
224        return;
225    };
226
227    // Adapt the pairs iterator into a Jobs iterator, but skip over the Jobs
228    // where there's not actually any work to do (e.g. because there's not much
229    // input and it's all just going in the State buffer).
230    let jobs = peekable_pairs.flat_map(|(state, input_t)| {
231        let mut input = input_t.as_ref();
232        // For each pair, if the State has some input in its buffer, try to
233        // finish that buffer. If there wasn't enough input to do that --
234        // or if the input was empty to begin with -- skip this pair.
235        state.compress_buffer_if_possible(&mut input);
236        if input.is_empty() {
237            return None;
238        }
239        // Now we know the buffer is empty and there's more input. Make sure we
240        // buffer the final block, because update() doesn't finalize.
241        let mut last_block_start = input.len() - 1;
242        last_block_start -= last_block_start % BLOCKBYTES;
243        let (blocks, last_block) = input.split_at(last_block_start);
244        state.buf[..last_block.len()].copy_from_slice(last_block);
245        state.buflen = last_block.len() as u8;
246        // Finally, if the full blocks slice is non-empty, prepare that job for
247        // compression, and bump the State count.
248        if blocks.is_empty() {
249            None
250        } else {
251            let count = state.count;
252            state.count = state.count.wrapping_add(blocks.len() as Count);
253            Some(Job {
254                input: blocks,
255                words: &mut state.words,
256                count,
257                last_node: state.last_node,
258            })
259        }
260    });
261
262    // Run all the Jobs in the iterator.
263    compress_many(jobs, implementation, Finalize::No, Stride::Serial);
264}
265
266/// A job for the [`hash_many`] function. After calling [`hash_many`] on a
267/// collection of `HashManyJob` objects, you can call [`to_hash`] on each job
268/// to get the result.
269///
270/// [`hash_many`]: fn.hash_many.html
271/// [`to_hash`]: struct.HashManyJob.html#method.to_hash
272#[derive(Clone)]
273pub struct HashManyJob<'a> {
274    words: [Word; 8],
275    count: Count,
276    last_node: LastNode,
277    hash_length: u8,
278    input: &'a [u8],
279    finished: bool,
280    implementation: guts::Implementation,
281}
282
283impl<'a> HashManyJob<'a> {
284    /// Construct a new `HashManyJob` from a set of hashing parameters and an
285    /// input.
286    #[inline]
287    pub fn new(params: &Params, input: &'a [u8]) -> Self {
288        let mut words = params.to_words();
289        let mut count = 0;
290        let mut finished = false;
291        // If we have key bytes, compress them into the state words. If there's
292        // no additional input, this compression needs to finalize and set
293        // finished=true.
294        if params.key_length > 0 {
295            let mut finalization = Finalize::No;
296            if input.is_empty() {
297                finalization = Finalize::Yes;
298                finished = true;
299            }
300            params.implementation.compress1_loop(
301                &params.key_block,
302                &mut words,
303                0,
304                params.last_node,
305                finalization,
306                Stride::Serial,
307            );
308            count = BLOCKBYTES as Count;
309        }
310        Self {
311            words,
312            count,
313            last_node: params.last_node,
314            hash_length: params.hash_length,
315            input,
316            finished,
317            implementation: params.implementation,
318        }
319    }
320
321    /// Get the hash from a finished job. If you call this before calling
322    /// [`hash_many`], it will panic in debug mode.
323    ///
324    /// [`hash_many`]: fn.hash_many.html
325    #[inline]
326    pub fn to_hash(&self) -> Hash {
327        debug_assert!(self.finished, "job hasn't been run yet");
328        Hash {
329            bytes: state_words_to_bytes(&self.words),
330            len: self.hash_length,
331        }
332    }
333}
334
335impl<'a> fmt::Debug for HashManyJob<'a> {
336    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
337        // NB: Don't print the words. Leaking them would allow length extension.
338        write!(
339            f,
340            "HashManyJob {{ count: {}, hash_length: {}, last_node: {}, input_len: {} }}",
341            self.count,
342            self.hash_length,
343            self.last_node.yes(),
344            self.input.len(),
345        )
346    }
347}
348
349/// Hash any number of complete inputs all at once.
350///
351/// This is slightly more efficient than using `update_many` with `State`
352/// objects, because it doesn't need to do any buffering.
353///
354/// Running `hash_many` on the same `HashManyJob` object more than once has no
355/// effect.
356///
357/// # Example
358///
359/// ```
360/// use blake2b_simd::{blake2b, Params, many::{HashManyJob, hash_many}};
361///
362/// let inputs = [
363///     &b"foo"[..],
364///     &b"bar"[..],
365///     &b"baz"[..],
366///     &b"bing"[..],
367/// ];
368///
369/// let mut params = Params::new();
370/// params.hash_length(16);
371///
372/// let mut jobs = [
373///     HashManyJob::new(&params, inputs[0]),
374///     HashManyJob::new(&params, inputs[1]),
375///     HashManyJob::new(&params, inputs[2]),
376///     HashManyJob::new(&params, inputs[3]),
377/// ];
378///
379/// hash_many(jobs.iter_mut());
380///
381/// for (input, job) in inputs.iter().zip(jobs.iter()) {
382///     let expected = params.hash(input);
383///     assert_eq!(expected, job.to_hash());
384/// }
385/// ```
386pub fn hash_many<'a, 'b, I>(hash_many_jobs: I)
387where
388    'b: 'a,
389    I: IntoIterator<Item = &'a mut HashManyJob<'b>>,
390{
391    // Get the guts::Implementation from the first job, if any.
392    let mut peekable_jobs = hash_many_jobs.into_iter().peekable();
393    let implementation = if let Some(job) = peekable_jobs.peek() {
394        job.implementation
395    } else {
396        // No work items, just short circuit.
397        return;
398    };
399
400    // In the jobs iterator, skip HashManyJobs that have already been run. This
401    // is less because we actually expect callers to call hash_many twice
402    // (though they're allowed to if they want), and more because
403    // HashManyJob::new might need to finalize if there are key bytes but no
404    // input. Tying the job lifetime to the Params reference is an alternative,
405    // but I've found it too constraining in practice. We could also put key
406    // bytes in every HashManyJob, but that would add unnecessary storage and
407    // zeroing for all callers.
408    let unfinished_jobs = peekable_jobs.into_iter().filter(|j| !j.finished);
409    let jobs = unfinished_jobs.map(|j| {
410        j.finished = true;
411        Job {
412            input: j.input,
413            words: &mut j.words,
414            count: j.count,
415            last_node: j.last_node,
416        }
417    });
418    compress_many(jobs, implementation, Finalize::Yes, Stride::Serial);
419}
420
421#[cfg(test)]
422mod test {
423    use super::*;
424    use crate::guts;
425    use crate::paint_test_input;
426    use crate::BLOCKBYTES;
427    use arrayvec::ArrayVec;
428
429    #[test]
430    fn test_degree() {
431        assert!(degree() <= MAX_DEGREE);
432
433        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
434        #[cfg(feature = "std")]
435        {
436            if is_x86_feature_detected!("avx2") {
437                assert!(degree() >= 4);
438            }
439            if is_x86_feature_detected!("sse4.1") {
440                assert!(degree() >= 2);
441            }
442        }
443    }
444
445    #[test]
446    fn test_hash_many() {
447        // Use a length of inputs that will exercise all of the power-of-two loops.
448        const LEN: usize = 2 * guts::MAX_DEGREE - 1;
449
450        // Rerun LEN inputs LEN different times, with the empty input starting in a
451        // different spot each time.
452        let mut input = [0; LEN * BLOCKBYTES];
453        paint_test_input(&mut input);
454        for start_offset in 0..LEN {
455            let mut inputs: [&[u8]; LEN] = [&[]; LEN];
456            for i in 0..LEN {
457                let chunks = (i + start_offset) % LEN;
458                inputs[i] = &input[..chunks * BLOCKBYTES];
459            }
460
461            let mut params: ArrayVec<Params, LEN> = ArrayVec::new();
462            for i in 0..LEN {
463                let mut p = Params::new();
464                p.node_offset(i as u64);
465                if i % 2 == 1 {
466                    p.last_node(true);
467                    p.key(b"foo");
468                }
469                params.push(p);
470            }
471
472            let mut jobs: ArrayVec<HashManyJob, LEN> = ArrayVec::new();
473            for i in 0..LEN {
474                jobs.push(HashManyJob::new(&params[i], inputs[i]));
475            }
476
477            hash_many(&mut jobs);
478
479            // Check the outputs.
480            for i in 0..LEN {
481                let expected = params[i].hash(inputs[i]);
482                assert_eq!(expected, jobs[i].to_hash());
483            }
484        }
485    }
486
487    #[test]
488    fn test_update_many() {
489        // Use a length of inputs that will exercise all of the power-of-two loops.
490        const LEN: usize = 2 * guts::MAX_DEGREE - 1;
491
492        // Rerun LEN inputs LEN different times, with the empty input starting in a
493        // different spot each time.
494        let mut input = [0; LEN * BLOCKBYTES];
495        paint_test_input(&mut input);
496        for start_offset in 0..LEN {
497            let mut inputs: [&[u8]; LEN] = [&[]; LEN];
498            for i in 0..LEN {
499                let chunks = (i + start_offset) % LEN;
500                inputs[i] = &input[..chunks * BLOCKBYTES];
501            }
502
503            let mut params: ArrayVec<Params, LEN> = ArrayVec::new();
504            for i in 0..LEN {
505                let mut p = Params::new();
506                p.node_offset(i as u64);
507                if i % 2 == 1 {
508                    p.last_node(true);
509                    p.key(b"foo");
510                }
511                params.push(p);
512            }
513
514            let mut states: ArrayVec<State, LEN> = ArrayVec::new();
515            for i in 0..LEN {
516                states.push(params[i].to_state());
517            }
518
519            // Run each input twice through, to exercise buffering.
520            update_many(states.iter_mut().zip(inputs.iter()));
521            update_many(states.iter_mut().zip(inputs.iter()));
522
523            // Check the outputs.
524            for i in 0..LEN {
525                let mut reference_state = params[i].to_state();
526                // Again, run the input twice.
527                reference_state.update(inputs[i]);
528                reference_state.update(inputs[i]);
529                assert_eq!(reference_state.finalize(), states[i].finalize());
530                assert_eq!(2 * inputs[i].len() as Count, states[i].count());
531            }
532        }
533    }
534}