1use alloc::vec::Vec;
23use p3_maybe_rayon::prelude::*;
4use tracing::instrument;
56use crate::field::Field;
7use crate::{FieldAlgebra, FieldArray, PackedValue};
89/// Batch multiplicative inverses with Montgomery's trick
10/// This is Montgomery's trick. At a high level, we invert the product of the given field
11/// elements, then derive the individual inverses from that via multiplication.
12///
13/// The usual Montgomery trick involves calculating an array of cumulative products,
14/// resulting in a long dependency chain. To increase instruction-level parallelism, we
15/// compute WIDTH separate cumulative product arrays that only meet at the end.
16///
17/// # Panics
18/// This will panic if any of the inputs is zero.
19#[instrument(level = "debug", skip_all)]
20pub fn batch_multiplicative_inverse<F: Field>(x: &[F]) -> Vec<F> {
21// How many elements to invert in one thread.
22const CHUNK_SIZE: usize = 1024;
2324let n = x.len();
25let mut result = F::zero_vec(n);
2627 x.par_chunks(CHUNK_SIZE)
28 .zip(result.par_chunks_mut(CHUNK_SIZE))
29 .for_each(|(x, result)| {
30 batch_multiplicative_inverse_helper(x, result);
31 });
3233 result
34}
3536/// Like `batch_multiplicative_inverse`, but writes the result to the given output buffer.
37fn batch_multiplicative_inverse_helper<F: Field>(x: &[F], result: &mut [F]) {
38// Higher WIDTH increases instruction-level parallelism, but too high a value will cause us
39 // to run out of registers.
40const WIDTH: usize = 4;
4142let n = x.len();
43assert_eq!(result.len(), n);
44if n % WIDTH != 0 {
45// There isn't a very clean way to do this with FieldArray; for now just do it in serial.
46 // Another simple (though suboptimal) workaround would be to make two separate calls, one
47 // for the packed part and one for the remainder.
48return batch_multiplicative_inverse_general(x, result, |x| x.inverse());
49 }
5051let x_packed = FieldArray::<F, 4>::pack_slice(x);
52let result_packed = FieldArray::<F, 4>::pack_slice_mut(result);
5354 batch_multiplicative_inverse_general(x_packed, result_packed, |x_packed| x_packed.inverse());
55}
5657/// A simple single-threaded implementation of Montgomery's trick. Since not all `FieldAlgebra`s
58/// support inversion, this takes a custom inversion function.
59pub(crate) fn batch_multiplicative_inverse_general<F, Inv>(x: &[F], result: &mut [F], inv: Inv)
60where
61F: FieldAlgebra + Copy,
62 Inv: Fn(F) -> F,
63{
64let n = x.len();
65assert_eq!(result.len(), n);
66if n == 0 {
67return;
68 }
6970 result[0] = F::ONE;
71for i in 1..n {
72 result[i] = result[i - 1] * x[i - 1];
73 }
7475let product = result[n - 1] * x[n - 1];
76let mut inv = inv(product);
7778for i in (0..n).rev() {
79 result[i] *= inv;
80 inv *= x[i];
81 }
82}