p3_field/batch_inverse.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
use alloc::vec::Vec;
use p3_maybe_rayon::prelude::*;
use tracing::instrument;
use crate::field::Field;
use crate::{AbstractField, FieldArray, PackedValue};
/// Batch multiplicative inverses with Montgomery's trick
/// This is Montgomery's trick. At a high level, we invert the product of the given field
/// elements, then derive the individual inverses from that via multiplication.
///
/// The usual Montgomery trick involves calculating an array of cumulative products,
/// resulting in a long dependency chain. To increase instruction-level parallelism, we
/// compute WIDTH separate cumulative product arrays that only meet at the end.
///
/// # Panics
/// This will panic if any of the inputs is zero.
#[instrument(level = "debug", skip_all)]
pub fn batch_multiplicative_inverse<F: Field>(x: &[F]) -> Vec<F> {
// How many elements to invert in one thread.
const CHUNK_SIZE: usize = 1024;
let n = x.len();
let mut result = F::zero_vec(n);
x.par_chunks(CHUNK_SIZE)
.zip(result.par_chunks_mut(CHUNK_SIZE))
.for_each(|(x, result)| {
batch_multiplicative_inverse_helper(x, result);
});
result
}
/// Like `batch_multiplicative_inverse`, but writes the result to the given output buffer.
fn batch_multiplicative_inverse_helper<F: Field>(x: &[F], result: &mut [F]) {
// Higher WIDTH increases instruction-level parallelism, but too high a value will cause us
// to run out of registers.
const WIDTH: usize = 4;
let n = x.len();
assert_eq!(result.len(), n);
if n % WIDTH != 0 {
// There isn't a very clean way to do this with FieldArray; for now just do it in serial.
// Another simple (though suboptimal) workaround would be to make two separate calls, one
// for the packed part and one for the remainder.
return batch_multiplicative_inverse_general(x, result, |x| x.inverse());
}
let x_packed = FieldArray::<F, 4>::pack_slice(x);
let result_packed = FieldArray::<F, 4>::pack_slice_mut(result);
batch_multiplicative_inverse_general(x_packed, result_packed, |x_packed| x_packed.inverse());
}
/// A simple single-threaded implementation of Montgomery's trick. Since not all `AbstractField`s
/// support inversion, this takes a custom inversion function.
pub(crate) fn batch_multiplicative_inverse_general<F, Inv>(x: &[F], result: &mut [F], inv: Inv)
where
F: AbstractField + Copy,
Inv: Fn(F) -> F,
{
let n = x.len();
assert_eq!(result.len(), n);
if n == 0 {
return;
}
result[0] = F::ONE;
for i in 1..n {
result[i] = result[i - 1] * x[i - 1];
}
let product = result[n - 1] * x[n - 1];
let mut inv = inv(product);
for i in (0..n).rev() {
result[i] *= inv;
inv *= x[i];
}
}