p3_goldilocks/
goldilocks.rs

1use alloc::vec;
2use alloc::vec::Vec;
3use core::fmt::{Debug, Display, Formatter};
4use core::hash::{Hash, Hasher};
5use core::iter::{Product, Sum};
6use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
7use core::{array, fmt};
8
9use num_bigint::BigUint;
10use p3_challenger::UniformSamplingField;
11use p3_field::exponentiation::exp_10540996611094048183;
12use p3_field::integers::QuotientMap;
13use p3_field::op_assign_macros::{
14    impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign,
15};
16use p3_field::{
17    Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField,
18    PrimeField64, RawDataSerializable, TwoAdicField, halve_u64, impl_raw_serializable_primefield64,
19    quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int,
20};
21use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner};
22use rand::Rng;
23use rand::distr::{Distribution, StandardUniform};
24use serde::{Deserialize, Serialize};
25
26/// The Goldilocks prime
27pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001;
28
29/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
30///
31/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64.
32#[derive(Copy, Clone, Default, Serialize, Deserialize)]
33#[repr(transparent)] // Important for reasoning about memory layout
34#[must_use]
35pub struct Goldilocks {
36    /// Not necessarily canonical.
37    pub(crate) value: u64,
38}
39
40impl Goldilocks {
41    pub(crate) const fn new(value: u64) -> Self {
42        Self { value }
43    }
44
45    /// Convert a constant u64 array into a constant Goldilocks array.
46    ///
47    /// This is a const version of `.map(Goldilocks::new)`.
48    #[inline]
49    pub(crate) const fn new_array<const N: usize>(input: [u64; N]) -> [Self; N] {
50        let mut output = [Self::ZERO; N];
51        let mut i = 0;
52        while i < N {
53            output[i].value = input[i];
54            i += 1;
55        }
56        output
57    }
58
59    /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`.
60    const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg();
61
62    /// A list of generators for the two-adic subgroups of the goldilocks field.
63    ///
64    /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`.
65    pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([
66        0x0000000000000001,
67        0xffffffff00000000,
68        0x0001000000000000,
69        0xfffffffeff000001,
70        0xefffffff00000001,
71        0x00003fffffffc000,
72        0x0000008000000000,
73        0xf80007ff08000001,
74        0xbf79143ce60ca966,
75        0x1905d02a5c411f4e,
76        0x9d8f2ad78bfed972,
77        0x0653b4801da1c8cf,
78        0xf2c35199959dfcb6,
79        0x1544ef2335d17997,
80        0xe0ee099310bba1e2,
81        0xf6b2cffe2306baac,
82        0x54df9630bf79450e,
83        0xabd0a6e8aa3d8a0e,
84        0x81281a7b05f9beac,
85        0xfbd41c6b8caa3302,
86        0x30ba2ecd5e93e76d,
87        0xf502aef532322654,
88        0x4b2a18ade67246b5,
89        0xea9d5a1336fbc98b,
90        0x86cdcc31c307e171,
91        0x4bbaf5976ecfefd8,
92        0xed41d05b78d6e286,
93        0x10d78dd8915a171d,
94        0x59049500004a4485,
95        0xdfa8c93ba46d2666,
96        0x7e9bd009b86a0845,
97        0x400a7f755588e659,
98        0x185629dcda58878c,
99    ]);
100
101    /// A list of powers of two from 0 to 95.
102    ///
103    /// Note that 2^{96} = -1 mod P so all powers of two can be simply
104    /// derived from this list.
105    const POWERS_OF_TWO: [Self; 96] = {
106        let mut powers_of_two = [Self::ONE; 96];
107
108        let mut i = 1;
109        while i < 64 {
110            powers_of_two[i] = Self::new(1 << i);
111            i += 1;
112        }
113        let mut var = Self::new(1 << 63);
114        while i < 96 {
115            var = const_add(var, var);
116            powers_of_two[i] = var;
117            i += 1;
118        }
119        powers_of_two
120    };
121}
122
123impl PartialEq for Goldilocks {
124    fn eq(&self, other: &Self) -> bool {
125        self.as_canonical_u64() == other.as_canonical_u64()
126    }
127}
128
129impl Eq for Goldilocks {}
130
131impl Packable for Goldilocks {}
132
133impl Hash for Goldilocks {
134    fn hash<H: Hasher>(&self, state: &mut H) {
135        state.write_u64(self.as_canonical_u64());
136    }
137}
138
139impl Ord for Goldilocks {
140    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
141        self.as_canonical_u64().cmp(&other.as_canonical_u64())
142    }
143}
144
145impl PartialOrd for Goldilocks {
146    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
147        Some(self.cmp(other))
148    }
149}
150
151impl Display for Goldilocks {
152    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
153        Display::fmt(&self.as_canonical_u64(), f)
154    }
155}
156
157impl Debug for Goldilocks {
158    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
159        Debug::fmt(&self.as_canonical_u64(), f)
160    }
161}
162
163impl Distribution<Goldilocks> for StandardUniform {
164    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Goldilocks {
165        loop {
166            let next_u64 = rng.next_u64();
167            let is_canonical = next_u64 < Goldilocks::ORDER_U64;
168            if is_canonical {
169                return Goldilocks::new(next_u64);
170            }
171        }
172    }
173}
174
175impl UniformSamplingField for Goldilocks {
176    const MAX_SINGLE_SAMPLE_BITS: usize = 24;
177    const SAMPLING_BITS_M: [u64; 64] = {
178        let prime: u64 = P;
179        let mut a = [0u64; 64];
180        let mut k = 0;
181        while k < 64 {
182            if k == 0 {
183                a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always.
184            } else {
185                // Create a mask to zero out the last k bits
186                let mask = !((1u64 << k) - 1);
187                a[k] = prime & mask;
188            }
189            k += 1;
190        }
191        a
192    };
193}
194
195impl PrimeCharacteristicRing for Goldilocks {
196    type PrimeSubfield = Self;
197
198    const ZERO: Self = Self::new(0);
199    const ONE: Self = Self::new(1);
200    const TWO: Self = Self::new(2);
201    const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
202
203    #[inline]
204    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
205        f
206    }
207
208    #[inline]
209    fn from_bool(b: bool) -> Self {
210        Self::new(b.into())
211    }
212
213    #[inline]
214    fn halve(&self) -> Self {
215        Self::new(halve_u64::<P>(self.value))
216    }
217
218    #[inline]
219    fn mul_2exp_u64(&self, exp: u64) -> Self {
220        // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P.
221        if exp < 96 {
222            *self * Self::POWERS_OF_TWO[exp as usize]
223        } else if exp < 192 {
224            -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
225        } else {
226            self.mul_2exp_u64(exp % 192)
227        }
228    }
229
230    #[inline]
231    fn div_2exp_u64(&self, mut exp: u64) -> Self {
232        // In the goldilocks field, 2^192 = 1 mod P.
233        // Thus 2^{-n} = 2^{192 - n} mod P.
234        exp %= 192;
235        self.mul_2exp_u64(192 - exp)
236    }
237
238    #[inline]
239    fn sum_array<const N: usize>(input: &[Self]) -> Self {
240        assert_eq!(N, input.len());
241        // Benchmarking shows that for N <= 3 it's faster to sum the elements directly
242        // but for N > 3 it's faster to use the .sum() methods which passes through u128's
243        // allowing for delayed reductions.
244        match N {
245            0 => Self::ZERO,
246            1 => input[0],
247            2 => input[0] + input[1],
248            3 => input[0] + input[1] + input[2],
249            _ => input.iter().copied().sum(),
250        }
251    }
252
253    #[inline]
254    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
255        // The constant OFFSET has 2 important properties:
256        // 1. It is a multiple of P.
257        // 2. It is greater than the maximum possible value of the sum of the products of two u64s.
258        const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
259        assert!((N as u32) <= (1 << 31));
260        match N {
261            0 => Self::ZERO,
262            1 => lhs[0] * rhs[0],
263            2 => {
264                // We unroll the N = 2 case as it is slightly faster and this is an important case
265                // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension.
266                let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
267                let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
268
269                // We know that long_prod_0, long_prod_1 < OFFSET.
270                // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET.
271                let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
272                // Compiler really likes defining sum_corr here instead of in the if/else.
273                let sum_corr = sum.wrapping_sub(OFFSET);
274                if over {
275                    reduce128(sum_corr)
276                } else {
277                    reduce128(sum)
278                }
279            }
280            _ => {
281                let (lo_plus_hi, hi) = lhs
282                    .iter()
283                    .zip(rhs)
284                    .map(|(x, y)| (x.value as u128) * (y.value as u128))
285                    .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
286                        // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits.
287                        let val_hi = (val >> 96) as u64;
288                        // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi.
289                        // As N <= 2^32, acc_hi cannot overflow.
290                        unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
291                    });
292                // First, remove the hi part from lo_plus_hi.
293                let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
294                // As 2^{96} = -1 mod P, we simply need to reduce lo - hi.
295                // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow.
296                let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
297                reduce128(sum)
298            }
299        }
300    }
301
302    #[inline]
303    fn zero_vec(len: usize) -> Vec<Self> {
304        // SAFETY:
305        // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment
306        // and memory layout making `flatten_to_base` safe. This this will create
307        // a vector Goldilocks elements with value set to 0.
308        unsafe { flatten_to_base(vec![0u64; len]) }
309    }
310}
311
312/// Degree of the smallest permutation polynomial for Goldilocks.
313///
314/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
315impl InjectiveMonomial<7> for Goldilocks {}
316
317impl PermutationMonomial<7> for Goldilocks {
318    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
319    ///
320    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
321    fn injective_exp_root_n(&self) -> Self {
322        exp_10540996611094048183(*self)
323    }
324}
325
326impl RawDataSerializable for Goldilocks {
327    impl_raw_serializable_primefield64!();
328}
329
330impl Field for Goldilocks {
331    // TODO: Add cfg-guarded Packing for NEON
332
333    #[cfg(all(
334        target_arch = "x86_64",
335        target_feature = "avx2",
336        not(target_feature = "avx512f")
337    ))]
338    type Packing = crate::PackedGoldilocksAVX2;
339
340    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
341    type Packing = crate::PackedGoldilocksAVX512;
342    #[cfg(not(any(
343        all(
344            target_arch = "x86_64",
345            target_feature = "avx2",
346            not(target_feature = "avx512f")
347        ),
348        all(target_arch = "x86_64", target_feature = "avx512f"),
349    )))]
350    type Packing = Self;
351
352    // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator()
353    const GENERATOR: Self = Self::new(7);
354
355    fn is_zero(&self) -> bool {
356        self.value == 0 || self.value == Self::ORDER_U64
357    }
358
359    fn try_inverse(&self) -> Option<Self> {
360        if self.is_zero() {
361            return None;
362        }
363
364        Some(gcd_inversion(*self))
365    }
366
367    #[inline]
368    fn order() -> BigUint {
369        P.into()
370    }
371}
372
373// We use macros to implement QuotientMap<Int> for all integer types except for u64 and i64.
374quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
375quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
376quotient_map_large_uint!(
377    Goldilocks,
378    u64,
379    Goldilocks::ORDER_U64,
380    "`[0, 2^64 - 2^32]`",
381    "`[0, 2^64 - 1]`",
382    [u128]
383);
384quotient_map_large_iint!(
385    Goldilocks,
386    i64,
387    "`[-(2^63 - 2^31), 2^63 - 2^31]`",
388    "`[1 + 2^32 - 2^64, 2^64 - 1]`",
389    [(i128, u128)]
390);
391
392impl QuotientMap<u64> for Goldilocks {
393    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
394    ///
395    /// No reduction is needed as the internal value is allowed
396    /// to be any u64.
397    #[inline]
398    fn from_int(int: u64) -> Self {
399        Self::new(int)
400    }
401
402    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
403    ///
404    /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`.
405    #[inline]
406    fn from_canonical_checked(int: u64) -> Option<Self> {
407        (int < Self::ORDER_U64).then(|| Self::new(int))
408    }
409
410    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
411    ///
412    /// # Safety
413    /// In this case this function is actually always safe as the internal
414    /// value is allowed to be any u64.
415    #[inline(always)]
416    unsafe fn from_canonical_unchecked(int: u64) -> Self {
417        Self::new(int)
418    }
419}
420
421impl QuotientMap<i64> for Goldilocks {
422    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
423    ///
424    /// We simply need to deal with the sign.
425    #[inline]
426    fn from_int(int: i64) -> Self {
427        if int >= 0 {
428            Self::new(int as u64)
429        } else {
430            Self::new(Self::ORDER_U64.wrapping_add_signed(int))
431        }
432    }
433
434    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
435    ///
436    /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`.
437    #[inline]
438    fn from_canonical_checked(int: i64) -> Option<Self> {
439        const POS_BOUND: i64 = (P >> 1) as i64;
440        const NEG_BOUND: i64 = -POS_BOUND;
441        match int {
442            0..=POS_BOUND => Some(Self::new(int as u64)),
443            NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
444            _ => None,
445        }
446    }
447
448    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
449    ///
450    /// # Safety
451    /// In this case this function is actually always safe as the internal
452    /// value is allowed to be any u64.
453    #[inline(always)]
454    unsafe fn from_canonical_unchecked(int: i64) -> Self {
455        Self::from_int(int)
456    }
457}
458
459impl PrimeField for Goldilocks {
460    fn as_canonical_biguint(&self) -> BigUint {
461        self.as_canonical_u64().into()
462    }
463}
464
465impl PrimeField64 for Goldilocks {
466    const ORDER_U64: u64 = P;
467
468    #[inline]
469    fn as_canonical_u64(&self) -> u64 {
470        let mut c = self.value;
471        // We only need one condition subtraction, since 2 * ORDER would not fit in a u64.
472        if c >= Self::ORDER_U64 {
473            c -= Self::ORDER_U64;
474        }
475        c
476    }
477}
478
479impl TwoAdicField for Goldilocks {
480    const TWO_ADICITY: usize = 32;
481
482    fn two_adic_generator(bits: usize) -> Self {
483        assert!(bits <= Self::TWO_ADICITY);
484        Self::TWO_ADIC_GENERATORS[bits]
485    }
486}
487
488/// A const version of the addition function.
489///
490/// Useful for constructing constants values in const contexts. Outside of
491/// const contexts, Add should be used instead.
492#[inline]
493const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
494    let (sum, over) = lhs.value.overflowing_add(rhs.value);
495    let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
496    if over {
497        sum += Goldilocks::NEG_ORDER;
498    }
499    Goldilocks::new(sum)
500}
501
502impl Add for Goldilocks {
503    type Output = Self;
504
505    #[inline]
506    fn add(self, rhs: Self) -> Self {
507        let (sum, over) = self.value.overflowing_add(rhs.value);
508        let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
509        if over {
510            // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not
511            // sufficient for double-overflow.
512            // This assume does two things:
513            //  1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip
514            //     this check.
515            //  2. Hints to the compiler how rare this double-overflow is (thus handled better with
516            //     a branch).
517            unsafe {
518                assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
519            }
520            branch_hint();
521            sum += Self::NEG_ORDER; // Cannot overflow.
522        }
523        Self::new(sum)
524    }
525}
526
527impl Sub for Goldilocks {
528    type Output = Self;
529
530    #[inline]
531    fn sub(self, rhs: Self) -> Self {
532        let (diff, under) = self.value.overflowing_sub(rhs.value);
533        let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
534        if under {
535            // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not
536            // sufficient for double-underflow.
537            // This assume does two things:
538            //  1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER,
539            //     then it can skip this check.
540            //  2. Hints to the compiler how rare this double-underflow is (thus handled better
541            //     with a branch).
542            unsafe {
543                assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
544            }
545            branch_hint();
546            diff -= Self::NEG_ORDER; // Cannot underflow.
547        }
548        Self::new(diff)
549    }
550}
551
552impl Neg for Goldilocks {
553    type Output = Self;
554
555    #[inline]
556    fn neg(self) -> Self::Output {
557        Self::new(Self::ORDER_U64 - self.as_canonical_u64())
558    }
559}
560
561impl Mul for Goldilocks {
562    type Output = Self;
563
564    #[inline]
565    fn mul(self, rhs: Self) -> Self {
566        reduce128(u128::from(self.value) * u128::from(rhs.value))
567    }
568}
569
570impl_add_assign!(Goldilocks);
571impl_sub_assign!(Goldilocks);
572impl_mul_methods!(Goldilocks);
573impl_div_methods!(Goldilocks, Goldilocks);
574
575impl Sum for Goldilocks {
576    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
577        // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2.
578
579        // This sum will not overflow so long as iter.len() < 2^64.
580        let sum = iter.map(|x| x.value as u128).sum::<u128>();
581        reduce128(sum)
582    }
583}
584
585/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
586/// field order and `2^64`.
587#[inline]
588pub(crate) fn reduce128(x: u128) -> Goldilocks {
589    let (x_lo, x_hi) = split(x); // This is a no-op
590    let x_hi_hi = x_hi >> 32;
591    let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
592
593    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
594    if borrow {
595        branch_hint(); // A borrow is exceedingly rare. It is faster to branch.
596        t0 -= Goldilocks::NEG_ORDER; // Cannot underflow.
597    }
598    let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
599    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
600    Goldilocks::new(t2)
601}
602
603#[inline]
604#[allow(clippy::cast_possible_truncation)]
605const fn split(x: u128) -> (u64, u64) {
606    (x as u64, (x >> 64) as u64)
607}
608
609/// Fast addition modulo ORDER for x86-64.
610/// This function is marked unsafe for the following reasons:
611///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
612///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
613///     the registers, so its use is not recommended when either input will be used again.
614#[inline(always)]
615#[cfg(target_arch = "x86_64")]
616unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
617    unsafe {
618        let res_wrapped: u64;
619        let adjustment: u64;
620        core::arch::asm!(
621            "add {0}, {1}",
622            // Trick. The carry flag is set iff the addition overflowed.
623            // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
624            // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
625            // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
626            // on overflow; this happens be NEG_ORDER.
627            // Note that the CPU does not realize that the result of sbb x, x does not actually depend
628            // on x. We must write the result to a register that we know to be ready. We have a
629            // dependency on {1} anyway, so let's use it.
630            "sbb {1:e}, {1:e}",
631            inlateout(reg) x => res_wrapped,
632            inlateout(reg) y => adjustment,
633            options(pure, nomem, nostack),
634        );
635        assume(x != 0 || (res_wrapped == y && adjustment == 0));
636        assume(y != 0 || (res_wrapped == x && adjustment == 0));
637        // Add NEG_ORDER == subtract ORDER.
638        // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
639        res_wrapped + adjustment
640    }
641}
642
643#[inline(always)]
644#[cfg(not(target_arch = "x86_64"))]
645unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
646    let (res_wrapped, carry) = x.overflowing_add(y);
647    // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
648    res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
649}
650
651/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm.
652///
653/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf.
654/// The key idea is to compute update factors which are incorrect by a known power of 2 which
655/// can be corrected at the end. These update factors can then be used to construct the inverse
656/// via a simple linear combination.
657///
658/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic.
659fn gcd_inversion(input: Goldilocks) -> Goldilocks {
660    // Initialise our values to the value we want to invert and the prime.
661    let (mut a, mut b) = (input.value, P);
662
663    // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`.
664    // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`.
665    // We split the iterations into 2 rounds of length 63.
666    const ROUND_SIZE: usize = 63;
667
668    // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted
669    // version which doesn't do any computations involving g. But either the compiler works this out
670    // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks.
671    let (f00, _, f10, _) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
672    let (_, _, f11, g11) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
673
674    // The update factors are i64's except we need to interpret -2^63 as 2^63.
675    // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
676    let u = from_unusual_int(f00);
677    let v = from_unusual_int(f10);
678    let u_fac11 = from_unusual_int(f11);
679    let v_fac11 = from_unusual_int(g11);
680
681    // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}.
682    // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66.
683    (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
684}
685
686/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63.
687const fn from_unusual_int(int: i64) -> Goldilocks {
688    if (int >= 0) || (int == i64::MIN) {
689        Goldilocks::new(int as u64)
690    } else {
691        Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
692    }
693}
694
695#[cfg(test)]
696mod tests {
697    use p3_field::extension::BinomialExtensionField;
698    use p3_field_testing::{
699        test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field,
700    };
701
702    use super::*;
703
704    type F = Goldilocks;
705    type EF = BinomialExtensionField<F, 5>;
706
707    #[test]
708    fn test_goldilocks() {
709        let f = F::new(100);
710        assert_eq!(f.as_canonical_u64(), 100);
711
712        // Over the Goldilocks field, the following set of equations hold
713        // p               = 0
714        // 2^64 - 2^32 + 1 = 0
715        // 2^64            = 2^32 - 1
716        let f = F::new(u64::MAX);
717        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
718
719        let f = F::from_u64(u64::MAX);
720        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
721
722        // Generator check
723        let expected_multiplicative_group_generator = F::new(7);
724        assert_eq!(F::GENERATOR, expected_multiplicative_group_generator);
725        assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64);
726
727        // Check on `reduce_u128`
728        let x = u128::MAX;
729        let y = reduce128(x);
730        // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1
731        // 2^128 - 1 = (2^64 - 1) * (2^64 + 1)
732        //           = (2^32 - 1 - 1) * (2^32 - 1 + 1)
733        //           = (2^32 - 2) * (2^32)
734        //           = 2^64 - 2 * 2^32
735        //           = 2^64 - 2^33
736        //           = 2^32 - 1 - 2^33
737        //           = - 2^32 - 1
738        let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE;
739        assert_eq!(y, expected_result);
740
741        let f = F::new(100);
742        assert_eq!(f.injective_exp_n().injective_exp_root_n(), f);
743        assert_eq!(y.injective_exp_n().injective_exp_root_n(), y);
744        assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO);
745    }
746
747    // Goldilocks has a redundant representation for both 0 and 1.
748    const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)];
749    const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)];
750
751    // Get the prime factorization of the order of the multiplicative group.
752    // i.e. the prime factorization of P - 1.
753    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] {
754        [
755            (BigUint::from(2u8), 32),
756            (BigUint::from(3u8), 1),
757            (BigUint::from(5u8), 1),
758            (BigUint::from(17u8), 1),
759            (BigUint::from(257u16), 1),
760            (BigUint::from(65537u32), 1),
761        ]
762    }
763
764    test_field!(
765        crate::Goldilocks,
766        &super::ZEROS,
767        &super::ONES,
768        &super::multiplicative_group_prime_factorization()
769    );
770    test_prime_field!(crate::Goldilocks);
771    test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES);
772    test_two_adic_field!(crate::Goldilocks);
773
774    test_field_dft!(
775        radix2dit,
776        crate::Goldilocks,
777        super::EF,
778        p3_dft::Radix2Dit<_>
779    );
780    test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers);
781    test_field_dft!(
782        parallel,
783        crate::Goldilocks,
784        super::EF,
785        p3_dft::Radix2DitParallel<crate::Goldilocks>
786    );
787}