12#ifndef RANLUXPP_MULMOD_H
13#define RANLUXPP_MULMOD_H
24static void multiply9x9(
const uint64_t *in1,
const uint64_t *in2, uint64_t *out)
27 unsigned nextCarry = 0;
29#if defined(__clang__) || defined(__INTEL_COMPILER) || defined(__CUDACC__)
31#elif defined(__GNUC__) && __GNUC__ >= 8
35 for (
int i = 0; i < 18; i++) {
36 uint64_t current = next;
37 unsigned carry = nextCarry;
42#if defined(__clang__) || defined(__INTEL_COMPILER) || defined(__CUDACC__)
44#elif defined(__GNUC__) && __GNUC__ >= 8
48 for (
int j = 0; j < 9; j++) {
53 uint64_t fac1 = in1[j];
54 uint64_t fac2 = in2[k];
55#if defined(__SIZEOF_INT128__) && !defined(ROOT_NO_INT128)
56 unsigned __int128 prod = fac1;
59 uint64_t upper = prod >> 64;
60 uint64_t lower =
static_cast<uint64_t
>(prod);
62 uint64_t upper1 = fac1 >> 32;
63 uint64_t lower1 =
static_cast<uint32_t
>(fac1);
65 uint64_t upper2 = fac2 >> 32;
66 uint64_t lower2 =
static_cast<uint32_t
>(fac2);
70 uint64_t upper = upper1 * upper2;
71 uint64_t middle1 = upper1 * lower2;
72 uint64_t middle2 = lower1 * upper2;
73 uint64_t lower = lower1 * lower2;
78 uint64_t middle =
add_overflow(middle1, middle2, overflow);
89 uint64_t overflow_add = overflow * (uint64_t(1) << 32);
94 upper += overflow_add;
96 uint64_t middle_upper = middle >> 32;
97 uint64_t middle_lower = middle << 32;
114 upper += middle_upper;
118 current =
add_carry(current, lower, carry);
121 next =
add_carry(next, upper, nextCarry);
124 next =
add_carry(next, carry, nextCarry);
138static void mod_m(
const uint64_t *mul, uint64_t *out)
142 for (
int i = 0; i < 9; i++) {
170 uint64_t c_unsigned =
static_cast<uint64_t
>(
c);
173 int64_t t2 = t0 - (c_unsigned << 48);
177 int64_t
t1 = t2 >> 48;
186 for (
int i = 1; i < 3; i++) {
190 uint64_t out_i =
sub_carry(r_i, t0, carry);
197 uint64_t out_3 =
sub_carry(r_3, t2, carry);
200 for (
int i = 4; i < 9; i++) {
215static void mulmod(
const uint64_t *in1, uint64_t *inout)
217 uint64_t mul[2 * 9] = {0};
229static void powermod(
const uint64_t *base, uint64_t *res, uint64_t
n)
231 uint64_t fac[9] = {0};
234 for (
int i = 1; i < 9; i++) {
239 uint64_t mul[18] = {0};
static uint64_t sub_overflow(uint64_t a, uint64_t b, unsigned &overflow)
Compute a - b and set overflow accordingly.
static uint64_t sub_carry(uint64_t a, uint64_t b, unsigned &carry)
Compute a - b and increment carry if there was an overflow.
static uint64_t add_carry(uint64_t a, uint64_t b, unsigned &carry)
Compute a + b and increment carry if there was an overflow.
static int64_t compute_r(const uint64_t *upper, uint64_t *r)
Update r = r - (t1 + t2) + (t3 + t2) * b ** 10.
static uint64_t add_overflow(uint64_t a, uint64_t b, unsigned &overflow)
Compute a + b and set overflow accordingly.
static void mulmod(const uint64_t *in1, uint64_t *inout)
Combine multiply9x9 and mod_m with internal temporary storage.
static void powermod(const uint64_t *base, uint64_t *res, uint64_t n)
Compute base to the n modulo m.
static void mod_m(const uint64_t *mul, uint64_t *out)
Compute a value congruent to mul modulo m less than 2 ** 576.
static void multiply9x9(const uint64_t *in1, const uint64_t *in2, uint64_t *out)
Multiply two 576 bit numbers, stored as 9 numbers of 64 bits each.