ModularPower.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. /*
  2. * Copyright (c) 2020, Ali Mohammad Pur <mpfard@serenityos.org>
  3. * Copyright (c) 2020-2021, Dex♪ <dexes.ttp@gmail.com>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include "UnsignedBigIntegerAlgorithms.h"
  8. namespace Crypto {
  9. void UnsignedBigIntegerAlgorithms::destructive_modular_power_without_allocation(
  10. UnsignedBigInteger& ep,
  11. UnsignedBigInteger& base,
  12. UnsignedBigInteger const& m,
  13. UnsignedBigInteger& temp_1,
  14. UnsignedBigInteger& temp_2,
  15. UnsignedBigInteger& temp_3,
  16. UnsignedBigInteger& temp_4,
  17. UnsignedBigInteger& temp_multiply,
  18. UnsignedBigInteger& temp_quotient,
  19. UnsignedBigInteger& temp_remainder,
  20. UnsignedBigInteger& exp)
  21. {
  22. exp.set_to(1);
  23. while (!(ep < 1)) {
  24. if (ep.words()[0] % 2 == 1) {
  25. // exp = (exp * base) % m;
  26. multiply_without_allocation(exp, base, temp_1, temp_2, temp_3, temp_multiply);
  27. divide_without_allocation(temp_multiply, m, temp_1, temp_2, temp_3, temp_4, temp_quotient, temp_remainder);
  28. exp.set_to(temp_remainder);
  29. }
  30. // ep = ep / 2;
  31. divide_u16_without_allocation(ep, 2, temp_quotient, temp_remainder);
  32. ep.set_to(temp_quotient);
  33. // base = (base * base) % m;
  34. multiply_without_allocation(base, base, temp_1, temp_2, temp_3, temp_multiply);
  35. divide_without_allocation(temp_multiply, m, temp_1, temp_2, temp_3, temp_4, temp_quotient, temp_remainder);
  36. base.set_to(temp_remainder);
  37. // Note that not clamping here would cause future calculations (multiply, specifically) to allocate even more unused space
  38. // which would then persist through the temp bigints, and significantly slow down later loops.
  39. // To avoid that, we can clamp to a specific max size, or just clamp to the min needed amount of space.
  40. ep.clamp_to_trimmed_length();
  41. exp.clamp_to_trimmed_length();
  42. base.clamp_to_trimmed_length();
  43. }
  44. }
  45. /**
  46. * Compute (1/value) % 2^32.
  47. * This needs an odd input value
  48. * Algorithm from: Dumas, J.G. "On Newton–Raphson Iteration for Multiplicative Inverses Modulo Prime Powers".
  49. */
  50. ALWAYS_INLINE static u32 inverse_wrapped(u32 value)
  51. {
  52. VERIFY(value & 1);
  53. i64 b = static_cast<i64>(value);
  54. i64 k0 = (2 - b);
  55. i64 t = (b - 1);
  56. size_t i = 1;
  57. while (i < 32) {
  58. t = t * t;
  59. k0 = k0 * (t + 1);
  60. i <<= 1;
  61. }
  62. return static_cast<u32>(-k0);
  63. }
  64. /**
  65. * Computes z = x * y + c. z_carry contains the top bits, z contains the bottom bits.
  66. */
  67. ALWAYS_INLINE static void linear_multiplication_with_carry(u32 x, u32 y, u32 c, u32& z_carry, u32& z)
  68. {
  69. u64 result = static_cast<u64>(x) * static_cast<u64>(y) + static_cast<u64>(c);
  70. z_carry = static_cast<u32>(result >> 32);
  71. z = static_cast<u32>(result);
  72. }
  73. /**
  74. * Computes z = a + b. z_carry contains the top bit (1 or 0), z contains the bottom bits.
  75. */
  76. ALWAYS_INLINE static void addition_with_carry(u32 a, u32 b, u32& z_carry, u32& z)
  77. {
  78. u64 result = static_cast<u64>(a) + static_cast<u64>(b);
  79. z_carry = static_cast<u32>(result >> 32);
  80. z = static_cast<u32>(result);
  81. }
  82. /**
  83. * Computes a montgomery "fragment" for y_i. This computes "z[i] += x[i] * y_i" for all words while rippling the carry, and returns the carry.
  84. * Algorithm from: Gueron, "Efficient Software Implementations of Modular Exponentiation". (https://eprint.iacr.org/2011/239.pdf)
  85. */
  86. UnsignedBigInteger::Word UnsignedBigIntegerAlgorithms::montgomery_fragment(UnsignedBigInteger& z, size_t offset_in_z, UnsignedBigInteger const& x, UnsignedBigInteger::Word y_digit, size_t num_words)
  87. {
  88. UnsignedBigInteger::Word carry { 0 };
  89. for (size_t i = 0; i < num_words; ++i) {
  90. UnsignedBigInteger::Word a_carry;
  91. UnsignedBigInteger::Word a;
  92. linear_multiplication_with_carry(x.m_words[i], y_digit, z.m_words[offset_in_z + i], a_carry, a);
  93. UnsignedBigInteger::Word b_carry;
  94. UnsignedBigInteger::Word b;
  95. addition_with_carry(a, carry, b_carry, b);
  96. z.m_words[offset_in_z + i] = b;
  97. carry = a_carry + b_carry;
  98. }
  99. return carry;
  100. }
  101. /**
  102. * Computes the "almost montgomery" product : x * y * 2 ^ (-num_words * BITS_IN_WORD) % modulo
  103. * [Note : that means that the result z satisfies z * 2^(num_words * BITS_IN_WORD) % modulo = x * y % modulo]
  104. * assuming :
  105. * - x, y and modulo are all already padded to num_words
  106. * - k = inverse_wrapped(modulo) (optimization to not recompute K each time)
  107. * Algorithm from: Gueron, "Efficient Software Implementations of Modular Exponentiation". (https://eprint.iacr.org/2011/239.pdf)
  108. */
  109. void UnsignedBigIntegerAlgorithms::almost_montgomery_multiplication_without_allocation(
  110. UnsignedBigInteger const& x,
  111. UnsignedBigInteger const& y,
  112. UnsignedBigInteger const& modulo,
  113. UnsignedBigInteger& z,
  114. UnsignedBigInteger::Word k,
  115. size_t num_words,
  116. UnsignedBigInteger& result)
  117. {
  118. VERIFY(x.length() >= num_words);
  119. VERIFY(y.length() >= num_words);
  120. VERIFY(modulo.length() >= num_words);
  121. z.set_to(0);
  122. z.resize_with_leading_zeros(num_words * 2);
  123. UnsignedBigInteger::Word previous_double_carry { 0 };
  124. for (size_t i = 0; i < num_words; ++i) {
  125. // z[i->num_words+i] += x * y_i
  126. UnsignedBigInteger::Word carry_1 = montgomery_fragment(z, i, x, y.m_words[i], num_words);
  127. // z[i->num_words+i] += modulo * (z_i * k)
  128. UnsignedBigInteger::Word t = z.m_words[i] * k;
  129. UnsignedBigInteger::Word carry_2 = montgomery_fragment(z, i, modulo, t, num_words);
  130. // Compute the carry by combining all of the carrys of the previous computations
  131. // Put it "right after" the range that we computed above
  132. UnsignedBigInteger::Word temp_carry = previous_double_carry + carry_1;
  133. UnsignedBigInteger::Word overall_carry = temp_carry + carry_2;
  134. z.m_words[num_words + i] = overall_carry;
  135. // Detect if there was a "double carry" for this word by checking if our carry results are smaller than their components
  136. previous_double_carry = (temp_carry < carry_1 || overall_carry < carry_2) ? 1 : 0;
  137. }
  138. if (previous_double_carry == 0) {
  139. // Return the top num_words bytes of Z, which contains our result.
  140. shift_right_by_n_words(z, num_words, result);
  141. result.resize_with_leading_zeros(num_words);
  142. return;
  143. }
  144. // We have a carry, so we're "one bigger" than we need to be.
  145. // Subtract the modulo from the result (the top half of z), and write it to the bottom half of Z since we have space.
  146. // (With carry, of course.)
  147. UnsignedBigInteger::Word c { 0 };
  148. for (size_t i = 0; i < num_words; ++i) {
  149. UnsignedBigInteger::Word z_digit = z.m_words[num_words + i];
  150. UnsignedBigInteger::Word modulo_digit = modulo.m_words[i];
  151. UnsignedBigInteger::Word new_z_digit = z_digit - modulo_digit - c;
  152. z.m_words[i] = new_z_digit;
  153. // Detect if the subtraction underflowed - from "Hacker's Delight"
  154. c = ((modulo_digit & ~z_digit) | ((modulo_digit | ~z_digit) & new_z_digit)) >> (UnsignedBigInteger::BITS_IN_WORD - 1);
  155. }
  156. // Return the bottom num_words bytes of Z (with the carry bit handled)
  157. z.m_words.resize(num_words);
  158. result.set_to(z);
  159. result.resize_with_leading_zeros(num_words);
  160. }
  161. /**
  162. * Complexity: still O(N^3) with N the number of words in the largest word, but less complex than the classical mod power.
  163. * Note: the montgomery multiplications requires an inverse modulo over 2^32, which is only defined for odd numbers.
  164. */
  165. void UnsignedBigIntegerAlgorithms::montgomery_modular_power_with_minimal_allocations(
  166. UnsignedBigInteger const& base,
  167. UnsignedBigInteger const& exponent,
  168. UnsignedBigInteger const& modulo,
  169. UnsignedBigInteger& temp_z,
  170. UnsignedBigInteger& rr,
  171. UnsignedBigInteger& one,
  172. UnsignedBigInteger& z,
  173. UnsignedBigInteger& zz,
  174. UnsignedBigInteger& x,
  175. UnsignedBigInteger& temp_extra,
  176. UnsignedBigInteger& result)
  177. {
  178. VERIFY(modulo.is_odd());
  179. // Note: While this is a constexpr variable for clarity and could be changed in theory,
  180. // various optimized parts of the algorithm rely on this value being exactly 4.
  181. constexpr size_t window_size = 4;
  182. size_t num_words = modulo.trimmed_length();
  183. UnsignedBigInteger::Word k = inverse_wrapped(modulo.m_words[0]);
  184. one.set_to(1);
  185. // rr = ( 2 ^ (2 * modulo.length() * BITS_IN_WORD) ) % modulo
  186. shift_left_by_n_words(one, 2 * num_words, x);
  187. divide_without_allocation(x, modulo, temp_z, one, z, zz, temp_extra, rr);
  188. rr.resize_with_leading_zeros(num_words);
  189. // x = base [% modulo, if x doesn't already fit in modulo's words]
  190. x.set_to(base);
  191. if (x.trimmed_length() > num_words)
  192. divide_without_allocation(base, modulo, temp_z, one, z, zz, temp_extra, x);
  193. x.resize_with_leading_zeros(num_words);
  194. one.set_to(1);
  195. one.resize_with_leading_zeros(num_words);
  196. // Compute the montgomery powers from 0 to 2^window_size. powers[i] = x^i
  197. UnsignedBigInteger powers[1 << window_size];
  198. almost_montgomery_multiplication_without_allocation(one, rr, modulo, temp_z, k, num_words, powers[0]);
  199. almost_montgomery_multiplication_without_allocation(x, rr, modulo, temp_z, k, num_words, powers[1]);
  200. for (size_t i = 2; i < (1 << window_size); ++i)
  201. almost_montgomery_multiplication_without_allocation(powers[i - 1], powers[1], modulo, temp_z, k, num_words, powers[i]);
  202. z.set_to(powers[0]);
  203. z.resize_with_leading_zeros(num_words);
  204. zz.set_to(0);
  205. zz.resize_with_leading_zeros(num_words);
  206. ssize_t exponent_length = exponent.trimmed_length();
  207. for (ssize_t word_in_exponent = exponent_length - 1; word_in_exponent >= 0; --word_in_exponent) {
  208. UnsignedBigInteger::Word exponent_word = exponent.m_words[word_in_exponent];
  209. size_t bit_in_word = 0;
  210. while (bit_in_word < UnsignedBigInteger::BITS_IN_WORD) {
  211. if (word_in_exponent != exponent_length - 1 || bit_in_word != 0) {
  212. almost_montgomery_multiplication_without_allocation(z, z, modulo, temp_z, k, num_words, zz);
  213. almost_montgomery_multiplication_without_allocation(zz, zz, modulo, temp_z, k, num_words, z);
  214. almost_montgomery_multiplication_without_allocation(z, z, modulo, temp_z, k, num_words, zz);
  215. almost_montgomery_multiplication_without_allocation(zz, zz, modulo, temp_z, k, num_words, z);
  216. }
  217. auto power_index = exponent_word >> (UnsignedBigInteger::BITS_IN_WORD - window_size);
  218. auto& power = powers[power_index];
  219. almost_montgomery_multiplication_without_allocation(z, power, modulo, temp_z, k, num_words, zz);
  220. swap(z, zz);
  221. // Move to the next window
  222. exponent_word <<= window_size;
  223. bit_in_word += window_size;
  224. }
  225. }
  226. almost_montgomery_multiplication_without_allocation(z, one, modulo, temp_z, k, num_words, zz);
  227. if (zz < modulo) {
  228. result.set_to(zz);
  229. result.clamp_to_trimmed_length();
  230. return;
  231. }
  232. // Note : Since we were using "almost montgomery" multiplications, we aren't guaranteed to be under the modulo already.
  233. // So, if we're here, we need to respect the modulo.
  234. // We can, however, start by trying to subtract the modulo, just in case we're close.
  235. subtract_without_allocation(zz, modulo, result);
  236. if (modulo < zz) {
  237. // Note: This branch shouldn't happen in theory (as noted in https://github.com/rust-num/num-bigint/blob/master/src/biguint/monty.rs#L210)
  238. // Let's dbgln the values we used. That way, if we hit this branch, we can contribute these values for test cases.
  239. dbgln("Encountered the modulo branch during a montgomery modular power. Params : {} - {} - {}", base, exponent, modulo);
  240. // We just clobber all the other temporaries that we don't need for the division.
  241. // This is wasteful, but we're on the edgiest of cases already.
  242. divide_without_allocation(zz, modulo, temp_z, rr, z, x, temp_extra, result);
  243. }
  244. result.clamp_to_trimmed_length();
  245. return;
  246. }
  247. }