[PATCH] MPI helper modular exponentiation, Least Leak Intended

Fri Feb 14 08:44:13 CET 2025

Hello, again,

NIIBE Yutaka <gniibe at fsij.org> wrote:
> This month, I created the ticket for improvement of modular
> exponentiation implementation with a branch named gniibe/t7490:
>
> 	https://dev.gnupg.org/T7490
>
> It somehow works for me now.
>
> I'd like to merge the branch manually in steps.

Here is the last and main part of the changes for the modular
exponentiation implementation (it's by Montgomery exponentiation
algorithm).

I put the changes inline, so that people can easily reply and
comment.

FYI, the reference, Handbook of Applied Cryptography is available:

    https://cacr.uwaterloo.ca/hac/

diff --git a/mpi/mpi-internal.h b/mpi/mpi-internal.h
index ffe8140a..decaadd8 100644
--- a/mpi/mpi-internal.h
+++ b/mpi/mpi-internal.h
@@ -261,6 +261,9 @@ mpi_limb_t _gcry_mpih_lshift( mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
 							   unsigned cnt);
 mpi_limb_t _gcry_mpih_rshift( mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
 							   unsigned cnt);
+/*-- mpih-pow.c --*/
+void _gcry_mpih_powm_lli (mpi_ptr_t rp, mpi_ptr_t bp, mpi_ptr_t mp,
+                          mpi_size_t n, mpi_ptr_t ep, mpi_size_t en);
 
 /*-- mpih-const-time.c --*/
 #define mpih_set_cond(w,u,s,o) _gcry_mpih_set_cond ((w),(u),(s),(o))
diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c
index defd675e..ef410a60 100644
--- a/mpi/mpi-pow.c
+++ b/mpi/mpi-pow.c
@@ -34,6 +34,21 @@
 #include "longlong.h"
 
 
+#ifndef USE_ALGORITHM_LLI_EXPONENTIATION
+/*
+ * When you don't need least-leak implementation, please add compilation option
+ * -DUSE_ALGORITHM_LLI_EXPONENTIATION=0
+ *
+ * For performance (by tests/benchmark rsa), it's comparable to leaky
+ * sliding window implementation on 64-bit architecture.  On 32-bit
+ * architecture, it's slower with 3072-bit and 4096-bit.  In future,
+ * it's good to have non-leaky Karatsuba multiplication, then, it's an
+ * option to use it.
+ *
+ */
+#define USE_ALGORITHM_LLI_EXPONENTIATION 1
+#endif
+
 /*
  * When you need old implementation, please add compilation option
  * -DUSE_ALGORITHM_SIMPLE_EXPONENTIATION
@@ -41,6 +56,16 @@
 #define USE_ALGORITHM_SIMPLE_EXPONENTIATION 1
  */
 
+const char *
+_gcry_mpi_get_powm_config (void)
+{
+#if USE_ALGORITHM_LLI_EXPONENTIATION
+  return "fixed-window";
+#else
+  return "sliding-window";
+#endif
+}
+
 #if defined(USE_ALGORITHM_SIMPLE_EXPONENTIATION)
 /****************
  * RES = BASE ^ EXPO mod MOD
@@ -541,6 +566,33 @@ _gcry_mpi_powm (gcry_mpi_t res,
       rp = res->d;
     }
 
+#if USE_ALGORITHM_LLI_EXPONENTIATION
+  if ((esec || bsec || msec) && (mod->d[0] & 1))
+    {
+      mpi_ptr_t bp1 = NULL;
+
+      if (bsize < msize)
+	{
+	  bp1 = mpi_alloc_limb_space (msize, 1);
+	  MPN_ZERO (bp1, msize);
+	  MPN_COPY (bp1, bp, bsize);
+	}
+
+      _gcry_mpih_powm_lli (rp, bp1?bp1:bp, mod->d, msize, ep, esize);
+      if (bp1)
+	_gcry_mpi_free_limb_space (bp1, msize);
+
+      rsign = 0;
+      negative_result = (ep[0] & 1) && bsign;
+      if (negative_result)
+	rsign = msign;
+
+      res->nlimbs = msize;
+      res->sign = rsign;
+      goto leave;
+    }
+#endif
+
   /* Main processing.  */
   {
     mpi_size_t i, j, k;
diff --git a/mpi/mpih-pow.c b/mpi/mpih-pow.c
new file mode 100644
index 00000000..ad7d1e8d
--- /dev/null
+++ b/mpi/mpih-pow.c
@@ -0,0 +1,283 @@
+/* mpih-pow.c  -  MPI helper functions
+ * Copyright (C) 2025  g10 Code GmbH
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpi-internal.h"
+#include "longlong.h"
+
+#if BITS_PER_MPI_LIMB <= 32
+#define MAX_SCRATCH_SPACE 256*2
+#define MAX_WINDOW 8
+#else
+#define MAX_SCRATCH_SPACE 256
+#define MAX_WINDOW 4
+#endif
+
+/*
+ * Compute -N^(-1) mod 2^BITS_PER_MPI_LIMB
+ *
+ *  (1) Compute N^(-1) mod 2^BITS_PER_MPI_LIMB
+ *  (2) Then, negate the value of (1)
+ *
+ * For computing N^(-1) mod (power of two), see:
+ *
+ *   Jeffrey Hurchalla
+ *   An Improved Integer Modular Multiplicative Inverse (modulo 2^w)
+ *   https://arxiv.org/abs/2204.04342
+ *
+ */
+static mpi_limb_t
+compute_minv (mpi_limb_t n)
+{
+  mpi_limb_t x, y;
+
+  gcry_assert (n%2 == 1);
+
+  x = (3*n)^2;
+  y = 1 - n*x;
+  x = x*(1 + y);
+  y *= y;
+  x = x*(1 + y);
+  y *= y;
+  x = x*(1 + y);
+#if BITS_PER_MPI_LIMB == 32
+  return -x;
+#elif BITS_PER_MPI_LIMB == 64
+  y *= y;
+  x = x*(1 + y);
+  return -x;
+#else
+# error "Please implement multiplicative inverse mod power of 2"
+#endif
+}
+
+
+/*
+ * Compute T * R^(-1) mod M (where R is represented by MINV)
+ *
+ * Reference:
+ *   Handbook of Applied Cryptography
+ *       Algorithm 14.82: Montgomery reduction
+ */
+static void
+mont_reduc (mpi_ptr_t rp, mpi_ptr_t tp,
+            mpi_ptr_t mp, mpi_size_t n, mpi_limb_t minv)
+{
+  mpi_size_t i;
+  mpi_limb_t cy0;
+  mpi_limb_t cy1 = 0;
+
+  for (i = 0; i < n; i++)
+    {
+      mpi_limb_t ui = tp[i] * minv;
+
+      cy0 = _gcry_mpih_addmul_1 (tp + i, mp, n, ui);
+      cy1 += _gcry_mpih_add_1_lli (tp + n + i, n - i, cy0);
+    }
+
+  cy0 = _gcry_mpih_sub_n (rp, tp + n, mp, n);
+  _gcry_mpih_set_cond (rp, tp + n, n,
+                       mpih_limb_is_not_zero (cy0)
+                       & mpih_limb_is_zero (cy1));
+}
+
+/*
+ * Compute X * Y * R^(-1) mod M (where R is represented by MINV)
+ *
+ * Reference:
+ *   Handbook of Applied Cryptography
+ *       Algorithm 14.86: Montgomery multiplication
+ *
+ * RP should have space of 2*N limbs.
+ *
+ */
+static void
+mont_mul (mpi_ptr_t rp, mpi_ptr_t xp, mpi_ptr_t yp, mpi_ptr_t mp,
+          mpi_size_t n, mpi_limb_t minv, mpi_ptr_t scratch_2n)
+{
+  _gcry_mpih_mul_lli (scratch_2n, xp, n, yp, n);
+  mont_reduc (rp, scratch_2n, mp, n, minv);
+}
+
+/* Determine the window size for computing exponentiation.  */
+static int
+window_size (mpi_size_t esize)
+{
+  int W;
+
+#if BITS_PER_MPI_LIMB <= 32
+  if (esize > 24)
+    W = 5;
+  else if (esize > 16)
+    W = 4;
+  else if (esize > 12)
+    W = 3;
+  else if (esize > 8)
+    W = 2;
+  else
+    W = 1;
+  return W;
+#else
+  if (esize > 8)
+    W = 4;
+  else if (esize > 6)
+    W = 3;
+  else if (esize > 4)
+    W = 2;
+  else
+    W = 1;
+  return W;
+#endif
+  return W;
+}
+
+/*
+ * Compute X ^ E mod M
+ *
+ * where X is in BP, M is in MP.
+ * The size of limbs for BP and MP are N.
+ * E is in EP, size of limbs EN.
+ *
+ * Result will be in RP with size N.
+ *
+ * Reference:
+ *   Handbook of Applied Cryptography
+ *       Algorithm 14.82: Left-to-right k-ary exponentiation
+ *       Algorithm 14.94: Montgomery exponentiation
+ *
+ */
+void
+_gcry_mpih_powm_lli (mpi_ptr_t rp, mpi_ptr_t bp, mpi_ptr_t mp, mpi_size_t n,
+                     mpi_ptr_t ep, mpi_size_t en)
+{
+  mpi_size_t scratch_size;
+#define temp0   (scratch)
+#define temp2   (scratch+n*2)
+#define a       (scratch+n*4)
+#define precomp (scratch+n*5)
+  mpi_ptr_t scratch;
+  mpi_limb_t minv;
+  mpi_size_t i;
+  int mod_shift_cnt;
+  int windowsize = window_size (en);
+  mpi_limb_t wmask = (((mpi_limb_t) 1 << windowsize) - 1);
+#define temp1   (precomp+n)
+#define x_tilde (precomp+n)
+
+  scratch_size = (5 + (1 << windowsize))*n;
+  scratch = _gcry_mpi_alloc_limb_space (scratch_size, 1);
+
+  minv = compute_minv (mp[0]);
+
+  gcry_assert (mp[0]*(-minv) == 1);
+
+  MPN_ZERO (temp0, n);
+
+  /* TEMP0 := R mod m */
+  count_leading_zeros (mod_shift_cnt, mp[n-1]);
+  if (mod_shift_cnt)
+    {
+      _gcry_mpih_lshift (temp2, mp, n, mod_shift_cnt);
+      temp0[n] = (mpi_limb_t)1 << mod_shift_cnt;
+    }
+  else
+    {
+      MPN_COPY (temp2, mp, n);
+      temp0[n] = 1;
+    }
+  _gcry_mpih_divrem (temp1, 0, temp0, n+1, temp2, n);
+  if (mod_shift_cnt)
+    _gcry_mpih_rshift (temp0, temp0, n, mod_shift_cnt);
+  /* PRECOMP[0] := R mod m */
+  MPN_COPY (precomp, temp0, n);
+
+  /* TEMP0 := (R mod m)^2 */
+  _gcry_mpih_sqr_n_basecase (temp0, precomp, n);
+
+  /* TEMP0 := R^2 mod m */
+  if (mod_shift_cnt)
+    _gcry_mpih_lshift (temp0, temp0, n*2, mod_shift_cnt);
+  _gcry_mpih_divrem (temp1, 0, temp0, n*2, temp2, n);
+  if (mod_shift_cnt)
+    _gcry_mpih_rshift (temp0, temp0, n, mod_shift_cnt);
+  /* x~ := Mont(x, R^2 mod m) */
+  mont_mul (x_tilde, bp, temp0, mp, n, minv, temp2);
+
+  /* PRECOMP[i] := x~ ^ i */
+  for (i = 0; i < (1 << windowsize) - 2; i += 2)
+    {
+      _gcry_mpih_sqr_n_basecase (temp0, precomp+n*(i/2+1), n);
+      mont_reduc (precomp+n*(i+2), temp0, mp, n, minv);
+      mont_mul (precomp+n*(i+3), x_tilde, precomp+n*(i+2), mp, n, minv, temp2);
+    }
+
+  MPN_COPY (a, precomp, n);
+  i = en * BITS_PER_MPI_LIMB;
+  do
+    {
+      mpi_limb_t e;
+      int w;
+
+      if (i < windowsize)
+        {
+          e = ep[0] & (((mpi_limb_t) 1 << i) - 1);
+          w = i;
+          i = 0;
+        }
+      else
+        {
+          mpi_limb_t v;
+          mpi_size_t shift;
+          mpi_size_t j;
+          int nbits_in_v;
+
+          i -= windowsize;
+          j = i / BITS_PER_MPI_LIMB;
+          shift = i % BITS_PER_MPI_LIMB;
+          v = ep[j] >> shift;
+          nbits_in_v = BITS_PER_MPI_LIMB - shift;
+          if (nbits_in_v < windowsize)
+            v += ep[j + 1] << nbits_in_v;
+          e = v & wmask;
+          w = windowsize;
+        }
+
+      do
+        {
+          _gcry_mpih_sqr_n_basecase (temp0, a, n);
+          mont_reduc (a, temp0, mp, n, minv);
+        }
+      while (--w);
+
+      _gcry_mpih_table_lookup (temp0, precomp, n, (1 << windowsize), e);
+      mont_mul (a, a, temp0, mp, n, minv, temp2);
+    }
+  while (i);
+
+  MPN_ZERO (temp0, n);
+  temp0[0] = 1;
+  mont_mul (a, a, temp0, mp, n, minv, temp2);
+
+  MPN_COPY (rp, a, n);
+  _gcry_mpi_free_limb_space (scratch, scratch_size);
+}
diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h
index d52a1b1b..1eb58cb9 100644
--- a/src/gcrypt-int.h
+++ b/src/gcrypt-int.h
@@ -474,6 +474,7 @@ void _gcry_mpi_mul_2exp (gcry_mpi_t w, gcry_mpi_t u, unsigned long cnt);
 void _gcry_mpi_div (gcry_mpi_t q, gcry_mpi_t r,
                    gcry_mpi_t dividend, gcry_mpi_t divisor, int round);
 void _gcry_mpi_mod (gcry_mpi_t r, gcry_mpi_t dividend, gcry_mpi_t divisor);
+const char *_gcry_mpi_get_powm_config (void);
 void _gcry_mpi_powm (gcry_mpi_t w,
                     const gcry_mpi_t b, const gcry_mpi_t e,
                     const gcry_mpi_t m);
diff --git a/src/global.c b/src/global.c
index a5a48cb5..74cf16d3 100644
--- a/src/global.c
+++ b/src/global.c
@@ -378,6 +378,9 @@ print_config (const char *what, gpgrt_stream_t fp)
   if (!what || !strcmp (what, "mpi-asm"))
     gpgrt_fprintf (fp, "mpi-asm:%s:\n", _gcry_mpi_get_hw_config ());
 
+  if (!what || !strcmp (what, "mpi-pow"))
+    gpgrt_fprintf (fp, "mpi-pow:%s\n", _gcry_mpi_get_powm_config ());
+
   if (!what || !strcmp (what, "hwflist"))
     {
       unsigned int hwfeatures, afeature;
--