[PATCH 6/8] mpi/ec: small optimization for ec_mulm_448

Jussi Kivilinna jussi.kivilinna at iki.fi
Mon Apr 26 23:00:44 CEST 2021


* mpi/ec.c (ec_addm_448, ec_subm_448): Change order of sub_n and
set_cond to remove need to clear 'n'.
(ec_mulm_448): Use memcpy where possible; Use mpih_rshift where
possible; Use mpih_lshift for doubling a3; Remove one addition
at end.
--

Benchmarks on AMD Ryzen 7 5800X:

Before:
 Ed448          |  nanosecs/iter   cycles/iter  auto Mhz
         keygen |         893096       4343326      4863
           sign |         988422       4795694      4852
         verify |        1899706       9215952      4851

After (~5% faster):
 Ed448          |  nanosecs/iter   cycles/iter  auto Mhz
         keygen |         822078       3987952      4851
           sign |         947327       4595433      4851
         verify |        1776259       8616675      4851

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/ec.c | 76 ++++++++++++++++----------------------------------------
 1 file changed, 22 insertions(+), 54 deletions(-)

diff --git a/mpi/ec.c b/mpi/ec.c
index e1d4b32c..4fabf9b4 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -464,14 +464,13 @@ ec_addm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
   if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize)
     log_bug ("addm_448: different sizes\n");
 
-  memset (n, 0, sizeof n);
   up = u->d;
   vp = v->d;
   wp = w->d;
 
   cy = _gcry_mpih_add_n (wp, up, vp, wsize);
-  mpih_set_cond (n, ctx->p->d, wsize, (cy != 0UL));
-  _gcry_mpih_sub_n (wp, wp, n, wsize);
+  _gcry_mpih_sub_n (n, wp, ctx->p->d, wsize);
+  mpih_set_cond (wp, n, wsize, (cy != 0UL));
 }
 
 static void
@@ -485,14 +484,13 @@ ec_subm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
   if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize)
     log_bug ("subm_448: different sizes\n");
 
-  memset (n, 0, sizeof n);
   up = u->d;
   vp = v->d;
   wp = w->d;
 
   borrow = _gcry_mpih_sub_n (wp, up, vp, wsize);
-  mpih_set_cond (n, ctx->p->d, wsize, (borrow != 0UL));
-  _gcry_mpih_add_n (wp, wp, n, wsize);
+  _gcry_mpih_add_n (n, wp, ctx->p->d, wsize);
+  mpih_set_cond (wp, n, wsize, (borrow != 0UL));
 }
 
 static void
@@ -506,10 +504,6 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
   mpi_limb_t b0[LIMB_SIZE_HALF_448];
   mpi_limb_t b1[LIMB_SIZE_HALF_448];
   mpi_limb_t cy;
-  int i;
-#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
-  mpi_limb_t b1_rest, a3_rest;
-#endif
 
   if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize)
     log_bug ("mulm_448: different sizes\n");
@@ -520,61 +514,37 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
 
   _gcry_mpih_mul_n (n, up, vp, wsize);
 
-  for (i = 0; i < (wsize + 1)/ 2; i++)
-    {
-      b0[i] = n[i];
-      b1[i] = n[i+wsize/2];
-      a2[i] = n[i+wsize];
-      a3[i] = n[i+wsize+wsize/2];
-    }
+  memcpy (b0, n, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
+  memcpy (a2, n + wsize, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
 
 #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
   b0[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL<<32)-1;
   a2[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL<<32)-1;
-
-  b1_rest = 0;
-  a3_rest = 0;
-
-  for (i = (wsize + 1)/ 2 -1; i >= 0; i--)
-    {
-      mpi_limb_t b1v, a3v;
-      b1v = b1[i];
-      a3v = a3[i];
-      b1[i] = (b1_rest<<32) | (b1v >> 32);
-      a3[i] = (a3_rest<<32) | (a3v >> 32);
-      b1_rest = b1v & (((mpi_limb_t)1UL <<32)-1);
-      a3_rest = a3v & (((mpi_limb_t)1UL <<32)-1);
-    }
+  _gcry_mpih_rshift (b1, n + wsize/2, LIMB_SIZE_HALF_448, 32);
+  _gcry_mpih_rshift (a3, n + wsize + wsize/2, LIMB_SIZE_HALF_448, 32);
+#else
+  memcpy (b1, n + wsize/2, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
+  memcpy (a3, n + wsize + wsize/2, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
 #endif
 
   cy = _gcry_mpih_add_n (b0, b0, a2, LIMB_SIZE_HALF_448);
-  cy += _gcry_mpih_add_n (b0, b0, a3, LIMB_SIZE_HALF_448);
-  for (i = 0; i < (wsize + 1)/ 2; i++)
-    wp[i] = b0[i];
+  cy += _gcry_mpih_add_n (wp, b0, a3, LIMB_SIZE_HALF_448);
 #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
+  cy = wp[LIMB_SIZE_HALF_448-1] >> 32;
   wp[LIMB_SIZE_HALF_448-1] &= (((mpi_limb_t)1UL <<32)-1);
 #endif
+  memset (b0, 0, LIMB_SIZE_HALF_448 * BYTES_PER_MPI_LIMB);
+  b0[0] = cy;
 
-#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
-  cy = b0[LIMB_SIZE_HALF_448-1] >> 32;
-#endif
-
-  cy = _gcry_mpih_add_1 (b1, b1, LIMB_SIZE_HALF_448, cy);
+  cy = _gcry_mpih_add_n (b1, b1, b0, LIMB_SIZE_HALF_448);
+  cy += _gcry_mpih_lshift (a3, a3, LIMB_SIZE_HALF_448, 1);
   cy += _gcry_mpih_add_n (b1, b1, a2, LIMB_SIZE_HALF_448);
   cy += _gcry_mpih_add_n (b1, b1, a3, LIMB_SIZE_HALF_448);
-  cy += _gcry_mpih_add_n (b1, b1, a3, LIMB_SIZE_HALF_448);
 #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
-  b1_rest = 0;
-  for (i = (wsize + 1)/ 2 -1; i >= 0; i--)
-    {
-      mpi_limb_t b1v = b1[i];
-      b1[i] = (b1_rest<<32) | (b1v >> 32);
-      b1_rest = b1v & (((mpi_limb_t)1UL <<32)-1);
-    }
-  wp[LIMB_SIZE_HALF_448-1] |= (b1_rest << 32);
+  cy = _gcry_mpih_rshift (b1, b1, LIMB_SIZE_HALF_448, 32);
+  wp[LIMB_SIZE_HALF_448-1] |= cy;
 #endif
-  for (i = 0; i < wsize / 2; i++)
-    wp[i+(wsize + 1) / 2] = b1[i];
+  memcpy (wp + LIMB_SIZE_HALF_448, b1, (wsize / 2) * BYTES_PER_MPI_LIMB);
 
 #if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2)
   cy = b1[LIMB_SIZE_HALF_448-1];
@@ -590,10 +560,8 @@ ec_mulm_448 (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx)
   n[0] = cy;
   _gcry_mpih_add_n (wp, wp, n, wsize);
 
-  memset (n, 0, wsize * BYTES_PER_MPI_LIMB);
-  cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize);
-  mpih_set_cond (n, ctx->p->d, wsize, (cy != 0UL));
-  _gcry_mpih_add_n (wp, wp, n, wsize);
+  cy = _gcry_mpih_sub_n (n, wp, ctx->p->d, wsize);
+  mpih_set_cond (wp, n, wsize, (cy == 0UL));
 }
 
 static void
-- 
2.30.2




More information about the Gcrypt-devel mailing list