[PATCH 5/5] mpi: optimize mpi_rshift and mpi_lshift to avoid extra MPI copying

Jussi Kivilinna jussi.kivilinna at iki.fi
Sat Apr 22 09:35:39 CEST 2023


* mpi/mpi-bit.c (_gcry_mpi_rshift): Refactor so that _gcry_mpih_rshift
is used to do the copying along with shifting when copying is needed
and refactor so that same code-path is used for both in-place and
copying operation.
(_gcry_mpi_lshift): Refactor so that _gcry_mpih_lshift is used to do
the copying along with shifting when copying is needed and refactor
so that same code-path is used for both in-place and copying operation.
--

Benchmark on AMD Ryzen 9 7900X:

 Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 rshift3            |     0.039 ns/B     24662 MiB/s     0.182 c/B      4700
 lshift3            |     0.108 ns/B      8832 MiB/s     0.508 c/B      4700
 rshift65           |     0.137 ns/B      6968 MiB/s     0.643 c/B      4700
 lshift65           |     0.109 ns/B      8776 MiB/s     0.511 c/B      4700

 After:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 rshift3            |     0.038 ns/B     25049 MiB/s     0.179 c/B      4700
 lshift3            |     0.039 ns/B     24709 MiB/s     0.181 c/B      4700
 rshift65           |     0.038 ns/B     24942 MiB/s     0.180 c/B      4700
 lshift65           |     0.040 ns/B     23671 MiB/s     0.189 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/mpi-bit.c | 138 +++++++++++++++++++-------------------------------
 1 file changed, 51 insertions(+), 87 deletions(-)

diff --git a/mpi/mpi-bit.c b/mpi/mpi-bit.c
index e2170401..7313a9d4 100644
--- a/mpi/mpi-bit.c
+++ b/mpi/mpi-bit.c
@@ -251,10 +251,11 @@ _gcry_mpi_rshift_limbs( gcry_mpi_t a, unsigned int count )
 void
 _gcry_mpi_rshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
 {
-  mpi_size_t xsize;
-  unsigned int i;
   unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
   unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+  unsigned int i;
+  mpi_size_t alimbs;
+  mpi_ptr_t xp, ap;
 
   if (mpi_is_immutable (x))
     {
@@ -262,75 +263,42 @@ _gcry_mpi_rshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
       return;
     }
 
-  if ( x == a )
-    {
-      /* In-place operation.  */
-      if ( nlimbs >= x->nlimbs )
-        {
-          x->nlimbs = 0;
-          return;
-        }
+  alimbs = a->nlimbs;
 
-      if (nlimbs)
-        {
-          for (i=0; i < x->nlimbs - nlimbs; i++ )
-            x->d[i] = x->d[i+nlimbs];
-          x->d[i] = 0;
-          x->nlimbs -= nlimbs;
-
-        }
-      if ( x->nlimbs && nbits )
-        _gcry_mpih_rshift ( x->d, x->d, x->nlimbs, nbits );
-    }
-  else if ( nlimbs )
+  if (x != a)
     {
-      /* Copy and shift by more or equal bits than in a limb. */
-      xsize = a->nlimbs;
+      RESIZE_IF_NEEDED (x, alimbs);
+      x->nlimbs = alimbs;
+      x->flags = a->flags;
       x->sign = a->sign;
-      RESIZE_IF_NEEDED (x, xsize);
-      x->nlimbs = xsize;
-      for (i=0; i < a->nlimbs; i++ )
-        x->d[i] = a->d[i];
-      x->nlimbs = i;
-
-      if ( nlimbs >= x->nlimbs )
-        {
-          x->nlimbs = 0;
-          return;
-        }
+    }
+
+  /* In-place operation.  */
+  if (nlimbs >= alimbs)
+    {
+      x->nlimbs = 0;
+      return;
+    }
+
+  xp = x->d;
+  ap = a->d;
 
+  if (alimbs && nbits)
+    {
+      _gcry_mpih_rshift (xp, ap + nlimbs, alimbs - nlimbs, nbits);
       if (nlimbs)
-        {
-          for (i=0; i < x->nlimbs - nlimbs; i++ )
-            x->d[i] = x->d[i+nlimbs];
-          x->d[i] = 0;
-          x->nlimbs -= nlimbs;
-        }
-
-      if ( x->nlimbs && nbits )
-        _gcry_mpih_rshift ( x->d, x->d, x->nlimbs, nbits );
+	xp[alimbs - nlimbs] = 0;
+      x->nlimbs -= nlimbs;
     }
-  else
+  else if (nlimbs || (x != a))
     {
-      /* Copy and shift by less than bits in a limb.  */
-      xsize = a->nlimbs;
-      x->sign = a->sign;
-      RESIZE_IF_NEEDED (x, xsize);
-      x->nlimbs = xsize;
-
-      if ( xsize )
-        {
-          if (nbits )
-            _gcry_mpih_rshift (x->d, a->d, x->nlimbs, nbits );
-          else
-            {
-              /* The rshift helper function is not specified for
-                 NBITS==0, thus we do a plain copy here. */
-              for (i=0; i < x->nlimbs; i++ )
-                x->d[i] = a->d[i];
-            }
-        }
+      for (i = 0; i < alimbs - nlimbs; i++ )
+	xp[i] = ap[i + nlimbs];
+      if (nlimbs)
+	xp[i] = 0;
+      x->nlimbs -= nlimbs;
     }
+
   MPN_NORMALIZE (x->d, x->nlimbs);
 }
 
@@ -368,6 +336,9 @@ _gcry_mpi_lshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
 {
   unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
   unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+  mpi_size_t alimbs;
+  mpi_ptr_t xp, ap;
+  int i;
 
   if (mpi_is_immutable (x))
     {
@@ -378,34 +349,27 @@ _gcry_mpi_lshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
   if (x == a && !n)
     return;  /* In-place shift with an amount of zero.  */
 
-  if ( x != a )
-    {
-      /* Copy A to X.  */
-      unsigned int alimbs = a->nlimbs;
-      int asign  = a->sign;
-      mpi_ptr_t xp, ap;
-
-      RESIZE_IF_NEEDED (x, alimbs+nlimbs+1);
-      xp = x->d;
-      ap = a->d;
-      MPN_COPY (xp, ap, alimbs);
-      x->nlimbs = alimbs;
-      x->flags = a->flags;
-      x->sign = asign;
-    }
+  /* Note: might be in-place operation, so a==x or a!=x. */
+
+  alimbs = a->nlimbs;
 
-  if (nlimbs && !nbits)
+  RESIZE_IF_NEEDED (x, alimbs + nlimbs + 1);
+  xp = x->d;
+  ap = a->d;
+  if (nbits && alimbs)
     {
-      /* Shift a full number of limbs.  */
-      _gcry_mpi_lshift_limbs (x, nlimbs);
+      x->nlimbs = alimbs + nlimbs + 1;
+      xp[alimbs + nlimbs] = _gcry_mpih_lshift (xp + nlimbs, ap, alimbs, nbits);
     }
-  else if (n)
+  else
     {
-      /* We use a very dump approach: Shift left by the number of
-         limbs plus one and than fix it up by an rshift.  */
-      _gcry_mpi_lshift_limbs (x, nlimbs+1);
-      mpi_rshift (x, x, BITS_PER_MPI_LIMB - nbits);
+      x->nlimbs = alimbs + nlimbs;
+      for (i = alimbs - 1; i >= 0; i--)
+	xp[i + nlimbs] = ap[i];
     }
-
+  for (i = 0; i < nlimbs; i++)
+    xp[i] = 0;
+  x->flags = a->flags;
+  x->sign = a->sign;
   MPN_NORMALIZE (x->d, x->nlimbs);
 }
-- 
2.39.2




More information about the Gcrypt-devel mailing list