From jussi.kivilinna at iki.fi  Sat Oct  1 08:15:12 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 1 Oct 2022 09:15:12 +0300
Subject: Uninteded Variable Length Array in ec-nist.c
In-Reply-To: <20220930113606.GE7424@yoink.cs.uwaterloo.ca>
References: <87a66lvdql.fsf@akagi.fsij.org>
 <3a5400f0-39c7-d7af-e5c7-63259b8d8ec6@iki.fi> <87k05ls2hz.fsf@akagi.fsij.org>
 <20220930113606.GE7424@yoink.cs.uwaterloo.ca>
Message-ID: <e317673b-edbd-5beb-358e-ceb0239b7db5@iki.fi>

On 30.9.2022 14.36, Ian Goldberg via Gcrypt-devel wrote:
> On Fri, Sep 30, 2022 at 05:14:16PM +0900, NIIBE Yutaka wrote:
>> Thank you for your quick response.
>>
>> Jussi Kivilinna <jussi.kivilinna at iki.fi> wrote:
>>> How about instead define arrays with wanted size and define 'wsize' with
>>> sizeof the array. This would avoid having macros. For example like this:
>>>
>>> index 69b05a6d..0de41e48 100644
>>> --- a/mpi/ec-nist.c
>>> +++ b/mpi/ec-nist.c
>>> @@ -94,9 +94,9 @@ _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx)
>>>      };
>>>      const mpi_limb64_t zero = LIMB_TO64(0);
>>>      mpi_ptr_t wp;
>>> -  mpi_size_t wsize = 192 / BITS_PER_MPI_LIMB64;
>>> -  mpi_limb64_t s[wsize + 1];
>>> -  mpi_limb64_t o[wsize + 1];
>>> +  mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1];
>>> +  mpi_limb64_t o[sizeof(s)];
> 
> Note that sizeof(s) is the number of *bytes* of s, not the number of
> *elements* of s, so the above new code will declare o to be much larger
> than the old code did.

Thanks, I somehow missed that. Next line in my example used DIM macro
which does the right thing of giving number of elements in array.

+  const mpi_size_t wsize = DIM(s) - 1;

Just need to change to use DIM for array definitions too.

-Jussi


From jussi.kivilinna at iki.fi  Sat Oct  1 09:48:21 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  1 Oct 2022 10:48:21 +0300
Subject: [PATCH 3/5] tests/bench-slope: remove VLA usage
In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
References: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
Message-ID: <20221001074823.201798-3-jussi.kivilinna@iki.fi>

* tests/bench-slope.c (bench_set_cipher_key): New.
(bench_encrypt_init, bench_xts_encrypt_init): Use'bench_set_cipher_key'
to remove VLA usage.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 tests/bench-slope.c | 59 ++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/tests/bench-slope.c b/tests/bench-slope.c
index aaddaa85..1cad6813 100644
--- a/tests/bench-slope.c
+++ b/tests/bench-slope.c
@@ -982,6 +982,35 @@ struct bench_cipher_mode
 };
 
 
+static void
+bench_set_cipher_key (gcry_cipher_hd_t hd, int keylen)
+{
+  char *key;
+  int err, i;
+
+  key = malloc (keylen);
+  if (!key)
+    {
+      fprintf (stderr, PGM ": couldn't allocate %d bytes\n", keylen);
+      gcry_cipher_close (hd);
+      exit (1);
+    }
+
+  for (i = 0; i < keylen; i++)
+    key[i] = 0x33 ^ (11 - i);
+
+  err = gcry_cipher_setkey (hd, key, keylen);
+  free (key);
+  if (err)
+    {
+      fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n",
+                gpg_strerror (err));
+      gcry_cipher_close (hd);
+      exit (1);
+    }
+}
+
+
 static int
 bench_encrypt_init (struct bench_obj *obj)
 {
@@ -1010,20 +1039,7 @@ bench_encrypt_init (struct bench_obj *obj)
 
   if (keylen)
     {
-      char key[keylen];
-      int i;
-
-      for (i = 0; i < keylen; i++)
-	key[i] = 0x33 ^ (11 - i);
-
-      err = gcry_cipher_setkey (hd, key, keylen);
-      if (err)
-	{
-	  fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n",
-		   gpg_strerror (err));
-	  gcry_cipher_close (hd);
-	  exit (1);
-	}
+      bench_set_cipher_key (hd, keylen);
     }
   else
     {
@@ -1119,20 +1135,7 @@ bench_xts_encrypt_init (struct bench_obj *obj)
   keylen = gcry_cipher_get_algo_keylen (mode->algo) * 2;
   if (keylen)
     {
-      char key[keylen];
-      int i;
-
-      for (i = 0; i < keylen; i++)
-	key[i] = 0x33 ^ (11 - i);
-
-      err = gcry_cipher_setkey (hd, key, keylen);
-      if (err)
-	{
-	  fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n",
-		   gpg_strerror (err));
-	  gcry_cipher_close (hd);
-	  exit (1);
-	}
+      bench_set_cipher_key (hd, keylen);
     }
   else
     {
-- 
2.34.1


From jussi.kivilinna at iki.fi  Sat Oct  1 09:48:23 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  1 Oct 2022 10:48:23 +0300
Subject: [PATCH 5/5] =?UTF-8?q?t-rsa-testparm:=20fix=20'function=20declara?=
 =?UTF-8?q?tion=20isn=E2=80=99t=20a=20prototype'=20warning?=
In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
References: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
Message-ID: <20221001074823.201798-5-jussi.kivilinna@iki.fi>

* cipher/t-rsa-testparm.c (check_rsa_testparm): Define parameters as
void.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 tests/t-rsa-testparm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/t-rsa-testparm.c b/tests/t-rsa-testparm.c
index 65617855..d62d9abf 100644
--- a/tests/t-rsa-testparm.c
+++ b/tests/t-rsa-testparm.c
@@ -32,7 +32,7 @@
 
 
 static void
-check_rsa_testparm ()
+check_rsa_testparm (void)
 {
   gpg_error_t err;
   gcry_sexp_t keyspec = NULL;
-- 
2.34.1


From jussi.kivilinna at iki.fi  Sat Oct  1 09:48:19 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  1 Oct 2022 10:48:19 +0300
Subject: [PATCH 1/5] mpi/ec: remove VLA usage
Message-ID: <20221001074823.201798-1-jussi.kivilinna@iki.fi>

* mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod)
(_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod)
(_gcry_mpi_ec_nist521_mod): Avoid VLA for arrays on stack.
* mpi/ec.c (ec_secp256k1_mod): Avoid VLA for arrays on stack.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/ec-nist.c | 40 ++++++++++++++++++++--------------------
 mpi/ec.c      |  6 +++---
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c
index 69b05a6d..14e3c3ab 100644
--- a/mpi/ec-nist.c
+++ b/mpi/ec-nist.c
@@ -94,9 +94,9 @@ _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = 192 / BITS_PER_MPI_LIMB64;
-  mpi_limb64_t s[wsize + 1];
-  mpi_limb64_t o[wsize + 1];
+  mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1];
+  mpi_limb64_t o[DIM(s)];
+  const mpi_size_t wsize = DIM(s) - 1;
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t s_is_negative;
@@ -186,10 +186,10 @@ _gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = (224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
+  mpi_limb64_t s[(224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64];
+  mpi_limb64_t d[DIM(s)];
+  const mpi_size_t wsize = DIM(s);
   mpi_size_t psize = ctx->p->nlimbs;
-  mpi_limb64_t s[wsize];
-  mpi_limb64_t d[wsize];
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t s_is_negative;
@@ -345,12 +345,12 @@ _gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
+  mpi_limb64_t s[(256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
+  mpi_limb64_t t[DIM(s)];
+  mpi_limb64_t d[DIM(s)];
+  mpi_limb64_t e[DIM(s)];
+  const mpi_size_t wsize = DIM(s) - 1;
   mpi_size_t psize = ctx->p->nlimbs;
-  mpi_limb64_t s[wsize + 1];
-  mpi_limb64_t t[wsize + 1];
-  mpi_limb64_t d[wsize + 1];
-  mpi_limb64_t e[wsize + 1];
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t mask3;
@@ -595,15 +595,15 @@ _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
   };
   const mpi_limb64_t zero = LIMB_TO64(0);
   mpi_ptr_t wp;
-  mpi_size_t wsize = (384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64;
-  mpi_size_t psize = ctx->p->nlimbs;
+  mpi_limb64_t s[(384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
+  mpi_limb64_t t[DIM(s)];
+  mpi_limb64_t d[DIM(s)];
+  mpi_limb64_t x[DIM(s)];
 #if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN)
-  mpi_limb_t wp_shr32[wsize * LIMBS_PER_LIMB64];
+  mpi_limb_t wp_shr32[(DIM(s) - 1) * LIMBS_PER_LIMB64];
 #endif
-  mpi_limb64_t s[wsize + 1];
-  mpi_limb64_t t[wsize + 1];
-  mpi_limb64_t d[wsize + 1];
-  mpi_limb64_t x[wsize + 1];
+  const mpi_size_t wsize = DIM(s) - 1;
+  mpi_size_t psize = ctx->p->nlimbs;
   mpi_limb_t mask1;
   mpi_limb_t mask2;
   mpi_limb_t s_is_negative;
@@ -791,8 +791,8 @@ _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
 void
 _gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx)
 {
-  mpi_size_t wsize = (521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB;
-  mpi_limb_t s[wsize];
+  mpi_limb_t s[(521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB];
+  const mpi_size_t wsize = DIM(s);
   mpi_limb_t cy;
   mpi_ptr_t wp;
 
diff --git a/mpi/ec.c b/mpi/ec.c
index c24921ee..0ad6769c 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -581,9 +581,9 @@ ec_pow2_448 (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx)
 static void
 ec_secp256k1_mod (gcry_mpi_t w, mpi_ec_t ctx)
 {
-  mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB;
-  mpi_limb_t n[wsize + 1];
-  mpi_limb_t s[wsize + 1];
+  mpi_limb_t s[(256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB + 1];
+  mpi_limb_t n[sizeof(s)];
+  const mpi_size_t wsize = DIM(s) - 1;
   mpi_limb_t cy, borrow;
   mpi_ptr_t wp;
 
-- 
2.34.1


From jussi.kivilinna at iki.fi  Sat Oct  1 09:48:20 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  1 Oct 2022 10:48:20 +0300
Subject: [PATCH 2/5] cipher-ccm: remove VLA usage
In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
References: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
Message-ID: <20221001074823.201798-2-jussi.kivilinna@iki.fi>

* cipher/cipher-ccm.c (do_cbc_mac): Avoid VLA for stack array.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-ccm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c
index f8b6865c..b325c966 100644
--- a/cipher/cipher-ccm.c
+++ b/cipher/cipher-ccm.c
@@ -38,9 +38,9 @@ static unsigned int
 do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen,
             int do_padding)
 {
-  const unsigned int blocksize = 16;
   gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
-  unsigned char tmp[blocksize];
+  unsigned char tmp[16];
+  const unsigned int blocksize = DIM(tmp);
   unsigned int burn = 0;
   unsigned int unused = c->u_mode.ccm.mac_unused;
   size_t nblocks;
-- 
2.34.1


From jussi.kivilinna at iki.fi  Sat Oct  1 09:48:22 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  1 Oct 2022 10:48:22 +0300
Subject: [PATCH 4/5] tests/benchmark: remove VLA usage
In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
References: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
Message-ID: <20221001074823.201798-4-jussi.kivilinna@iki.fi>

* cipher/benchmark.c (ccm_aead_init): Avoid VLA in stack array.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 tests/benchmark.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/benchmark.c b/tests/benchmark.c
index a23cf74b..e9223f5a 100644
--- a/tests/benchmark.c
+++ b/tests/benchmark.c
@@ -719,15 +719,16 @@ mac_bench ( const char *algoname )
 
 static void ccm_aead_init(gcry_cipher_hd_t hd, size_t buflen, int authlen)
 {
-  const int _L = 4;
-  const int noncelen = 15 - _L;
-  char nonce[noncelen];
+  const char _L[4];
+  char nonce[15 - sizeof(_L)];
   u64 params[3];
   gcry_error_t err = GPG_ERR_NO_ERROR;
 
-  memset (nonce, 0x33, noncelen);
+  (void)_L;
 
-  err = gcry_cipher_setiv (hd, nonce, noncelen);
+  memset (nonce, 0x33, sizeof(nonce));
+
+  err = gcry_cipher_setiv (hd, nonce, sizeof(nonce));
   if (err)
     {
       fprintf (stderr, "gcry_cipher_setiv failed: %s\n",
-- 
2.34.1


From jcb62281 at gmail.com  Mon Oct  3 02:19:30 2022
From: jcb62281 at gmail.com (Jacob Bachmeyer)
Date: Sun, 02 Oct 2022 19:19:30 -0500
Subject: [PATCH 1/5] mpi/ec: remove VLA usage
In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
References: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
Message-ID: <633A2A92.3050900@gmail.com>

Jussi Kivilinna wrote:
> * mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod)
> (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod)
> (_gcry_mpi_ec_nist521_mod): Avoid VLA for arrays on stack.
> * mpi/ec.c (ec_secp256k1_mod): Avoid VLA for arrays on stack.
> --
>
> Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
> ---
>  mpi/ec-nist.c | 40 ++++++++++++++++++++--------------------
>  mpi/ec.c      |  6 +++---
>  2 files changed, 23 insertions(+), 23 deletions(-)
>
> diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c
> index 69b05a6d..14e3c3ab 100644
> [...]
> +  mpi_limb64_t o[DIM(s)];
> [...]
> +  mpi_limb64_t d[DIM(s)];
> [...]
> +  mpi_limb64_t t[DIM(s)];
> +  mpi_limb64_t d[DIM(s)];
> +  mpi_limb64_t e[DIM(s)];
> +  const mpi_size_t wsize = DIM(s) - 1;
> [...]
> +  mpi_limb64_t t[DIM(s)];
> +  mpi_limb64_t d[DIM(s)];
> +  mpi_limb64_t x[DIM(s)];
> [...]
> +  mpi_limb_t s[(256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB + 1];
> +  mpi_limb_t n[sizeof(s)];
> +  const mpi_size_t wsize = DIM(s) - 1;
>    mpi_limb_t cy, borrow;
>    mpi_ptr_t wp;
>   

Am I misreading the patch (e.g. sizeof(mpi_limb_t) == 1?) or did you 
miss a spot at the end?


-- Jacob


From jussi.kivilinna at iki.fi  Mon Oct  3 17:30:59 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 3 Oct 2022 18:30:59 +0300
Subject: [PATCH 1/5] mpi/ec: remove VLA usage
In-Reply-To: <633A2A92.3050900@gmail.com>
References: <20221001074823.201798-1-jussi.kivilinna@iki.fi>
 <633A2A92.3050900@gmail.com>
Message-ID: <5090d5a3-7777-1bae-9f80-eaf256db4f3e@iki.fi>

On 3.10.2022 3.19, Jacob Bachmeyer wrote:
> Jussi Kivilinna wrote:
>> * mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod)
>> (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod)
>> (_gcry_mpi_ec_nist521_mod): Avoid VLA for arrays on stack.
>> * mpi/ec.c (ec_secp256k1_mod): Avoid VLA for arrays on stack.
>> -- 
>>
>> Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
>> ---
>> ?mpi/ec-nist.c | 40 ++++++++++++++++++++--------------------
>> ?mpi/ec.c????? |? 6 +++---
>> ?2 files changed, 23 insertions(+), 23 deletions(-)
>>
>> diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c
>> index 69b05a6d..14e3c3ab 100644
>> [...]
>> +? mpi_limb64_t o[DIM(s)];
>> [...]
>> +? mpi_limb64_t d[DIM(s)];
>> [...]
>> +? mpi_limb64_t t[DIM(s)];
>> +? mpi_limb64_t d[DIM(s)];
>> +? mpi_limb64_t e[DIM(s)];
>> +? const mpi_size_t wsize = DIM(s) - 1;
>> [...]
>> +? mpi_limb64_t t[DIM(s)];
>> +? mpi_limb64_t d[DIM(s)];
>> +? mpi_limb64_t x[DIM(s)];
>> [...]
>> +? mpi_limb_t s[(256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB + 1];
>> +? mpi_limb_t n[sizeof(s)];
>> +? const mpi_size_t wsize = DIM(s) - 1;
>> ?? mpi_limb_t cy, borrow;
>> ?? mpi_ptr_t wp;
> 
> Am I misreading the patch (e.g. sizeof(mpi_limb_t) == 1?) or did you miss a spot at the end?
> 

I missed it at first, but fixed before pushing to master.

https://git.gnupg.org/cgi-bin/gitweb.cgi?p=libgcrypt.git;a=commitdiff;h=9978fc22045ca7623a6e0cbf704fb48ab1550419;hp=0cb29a5736cfcd6bce4ce2495cd0481f0bdb34a4

-Jussi


From jussi.kivilinna at iki.fi  Tue Oct  4 20:48:16 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue,  4 Oct 2022 21:48:16 +0300
Subject: [PATCH 2/3] mpi/longlong: fix generic smul_ppmm ifdef
In-Reply-To: <20221004184817.140959-1-jussi.kivilinna@iki.fi>
References: <20221004184817.140959-1-jussi.kivilinna@iki.fi>
Message-ID: <20221004184817.140959-2-jussi.kivilinna@iki.fi>

* mpi/longlong.h [!umul_ppmm] (smul_ppmm): Change ifdef
from !defined(umul_ppmm) to !defined(smul_ppmm).
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/longlong.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mpi/longlong.h b/mpi/longlong.h
index 6a829f49..2921e9bd 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -1710,7 +1710,7 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
   } while (0)
 #endif
 
-#if !defined (umul_ppmm)
+#if !defined (smul_ppmm)
 #  define smul_ppmm(w1, w0, u, v) 					\
   do {									\
     UWtype __w1;							\
-- 
2.34.1


From jussi.kivilinna at iki.fi  Tue Oct  4 20:48:15 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue,  4 Oct 2022 21:48:15 +0300
Subject: [PATCH 1/3] mpi/longlong: provide generic implementation using double
 word type
Message-ID: <20221004184817.140959-1-jussi.kivilinna@iki.fi>

* configure.ac: Add check for 'unsigned __int128'.
* mpi/longlong.h (UDWtype): Define for 32-bit or 64-bit when
'unsigned long long' or 'unsigned __int128' is available.
(add_ssaaaa, sub_ddmmss, umul_ppmm, udiv_qrnnd) [UDWtype]: New.
--

New generic longlong.h implementation by using 'unsigned long long'
on 32-bit and 'unsigned __int128' on 64-bit (for new architectures like
RISC-V).

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 configure.ac   |  1 +
 mpi/longlong.h | 75 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/configure.ac b/configure.ac
index c39257b5..6f68a945 100644
--- a/configure.ac
+++ b/configure.ac
@@ -360,6 +360,7 @@ AC_CHECK_SIZEOF(unsigned short, 2)
 AC_CHECK_SIZEOF(unsigned int, 4)
 AC_CHECK_SIZEOF(unsigned long, 4)
 AC_CHECK_SIZEOF(unsigned long long, 0)
+AC_CHECK_SIZEOF(unsigned __int128, 0)
 AC_CHECK_SIZEOF(void *, 0)
 
 AC_TYPE_UINTPTR_T
diff --git a/mpi/longlong.h b/mpi/longlong.h
index c299534c..6a829f49 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -20,18 +20,28 @@ along with this file; see the file COPYING.LIB.  If not, write to
 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 MA 02111-1307, USA. */
 
+/* On 32-bit, use 64-bit 'unsigned long long' for UDWtype, if available. */
+#if !defined (UDWtype) && SIZEOF_UNSIGNED_LONG_LONG * 8 == W_TYPE_SIZE * 2
+#  define UDWtype unsigned long long
+#endif
+
+/* On 64-bit, use 128-bit 'unsigned __int128' for UDWtype, if available. */
+#if !defined (UDWtype) && SIZEOF_UNSIGNED___INT128 * 8 == W_TYPE_SIZE * 2
+#  define UDWtype unsigned __int128
+#endif
+
 /* You have to define the following before including this file:
 
-   UWtype -- An unsigned type, default type for operations (typically a "word")
+   UWtype -- An unsigned type, default type for operations (typically a "word").
    UHWtype -- An unsigned type, at least half the size of UWtype.
-   UDWtype -- An unsigned type, at least twice as large a UWtype
-   W_TYPE_SIZE -- size in bits of UWtype
+   UDWtype -- An unsigned type, at least twice as large a UWtype.
+   W_TYPE_SIZE -- size in bits of UWtype.
 
    SItype, USItype -- Signed and unsigned 32 bit types.
    DItype, UDItype -- Signed and unsigned 64 bit types.
 
-   On a 32 bit machine UWtype should typically be USItype;
-   on a 64 bit machine, UWtype should typically be UDItype.
+   On a 32 bit machine UWtype should typically be USItype.
+   On a 64 bit machine, UWtype should typically be UDItype.
 */
 
 #define __BITS4 (W_TYPE_SIZE / 4)
@@ -1617,7 +1627,21 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
 
 /* If this machine has no inline assembler, use C macros.  */
 
-#if !defined (add_ssaaaa)
+#if !defined (add_ssaaaa) && defined (UDWtype)
+/* Use double word type when available. */
+#  define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UDWtype __audw = (ah);						\
+    UDWtype __budw = (bh);						\
+    __audw <<= W_TYPE_SIZE;						\
+    __audw |= (al);							\
+    __budw <<= W_TYPE_SIZE;						\
+    __budw |= (bl);							\
+    __audw += __budw;							\
+    (sh) = (UWtype)(__audw >> W_TYPE_SIZE);				\
+    (sl) = (UWtype)(__audw); 						\
+  } while (0)
+#elif !defined (add_ssaaaa)
 #  define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {									\
     UWtype __x; 							\
@@ -1627,7 +1651,21 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
   } while (0)
 #endif
 
-#if !defined (sub_ddmmss)
+#if !defined (sub_ddmmss) && defined (UDWtype)
+/* Use double word type when available. */
+#  define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UDWtype __audw = (ah);						\
+    UDWtype __budw = (bh);						\
+    __audw <<= W_TYPE_SIZE;						\
+    __audw |= (al);							\
+    __budw <<= W_TYPE_SIZE;						\
+    __budw |= (bl);							\
+    __audw -= __budw;							\
+    (sh) = (UWtype)(__audw >> W_TYPE_SIZE);				\
+    (sl) = (UWtype)(__audw); 						\
+  } while (0)
+#elif !defined (sub_ddmmss)
 #  define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   do {									\
     UWtype __x; 							\
@@ -1637,7 +1675,15 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
   } while (0)
 #endif
 
-#if !defined (umul_ppmm)
+#if !defined (umul_ppmm) && defined (UDWtype)
+#  define umul_ppmm(w1, w0, u, v) 					\
+  do {									\
+    UDWtype __x = (u);							\
+    __x *= (v);								\
+    (w1) = (UWtype)(__x >> W_TYPE_SIZE);				\
+    (w0) = (UWtype)(__x);						\
+  } while (0)
+#elif !defined (umul_ppmm)
 #  define umul_ppmm(w1, w0, u, v) 					\
   do {									\
     UWtype __x0, __x1, __x2, __x3;					\
@@ -1712,6 +1758,19 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
     (r) = __r0; 							\
   } while (0)
 
+/* Use double word type if available. */
+#if !defined (udiv_qrnnd) && defined (UDWtype)
+#  define udiv_qrnnd(q, r, nh, nl, d) \
+  do {									\
+    UWtype __d = (d);							\
+    UDWtype __nudw = (nh);						\
+    __nudw <<= W_TYPE_SIZE;						\
+    __nudw |= (nl);							\
+    (q) = (UWtype)(__nudw / __d);					\
+    (r) = (UWtype)(__nudw % __d);					\
+  } while (0)
+#endif
+
 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
-- 
2.34.1


From jussi.kivilinna at iki.fi  Tue Oct  4 20:48:17 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue,  4 Oct 2022 21:48:17 +0300
Subject: [PATCH 3/3] mpi/longlong.h: x86-64: use tzcnt instruction for
 trailing zeros
In-Reply-To: <20221004184817.140959-1-jussi.kivilinna@iki.fi>
References: <20221004184817.140959-1-jussi.kivilinna@iki.fi>
Message-ID: <20221004184817.140959-3-jussi.kivilinna@iki.fi>

* mpi/longlong.h [__x86_64__] (count_trailing_zeros): Add 'rep' prefix
for 'bsfq'.
--

"rep;bsf" aka "tzcnt" is new instruction with well defined operation
on zero input and as result is faster on new CPUs. On old CPUs, "tzcnt"
functions as old "bsf" with undefined behaviour on zero input.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/longlong.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mpi/longlong.h b/mpi/longlong.h
index 2921e9bd..706ac723 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -624,7 +624,7 @@ extern USItype __udiv_qrnnd ();
 # define count_trailing_zeros(count, x) \
   do {                                                                  \
     UDItype __cbtmp;                                                    \
-    __asm__ ("bsfq %1,%0"                                               \
+    __asm__ ("rep;bsfq %1,%0"                                           \
              : "=r" (__cbtmp) : "rm" ((UDItype)(x))                     \
              __CLOBBER_CC);                                             \
     (count) = __cbtmp;                                                  \
-- 
2.34.1


From gniibe at fsij.org  Thu Oct  6 09:09:07 2022
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 06 Oct 2022 16:09:07 +0900
Subject: [PATCH 3/3] mpi/longlong.h: x86-64: use tzcnt instruction for
 trailing zeros
In-Reply-To: <20221004184817.140959-3-jussi.kivilinna@iki.fi>
References: <20221004184817.140959-1-jussi.kivilinna@iki.fi>
 <20221004184817.140959-3-jussi.kivilinna@iki.fi>
Message-ID: <87a6695sz0.fsf@jumper.gniibe.org>

Hello,

Jussi Kivilinna wrote:
> * mpi/longlong.h [__x86_64__] (count_trailing_zeros): Add 'rep' prefix
> for 'bsfq'.

Is it also applicable to 80x86 (IA-32) (adding 'rep')?


Besides, I have another issue/concern here.  IIUC, longlong.h upstream
is GCC.  It would be good to import some other changes from the
upstream.  For example, in our version for PPC/POWER, we still have old
two-syntax asm code, that's quite outdated. ( https://dev.gnupg.org/T5980 )
-- 


From jussi.kivilinna at iki.fi  Sat Oct  8 14:01:36 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 8 Oct 2022 15:01:36 +0300
Subject: [PATCH 3/3] mpi/longlong.h: x86-64: use tzcnt instruction for
 trailing zeros
In-Reply-To: <87a6695sz0.fsf@jumper.gniibe.org>
References: <20221004184817.140959-1-jussi.kivilinna@iki.fi>
 <20221004184817.140959-3-jussi.kivilinna@iki.fi>
 <87a6695sz0.fsf@jumper.gniibe.org>
Message-ID: <86aa299c-7e6c-235d-09d4-31393394df14@iki.fi>

On 6.10.2022 10.09, NIIBE Yutaka wrote:
> Hello,
> 
> Jussi Kivilinna wrote:
>> * mpi/longlong.h [__x86_64__] (count_trailing_zeros): Add 'rep' prefix
>> for 'bsfq'.
> 
> Is it also applicable to 80x86 (IA-32) (adding 'rep')?
> 

Yes it is, I'll add 'rep' for i386 too.

> 
> Besides, I have another issue/concern here.  IIUC, longlong.h upstream
> is GCC.  It would be good to import some other changes from the
> upstream.  For example, in our version for PPC/POWER, we still have old
> two-syntax asm code, that's quite outdated. ( https://dev.gnupg.org/T5980 )

I can take look into it.

-Jussi


From jussi.kivilinna at iki.fi  Sat Oct 22 16:14:26 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 22 Oct 2022 17:14:26 +0300
Subject: [PATCH 2/2] hwf-x86: enable VPGATHER usage for AMD CPUs with AVX512
In-Reply-To: <20221022141426.293283-1-jussi.kivilinna@iki.fi>
References: <20221022141426.293283-1-jussi.kivilinna@iki.fi>
Message-ID: <20221022141426.293283-2-jussi.kivilinna@iki.fi>

* src/hwf-x86.c (detect_x86_gnuc): Move model based checks and
forced soft hwfeatures enablement at end; Enable VPGATHER for
AMD CPUs with AVX512.
--

AMD Zen4 is able to benefit from VPGATHER based table-lookup for
Twofish.

Benchmark on Ryzen 9 7900X:

Before:
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        CTR enc |      1.79 ns/B     532.8 MiB/s     10.07 c/B      5625
        CTR dec |      1.79 ns/B     532.6 MiB/s     10.07 c/B      5625

After (~10% faster):
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        CTR enc |      1.61 ns/B     593.5 MiB/s      9.05 c/B      5631?2
        CTR dec |      1.61 ns/B     590.8 MiB/s      9.08 c/B      5625

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 src/hwf-x86.c | 157 ++++++++++++++++++++++++++------------------------
 1 file changed, 83 insertions(+), 74 deletions(-)

diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index b440827e..c6f493eb 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -187,6 +187,7 @@ detect_x86_gnuc (void)
   unsigned int fms, family, model;
   unsigned int result = 0;
   unsigned int avoid_vpgather = 0;
+  unsigned int is_amd_cpu = 0;
 
   (void)os_supports_avx_avx2_registers;
   (void)os_supports_avx512_registers;
@@ -242,6 +243,7 @@ detect_x86_gnuc (void)
   else if (!strcmp (vendor_id.c, "AuthenticAMD"))
     {
       /* This is an AMD CPU.  */
+      is_amd_cpu = 1;
     }
 
   /* Detect Intel features, that might also be supported by other
@@ -253,77 +255,6 @@ detect_x86_gnuc (void)
   family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20);
   model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12);
 
-  if ((result & HWF_INTEL_CPU) && family == 6)
-    {
-      /* These Intel Core processor models have SHLD/SHRD instruction that
-       * can do integer rotation faster actual ROL/ROR instructions. */
-      switch (model)
-	{
-	case 0x2A:
-	case 0x2D:
-	case 0x3A:
-	case 0x3C:
-	case 0x3F:
-	case 0x45:
-	case 0x46:
-	case 0x3D:
-	case 0x4F:
-	case 0x56:
-	case 0x47:
-	case 0x4E:
-	case 0x5E:
-	case 0x8E:
-	case 0x9E:
-	case 0x55:
-	case 0x66:
-	  result |= HWF_INTEL_FAST_SHLD;
-	  break;
-	}
-
-      /* These Intel Core processors that have AVX2 have slow VPGATHER and
-       * should be avoided for table-lookup use. */
-      switch (model)
-	{
-	case 0x3C:
-	case 0x3F:
-	case 0x45:
-	case 0x46:
-	  /* Haswell */
-	  avoid_vpgather |= 1;
-	  break;
-	}
-    }
-  else
-    {
-      /* Avoid VPGATHER for non-Intel CPUs as testing is needed to
-       * make sure it is fast enough. */
-
-      avoid_vpgather |= 1;
-    }
-
-#ifdef ENABLE_FORCE_SOFT_HWFEATURES
-  /* Soft HW features mark functionality that is available on all systems
-   * but not feasible to use because of slow HW implementation. */
-
-  /* SHLD is faster at rotating register than actual ROR/ROL instructions
-   * on older Intel systems (~sandy-bridge era). However, SHLD is very
-   * slow on almost anything else and later Intel processors have faster
-   * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
-   * only for those Intel processors that benefit from the SHLD
-   * instruction. Enabled here unconditionally as requested. */
-  result |= HWF_INTEL_FAST_SHLD;
-
-  /* VPGATHER instructions are used for look-up table based
-   * implementations which require VPGATHER to be fast enough to beat
-   * regular parallelized look-up table implementations (see Twofish).
-   * So far, only Intel processors beginning with skylake have had
-   * VPGATHER fast enough to be enabled. AMD Zen3 comes close to
-   * being feasible, but not quite (where twofish-avx2 is few percent
-   * slower than twofish-3way). Enable VPGATHER here unconditionally
-   * as requested. */
-  avoid_vpgather = 0;
-#endif
-
 #ifdef ENABLE_PCLMUL_SUPPORT
   /* Test bit 1 for PCLMUL.  */
   if (features & 0x00000002)
@@ -392,9 +323,6 @@ detect_x86_gnuc (void)
       if (features & 0x00000020)
         if (os_supports_avx_avx2_registers)
           result |= HWF_INTEL_AVX2;
-
-      if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
-        result |= HWF_INTEL_FAST_VPGATHER;
 #endif /*ENABLE_AVX_SUPPORT*/
 
       /* Test bit 29 for SHA Extensions. */
@@ -446,6 +374,87 @@ detect_x86_gnuc (void)
         result |= HWF_INTEL_GFNI;
     }
 
+  if ((result & HWF_INTEL_CPU) && family == 6)
+    {
+      /* These Intel Core processor models have SHLD/SHRD instruction that
+       * can do integer rotation faster actual ROL/ROR instructions. */
+      switch (model)
+	{
+	case 0x2A:
+	case 0x2D:
+	case 0x3A:
+	case 0x3C:
+	case 0x3F:
+	case 0x45:
+	case 0x46:
+	case 0x3D:
+	case 0x4F:
+	case 0x56:
+	case 0x47:
+	case 0x4E:
+	case 0x5E:
+	case 0x8E:
+	case 0x9E:
+	case 0x55:
+	case 0x66:
+	  result |= HWF_INTEL_FAST_SHLD;
+	  break;
+	}
+
+      /* These Intel Core processors that have AVX2 have slow VPGATHER and
+       * should be avoided for table-lookup use. */
+      switch (model)
+	{
+	case 0x3C:
+	case 0x3F:
+	case 0x45:
+	case 0x46:
+	  /* Haswell */
+	  avoid_vpgather |= 1;
+	  break;
+	}
+    }
+  else if (is_amd_cpu)
+    {
+      /* Non-AVX512 AMD CPUs (pre-Zen4) have slow VPGATHER and should be
+       * avoided for table-lookup use. */
+      avoid_vpgather |= !(result & HWF_INTEL_AVX512);
+    }
+  else
+    {
+      /* Avoid VPGATHER for non-Intel/non-AMD CPUs as testing is needed to
+       * make sure it is fast enough. */
+      avoid_vpgather |= 1;
+    }
+
+#ifdef ENABLE_FORCE_SOFT_HWFEATURES
+  /* Soft HW features mark functionality that is available on all systems
+   * but not feasible to use because of slow HW implementation. */
+
+  /* Some implementations are disabled for non-Intel CPUs. Mark
+   * current CPU as Intel one to enable those implementations. */
+  result |= HWF_INTEL_CPU;
+
+  /* SHLD is faster at rotating register than actual ROR/ROL instructions
+   * on older Intel systems (~sandy-bridge era). However, SHLD is very
+   * slow on almost anything else and later Intel processors have faster
+   * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
+   * only for those Intel processors that benefit from the SHLD
+   * instruction. Enabled here unconditionally as requested. */
+  result |= HWF_INTEL_FAST_SHLD;
+
+  /* VPGATHER instructions are used for look-up table based
+   * implementations which require VPGATHER to be fast enough to beat
+   * regular parallelized look-up table implementations (see Twofish).
+   * So far, only Intel processors beginning with Skylake and AMD
+   * processors starting with Zen4 have had VPGATHER fast enough to be
+   * enabled. Enable VPGATHER here unconditionally as requested. */
+  avoid_vpgather = 0;
+#endif
+
+  if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
+    result |= HWF_INTEL_FAST_VPGATHER;
+
   return result;
 }
 #endif /* HAS_X86_CPUID */
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sat Oct 22 16:14:25 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 22 Oct 2022 17:14:25 +0300
Subject: [PATCH 1/2] sha512-avx512: enable only on Intel CPUs for now
Message-ID: <20221022141426.293283-1-jussi.kivilinna@iki.fi>

* cipher/sha512.c (sha512_init_common): Enable AVX512 implementation
only for Intel CPUs.
--

SHA512-AVX512 implementation is slightly slower than AVX2 variant
on AMD Zen4 (AVX512 4.88 cpb, AVX2 4.35 cpb). This is likely
because AVX512 implementation uses vector registers for round
function unlike AVX2 where general purpose registers are used
for round function. On Zen4, message expansion and round function
then end up competing for narrower vector execution bandwidth
and gives slower performance.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sha512.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cipher/sha512.c b/cipher/sha512.c
index 9ac412b3..492d021a 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -466,7 +466,7 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
     ctx->bctx.bwrite = do_sha512_transform_amd64_avx2;
 #endif
 #ifdef USE_AVX512
-  if ((features & HWF_INTEL_AVX512) != 0)
+  if ((features & HWF_INTEL_AVX512) && (features & HWF_INTEL_CPU))
     ctx->bctx.bwrite = do_sha512_transform_amd64_avx512;
 #endif
 #ifdef USE_PPC_CRYPTO
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:07:14 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:07:14 +0300
Subject: [PATCH] mpi/longlong: update powerpc macros from GCC
Message-ID: <20221023160714.1144288-1-jussi.kivilinna@iki.fi>

* mpi/longlong.h [__powerpc__, __powerpc64__]: Update macros.
--

Update longlong.h powerpc macros with more up to date versions
from GCC's longlong.h. Note, GCC's version is licensed under
LGPLv2.1+.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/longlong.h | 212 +++++++++++++++++++------------------------------
 1 file changed, 81 insertions(+), 131 deletions(-)

diff --git a/mpi/longlong.h b/mpi/longlong.h
index 9e94ef30..fb860cb6 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -979,180 +979,130 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
 /***************************************
  **************  PPC  ******************
  ***************************************/
-#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32
+/* Powerpc 32 bit support taken from GCC longlong.h. */
+#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 32
 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {									\
-    if (__builtin_constant_p (bh) && (bh) == 0) 			\
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"           \
-	     : "=r" ((sh)),                                             \
-	       "=&r" ((sl))                                             \
-	     : "%r" ((USItype)(ah)),                                    \
-	       "%r" ((USItype)(al)),                                    \
-	       "rI" ((USItype)(bl)));                                   \
-    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)		\
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"           \
-	     : "=r" ((sh)),                                             \
-	       "=&r" ((sl))                                             \
-	     : "%r" ((USItype)(ah)),                                    \
-	       "%r" ((USItype)(al)),                                    \
-	       "rI" ((USItype)(bl)));                                   \
+    if (__builtin_constant_p (bh) && (bh) == 0)				\
+      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
+              __CLOBBER_CC);						\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
+              __CLOBBER_CC);						\
     else								\
-      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"          \
-	     : "=r" ((sh)),                                             \
-	       "=&r" ((sl))                                             \
-	     : "%r" ((USItype)(ah)),                                    \
-	       "r" ((USItype)(bh)),                                     \
-	       "%r" ((USItype)(al)),                                    \
-	       "rI" ((USItype)(bl)));                                   \
+      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
+              : "=r" (sh), "=&r" (sl)					\
+              : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
+              __CLOBBER_CC);						\
   } while (0)
 # define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   do {									\
-    if (__builtin_constant_p (ah) && (ah) == 0) 			\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"       \
-	       : "=r" ((sh)),                                           \
-		 "=&r" ((sl))                                           \
-	       : "r" ((USItype)(bh)),                                   \
-		 "rI" ((USItype)(al)),                                  \
-		 "r" ((USItype)(bl)));                                  \
-    else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0)		\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"       \
-	       : "=r" ((sh)),                                  \
-		 "=&r" ((sl))                                  \
-	       : "r" ((USItype)(bh)),                                   \
-		 "rI" ((USItype)(al)),                                  \
-		 "r" ((USItype)(bl)));                                  \
+    if (__builtin_constant_p (ah) && (ah) == 0)				\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
+    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
     else if (__builtin_constant_p (bh) && (bh) == 0)			\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"         \
-	       : "=r" ((sh)),                                           \
-		 "=&r" ((sl))                                           \
-	       : "r" ((USItype)(ah)),                                   \
-		 "rI" ((USItype)(al)),                                  \
-		 "r" ((USItype)(bl)));                                  \
-    else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0)		\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"         \
-	       : "=r" ((sh)),                                           \
-		 "=&r" ((sl))                                           \
-	       : "r" ((USItype)(ah)),                                   \
-		 "rI" ((USItype)(al)),                                  \
-		 "r" ((USItype)(bl)));                                  \
+      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
     else								\
-      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"      \
-	       : "=r" ((sh)),                                           \
-		 "=&r" ((sl))                                           \
-	       : "r" ((USItype)(ah)),                                   \
-		 "r" ((USItype)(bh)),                                   \
-		 "rI" ((USItype)(al)),                                  \
-		 "r" ((USItype)(bl)));                                  \
+      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
+              : "=r" (sh), "=&r" (sl)					\
+              : "r" (ah), "r" (bh), "rI" (al), "r" (bl)			\
+              __CLOBBER_CC);						\
   } while (0)
 # define count_leading_zeros(count, x) \
-  __asm__ ("{cntlz|cntlzw} %0,%1"                                       \
-	   : "=r" ((count))                                             \
-	   : "r" ((USItype)(x)))
+  __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
 # define COUNT_LEADING_ZEROS_0 32
-# if defined (_ARCH_PPC)
-#  define umul_ppmm(ph, pl, m0, m1) \
+# define umul_ppmm(ph, pl, m0, m1) \
   do {									\
     USItype __m0 = (m0), __m1 = (m1);					\
-    __asm__ ("mulhwu %0,%1,%2"                                          \
-	     : "=r" (ph)                                                \
-	     : "%r" (__m0),                                             \
-	       "r" (__m1));                                             \
-    (pl) = __m0 * __m1; 						\
+    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
   } while (0)
-#  define UMUL_TIME 15
-#  define smul_ppmm(ph, pl, m0, m1) \
+# define UMUL_TIME 15
+# define smul_ppmm(ph, pl, m0, m1) \
   do {									\
     SItype __m0 = (m0), __m1 = (m1);					\
-    __asm__ ("mulhw %0,%1,%2"                                           \
-	     : "=r" ((SItype) ph)                                       \
-	     : "%r" (__m0),                                             \
-	       "r" (__m1));                                             \
-    (pl) = __m0 * __m1; 						\
-  } while (0)
-#  define SMUL_TIME 14
-#  define UDIV_TIME 120
-# else
-#  define umul_ppmm(xh, xl, m0, m1) \
-  do {									\
-    USItype __m0 = (m0), __m1 = (m1);					\
-    __asm__ ("mul %0,%2,%3"                                             \
-	     : "=r" ((xh)),                                             \
-	       "=q" ((xl))                                              \
-	     : "r" (__m0),                                              \
-	       "r" (__m1));                                             \
-    (xh) += ((((SItype) __m0 >> 31) & __m1)				\
-	     + (((SItype) __m1 >> 31) & __m0)); 			\
+    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
   } while (0)
-#  define UMUL_TIME 8
-#  define smul_ppmm(xh, xl, m0, m1) \
-  __asm__ ("mul %0,%2,%3"                                               \
-	   : "=r" ((SItype)(xh)),                                       \
-	     "=q" ((SItype)(xl))                                        \
-	   : "r" (m0),                                                  \
-	     "r" (m1))
-#  define SMUL_TIME 4
-#  define sdiv_qrnnd(q, r, nh, nl, d) \
-  __asm__ ("div %0,%2,%4"                                               \
-	   : "=r" ((SItype)(q)), "=q" ((SItype)(r))                     \
-	   : "r" ((SItype)(nh)), "1" ((SItype)(nl)), "r" ((SItype)(d)))
-#  define UDIV_TIME 100
-# endif
-#endif /* Power architecture variants.	*/
+# define SMUL_TIME 14
+# define UDIV_TIME 120
+#endif /* 32-bit POWER architecture variants.  */
 
-/* Powerpc 64 bit support taken from gmp-4.1.2. */
+/* Powerpc 64 bit support taken from GCC longlong.h. */
 /* We should test _IBMR2 here when we add assembly support for the system
    vendor compilers.  */
-#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 64
-#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+#if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64
+# define add_ssaaaa(sh, sl, ah, al, bh, bl) \
   do {									\
     if (__builtin_constant_p (bh) && (bh) == 0)				\
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
-	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+      __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
+              __CLOBBER_CC);						\
     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
-      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
-	     : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\
+      __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
+              __CLOBBER_CC);						\
     else								\
-      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
-	     : "=r" (sh), "=&r" (sl)					\
-	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
+      __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
+              : "=r" (sh), "=&r" (sl)					\
+              : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
+              __CLOBBER_CC);						\
   } while (0)
-#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+# define sub_ddmmss(sh, sl, ah, al, bh, bl) \
   do {									\
     if (__builtin_constant_p (ah) && (ah) == 0)				\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
-	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
     else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
-	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
     else if (__builtin_constant_p (bh) && (bh) == 0)			\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
-	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
-      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
-	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+      __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
+              : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
+              __CLOBBER_CC);						\
     else								\
-      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
-	       : "=r" (sh), "=&r" (sl)					\
-	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
+      __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
+              : "=r" (sh), "=&r" (sl)					\
+              : "r" (ah), "r" (bh), "rI" (al), "r" (bl)			\
+              __CLOBBER_CC);						\
   } while (0)
-#define count_leading_zeros(count, x) \
+# define count_leading_zeros(count, x) \
   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
-#define COUNT_LEADING_ZEROS_0 64
-#define umul_ppmm(ph, pl, m0, m1) \
+# define COUNT_LEADING_ZEROS_0 64
+# define umul_ppmm(ph, pl, m0, m1) \
   do {									\
     UDItype __m0 = (m0), __m1 = (m1);					\
     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
     (pl) = __m0 * __m1;							\
   } while (0)
-#define UMUL_TIME 15
-#define smul_ppmm(ph, pl, m0, m1) \
+# define UMUL_TIME 15
+# define smul_ppmm(ph, pl, m0, m1) \
   do {									\
     DItype __m0 = (m0), __m1 = (m1);					\
     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
     (pl) = __m0 * __m1;							\
   } while (0)
-#define SMUL_TIME 14  /* ??? */
-#define UDIV_TIME 120 /* ??? */
+# define SMUL_TIME 14  /* ??? */
+# define UDIV_TIME 120 /* ??? */
 #endif /* 64-bit PowerPC.  */
 
 /***************************************
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:04 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:04 +0300
Subject: [PATCH 4/8] sm4: fix lookup-table prefetching
In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
Message-ID: <20221023161608.1145423-4-jussi.kivilinna@iki.fi>

* cipher/sm4.c (sm4_expand_key): Prefetch sbox table.
(sm4_get_crypt_blk1_16_fn): Do not prefetch sbox table.
(sm4_expand_key, _gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec)
(_gcry_sm4_cfb_dec): Prefetch sbox table if table look-up
implementation is used.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sm4.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/cipher/sm4.c b/cipher/sm4.c
index 99a1e840..32a21dd9 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -701,6 +701,8 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
     }
 #endif
 
+  prefetch_sbox_table ();
+
   rk[0] = buf_get_be32(key + 4 * 0) ^ fk[0];
   rk[1] = buf_get_be32(key + 4 * 1) ^ fk[1];
   rk[2] = buf_get_be32(key + 4 * 2) ^ fk[2];
@@ -1008,7 +1010,6 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
   else
     {
       (void)ctx;
-      prefetch_sbox_table ();
       return &sm4_crypt_blocks;
     }
 }
@@ -1149,6 +1150,9 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
       unsigned int tmp_used = 16;
       size_t nburn;
 
+      if (crypt_blk1_16 == &sm4_crypt_blocks)
+	prefetch_sbox_table ();
+
       nburn = bulk_ctr_enc_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
                                nblocks, ctr, tmpbuf, sizeof(tmpbuf) / 16,
                                &tmp_used);
@@ -1295,6 +1299,9 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
       unsigned int tmp_used = 16;
       size_t nburn;
 
+      if (crypt_blk1_16 == &sm4_crypt_blocks)
+	prefetch_sbox_table ();
+
       nburn = bulk_cbc_dec_128(ctx->rkey_dec, crypt_blk1_16, outbuf, inbuf,
                                nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
                                &tmp_used);
@@ -1441,6 +1448,9 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
       unsigned int tmp_used = 16;
       size_t nburn;
 
+      if (crypt_blk1_16 == &sm4_crypt_blocks)
+	prefetch_sbox_table ();
+
       nburn = bulk_cfb_dec_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
                                nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
                                &tmp_used);
@@ -1457,6 +1467,7 @@ static unsigned int
 sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf,
                    unsigned int num_blks, const u32 *rk)
 {
+  crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
   unsigned int stack_burn_size = 0;
   unsigned int nburn;
 
@@ -1479,7 +1490,7 @@ sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf,
   do
     {
       unsigned int curr_blks = num_blks > 16 ? 16 : num_blks;
-      nburn = ctx->crypt_blk1_16 (rk, outbuf, inbuf, curr_blks);
+      nburn = crypt_blk1_16 (rk, outbuf, inbuf, curr_blks);
       stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
       outbuf += curr_blks * 16;
       inbuf += curr_blks * 16;
@@ -1534,6 +1545,9 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
       unsigned int tmp_used = 16;
       size_t nburn;
 
+      if (ctx->crypt_blk1_16 == &sm4_crypt_blocks)
+	prefetch_sbox_table ();
+
       nburn = bulk_xts_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32
                                               : sm4_decrypt_blk1_32,
                                  outbuf, inbuf, nblocks,
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:06 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:06 +0300
Subject: [PATCH 6/8] twofish: accelerate XTS and ECB modes
In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
Message-ID: <20221023161608.1145423-6-jussi.kivilinna@iki.fi>

* cipher/twofish-amd64.S (_gcry_twofish_amd64_blk3): New.
* cipher/twofish-avx2-amd64.S (_gcry_twofish_avx2_blk16): New.
(_gcry_twofish_xts_crypt, _gcry_twofish_ecb_crypt)
(_gcry_twofish_avx2_blk16, _gcry_twofish_amd64_blk3)
(twofish_crypt_blk1_16, twofish_encrypt_blk1_16)
(twofish_decrypt_blk1_16): New.
(twofish_setkey): Setup XTS and ECB bulk functions.
--

Benchmark on AMD Ryzen 9 7900X:

Before:
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      2.52 ns/B     378.2 MiB/s     14.18 c/B      5625
        ECB dec |      2.51 ns/B     380.2 MiB/s     14.11 c/B      5625
        XTS enc |      2.65 ns/B     359.9 MiB/s     14.91 c/B      5625
        XTS dec |      2.63 ns/B     362.0 MiB/s     14.60 c/B      5541

After:
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.60 ns/B     594.8 MiB/s      9.02 c/B      5625
        ECB dec |      1.60 ns/B     594.8 MiB/s      9.02 c/B      5625
        XTS enc |      1.66 ns/B     573.9 MiB/s      9.35 c/B      5625
        XTS dec |      1.67 ns/B     569.6 MiB/s      9.41 c/B      5619?2

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/twofish-amd64.S      |  74 ++++++++++++++++++
 cipher/twofish-avx2-amd64.S |  46 +++++++++++
 cipher/twofish.c            | 147 +++++++++++++++++++++++++++++++++++-
 3 files changed, 264 insertions(+), 3 deletions(-)

diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index a7a60553..8998d296 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -544,6 +544,80 @@ __twofish_dec_blk3:
 	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
+.align 8
+.globl _gcry_twofish_amd64_blk3
+ELF(.type   _gcry_twofish_amd64_blk3, at function;)
+_gcry_twofish_amd64_blk3:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%ecx: encrypt (0 or 1)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	testl %ecx, %ecx;
+	movq %rdx, RX0;
+	movq %rsi, (6 * 8)(%rsp);
+
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+
+	jz .Lblk1_3_dec;
+		call __twofish_enc_blk3;
+		jmp .Lblk1_3_end;
+	.Lblk1_3_dec:
+		call __twofish_dec_blk3;
+
+.Lblk1_3_end:
+	movq (6 * 8)(%rsp), RX0;
+	movq RCD0, (0 * 8)(RX0);
+	movq RAB0, (1 * 8)(RX0);
+	movq RCD1, (2 * 8)(RX0);
+	movq RAB1, (3 * 8)(RX0);
+	movq RCD2, (4 * 8)(RX0);
+	movq RAB2, (5 * 8)(RX0);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+	EXIT_SYSV_FUNC
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;)
+
 .align 8
 .globl _gcry_twofish_amd64_ctr_enc
 ELF(.type   _gcry_twofish_amd64_ctr_enc, at function;)
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index 930ac792..0cb9a64c 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -468,6 +468,52 @@ __twofish_dec_blk16:
 	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
 
+.align 8
+.globl _gcry_twofish_avx2_blk16
+ELF(.type   _gcry_twofish_avx2_blk16, at function;)
+_gcry_twofish_avx2_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%ecx: encrypt
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RB0;
+	vmovdqu (2 * 32)(%rdx), RC0;
+	vmovdqu (3 * 32)(%rdx), RD0;
+	vmovdqu (4 * 32)(%rdx), RA1;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RC1;
+	vmovdqu (7 * 32)(%rdx), RD1;
+
+	testl %ecx, %ecx;
+	jz .Lblk16_dec;
+		call __twofish_enc_blk16;
+		jmp .Lblk16_end;
+	.Lblk16_dec:
+		call __twofish_dec_blk16;
+
+.Lblk16_end:
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_blk16,.-_gcry_twofish_avx2_blk16;)
+
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
diff --git a/cipher/twofish.c b/cipher/twofish.c
index b300715b..92c463fc 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -101,7 +101,12 @@ static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 				       int encrypt);
 static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 				      size_t nblocks);
-
+static void _gcry_twofish_xts_crypt (void *context, unsigned char *tweak,
+				     void *outbuf_arg, const void *inbuf_arg,
+				     size_t nblocks, int encrypt);
+static void _gcry_twofish_ecb_crypt (void *context, void *outbuf_arg,
+				     const void *inbuf_arg, size_t nblocks,
+				     int encrypt);
 
 /* Structure for an expanded Twofish key.  s contains the key-dependent
  * S-boxes composed with the MDS matrix; w contains the eight "whitening"
@@ -775,7 +780,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
   bulk_ops->cfb_dec = _gcry_twofish_cfb_dec;
   bulk_ops->ctr_enc = _gcry_twofish_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt;
-  bulk_ops->ocb_auth  = _gcry_twofish_ocb_auth;
+  bulk_ops->ocb_auth = _gcry_twofish_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_twofish_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_twofish_ecb_crypt;
 
   (void)hwfeatures;
 
@@ -788,6 +795,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
 /* Assembler implementations of Twofish using AVX2.  Process 16 block in
    parallel.
  */
+extern void _gcry_twofish_avx2_blk16 (const TWOFISH_context *c, byte *out,
+				      const byte *in, int encrypt) ASM_FUNC_ABI;
+
 extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
@@ -835,6 +845,9 @@ extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c,
 					      byte *out, const byte *in);
 
 /* These assembly implementations process three blocks in parallel. */
+extern void _gcry_twofish_amd64_blk3(const TWOFISH_context *c, byte *out,
+				     const byte *in, int encrypt);
+
 extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out,
 					const byte *in, byte *ctr);
 
@@ -1501,7 +1514,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 	blkn += 3;
 
 	twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-			      c->u_mode.ocb.aad_sum, Ls);
+			       c->u_mode.ocb.aad_sum, Ls);
 
 	nblocks -= 3;
 	abuf += 3 * TWOFISH_BLOCKSIZE;
@@ -1527,6 +1540,134 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 }
 
 
+static unsigned int
+twofish_crypt_blk1_16(const void *context, byte *out, const byte *in,
+		      unsigned int num_blks, int encrypt)
+{
+  const TWOFISH_context *ctx = context;
+  unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+  if (num_blks == 16 && ctx->use_avx2)
+    {
+      _gcry_twofish_avx2_blk16 (ctx, out, in, encrypt);
+      return 0;
+    }
+#endif
+
+#ifdef USE_AMD64_ASM
+  while (num_blks >= 3)
+    {
+      _gcry_twofish_amd64_blk3 (ctx, out, in, encrypt);
+      burn = 8 * sizeof(void *);
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += 3 * TWOFISH_BLOCKSIZE;
+      in += 3 * TWOFISH_BLOCKSIZE;
+      num_blks -= 3;
+    }
+#endif
+
+  while (num_blks >= 1)
+    {
+      if (encrypt)
+	burn = twofish_encrypt((void *)ctx, out, in);
+      else
+	burn = twofish_decrypt((void *)ctx, out, in);
+
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += TWOFISH_BLOCKSIZE;
+      in += TWOFISH_BLOCKSIZE;
+      num_blks--;
+    }
+
+  return burn_stack_depth;
+}
+
+static unsigned int
+twofish_encrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 1);
+}
+
+static unsigned int
+twofish_decrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 0);
+}
+
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_twofish_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+			 const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[16 * 16];
+      unsigned int tmp_used = 16;
+      size_t tmpbufsize = 15 * 16;
+      size_t nburn;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2)
+	tmpbufsize = 16 * 16;
+#endif
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+                                              : twofish_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks,
+                                 tweak, tmpbuf, tmpbufsize / 16,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_twofish_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+			 size_t nblocks, int encrypt)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t fn_maxblocks = 15;
+      size_t nburn;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2)
+	fn_maxblocks = 16;
+#endif
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+                                              : twofish_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks, fn_maxblocks);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+
 

 /* Test a single encryption and decryption with each key size. */
 
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:03 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:03 +0300
Subject: [PATCH 3/8] camellia: accelerate ECB (for benchmarking)
In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
Message-ID: <20221023161608.1145423-3-jussi.kivilinna@iki.fi>

* cipher/bulkhelp.h (bulk_ecb_crypt_128): New.
* cipher/camellia-glue.c (_gcry_camellia_ecb_crypt): New.
(camellia_setkey): Select ECB bulk function with AESNI/AVX2, VAES/AVX2
and GFNI/AVX2.
--

Benchmark on AMD Ryzen 9 7900X:

Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      3.27 ns/B     291.8 MiB/s     18.38 c/B      5625
        ECB dec |      3.25 ns/B     293.3 MiB/s     18.29 c/B      5625

After (OCB for reference):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.146 ns/B      6533 MiB/s     0.803 c/B      5500
        ECB dec |     0.149 ns/B      6384 MiB/s     0.822 c/B      5500
        OCB enc |     0.170 ns/B      5608 MiB/s     0.957 c/B      5625
        OCB dec |     0.175 ns/B      5452 MiB/s     0.984 c/B      5625

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/bulkhelp.h      | 19 +++++++++++++++++++
 cipher/camellia-glue.c | 38 ++++++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h
index 444973ab..b86abc27 100644
--- a/cipher/bulkhelp.h
+++ b/cipher/bulkhelp.h
@@ -470,5 +470,24 @@ bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
   return burn_depth;
 }
 
+static inline unsigned int
+bulk_ecb_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+		    const byte *inbuf, size_t nblocks, size_t fn_max_nblocks)
+{
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > fn_max_nblocks ? fn_max_nblocks : nblocks;
+      nburn = crypt_fn (priv, outbuf, inbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      inbuf += curr_blks * 16;
+      outbuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  return burn_depth;
+}
 
 #endif /*GCRYPT_BULKHELP_H*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index b2a50233..a81d586a 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -405,11 +405,14 @@ static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
 				    void *outbuf_arg, const void *inbuf_arg,
 				    size_t nblocks);
 static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
-                                      void *outbuf_arg, const void *inbuf_arg,
-                                      size_t nblocks, int encrypt);
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks, int encrypt);
+static void _gcry_camellia_ecb_crypt (void *context, void *outbuf_arg,
+				      const void *inbuf_arg, size_t nblocks,
+				      int encrypt);
 static void _gcry_camellia_ctr32le_enc (void *context, unsigned char *ctr,
-                                        void *outbuf_arg, const void *inbuf_arg,
-                                        size_t nblocks);
+					void *outbuf_arg, const void *inbuf_arg,
+					size_t nblocks);
 static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 					const void *inbuf_arg, size_t nblocks,
 					int encrypt);
@@ -474,10 +477,12 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2)
     {
       bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
+      bulk_ops->ecb_crypt = _gcry_camellia_ecb_crypt;
       bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc;
     }
 #else
   (void)_gcry_camellia_xts_crypt;
+  (void)_gcry_camellia_ecb_crypt;
   (void)_gcry_camellia_ctr32le_enc;
 #endif
 
@@ -1126,6 +1131,31 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
     _gcry_burn_stack(burn_stack_depth);
 }
 
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_camellia_ecb_crypt (void *context, void *outbuf_arg,
+			  const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t nburn;
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_64
+                                              : camellia_decrypt_blk1_64,
+                                 outbuf, inbuf, nblocks, 64);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
 /* Bulk encryption/decryption of complete blocks in XTS mode. */
 static void
 _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:05 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:05 +0300
Subject: [PATCH 5/8] sm4: accelerate ECB (for benchmarking)
In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
Message-ID: <20221023161608.1145423-5-jussi.kivilinna@iki.fi>

* cipher/sm4.c (_gcry_sm4_ecb_crypt): New.
(sm4_setkey): Setup ECB bulk function.
--

Benchmark on AMD Ryzen 9 7900X:

Before:
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      4.75 ns/B     200.6 MiB/s     26.74 c/B      5625
        ECB dec |      4.79 ns/B     199.3 MiB/s     26.92 c/B      5625

After (OCB for reference):
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.252 ns/B      3782 MiB/s      1.42 c/B      5624
        ECB dec |     0.253 ns/B      3770 MiB/s      1.42 c/B      5625
        OCB enc |     0.277 ns/B      3446 MiB/s      1.56 c/B      5625
        OCB dec |     0.281 ns/B      3399 MiB/s      1.54 c/B      5500

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sm4.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/cipher/sm4.c b/cipher/sm4.c
index 32a21dd9..20852cfb 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -129,6 +129,9 @@ static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
 static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak,
                                  void *outbuf_arg, const void *inbuf_arg,
                                  size_t nblocks, int encrypt);
+static void _gcry_sm4_ecb_crypt (void *context, void *outbuf_arg,
+				 const void *inbuf_arg, size_t nblocks,
+				 int encrypt);
 static void _gcry_sm4_ctr32le_enc(void *context, unsigned char *ctr,
                                   void *outbuf_arg, const void *inbuf_arg,
                                   size_t nblocks);
@@ -796,6 +799,7 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
   bulk_ops->cfb_dec = _gcry_sm4_cfb_dec;
   bulk_ops->ctr_enc = _gcry_sm4_ctr_enc;
   bulk_ops->xts_crypt = _gcry_sm4_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_sm4_ecb_crypt;
   bulk_ops->ctr32le_enc = _gcry_sm4_ctr32le_enc;
   bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt;
   bulk_ops->ocb_auth  = _gcry_sm4_ocb_auth;
@@ -1517,6 +1521,34 @@ sm4_decrypt_blk1_32 (const void *context, byte *out, const byte *in,
   return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_dec);
 }
 
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_sm4_ecb_crypt (void *context, void *outbuf_arg,
+		     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  SM4_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t nburn;
+
+      if (ctx->crypt_blk1_16 == &sm4_crypt_blocks)
+	prefetch_sbox_table ();
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32
+                                              : sm4_decrypt_blk1_32,
+                                 outbuf, inbuf, nblocks, 32);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
 /* Bulk encryption/decryption of complete blocks in XTS mode. */
 static void
 _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:07 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:07 +0300
Subject: [PATCH 7/8] serpent: fix compiler warning on 32-bit ARM
In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
Message-ID: <20221023161608.1145423-7-jussi.kivilinna@iki.fi>

* cipher/serpent.c (_gcry_serpent_ocb_crypt)
(_gcry_serpent_ocb_auth) [USE_NEON]: Cast "Ls" to 'const void **'.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/serpent.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cipher/serpent.c b/cipher/serpent.c
index 11eeb079..93c561c5 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1369,10 +1369,10 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
 	      if (encrypt)
 		_gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-					   c->u_ctr.ctr, (void **)Ls);
+					   c->u_ctr.ctr, (const void **)Ls);
 	      else
 		_gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-					   c->u_ctr.ctr, (void **)Ls);
+					   c->u_ctr.ctr, (const void **)Ls);
 
 	      nblocks -= 8;
 	      outbuf += 8 * sizeof(serpent_block_t);
@@ -1508,7 +1508,8 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
 
 	      _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-					  c->u_mode.ocb.aad_sum, (void **)Ls);
+					  c->u_mode.ocb.aad_sum,
+					  (const void **)Ls);
 
 	      nblocks -= 8;
 	      abuf += 8 * sizeof(serpent_block_t);
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:02 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:02 +0300
Subject: [PATCH 2/8] rijndael-vaes: align asm functions
In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
Message-ID: <20221023161608.1145423-2-jussi.kivilinna@iki.fi>

* cipher/rijndael-vaes-avx2-amd64.S: Align functions to 16 bytes.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-vaes-avx2-amd64.S | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index 655fdf55..13fe7ab0 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -86,6 +86,7 @@
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_cbc_dec_amd64, at function)
 .globl _gcry_vaes_avx2_cbc_dec_amd64
+.align 16
 _gcry_vaes_avx2_cbc_dec_amd64:
 	/* input:
 	 *	%rdi: round keys
@@ -392,6 +393,7 @@ ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_cfb_dec_amd64, at function)
 .globl _gcry_vaes_avx2_cfb_dec_amd64
+.align 16
 _gcry_vaes_avx2_cfb_dec_amd64:
 	/* input:
 	 *	%rdi: round keys
@@ -700,6 +702,7 @@ ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_ctr_enc_amd64, at function)
 .globl _gcry_vaes_avx2_ctr_enc_amd64
+.align 16
 _gcry_vaes_avx2_ctr_enc_amd64:
 	/* input:
 	 *	%rdi: round keys
@@ -1112,6 +1115,7 @@ ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_ctr32le_enc_amd64, at function)
 .globl _gcry_vaes_avx2_ctr32le_enc_amd64
+.align 16
 _gcry_vaes_avx2_ctr32le_enc_amd64:
 	/* input:
 	 *	%rdi: round keys
@@ -1396,6 +1400,7 @@ ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64, at function)
 .globl _gcry_vaes_avx2_ocb_crypt_amd64
+.align 16
 _gcry_vaes_avx2_ocb_crypt_amd64:
 	/* input:
 	 *	%rdi:     round keys
@@ -2361,6 +2366,7 @@ ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_xts_crypt_amd64, at function)
 .globl _gcry_vaes_avx2_xts_crypt_amd64
+.align 16
 _gcry_vaes_avx2_xts_crypt_amd64:
 	/* input:
 	 *	%rdi: round keys
@@ -2878,6 +2884,7 @@ ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64)
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64, at function)
 .globl _gcry_vaes_avx2_ecb_crypt_amd64
+.align 16
 _gcry_vaes_avx2_ecb_crypt_amd64:
 	/* input:
 	 *	%rdi: round keys
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:08 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:08 +0300
Subject: [PATCH 8/8] serpent: accelerate XTS and ECB modes
In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>
Message-ID: <20221023161608.1145423-8-jussi.kivilinna@iki.fi>

* cipher/serpent-armv7-neon.S (_gcry_serpent_neon_blk8): New.
* cipher/serpent-avx2-amd64.S (_gcry_serpent_avx2_blk16): New.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_blk8): New.
* cipher/serpent.c (_gcry_serpent_sse2_blk8)
(_gcry_serpent_avx2_blk16, _gcry_serpent_neon_blk8)
(_gcry_serpent_xts_crypt, _gcry_serpent_ecb_crypt)
(serpent_crypt_blk1_16, serpent_encrypt_blk1_16)
(serpent_decrypt_blk1_16): New.
(serpent_setkey): Setup XTS and ECB bulk functions.
--

Benchmark on AMD Ryzen 9 7900X:

Before:
 SERPENT128     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      5.42 ns/B     176.0 MiB/s     30.47 c/B      5625
        ECB dec |      4.82 ns/B     197.9 MiB/s     27.11 c/B      5625
        XTS enc |      5.57 ns/B     171.3 MiB/s     31.31 c/B      5625
        XTS dec |      4.99 ns/B     191.1 MiB/s     28.07 c/B      5625

After:
 SERPENT128     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.708 ns/B      1347 MiB/s      3.98 c/B      5625
        ECB dec |     0.694 ns/B      1373 MiB/s      3.91 c/B      5625
        XTS enc |     0.766 ns/B      1246 MiB/s      4.31 c/B      5625
        XTS dec |     0.754 ns/B      1264 MiB/s      4.24 c/B      5625

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/serpent-armv7-neon.S |  56 ++++++++++++++
 cipher/serpent-avx2-amd64.S |  50 ++++++++++++
 cipher/serpent-sse2-amd64.S |  65 ++++++++++++++++
 cipher/serpent.c            | 147 +++++++++++++++++++++++++++++++++++-
 4 files changed, 317 insertions(+), 1 deletion(-)

diff --git a/cipher/serpent-armv7-neon.S b/cipher/serpent-armv7-neon.S
index adff6394..4179ba2c 100644
--- a/cipher/serpent-armv7-neon.S
+++ b/cipher/serpent-armv7-neon.S
@@ -600,6 +600,62 @@ __serpent_dec_blk8:
 	bx lr;
 .size __serpent_dec_blk8,.-__serpent_dec_blk8;
 
+.align 3
+.globl _gcry_serpent_neon_blk8
+.type _gcry_serpent_neon_blk8,%function;
+_gcry_serpent_neon_blk8:
+	/* input:
+	 *	r0: ctx, CTX
+	 *	r1: dst (8 blocks)
+	 *	r2: src (8 blocks)
+	 *	r3: encrypt
+	 */
+
+	push {lr};
+	vpush {RA4-RB2};
+
+	cmp r3, #0
+
+	vld1.8 {RA0, RA1}, [r2]!;
+	vld1.8 {RA2, RA3}, [r2]!;
+	vld1.8 {RB0, RB1}, [r2]!;
+	vld1.8 {RB2, RB3}, [r2]!;
+
+	beq .Lblk8_dec;
+		bl __serpent_enc_blk8;
+		vst1.8 {RA4}, [r1]!;
+		vst1.8 {RA1, RA2}, [r1]!;
+		vst1.8 {RA0}, [r1]!;
+		vst1.8 {RB4}, [r1]!;
+		vst1.8 {RB1, RB2}, [r1]!;
+		vst1.8 {RB0}, [r1]!;
+		b .Lblk8_end;
+	.Lblk8_dec:
+		bl __serpent_dec_blk8;
+		vst1.8 {RA0, RA1}, [r1]!;
+		vst1.8 {RA2, RA3}, [r1]!;
+		vst1.8 {RB0, RB1}, [r1]!;
+		vst1.8 {RB2, RB3}, [r1]!;
+
+.Lblk8_end:
+	/* clear the used registers */
+	veor RA0, RA0;
+	veor RA1, RA1;
+	veor RA2, RA2;
+	veor RA3, RA3;
+
+	vpop {RA4-RB2};
+
+	veor RB3, RB3;
+	veor RB4, RB4;
+	veor RT0, RT0;
+	veor RT1, RT1;
+	veor RT2, RT2;
+	veor RT3, RT3;
+
+	pop {pc};
+.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
+
 .align 3
 .globl _gcry_serpent_neon_ctr_enc
 .type _gcry_serpent_neon_ctr_enc,%function;
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index d3515a21..54ff61e4 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -583,6 +583,56 @@ __serpent_dec_blk16:
 	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
+.align 8
+.globl _gcry_serpent_avx2_blk16
+ELF(.type   _gcry_serpent_avx2_blk16, at function;)
+_gcry_serpent_avx2_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%ecx: encrypt
+	 */
+	CFI_STARTPROC();
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RA1;
+	vmovdqu (2 * 32)(%rdx), RA2;
+	vmovdqu (3 * 32)(%rdx), RA3;
+	vmovdqu (4 * 32)(%rdx), RB0;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RB2;
+	vmovdqu (7 * 32)(%rdx), RB3;
+
+	testl %ecx, %ecx;
+	jz .Lblk16_dec;
+		call __serpent_enc_blk16;
+		vmovdqu RA4, (0 * 32)(%rsi);
+		vmovdqu RA1, (1 * 32)(%rsi);
+		vmovdqu RA2, (2 * 32)(%rsi);
+		vmovdqu RA0, (3 * 32)(%rsi);
+		vmovdqu RB4, (4 * 32)(%rsi);
+		vmovdqu RB1, (5 * 32)(%rsi);
+		vmovdqu RB2, (6 * 32)(%rsi);
+		vmovdqu RB0, (7 * 32)(%rsi);
+		jmp .Lblk16_end;
+	.Lblk16_dec:
+		call __serpent_dec_blk16;
+		vmovdqu RA0, (0 * 32)(%rsi);
+		vmovdqu RA1, (1 * 32)(%rsi);
+		vmovdqu RA2, (2 * 32)(%rsi);
+		vmovdqu RA3, (3 * 32)(%rsi);
+		vmovdqu RB0, (4 * 32)(%rsi);
+		vmovdqu RB1, (5 * 32)(%rsi);
+		vmovdqu RB2, (6 * 32)(%rsi);
+		vmovdqu RB3, (7 * 32)(%rsi);
+
+.Lblk16_end:
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;)
+
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index b5935095..01723a2a 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -605,6 +605,71 @@ __serpent_dec_blk8:
 	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
+.align 8
+.globl _gcry_serpent_sse2_blk8
+ELF(.type   _gcry_serpent_sse2_blk8, at function;)
+_gcry_serpent_sse2_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%ecx: encrypt
+	 */
+	CFI_STARTPROC();
+
+	movdqu (0 * 16)(%rdx), RA0;
+	movdqu (1 * 16)(%rdx), RA1;
+	movdqu (2 * 16)(%rdx), RA2;
+	movdqu (3 * 16)(%rdx), RA3;
+	movdqu (4 * 16)(%rdx), RB0;
+	movdqu (5 * 16)(%rdx), RB1;
+	movdqu (6 * 16)(%rdx), RB2;
+	movdqu (7 * 16)(%rdx), RB3;
+
+	testl %ecx, %ecx;
+	jz .Lblk8_dec;
+		call __serpent_enc_blk8;
+		movdqu RA4, (0 * 16)(%rsi);
+		movdqu RA1, (1 * 16)(%rsi);
+		movdqu RA2, (2 * 16)(%rsi);
+		movdqu RA0, (3 * 16)(%rsi);
+		movdqu RB4, (4 * 16)(%rsi);
+		movdqu RB1, (5 * 16)(%rsi);
+		movdqu RB2, (6 * 16)(%rsi);
+		movdqu RB0, (7 * 16)(%rsi);
+		jmp .Lblk8_end;
+	.Lblk8_dec:
+		call __serpent_dec_blk8;
+		movdqu RA0, (0 * 16)(%rsi);
+		movdqu RA1, (1 * 16)(%rsi);
+		movdqu RA2, (2 * 16)(%rsi);
+		movdqu RA3, (3 * 16)(%rsi);
+		movdqu RB0, (4 * 16)(%rsi);
+		movdqu RB1, (5 * 16)(%rsi);
+		movdqu RB2, (6 * 16)(%rsi);
+		movdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_end:
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;)
+
 .align 8
 .globl _gcry_serpent_sse2_ctr_enc
 ELF(.type   _gcry_serpent_sse2_ctr_enc, at function;)
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 93c561c5..0a9ed27c 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -139,6 +139,9 @@ extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx,
 					unsigned char *offset,
 					unsigned char *checksum,
 					const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_blk8(const serpent_context_t *c, byte *out,
+				    const byte *in, int encrypt) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AVX2
@@ -179,6 +182,9 @@ extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx,
 					unsigned char *offset,
 					unsigned char *checksum,
 					const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_blk16(const serpent_context_t *c, byte *out,
+				     const byte *in, int encrypt) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_NEON
@@ -219,6 +225,9 @@ extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx,
 					unsigned char *offset,
 					unsigned char *checksum,
 					const void *Ls[8]);
+
+extern void _gcry_serpent_neon_blk8(const serpent_context_t *c, byte *out,
+				    const byte *in, int encrypt);
 #endif
 
 
@@ -239,6 +248,12 @@ static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 				       int encrypt);
 static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 				      size_t nblocks);
+static void _gcry_serpent_xts_crypt (void *context, unsigned char *tweak,
+				     void *outbuf_arg, const void *inbuf_arg,
+				     size_t nblocks, int encrypt);
+static void _gcry_serpent_ecb_crypt (void *context, void *outbuf_arg,
+				     const void *inbuf_arg, size_t nblocks,
+				     int encrypt);
 
 
 /*
@@ -790,7 +805,9 @@ serpent_setkey (void *ctx,
   bulk_ops->cfb_dec = _gcry_serpent_cfb_dec;
   bulk_ops->ctr_enc = _gcry_serpent_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_serpent_ocb_crypt;
-  bulk_ops->ocb_auth  = _gcry_serpent_ocb_auth;
+  bulk_ops->ocb_auth = _gcry_serpent_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_serpent_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_serpent_ecb_crypt;
 
   if (serpent_test_ret)
     ret = GPG_ERR_SELFTEST_FAILED;
@@ -1538,6 +1555,134 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   return nblocks;
 }
 
+
+static unsigned int
+serpent_crypt_blk1_16(const void *context, byte *out, const byte *in,
+		      unsigned int num_blks, int encrypt)
+{
+  const serpent_context_t *ctx = context;
+  unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+  if (num_blks == 16 && ctx->use_avx2)
+    {
+      _gcry_serpent_avx2_blk16 (ctx, out, in, encrypt);
+      return 0;
+    }
+#endif
+
+#ifdef USE_SSE2
+  while (num_blks >= 8)
+    {
+      _gcry_serpent_sse2_blk8 (ctx, out, in, encrypt);
+      out += 8 * sizeof(serpent_block_t);
+      in += 8 * sizeof(serpent_block_t);
+      num_blks -= 8;
+    }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      while (num_blks >= 8)
+	{
+	  _gcry_serpent_neon_blk8 (ctx, out, in, encrypt);
+	  out += 8 * sizeof(serpent_block_t);
+	  in += 8 * sizeof(serpent_block_t);
+	  num_blks -= 8;
+	}
+    }
+#endif
+
+  while (num_blks >= 1)
+    {
+      if (encrypt)
+	serpent_encrypt_internal((void *)ctx, in, out);
+      else
+	serpent_decrypt_internal((void *)ctx, in, out);
+
+      burn = 2 * sizeof(serpent_block_t);
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += sizeof(serpent_block_t);
+      in += sizeof(serpent_block_t);
+      num_blks--;
+    }
+
+  return burn_stack_depth;
+}
+
+static unsigned int
+serpent_encrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 1);
+}
+
+static unsigned int
+serpent_decrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 0);
+}
+
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_serpent_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+			 const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[16 * 16];
+      unsigned int tmp_used = 16;
+      size_t nburn;
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16
+                                              : serpent_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks,
+                                 tweak, tmpbuf, sizeof(tmpbuf) / 16,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+			 size_t nblocks, int encrypt)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t nburn;
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16
+                                              : serpent_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks, 16);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
 
 
 /* Serpent test.  */
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Oct 23 18:16:01 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 23 Oct 2022 19:16:01 +0300
Subject: [PATCH 1/8] rijndael: add ECB acceleration (for benchmarking purposes)
Message-ID: <20221023161608.1145423-1-jussi.kivilinna@iki.fi>

* cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'.
* cipher/cipher.c (do_ecb_crypt): Use bulk function if available.
* cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label
'.Ldeclast' to '.Lenclast'.
(_gcry_aes_aesni_ecb_crypt): New.
* cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce)
(_gcry_aes_ecb_dec_armv8_ce): New.
* cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce)
(_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change
return value from void to size_t.
(ocb_crypt_fn_t, xts_crypt_fn_t): Remove.
(_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove
indirect function call; Return value from called function (allows tail
call optimization).
(_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows
tail call optimization).
(_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce)
(_gcry_aes_armv8_ce_ecb_crypt): New.
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ecb_crypt_amd64): New.
* cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64)
(_gcry_aes_vaes_ecb_crypt): New.
* cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt)
(_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New.
(do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE.
--

Benchmark on AMD Ryzen 9 7900X:

Before (OCB for reference):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.128 ns/B      7460 MiB/s     0.720 c/B      5634?1
        ECB dec |     0.134 ns/B      7103 MiB/s     0.753 c/B      5608
        OCB enc |     0.029 ns/B     32930 MiB/s     0.163 c/B      5625
        OCB dec |     0.029 ns/B     32738 MiB/s     0.164 c/B      5625

After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.028 ns/B     33761 MiB/s     0.159 c/B      5625
        ECB dec |     0.028 ns/B     33917 MiB/s     0.158 c/B      5625

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-internal.h           |   2 +
 cipher/cipher.c                    |  41 ++-
 cipher/rijndael-aesni.c            | 160 ++++++++++-
 cipher/rijndael-armv8-aarch32-ce.S | 152 +++++++++-
 cipher/rijndael-armv8-aarch64-ce.S | 125 ++++++++-
 cipher/rijndael-armv8-ce.c         | 124 +++++----
 cipher/rijndael-vaes-avx2-amd64.S  | 432 ++++++++++++++++++++++++++++-
 cipher/rijndael-vaes.c             |  26 ++
 cipher/rijndael.c                  |  12 +
 9 files changed, 997 insertions(+), 77 deletions(-)

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 66b75955..4e022f38 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -161,6 +161,8 @@ typedef struct cipher_mode_ops
    not NULL.  */
 typedef struct cipher_bulk_ops
 {
+  void (*ecb_crypt)(void *context, void *outbuf_arg, const void *inbuf_arg,
+		    size_t nblocks, int encrypt);
   void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg,
 		  const void *inbuf_arg, size_t nblocks);
   void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg,
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 6c335aec..026c1511 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -983,14 +983,11 @@ cipher_reset (gcry_cipher_hd_t c)
 
 
 static gcry_err_code_t
-do_ecb_crypt (gcry_cipher_hd_t c,
-              unsigned char *outbuf, size_t outbuflen,
-              const unsigned char *inbuf, size_t inbuflen,
-              gcry_cipher_encrypt_t crypt_fn)
+do_ecb_crypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen,
+	      const unsigned char *inbuf, size_t inbuflen, int encrypt)
 {
   unsigned int blocksize = c->spec->blocksize;
   size_t n, nblocks;
-  unsigned int burn, nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
@@ -998,18 +995,32 @@ do_ecb_crypt (gcry_cipher_hd_t c,
     return GPG_ERR_INV_LENGTH;
 
   nblocks = inbuflen / blocksize;
-  burn = 0;
 
-  for (n=0; n < nblocks; n++ )
+  if (nblocks == 0)
+    return 0;
+
+  if (c->bulk.ecb_crypt)
     {
-      nburn = crypt_fn (&c->context.c, outbuf, inbuf);
-      burn = nburn > burn ? nburn : burn;
-      inbuf  += blocksize;
-      outbuf += blocksize;
+      c->bulk.ecb_crypt (&c->context.c, outbuf, inbuf, nblocks, encrypt);
     }
+  else
+    {
+      gcry_cipher_encrypt_t crypt_fn =
+          encrypt ? c->spec->encrypt : c->spec->decrypt;
+      unsigned int burn = 0;
+      unsigned int nburn;
 
-  if (burn > 0)
-    _gcry_burn_stack (burn + 4 * sizeof(void *));
+      for (n = 0; n < nblocks; n++)
+	{
+	  nburn = crypt_fn (&c->context.c, outbuf, inbuf);
+	  burn = nburn > burn ? nburn : burn;
+	  inbuf  += blocksize;
+	  outbuf += blocksize;
+	}
+
+      if (burn > 0)
+	_gcry_burn_stack (burn + 4 * sizeof(void *));
+    }
 
   return 0;
 }
@@ -1019,7 +1030,7 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, size_t outbuflen,
                 const unsigned char *inbuf, size_t inbuflen)
 {
-  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->encrypt);
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 1);
 }
 
 static gcry_err_code_t
@@ -1027,7 +1038,7 @@ do_ecb_decrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, size_t outbuflen,
                 const unsigned char *inbuf, size_t inbuflen)
 {
-  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->decrypt);
+  return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 0);
 }
 
 
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 156af015..906737a6 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -870,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm10\n\t"
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xa0(%[key]), %%xmm0\n\t"
-                "jb .Ldeclast%=\n\t"
+                "jb .Lenclast%=\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -889,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm10\n\t"
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xc0(%[key]), %%xmm0\n\t"
-                "je .Ldeclast%=\n\t"
+                "je .Lenclast%=\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -909,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0xe0(%[key]), %%xmm0\n"
 
-                ".Ldeclast%=:\n\t"
+                ".Lenclast%=:\n\t"
                 : /* no output */
                 : [key] "r" (ctx->keyschenc),
                   [rounds] "r" (ctx->rounds)
@@ -1717,6 +1717,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
 }
 
 
+void ASM_FUNC_ATTR
+_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst,
+			   const unsigned char *src, size_t nblocks,
+			   int encrypt)
+{
+  aesni_prepare_2_7_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_7();
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      do_aesni_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec;
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  asm volatile
+	    ("movdqa (%[key]), %%xmm0\n\t"
+	     "movdqu 0*16(%[src]), %%xmm1\n\t"
+	     "movdqu 1*16(%[src]), %%xmm2\n\t"
+	     "movdqu 2*16(%[src]), %%xmm3\n\t"
+	     "movdqu 3*16(%[src]), %%xmm4\n\t"
+	     "movdqu 4*16(%[src]), %%xmm8\n\t"
+	     "movdqu 5*16(%[src]), %%xmm9\n\t"
+	     "movdqu 6*16(%[src]), %%xmm10\n\t"
+	     "movdqu 7*16(%[src]), %%xmm11\n\t"
+	     "pxor   %%xmm0, %%xmm1\n\t"
+	     "pxor   %%xmm0, %%xmm2\n\t"
+	     "pxor   %%xmm0, %%xmm3\n\t"
+	     "pxor   %%xmm0, %%xmm4\n\t"
+	     "pxor   %%xmm0, %%xmm8\n\t"
+	     "pxor   %%xmm0, %%xmm9\n\t"
+	     "pxor   %%xmm0, %%xmm10\n\t"
+	     "pxor   %%xmm0, %%xmm11\n\t"
+	     : /* No output */
+	     : [src] "r" (src),
+	       [key] "r" (key)
+	     : "memory");
+
+	  if (encrypt)
+	    {
+	      do_aesni_enc_vec8 (ctx);
+	      asm volatile
+		("aesenclast %%xmm0, %%xmm1\n\t"
+		 "aesenclast %%xmm0, %%xmm2\n\t"
+		 "aesenclast %%xmm0, %%xmm3\n\t"
+		 "aesenclast %%xmm0, %%xmm4\n\t"
+		 "aesenclast %%xmm0, %%xmm8\n\t"
+		 "aesenclast %%xmm0, %%xmm9\n\t"
+		 "aesenclast %%xmm0, %%xmm10\n\t"
+		 "aesenclast %%xmm0, %%xmm11\n\t"
+		 ::: "memory" );
+	    }
+	  else
+	    {
+	      do_aesni_dec_vec8 (ctx);
+	      asm volatile
+		("aesdeclast %%xmm0, %%xmm1\n\t"
+		 "aesdeclast %%xmm0, %%xmm2\n\t"
+		 "aesdeclast %%xmm0, %%xmm3\n\t"
+		 "aesdeclast %%xmm0, %%xmm4\n\t"
+		 "aesdeclast %%xmm0, %%xmm8\n\t"
+		 "aesdeclast %%xmm0, %%xmm9\n\t"
+		 "aesdeclast %%xmm0, %%xmm10\n\t"
+		 "aesdeclast %%xmm0, %%xmm11\n\t"
+		 ::: "memory" );
+	    }
+
+	  asm volatile
+	    ("movdqu %%xmm1, 0*16(%[dst])\n\t"
+	     "movdqu %%xmm2, 1*16(%[dst])\n\t"
+	     "movdqu %%xmm3, 2*16(%[dst])\n\t"
+	     "movdqu %%xmm4, 3*16(%[dst])\n\t"
+	     "movdqu %%xmm8, 4*16(%[dst])\n\t"
+	     "movdqu %%xmm9, 5*16(%[dst])\n\t"
+	     "movdqu %%xmm10, 6*16(%[dst])\n\t"
+	     "movdqu %%xmm11, 7*16(%[dst])\n\t"
+	     : /* No output */
+	     : [dst] "r" (dst)
+	     : "memory");
+
+	  dst += 8*BLOCKSIZE;
+	  src += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile
+	("movdqu 0*16(%[src]), %%xmm1\n\t"
+	 "movdqu 1*16(%[src]), %%xmm2\n\t"
+	 "movdqu 2*16(%[src]), %%xmm3\n\t"
+	 "movdqu 3*16(%[src]), %%xmm4\n\t"
+	 : /* No output */
+	 : [src] "r" (src)
+	 : "memory");
+
+      if (encrypt)
+	do_aesni_enc_vec4 (ctx);
+      else
+	do_aesni_dec_vec4 (ctx);
+
+      asm volatile
+	("movdqu %%xmm1, 0*16(%[dst])\n\t"
+	 "movdqu %%xmm2, 1*16(%[dst])\n\t"
+	 "movdqu %%xmm3, 2*16(%[dst])\n\t"
+	 "movdqu %%xmm4, 3*16(%[dst])\n\t"
+	 : /* No output */
+	 : [dst] "r" (dst)
+	 : "memory");
+
+      dst += 4*BLOCKSIZE;
+      src += 4*BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      asm volatile ("movdqu %[src], %%xmm0\n\t"
+                    :
+                    : [src] "m" (*src)
+                    : "memory" );
+
+      if (encrypt)
+	do_aesni_enc (ctx);
+      else
+	do_aesni_dec (ctx);
+
+      asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                    : [dst] "=m" (*dst)
+                    :
+                    : "memory" );
+
+      dst += BLOCKSIZE;
+      src += BLOCKSIZE;
+    }
+
+  aesni_cleanup ();
+  aesni_cleanup_2_7 ();
+}
+
+
 void ASM_FUNC_ATTR
 _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 1eafa93e..6208652b 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -653,6 +653,149 @@ _gcry_aes_cbc_dec_armv8_ce:
 .size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;
 
 
+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+.type  _gcry_aes_ecb_enc_armv8_ce,%function;
+_gcry_aes_ecb_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: nblocks
+   *    %st+0: nrounds => r4
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  cmp r3, #0
+  beq .Lecb_enc_skip
+  ldr r4, [sp, #(16+0)]
+  vpush {q4-q7}
+
+  cmp r4, #12
+  aes_preload_keys(r0, lr);
+
+  beq .Lecb_entry_192e
+  bhi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc, ...) \
+  .Lecb_entry_##bits##e_d: \
+    cmp r3, #4; \
+    blo .Lecb_loop_##bits##e_d; \
+    \
+  .Lecb_loop4_##bits##e_d: \
+    vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \
+    sub r3, r3, #4; \
+    vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \
+    cmp r3, #4; \
+    \
+    do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \
+    vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \
+    \
+    bhs .Lecb_loop4_##bits##e_d; \
+    cmp r3, #0; \
+    beq .Lecb_done_##e_d; \
+    \
+  .Lecb_loop_##bits##e_d: \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    subs r3, r3, #1; \
+    \
+    do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \
+    \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    bne .Lecb_loop_##bits##e_d; \
+    b .Lecb_done_##e_d;
+
+  ECB_CRYPT(128, e, mc)
+  ECB_CRYPT(192, e, mc, r0, lr)
+  ECB_CRYPT(256, e, mc, r0, lr)
+
+.Lecb_done_e:
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lecb_enc_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks,
+ *                                  unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+.type  _gcry_aes_ecb_dec_armv8_ce,%function;
+_gcry_aes_ecb_dec_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: nblocks
+   *    %st+0: nrounds => r4
+   */
+
+  push {r4-r6,lr} /* 4*4 = 16b */
+  cmp r3, #0
+  beq .Lecb_enc_skip
+  ldr r4, [sp, #(16+0)]
+  vpush {q4-q7}
+
+  cmp r4, #12
+
+  aes_preload_keys(r0, lr);
+
+  beq .Lecb_entry_192d
+  bhi .Lecb_entry_256d
+
+  ECB_CRYPT(128, d, imc)
+  ECB_CRYPT(192, d, imc, r0, lr)
+  ECB_CRYPT(256, d, imc, r0, lr)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  vpop {q4-q7}
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lecb_dec_skip:
+  pop {r4-r6,pc}
+.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;
+
+
 /*
  * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
@@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
@@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
@@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce:
 
 
 /*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
@@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(q13)
   CLEAR_REG(q14)
 
+  mov r0, #0
   pop {r4-r12,lr}
   vpop {q4-q7}
   bx lr
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 4fef0345..97d3d7eb 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -385,6 +385,119 @@ _gcry_aes_dec_armv8_ce:
 ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
 
 
+/*
+ * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_enc_armv8_ce
+ELF(.type  _gcry_aes_ecb_enc_armv8_ce,%function;)
+_gcry_aes_ecb_enc_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: nblocks
+   *    w4: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Lecb_enc_skip
+
+  aes_preload_keys(x0, w4);
+
+  b.eq .Lecb_entry_192e
+  b.hi .Lecb_entry_256e
+
+#define ECB_CRYPT(bits, e_d, mc_imc) \
+  .Lecb_entry_##bits##e_d: \
+    cmp x3, #4; \
+    b.lo .Lecb_loop_##bits##e_d; \
+    \
+  .Lecb_loop4_##bits##e_d: \
+    sub x3, x3, #4; \
+    ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \
+    cmp x3, #4; \
+    do_aes_4_##bits(e_d, mc_imc, v0, v1, v2, v3); \
+    st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lecb_loop4_##bits##e_d; \
+    CLEAR_REG(v1); \
+    CLEAR_REG(v2); \
+    CLEAR_REG(v3); \
+    cbz x3, .Lecb_done_##e_d; \
+    \
+  .Lecb_loop_##bits##e_d: \
+    ld1 {v0.16b}, [x2], #16; /* load ciphertext */ \
+    sub x3, x3, #1; \
+    do_aes_one##bits(e_d, mc_imc, v0, v0, vk0); \
+    st1 {v0.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x3, .Lecb_loop_##bits##e_d; \
+    b .Lecb_done_##e_d;
+
+  ECB_CRYPT(128, e, mc)
+  ECB_CRYPT(192, e, mc)
+  ECB_CRYPT(256, e, mc)
+
+.Lecb_done_e:
+  aes_clear_keys(w4)
+
+  CLEAR_REG(v0)
+
+.Lecb_enc_skip:
+  ret_spec_stop
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;)
+
+
+/*
+ * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+ *                                  unsigned char *outbuf,
+ *                                  const unsigned char *inbuf,
+ *                                  size_t nblocks, unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ecb_dec_armv8_ce
+ELF(.type  _gcry_aes_ecb_dec_armv8_ce,%function;)
+_gcry_aes_ecb_dec_armv8_ce:
+  /* input:
+   *    x0: keysched
+   *    x1: outbuf
+   *    x2: inbuf
+   *    x3: nblocks
+   *    w4: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Lecb_enc_skip
+
+  aes_preload_keys(x0, w4);
+
+  b.eq .Lecb_entry_192d
+  b.hi .Lecb_entry_256d
+
+  ECB_CRYPT(128, d, imc)
+  ECB_CRYPT(192, d, imc)
+  ECB_CRYPT(256, d, imc)
+
+#undef ECB_CRYPT
+
+.Lecb_done_d:
+  aes_clear_keys(w4)
+
+  CLEAR_REG(v0)
+
+.Lecb_dec_skip:
+  ret_spec_stop
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;)
+
+
 /*
  * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
@@ -471,7 +584,8 @@ ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
  * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
- *                                  unsigned char *iv, unsigned int nrounds);
+ *                                  unsigned char *iv,
+ *                                  size_t nblocks, unsigned int nrounds);
  */
 
 .align 3
@@ -1136,7 +1250,7 @@ ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1379,13 +1493,14 @@ _gcry_aes_ocb_enc_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
  *                                  const unsigned char *inbuf,
  *                                  unsigned char *offset,
@@ -1458,13 +1573,14 @@ _gcry_aes_ocb_dec_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
 
 
 /*
- * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+ * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
  *                                   const unsigned char *abuf,
  *                                   unsigned char *offset,
  *                                   unsigned char *checksum,
@@ -1605,6 +1721,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
+  mov x0, #0
   ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index c9c37654..042b7d42 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -80,32 +80,32 @@ extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
                                             unsigned char *iv, size_t nblocks,
                                             unsigned int nrounds);
 
-extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
-                                        unsigned char *outbuf,
-                                        const unsigned char *inbuf,
-                                        unsigned char *offset,
-                                        unsigned char *checksum,
-                                        unsigned char *L_table,
-                                        size_t nblocks,
-                                        unsigned int nrounds,
-                                        unsigned int blkn);
-extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
-                                        unsigned char *outbuf,
-                                        const unsigned char *inbuf,
-                                        unsigned char *offset,
-                                        unsigned char *checksum,
-                                        unsigned char *L_table,
-                                        size_t nblocks,
-                                        unsigned int nrounds,
-                                        unsigned int blkn);
-extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
-                                         const unsigned char *abuf,
-                                         unsigned char *offset,
-                                         unsigned char *checksum,
-                                         unsigned char *L_table,
-                                         size_t nblocks,
-                                         unsigned int nrounds,
-                                         unsigned int blkn);
+extern size_t _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          unsigned char *L_table,
+                                          size_t nblocks,
+                                          unsigned int nrounds,
+                                          unsigned int blkn);
+extern size_t _gcry_aes_ocb_dec_armv8_ce (const void *keysched,
+                                          unsigned char *outbuf,
+                                          const unsigned char *inbuf,
+                                          unsigned char *offset,
+                                          unsigned char *checksum,
+                                          unsigned char *L_table,
+                                          size_t nblocks,
+                                          unsigned int nrounds,
+                                          unsigned int blkn);
+extern size_t _gcry_aes_ocb_auth_armv8_ce (const void *keysched,
+                                           const unsigned char *abuf,
+                                           unsigned char *offset,
+                                           unsigned char *checksum,
+                                           unsigned char *L_table,
+                                           size_t nblocks,
+                                           unsigned int nrounds,
+                                           unsigned int blkn);
 extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched,
                                         unsigned char *outbuf,
                                         const unsigned char *inbuf,
@@ -116,17 +116,14 @@ extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched,
                                         const unsigned char *inbuf,
                                         unsigned char *tweak,
                                         size_t nblocks, unsigned int nrounds);
-
-typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
-                                const unsigned char *inbuf,
-                                unsigned char *offset, unsigned char *checksum,
-                                unsigned char *L_table, size_t nblocks,
-                                unsigned int nrounds, unsigned int blkn);
-
-typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf,
-                                const unsigned char *inbuf,
-                                unsigned char *tweak, size_t nblocks,
-                                unsigned int nrounds);
+extern void _gcry_aes_ecb_enc_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        size_t nblocks, unsigned int nrounds);
+extern void _gcry_aes_ecb_dec_armv8_ce (const void *keysched,
+                                        unsigned char *outbuf,
+                                        const unsigned char *inbuf,
+                                        size_t nblocks, unsigned int nrounds);
 
 
 void
@@ -312,8 +309,6 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
-  ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce
-                                    : _gcry_aes_ocb_dec_armv8_ce;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned int nrounds = ctx->rounds;
@@ -327,10 +322,16 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = blkn + nblocks;
 
-  crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
-           c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn);
-
-  return 0;
+  if (encrypt)
+    return _gcry_aes_ocb_enc_armv8_ce (keysched, outbuf, inbuf,
+				       c->u_iv.iv, c->u_ctr.ctr,
+				       c->u_mode.ocb.L[0], nblocks, nrounds,
+				       (unsigned int)blkn);
+  else
+    return _gcry_aes_ocb_dec_armv8_ce (keysched, outbuf, inbuf,
+				       c->u_iv.iv, c->u_ctr.ctr,
+				       c->u_mode.ocb.L[0], nblocks, nrounds,
+				       (unsigned int)blkn);
 }
 
 size_t
@@ -345,11 +346,9 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
 
   c->u_mode.ocb.aad_nblocks = blkn + nblocks;
 
-  _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset,
-			      c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
-			      nblocks, nrounds, (unsigned int)blkn);
-
-  return 0;
+  return _gcry_aes_ocb_auth_armv8_ce (keysched, abuf, c->u_mode.ocb.aad_offset,
+				      c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0],
+				      nblocks, nrounds, (unsigned int)blkn);
 }
 
 void
@@ -358,8 +357,6 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
 			      size_t nblocks, int encrypt)
 {
   const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
-  xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce
-                                    : _gcry_aes_xts_dec_armv8_ce;
   unsigned int nrounds = ctx->rounds;
 
   if ( !encrypt && !ctx->decryption_prepared )
@@ -368,7 +365,32 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
       ctx->decryption_prepared = 1;
     }
 
-  crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds);
+  if (encrypt)
+    _gcry_aes_xts_enc_armv8_ce (keysched, outbuf, inbuf, tweak,
+				nblocks, nrounds);
+  else
+    _gcry_aes_xts_dec_armv8_ce (keysched, outbuf, inbuf, tweak,
+				nblocks, nrounds);
 }
 
+void
+_gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf,
+			      const void *inbuf, size_t nblocks,
+			      int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if ( !encrypt && !ctx->decryption_prepared )
+    {
+      _gcry_aes_armv8_ce_prepare_decryption ( ctx );
+      ctx->decryption_prepared = 1;
+    }
+
+  if (encrypt)
+    _gcry_aes_ecb_enc_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+  else
+    _gcry_aes_ecb_dec_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds);
+}
 #endif /* USE_ARM_CE */
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index e36e82a0..655fdf55 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -2357,7 +2357,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64)
 
 /**********************************************************************
-  CTR-mode encryption
+  XTS-mode encryption
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_xts_crypt_amd64, at function)
 .globl _gcry_vaes_avx2_xts_crypt_amd64
@@ -2873,6 +2873,436 @@ _gcry_vaes_avx2_xts_crypt_amd64:
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64)
 
+/**********************************************************************
+  ECB-mode encryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64, at function)
+.globl _gcry_vaes_avx2_ecb_crypt_amd64
+_gcry_vaes_avx2_ecb_crypt_amd64:
+	/* input:
+	 *	%rdi: round keys
+	 *	%esi: encrypt
+	 *	%rdx: dst
+	 *	%rcx: src
+	 *	%r8:  nblocks
+	 *	%r9:  nrounds
+	 */
+	CFI_STARTPROC();
+
+	/* Process 16 blocks per loop. */
+.align 8
+.Lecb_blk16:
+	cmpq $16, %r8;
+	jb .Lecb_blk8;
+
+	leaq -16(%r8), %r8;
+
+	/* Load input and xor first key. */
+	vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+	vmovdqu (0 * 16)(%rcx), %ymm0;
+	vmovdqu (2 * 16)(%rcx), %ymm1;
+	vmovdqu (4 * 16)(%rcx), %ymm2;
+	vmovdqu (6 * 16)(%rcx), %ymm3;
+	vmovdqu (8 * 16)(%rcx), %ymm4;
+	vmovdqu (10 * 16)(%rcx), %ymm5;
+	vmovdqu (12 * 16)(%rcx), %ymm6;
+	vmovdqu (14 * 16)(%rcx), %ymm7;
+	vpxor %ymm8, %ymm0, %ymm0;
+	vpxor %ymm8, %ymm1, %ymm1;
+	vpxor %ymm8, %ymm2, %ymm2;
+	vpxor %ymm8, %ymm3, %ymm3;
+	vpxor %ymm8, %ymm4, %ymm4;
+	vpxor %ymm8, %ymm5, %ymm5;
+	vpxor %ymm8, %ymm6, %ymm6;
+	vpxor %ymm8, %ymm7, %ymm7;
+	vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+	leaq (16 * 16)(%rcx), %rcx;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk16;
+		/* AES rounds */
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk16_last;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+		jz .Lecb_enc_blk16_last;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+		VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+	  .Lecb_enc_blk16_last:
+		vaesenclast %ymm8, %ymm0, %ymm0;
+		vaesenclast %ymm8, %ymm1, %ymm1;
+		vaesenclast %ymm8, %ymm2, %ymm2;
+		vaesenclast %ymm8, %ymm3, %ymm3;
+		vaesenclast %ymm8, %ymm4, %ymm4;
+		vaesenclast %ymm8, %ymm5, %ymm5;
+		vaesenclast %ymm8, %ymm6, %ymm6;
+		vaesenclast %ymm8, %ymm7, %ymm7;
+		jmp .Lecb_blk16_end;
+
+	  .align 8
+	  .Lecb_dec_blk16:
+		/* AES rounds */
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk16_last;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+		jz .Lecb_dec_blk16_last;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+		VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+	  .Lecb_dec_blk16_last:
+		vaesdeclast %ymm8, %ymm0, %ymm0;
+		vaesdeclast %ymm8, %ymm1, %ymm1;
+		vaesdeclast %ymm8, %ymm2, %ymm2;
+		vaesdeclast %ymm8, %ymm3, %ymm3;
+		vaesdeclast %ymm8, %ymm4, %ymm4;
+		vaesdeclast %ymm8, %ymm5, %ymm5;
+		vaesdeclast %ymm8, %ymm6, %ymm6;
+		vaesdeclast %ymm8, %ymm7, %ymm7;
+		jmp .Lecb_blk16_end;
+
+  .align 8
+  .Lecb_blk16_end:
+	vmovdqu %ymm0, (0 * 16)(%rdx);
+	vmovdqu %ymm1, (2 * 16)(%rdx);
+	vmovdqu %ymm2, (4 * 16)(%rdx);
+	vmovdqu %ymm3, (6 * 16)(%rdx);
+	vmovdqu %ymm4, (8 * 16)(%rdx);
+	vmovdqu %ymm5, (10 * 16)(%rdx);
+	vmovdqu %ymm6, (12 * 16)(%rdx);
+	vmovdqu %ymm7, (14 * 16)(%rdx);
+	leaq (16 * 16)(%rdx), %rdx;
+
+	jmp .Lecb_blk16;
+
+	/* Handle trailing eight blocks. */
+.align 8
+.Lecb_blk8:
+	cmpq $8, %r8;
+	jmp .Lecb_blk4;
+
+	leaq -8(%r8), %r8;
+
+	/* Load input and xor first key. */
+	vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+	vmovdqu (0 * 16)(%rcx), %ymm0;
+	vmovdqu (2 * 16)(%rcx), %ymm1;
+	vmovdqu (4 * 16)(%rcx), %ymm2;
+	vmovdqu (6 * 16)(%rcx), %ymm3;
+	vpxor %ymm4, %ymm0, %ymm0;
+	vpxor %ymm4, %ymm1, %ymm1;
+	vpxor %ymm4, %ymm2, %ymm2;
+	vpxor %ymm4, %ymm3, %ymm3;
+	vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+	leaq (8 * 16)(%rcx), %rcx;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk8;
+		/* AES rounds */
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk8_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_enc_blk8_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_enc_blk8_last:
+		vaesenclast %ymm4, %ymm0, %ymm0;
+		vaesenclast %ymm4, %ymm1, %ymm1;
+		vaesenclast %ymm4, %ymm2, %ymm2;
+		vaesenclast %ymm4, %ymm3, %ymm3;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		vmovdqu %ymm2, (4 * 16)(%rdx);
+		vmovdqu %ymm3, (6 * 16)(%rdx);
+		leaq (8 * 16)(%rdx), %rdx;
+		jmp .Lecb_blk4;
+
+	  .align 8
+	  .Lecb_dec_blk8:
+		/* AES rounds */
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk8_last;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_dec_blk8_last;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_dec_blk8_last:
+		vaesdeclast %ymm4, %ymm0, %ymm0;
+		vaesdeclast %ymm4, %ymm1, %ymm1;
+		vaesdeclast %ymm4, %ymm2, %ymm2;
+		vaesdeclast %ymm4, %ymm3, %ymm3;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		vmovdqu %ymm2, (4 * 16)(%rdx);
+		vmovdqu %ymm3, (6 * 16)(%rdx);
+		leaq (8 * 16)(%rdx), %rdx;
+
+	/* Handle trailing four blocks. */
+.align 8
+.Lecb_blk4:
+	cmpq $4, %r8;
+	jb .Lecb_blk1;
+
+	leaq -4(%r8), %r8;
+
+	/* Load input and xor first key. */
+	vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+	vmovdqu (0 * 16)(%rcx), %ymm0;
+	vmovdqu (2 * 16)(%rcx), %ymm1;
+	vpxor %ymm4, %ymm0, %ymm0;
+	vpxor %ymm4, %ymm1, %ymm1;
+	vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+	leaq (4 * 16)(%rcx), %rcx;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk4;
+		/* AES rounds */
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk4_last;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_enc_blk4_last;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_enc_blk4_last:
+		vaesenclast %ymm4, %ymm0, %ymm0;
+		vaesenclast %ymm4, %ymm1, %ymm1;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		leaq (4 * 16)(%rdx), %rdx;
+		jmp .Lecb_blk1;
+
+	  .align 8
+	  .Lecb_dec_blk4:
+		/* AES rounds */
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk4_last;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Lecb_dec_blk4_last;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESDEC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+	  .Lecb_dec_blk4_last:
+		vaesdeclast %ymm4, %ymm0, %ymm0;
+		vaesdeclast %ymm4, %ymm1, %ymm1;
+		vmovdqu %ymm0, (0 * 16)(%rdx);
+		vmovdqu %ymm1, (2 * 16)(%rdx);
+		leaq (4 * 16)(%rdx), %rdx;
+
+	/* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lecb_blk1:
+	cmpq $1, %r8;
+	jb .Ldone_ecb;
+
+	leaq -1(%r8), %r8;
+
+	/* Load input. */
+	vmovdqu (%rcx), %xmm2;
+	leaq 16(%rcx), %rcx;
+
+	/* Xor first key. */
+	vpxor (0 * 16)(%rdi), %xmm2, %xmm0;
+
+	testl %esi, %esi;
+	jz .Lecb_dec_blk1;
+		/* AES rounds. */
+		vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (10 * 16)(%rdi), %xmm1;
+		cmpl $12, %r9d;
+		jb .Lecb_enc_blk1_last;
+		vaesenc %xmm1, %xmm0, %xmm0;
+		vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (12 * 16)(%rdi), %xmm1;
+		jz .Lecb_enc_blk1_last;
+		vaesenc %xmm1, %xmm0, %xmm0;
+		vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (14 * 16)(%rdi), %xmm1;
+	  .Lecb_enc_blk1_last:
+		vaesenclast %xmm1, %xmm0, %xmm0;
+		jmp .Lecb_blk1_end;
+
+	  .align 8
+	  .Lecb_dec_blk1:
+		/* AES rounds. */
+		vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
+		vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (10 * 16)(%rdi), %xmm1;
+		cmpl $12, %r9d;
+		jb .Lecb_dec_blk1_last;
+		vaesdec %xmm1, %xmm0, %xmm0;
+		vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (12 * 16)(%rdi), %xmm1;
+		jz .Lecb_dec_blk1_last;
+		vaesdec %xmm1, %xmm0, %xmm0;
+		vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (14 * 16)(%rdi), %xmm1;
+	  .Lecb_dec_blk1_last:
+		vaesdeclast %xmm1, %xmm0, %xmm0;
+		jmp .Lecb_blk1_end;
+
+  .align 8
+  .Lecb_blk1_end:
+	vmovdqu %xmm0, (%rdx);
+	leaq 16(%rdx), %rdx;
+
+	jmp .Lecb_blk1;
+
+.align 8
+.Ldone_ecb:
+	vzeroall;
+	ret_spec_stop
+	CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64)
+
 /**********************************************************************
   constants
  **********************************************************************/
diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c
index dbcf9afa..978c86da 100644
--- a/cipher/rijndael-vaes.c
+++ b/cipher/rijndael-vaes.c
@@ -91,6 +91,32 @@ extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched,
 					     unsigned int nrounds,
 					     int encrypt) ASM_FUNC_ABI;
 
+extern void _gcry_vaes_avx2_ecb_crypt_amd64 (const void *keysched,
+					     int encrypt,
+					     void *outbuf_arg,
+					     const void *inbuf_arg,
+					     size_t nblocks,
+					     unsigned int nrounds) ASM_FUNC_ABI;
+
+
+void
+_gcry_aes_vaes_ecb_crypt (void *context, void *outbuf,
+			  const void *inbuf, size_t nblocks,
+			  int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+  unsigned int nrounds = ctx->rounds;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      _gcry_aes_aesni_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  _gcry_vaes_avx2_ecb_crypt_amd64 (keysched, encrypt, outbuf, inbuf,
+				   nblocks, nrounds);
+}
 
 void
 _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index f3060ea5..84cb7109 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -102,6 +102,9 @@ extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg
 extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak,
                                        void *outbuf_arg, const void *inbuf_arg,
                                        size_t nblocks, int encrypt);
+extern void _gcry_aes_aesni_ecb_crypt (void *context, void *outbuf_arg,
+				       const void *inbuf_arg, size_t nblocks,
+				       int encrypt);
 #endif
 
 #ifdef USE_VAES
@@ -125,6 +128,9 @@ extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak,
 				      void *outbuf_arg, const void *inbuf_arg,
 				      size_t nblocks, int encrypt);
+extern void _gcry_aes_vaes_ecb_crypt (void *context, void *outbuf_arg,
+				      const void *inbuf_arg, size_t nblocks,
+				      int encrypt);
 #endif
 
 #ifdef USE_SSSE3
@@ -227,6 +233,9 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
                                           void *outbuf_arg,
                                           const void *inbuf_arg,
                                           size_t nblocks, int encrypt);
+extern void _gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf_arg,
+                                          const void *inbuf_arg, size_t nblocks,
+                                          int encrypt);
 #endif /*USE_ARM_ASM*/
 
 #ifdef USE_PPC_CRYPTO
@@ -524,6 +533,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt;
+      bulk_ops->ecb_crypt = _gcry_aes_aesni_ecb_crypt;
 
 #ifdef USE_VAES
       if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) &&
@@ -536,6 +546,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 	  bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc;
 	  bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
 	  bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt;
+	  bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt;
 	}
 #endif
     }
@@ -591,6 +602,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
+      bulk_ops->ecb_crypt = _gcry_aes_armv8_ce_ecb_crypt;
     }
 #endif
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
-- 
2.37.2