[PATCH 7/7] camellia-avx2: add bulk processing for XTS mode

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Apr 24 20:40:25 CEST 2022


* cipher/bulkhelp.h (bulk_xts_crypt_128): New.
* cipher/camellia-glue.c (_gcry_camellia_xts_crypt): New.
(camellia_set_key) [USE_AESNI_AVX2]: Set XTS bulk function if AVX2
implementation is available.
--

Benchmark on AMD Ryzen 5800X:

Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        XTS enc |      3.79 ns/B     251.8 MiB/s     18.37 c/B      4850
        XTS dec |      3.77 ns/B     253.2 MiB/s     18.27 c/B      4850

After (6.8x faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        XTS enc |     0.554 ns/B      1720 MiB/s      2.69 c/B      4850
        XTS dec |     0.541 ns/B      1762 MiB/s      2.63 c/B      4850

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/bulkhelp.h      | 68 ++++++++++++++++++++++++++++++++++++++++++
 cipher/camellia-glue.c | 39 ++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h
index c9ecaba6..b1b4b2e1 100644
--- a/cipher/bulkhelp.h
+++ b/cipher/bulkhelp.h
@@ -325,4 +325,72 @@ bulk_ocb_auth_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
 }
 
 
+static inline unsigned int
+bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                    const byte *inbuf, size_t nblocks, byte *tweak,
+                    byte *tmpbuf, size_t tmpbuf_nblocks,
+                    unsigned int *num_used_tmpblocks)
+{
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  tweak_next_lo = buf_get_le64 (tweak + 0);
+  tweak_next_hi = buf_get_le64 (tweak + 8);
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          tweak_lo = tweak_next_lo;
+          tweak_hi = tweak_next_hi;
+
+          /* Generate next tweak. */
+          carry = -(tweak_next_hi >> 63) & 0x87;
+          tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+          tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+          /* Xor-Encrypt/Decrypt-Xor block. */
+          tmp_lo = buf_get_le64 (inbuf + i * 16 + 0) ^ tweak_lo;
+          tmp_hi = buf_get_le64 (inbuf + i * 16 + 8) ^ tweak_hi;
+          buf_put_he64 (&tmpbuf[i * 16 + 0], tweak_lo);
+          buf_put_he64 (&tmpbuf[i * 16 + 8], tweak_hi);
+          buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+          buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+        }
+
+      nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          /* Xor-Encrypt/Decrypt-Xor block. */
+          tweak_lo = buf_get_he64 (&tmpbuf[i * 16 + 0]);
+          tweak_hi = buf_get_he64 (&tmpbuf[i * 16 + 8]);
+          tmp_lo = buf_get_le64 (outbuf + i * 16 + 0) ^ tweak_lo;
+          tmp_hi = buf_get_le64 (outbuf + i * 16 + 8) ^ tweak_hi;
+          buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+          buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+        }
+
+      inbuf += curr_blks * 16;
+      outbuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  buf_put_le64 (tweak + 0, tweak_next_lo);
+  buf_put_le64 (tweak + 8, tweak_next_hi);
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
 #endif /*GCRYPT_BULKHELP_H*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 20ab7f7d..eae1d9ff 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -351,6 +351,9 @@ static void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
 static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
 				    void *outbuf_arg, const void *inbuf_arg,
 				    size_t nblocks);
+static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+                                      void *outbuf_arg, const void *inbuf_arg,
+                                      size_t nblocks, int encrypt);
 static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 					const void *inbuf_arg, size_t nblocks,
 					int encrypt);
@@ -407,6 +410,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   bulk_ops->ctr_enc = _gcry_camellia_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt;
   bulk_ops->ocb_auth  = _gcry_camellia_ocb_auth;
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2)
+    bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
+#endif
 
   if (0)
     { }
@@ -900,6 +907,38 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
     _gcry_burn_stack(burn_stack_depth);
 }
 
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+                          void *outbuf_arg, const void *inbuf_arg,
+                          size_t nblocks, int encrypt)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_32
+                                              : camellia_decrypt_blk1_32,
+                                 outbuf, inbuf, nblocks, tweak, tmpbuf,
+                                 sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
 static size_t
 _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
-- 
2.34.1




More information about the Gcrypt-devel mailing list