[PATCH 7/7] camellia-avx2: add bulk processing for XTS mode
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Apr 24 20:40:25 CEST 2022
* cipher/bulkhelp.h (bulk_xts_crypt_128): New.
* cipher/camellia-glue.c (_gcry_camellia_xts_crypt): New.
(camellia_set_key) [USE_AESNI_AVX2]: Set XTS bulk function if AVX2
implementation is available.
--
Benchmark on AMD Ryzen 5800X:
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 3.79 ns/B 251.8 MiB/s 18.37 c/B 4850
XTS dec | 3.77 ns/B 253.2 MiB/s 18.27 c/B 4850
After (6.8x faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 0.554 ns/B 1720 MiB/s 2.69 c/B 4850
XTS dec | 0.541 ns/B 1762 MiB/s 2.63 c/B 4850
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/bulkhelp.h | 68 ++++++++++++++++++++++++++++++++++++++++++
cipher/camellia-glue.c | 39 ++++++++++++++++++++++++
2 files changed, 107 insertions(+)
diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h
index c9ecaba6..b1b4b2e1 100644
--- a/cipher/bulkhelp.h
+++ b/cipher/bulkhelp.h
@@ -325,4 +325,72 @@ bulk_ocb_auth_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
}
+static inline unsigned int
+bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+ const byte *inbuf, size_t nblocks, byte *tweak,
+ byte *tmpbuf, size_t tmpbuf_nblocks,
+ unsigned int *num_used_tmpblocks)
+{
+ u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+ unsigned int tmp_used = 16;
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ tweak_next_lo = buf_get_le64 (tweak + 0);
+ tweak_next_hi = buf_get_le64 (tweak + 8);
+
+ while (nblocks >= 1)
+ {
+ size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+ size_t i;
+
+ if (curr_blks * 16 > tmp_used)
+ tmp_used = curr_blks * 16;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ tweak_lo = tweak_next_lo;
+ tweak_hi = tweak_next_hi;
+
+ /* Generate next tweak. */
+ carry = -(tweak_next_hi >> 63) & 0x87;
+ tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+ tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+ /* Xor-Encrypt/Decrypt-Xor block. */
+ tmp_lo = buf_get_le64 (inbuf + i * 16 + 0) ^ tweak_lo;
+ tmp_hi = buf_get_le64 (inbuf + i * 16 + 8) ^ tweak_hi;
+ buf_put_he64 (&tmpbuf[i * 16 + 0], tweak_lo);
+ buf_put_he64 (&tmpbuf[i * 16 + 8], tweak_hi);
+ buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+ buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+ }
+
+ nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ /* Xor-Encrypt/Decrypt-Xor block. */
+ tweak_lo = buf_get_he64 (&tmpbuf[i * 16 + 0]);
+ tweak_hi = buf_get_he64 (&tmpbuf[i * 16 + 8]);
+ tmp_lo = buf_get_le64 (outbuf + i * 16 + 0) ^ tweak_lo;
+ tmp_hi = buf_get_le64 (outbuf + i * 16 + 8) ^ tweak_hi;
+ buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+ buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+ }
+
+ inbuf += curr_blks * 16;
+ outbuf += curr_blks * 16;
+ nblocks -= curr_blks;
+ }
+
+ buf_put_le64 (tweak + 0, tweak_next_lo);
+ buf_put_le64 (tweak + 8, tweak_next_hi);
+
+ *num_used_tmpblocks = tmp_used;
+ return burn_depth;
+}
+
+
#endif /*GCRYPT_BULKHELP_H*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 20ab7f7d..eae1d9ff 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -351,6 +351,9 @@ static void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
+static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt);
@@ -407,6 +410,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
bulk_ops->ctr_enc = _gcry_camellia_ctr_enc;
bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt;
bulk_ops->ocb_auth = _gcry_camellia_ocb_auth;
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2)
+ bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
+#endif
if (0)
{ }
@@ -900,6 +907,38 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt)
+{
+ CAMELLIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_32
+ : camellia_decrypt_blk1_32,
+ outbuf, inbuf, nblocks, tweak, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
+
/* Bulk encryption/decryption of complete blocks in OCB mode. */
static size_t
_gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
--
2.34.1
More information about the Gcrypt-devel
mailing list