[PATCH 2/2] aria: add generic 2-way bulk processing
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Jan 6 09:57:18 CET 2023
* cipher/aria.c (ARIA_context): Add 'bulk_prefetch_ready'.
(aria_crypt_2blks, aria_crypt_blocks, aria_enc_blocks, aria_dec_blocks)
(_gcry_aria_ctr_enc, _gcry_aria_cbc_enc, _gcry_aria_cbc_dec)
(_gcry_aria_cfb_enc, _gcry_aria_cfb_dec, _gcry_aria_ecb_crypt)
(_gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc, _gcry_aria_ocb_crypt)
(_gcry_aria_ocb_auth): New.
(aria_setkey): Setup 'bulk_ops' function pointers.
--
Patch adds 2-way parallel generic ARIA implementation for modest
performance increase.
Benchmark on AMD Ryzen 9 7900X (x86-64) shows ~40% performance
improvement for parallelizable modes:
ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 2.62 ns/B 364.0 MiB/s 14.74 c/B 5625
ECB dec | 2.61 ns/B 365.2 MiB/s 14.69 c/B 5625
CBC enc | 3.62 ns/B 263.7 MiB/s 20.34 c/B 5625
CBC dec | 2.63 ns/B 363.0 MiB/s 14.78 c/B 5625
CFB enc | 3.59 ns/B 265.3 MiB/s 20.22 c/B 5625
CFB dec | 2.63 ns/B 362.0 MiB/s 14.82 c/B 5625
OFB enc | 3.98 ns/B 239.7 MiB/s 22.38 c/B 5625
OFB dec | 4.00 ns/B 238.2 MiB/s 22.52 c/B 5625
CTR enc | 2.64 ns/B 360.6 MiB/s 14.87 c/B 5624
CTR dec | 2.65 ns/B 360.0 MiB/s 14.90 c/B 5625
XTS enc | 2.68 ns/B 355.8 MiB/s 15.08 c/B 5625
XTS dec | 2.67 ns/B 356.9 MiB/s 15.03 c/B 5625
CCM enc | 6.24 ns/B 152.7 MiB/s 35.12 c/B 5625
CCM dec | 6.25 ns/B 152.5 MiB/s 35.18 c/B 5625
CCM auth | 3.59 ns/B 265.4 MiB/s 20.21 c/B 5625
EAX enc | 6.23 ns/B 153.0 MiB/s 35.06 c/B 5625
EAX dec | 6.23 ns/B 153.1 MiB/s 35.05 c/B 5625
EAX auth | 3.59 ns/B 265.4 MiB/s 20.22 c/B 5625
GCM enc | 2.68 ns/B 355.8 MiB/s 15.08 c/B 5625
GCM dec | 2.69 ns/B 354.7 MiB/s 15.12 c/B 5625
GCM auth | 0.031 ns/B 30832 MiB/s 0.174 c/B 5625
OCB enc | 2.71 ns/B 351.4 MiB/s 15.27 c/B 5625
OCB dec | 2.74 ns/B 347.6 MiB/s 15.43 c/B 5625
OCB auth | 2.64 ns/B 360.8 MiB/s 14.87 c/B 5625
SIV enc | 6.24 ns/B 152.9 MiB/s 35.08 c/B 5625
SIV dec | 6.24 ns/B 152.8 MiB/s 35.10 c/B 5625
SIV auth | 3.59 ns/B 266.0 MiB/s 20.17 c/B 5625
GCM-SIV enc | 2.67 ns/B 356.7 MiB/s 15.04 c/B 5625
GCM-SIV dec | 2.68 ns/B 355.7 MiB/s 15.08 c/B 5625
GCM-SIV auth | 0.034 ns/B 28303 MiB/s 0.190 c/B 5625
Cc: Taehee Yoo <ap420073 at gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/aria.c | 479 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 477 insertions(+), 2 deletions(-)
diff --git a/cipher/aria.c b/cipher/aria.c
index 893763a9..700ea409 100644
--- a/cipher/aria.c
+++ b/cipher/aria.c
@@ -66,8 +66,9 @@ typedef struct
u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
int rounds;
- /* The decryption key schedule is available */
- unsigned int decryption_prepared:1;
+ unsigned int decryption_prepared:1; /* The decryption key is set up. */
+ unsigned int bulk_prefetch_ready:1; /* Look-up table prefetch ready for
+ * current bulk operation. */
} ARIA_context;
@@ -506,6 +507,7 @@ aria_add_round_key(u32 *rk, u32 *t0, u32 *t1, u32 *t2, u32 *t3)
*t2 ^= rk[2];
*t3 ^= rk[3];
}
+
/* Odd round Substitution & Diffusion */
static ALWAYS_INLINE void
aria_subst_diff_odd(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
@@ -803,6 +805,469 @@ aria_decrypt(void *c, byte *outbuf, const byte *inbuf)
}
+static unsigned int
+aria_crypt_2blks(ARIA_context *ctx, byte *out, const byte *in,
+ u32 key[][ARIA_RD_KEY_WORDS])
+{
+ u32 ra0, ra1, ra2, ra3;
+ u32 rb0, rb1, rb2, rb3;
+ int rounds = ctx->rounds;
+ int rkidx = 0;
+
+ ra0 = buf_get_be32(in + 0);
+ ra1 = buf_get_be32(in + 4);
+ ra2 = buf_get_be32(in + 8);
+ ra3 = buf_get_be32(in + 12);
+ rb0 = buf_get_be32(in + 16);
+ rb1 = buf_get_be32(in + 20);
+ rb2 = buf_get_be32(in + 24);
+ rb3 = buf_get_be32(in + 28);
+
+ while (1)
+ {
+ aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+ aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+ rkidx++;
+
+ aria_subst_diff_odd(&ra0, &ra1, &ra2, &ra3);
+ aria_subst_diff_odd(&rb0, &rb1, &rb2, &rb3);
+ aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+ aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+ rkidx++;
+
+ if (rkidx >= rounds)
+ break;
+
+ aria_subst_diff_even(&ra0, &ra1, &ra2, &ra3);
+ aria_subst_diff_even(&rb0, &rb1, &rb2, &rb3);
+ }
+
+ aria_last_round(&ra0, &ra1, &ra2, &ra3);
+ aria_last_round(&rb0, &rb1, &rb2, &rb3);
+ aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+ aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+
+ buf_put_be32(out + 0, ra0);
+ buf_put_be32(out + 4, ra1);
+ buf_put_be32(out + 8, ra2);
+ buf_put_be32(out + 12, ra3);
+ buf_put_be32(out + 16, rb0);
+ buf_put_be32(out + 20, rb1);
+ buf_put_be32(out + 24, rb2);
+ buf_put_be32(out + 28, rb3);
+
+ return 4 * sizeof(void *) + 8 * sizeof(u32); /* stack burn depth */
+}
+
+static unsigned int
+aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
+ size_t num_blks, u32 key[][ARIA_RD_KEY_WORDS])
+{
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ if (!ctx->bulk_prefetch_ready)
+ {
+ prefetch_sboxes();
+ ctx->bulk_prefetch_ready = 1;
+ }
+
+ while (num_blks >= 2)
+ {
+ nburn = aria_crypt_2blks (ctx, out, in, key);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+ out += 2 * 16;
+ in += 2 * 16;
+ num_blks -= 2;
+ }
+
+ while (num_blks)
+ {
+ nburn = aria_crypt (ctx, out, in, key);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+ out += 16;
+ in += 16;
+ num_blks--;
+ }
+
+ if (burn_depth)
+ burn_depth += sizeof(void *) * 5;
+ return burn_depth;
+}
+
+static unsigned int
+aria_enc_blocks (void *c, byte *out, const byte *in, size_t num_blks)
+{
+ ARIA_context *ctx = (ARIA_context *)c;
+
+ return aria_crypt_blocks (ctx, out, in, num_blks, ctx->enc_key);
+}
+
+static unsigned int
+aria_dec_blocks (void *c, byte *out, const byte *in, size_t num_blks)
+{
+ ARIA_context *ctx = (ARIA_context *)c;
+
+ return aria_crypt_blocks (ctx, out, in, num_blks, ctx->dec_key);
+}
+
+
+/* Bulk encryption of complete blocks in CTR mode. This function is only
+ intended for the bulk encryption feature of cipher.c. CTR is expected to be
+ of size 16. */
+static void
+_gcry_aria_ctr_enc(void *context, unsigned char *ctr,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks)
+{
+ ARIA_context *ctx = context;
+ byte *outbuf = outbuf_arg;
+ const byte *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ byte tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned int tmp_used = ARIA_BLOCK_SIZE;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+
+ nburn = bulk_ctr_enc_128(ctx, aria_enc_blocks, outbuf, inbuf,
+ nblocks, ctr, tmpbuf,
+ sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory (tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CBC mode. */
+static void
+_gcry_aria_cbc_enc (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int cbc_mac)
+{
+ ARIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned char *last_iv;
+ unsigned int burn_depth = 0;
+
+ prefetch_sboxes();
+
+ last_iv = iv;
+
+ for (; nblocks; nblocks--)
+ {
+ cipher_block_xor (outbuf, inbuf, last_iv, ARIA_BLOCK_SIZE);
+
+ burn_depth = aria_crypt (ctx, outbuf, outbuf, ctx->enc_key);
+
+ last_iv = outbuf;
+ inbuf += ARIA_BLOCK_SIZE;
+ if (!cbc_mac)
+ outbuf += ARIA_BLOCK_SIZE;
+ }
+
+ if (last_iv != iv)
+ cipher_block_cpy (iv, last_iv, ARIA_BLOCK_SIZE);
+
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+/* Bulk decryption of complete blocks in CBC mode. This function is only
+ intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_aria_cbc_dec(void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks)
+{
+ ARIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ if (!ctx->decryption_prepared)
+ {
+ aria_set_decrypt_key (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned int tmp_used = ARIA_BLOCK_SIZE;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+
+ nburn = bulk_cbc_dec_128(ctx, aria_dec_blocks, outbuf, inbuf,
+ nblocks, iv, tmpbuf,
+ sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory (tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CFB mode. */
+static void
+_gcry_aria_cfb_enc (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks)
+{
+ ARIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned int burn_depth = 0;
+
+ prefetch_sboxes();
+
+ for (; nblocks; nblocks--)
+ {
+ /* Encrypt the IV. */
+ burn_depth = aria_crypt (ctx, iv, iv, ctx->enc_key);
+ /* XOR the input with the IV and store input into IV. */
+ cipher_block_xor_2dst(outbuf, iv, inbuf, ARIA_BLOCK_SIZE);
+ outbuf += ARIA_BLOCK_SIZE;
+ inbuf += ARIA_BLOCK_SIZE;
+ }
+
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+/* Bulk decryption of complete blocks in CFB mode. This function is only
+ intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_aria_cfb_dec(void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks)
+{
+ ARIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned int tmp_used = ARIA_BLOCK_SIZE;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+
+ nburn = bulk_cfb_dec_128(ctx, aria_enc_blocks, outbuf, inbuf,
+ nblocks, iv, tmpbuf,
+ sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory (tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_aria_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ ARIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ aria_set_decrypt_key (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ bulk_crypt_fn_t crypt_blk1_16;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+ crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+ nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_16,
+ outbuf, inbuf, nblocks, 16);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_aria_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ ARIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ aria_set_decrypt_key (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned int tmp_used = ARIA_BLOCK_SIZE;
+ bulk_crypt_fn_t crypt_blk1_16;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+ crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+ nburn = bulk_xts_crypt_128(ctx, crypt_blk1_16,
+ outbuf, inbuf, nblocks,
+ tweak, tmpbuf,
+ sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory (tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */
+static void
+_gcry_aria_ctr32le_enc(void *context, unsigned char *ctr,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks)
+{
+ ARIA_context *ctx = context;
+ byte *outbuf = outbuf_arg;
+ const byte *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned int tmp_used = ARIA_BLOCK_SIZE;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+
+ nburn = bulk_ctr32le_enc_128 (ctx, aria_enc_blocks, outbuf, inbuf,
+ nblocks, ctr, tmpbuf,
+ sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory (tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_aria_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ ARIA_context *ctx = (void *)&c->context.c;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ u64 blkn = c->u_mode.ocb.data_nblocks;
+ int burn_stack_depth = 0;
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ aria_set_decrypt_key (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned int tmp_used = ARIA_BLOCK_SIZE;
+ bulk_crypt_fn_t crypt_blk1_16;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+ crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+ nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_16, outbuf, inbuf, nblocks,
+ &blkn, encrypt, tmpbuf,
+ sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory (tmpbuf, tmp_used);
+ }
+
+ c->u_mode.ocb.data_nblocks = blkn;
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+
+ return 0;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_aria_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
+{
+ ARIA_context *ctx = (void *)&c->context.c;
+ const unsigned char *abuf = abuf_arg;
+ u64 blkn = c->u_mode.ocb.aad_nblocks;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+ unsigned int tmp_used = ARIA_BLOCK_SIZE;
+ size_t nburn;
+
+ ctx->bulk_prefetch_ready = 0;
+
+ nburn = bulk_ocb_auth_128 (c, ctx, aria_enc_blocks, abuf, nblocks,
+ &blkn, tmpbuf,
+ sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory (tmpbuf, tmp_used);
+ }
+
+ c->u_mode.ocb.aad_nblocks = blkn;
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth);
+
+ return 0;
+}
+
+
static gcry_err_code_t
aria_setkey(void *c, const byte *key, unsigned keylen,
cipher_bulk_ops_t *bulk_ops)
@@ -827,6 +1292,16 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
/* Setup bulk encryption routines. */
memset (bulk_ops, 0, sizeof(*bulk_ops));
+ bulk_ops->cbc_enc = _gcry_aria_cbc_enc;
+ bulk_ops->cbc_dec = _gcry_aria_cbc_dec;
+ bulk_ops->cfb_enc = _gcry_aria_cfb_enc;
+ bulk_ops->cfb_dec = _gcry_aria_cfb_dec;
+ bulk_ops->ctr_enc = _gcry_aria_ctr_enc;
+ bulk_ops->ctr32le_enc = _gcry_aria_ctr32le_enc;
+ bulk_ops->ecb_crypt = _gcry_aria_ecb_crypt;
+ bulk_ops->xts_crypt = _gcry_aria_xts_crypt;
+ bulk_ops->ocb_crypt = _gcry_aria_ocb_crypt;
+ bulk_ops->ocb_auth = _gcry_aria_ocb_auth;
/* Setup context and encryption key. */
ctx->decryption_prepared = 0;
--
2.37.2
More information about the Gcrypt-devel
mailing list