[PATCH 6/8] twofish: accelerate XTS and ECB modes
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Oct 23 18:16:06 CEST 2022
* cipher/twofish-amd64.S (_gcry_twofish_amd64_blk3): New.
* cipher/twofish-avx2-amd64.S (_gcry_twofish_avx2_blk16): New.
(_gcry_twofish_xts_crypt, _gcry_twofish_ecb_crypt)
(_gcry_twofish_avx2_blk16, _gcry_twofish_amd64_blk3)
(twofish_crypt_blk1_16, twofish_encrypt_blk1_16)
(twofish_decrypt_blk1_16): New.
(twofish_setkey): Setup XTS and ECB bulk functions.
--
Benchmark on AMD Ryzen 9 7900X:
Before:
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 2.52 ns/B 378.2 MiB/s 14.18 c/B 5625
ECB dec | 2.51 ns/B 380.2 MiB/s 14.11 c/B 5625
XTS enc | 2.65 ns/B 359.9 MiB/s 14.91 c/B 5625
XTS dec | 2.63 ns/B 362.0 MiB/s 14.60 c/B 5541
After:
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.60 ns/B 594.8 MiB/s 9.02 c/B 5625
ECB dec | 1.60 ns/B 594.8 MiB/s 9.02 c/B 5625
XTS enc | 1.66 ns/B 573.9 MiB/s 9.35 c/B 5625
XTS dec | 1.67 ns/B 569.6 MiB/s 9.41 c/B 5619±2
GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/twofish-amd64.S | 74 ++++++++++++++++++
cipher/twofish-avx2-amd64.S | 46 +++++++++++
cipher/twofish.c | 147 +++++++++++++++++++++++++++++++++++-
3 files changed, 264 insertions(+), 3 deletions(-)
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index a7a60553..8998d296 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -544,6 +544,80 @@ __twofish_dec_blk3:
CFI_ENDPROC();
ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
+.align 8
+.globl _gcry_twofish_amd64_blk3
+ELF(.type _gcry_twofish_amd64_blk3, at function;)
+_gcry_twofish_amd64_blk3:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (3 blocks)
+ * %rdx: src (3 blocks)
+ * %ecx: encrypt (0 or 1)
+ */
+ CFI_STARTPROC();
+ ENTER_SYSV_FUNC_PARAMS_0_4
+
+ subq $(8 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(8 * 8);
+ movq %rbp, (0 * 8)(%rsp);
+ movq %rbx, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ movq %r14, (4 * 8)(%rsp);
+ movq %r15, (5 * 8)(%rsp);
+ CFI_REL_OFFSET(%rbp, 0 * 8);
+ CFI_REL_OFFSET(%rbx, 1 * 8);
+ CFI_REL_OFFSET(%r12, 2 * 8);
+ CFI_REL_OFFSET(%r13, 3 * 8);
+ CFI_REL_OFFSET(%r14, 4 * 8);
+ CFI_REL_OFFSET(%r15, 5 * 8);
+
+ testl %ecx, %ecx;
+ movq %rdx, RX0;
+ movq %rsi, (6 * 8)(%rsp);
+
+ movq (0 * 8)(RX0), RAB0;
+ movq (1 * 8)(RX0), RCD0;
+ movq (2 * 8)(RX0), RAB1;
+ movq (3 * 8)(RX0), RCD1;
+ movq (4 * 8)(RX0), RAB2;
+ movq (5 * 8)(RX0), RCD2;
+
+ jz .Lblk1_3_dec;
+ call __twofish_enc_blk3;
+ jmp .Lblk1_3_end;
+ .Lblk1_3_dec:
+ call __twofish_dec_blk3;
+
+.Lblk1_3_end:
+ movq (6 * 8)(%rsp), RX0;
+ movq RCD0, (0 * 8)(RX0);
+ movq RAB0, (1 * 8)(RX0);
+ movq RCD1, (2 * 8)(RX0);
+ movq RAB1, (3 * 8)(RX0);
+ movq RCD2, (4 * 8)(RX0);
+ movq RAB2, (5 * 8)(RX0);
+
+ movq (0 * 8)(%rsp), %rbp;
+ movq (1 * 8)(%rsp), %rbx;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ movq (4 * 8)(%rsp), %r14;
+ movq (5 * 8)(%rsp), %r15;
+ CFI_RESTORE(%rbp);
+ CFI_RESTORE(%rbx);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+ CFI_RESTORE(%r14);
+ CFI_RESTORE(%r15);
+ addq $(8 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+ EXIT_SYSV_FUNC
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;)
+
.align 8
.globl _gcry_twofish_amd64_ctr_enc
ELF(.type _gcry_twofish_amd64_ctr_enc, at function;)
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index 930ac792..0cb9a64c 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -468,6 +468,52 @@ __twofish_dec_blk16:
CFI_ENDPROC();
ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
+.align 8
+.globl _gcry_twofish_avx2_blk16
+ELF(.type _gcry_twofish_avx2_blk16, at function;)
+_gcry_twofish_avx2_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %ecx: encrypt
+ */
+ CFI_STARTPROC();
+
+ vzeroupper;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RB0;
+ vmovdqu (2 * 32)(%rdx), RC0;
+ vmovdqu (3 * 32)(%rdx), RD0;
+ vmovdqu (4 * 32)(%rdx), RA1;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RC1;
+ vmovdqu (7 * 32)(%rdx), RD1;
+
+ testl %ecx, %ecx;
+ jz .Lblk16_dec;
+ call __twofish_enc_blk16;
+ jmp .Lblk16_end;
+ .Lblk16_dec:
+ call __twofish_dec_blk16;
+
+.Lblk16_end:
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RB0, (1 * 32)(%rsi);
+ vmovdqu RC0, (2 * 32)(%rsi);
+ vmovdqu RD0, (3 * 32)(%rsi);
+ vmovdqu RA1, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RC1, (6 * 32)(%rsi);
+ vmovdqu RD1, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_blk16,.-_gcry_twofish_avx2_blk16;)
+
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
vpsubq minus_one, x, x; \
diff --git a/cipher/twofish.c b/cipher/twofish.c
index b300715b..92c463fc 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -101,7 +101,12 @@ static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
int encrypt);
static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
size_t nblocks);
-
+static void _gcry_twofish_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
+static void _gcry_twofish_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
/* Structure for an expanded Twofish key. s contains the key-dependent
* S-boxes composed with the MDS matrix; w contains the eight "whitening"
@@ -775,7 +780,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
bulk_ops->cfb_dec = _gcry_twofish_cfb_dec;
bulk_ops->ctr_enc = _gcry_twofish_ctr_enc;
bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt;
- bulk_ops->ocb_auth = _gcry_twofish_ocb_auth;
+ bulk_ops->ocb_auth = _gcry_twofish_ocb_auth;
+ bulk_ops->xts_crypt = _gcry_twofish_xts_crypt;
+ bulk_ops->ecb_crypt = _gcry_twofish_ecb_crypt;
(void)hwfeatures;
@@ -788,6 +795,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
/* Assembler implementations of Twofish using AVX2. Process 16 block in
parallel.
*/
+extern void _gcry_twofish_avx2_blk16 (const TWOFISH_context *c, byte *out,
+ const byte *in, int encrypt) ASM_FUNC_ABI;
+
extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
unsigned char *out,
const unsigned char *in,
@@ -835,6 +845,9 @@ extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c,
byte *out, const byte *in);
/* These assembly implementations process three blocks in parallel. */
+extern void _gcry_twofish_amd64_blk3(const TWOFISH_context *c, byte *out,
+ const byte *in, int encrypt);
+
extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out,
const byte *in, byte *ctr);
@@ -1501,7 +1514,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
blkn += 3;
twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ c->u_mode.ocb.aad_sum, Ls);
nblocks -= 3;
abuf += 3 * TWOFISH_BLOCKSIZE;
@@ -1527,6 +1540,134 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
}
+static unsigned int
+twofish_crypt_blk1_16(const void *context, byte *out, const byte *in,
+ unsigned int num_blks, int encrypt)
+{
+ const TWOFISH_context *ctx = context;
+ unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+ if (num_blks == 16 && ctx->use_avx2)
+ {
+ _gcry_twofish_avx2_blk16 (ctx, out, in, encrypt);
+ return 0;
+ }
+#endif
+
+#ifdef USE_AMD64_ASM
+ while (num_blks >= 3)
+ {
+ _gcry_twofish_amd64_blk3 (ctx, out, in, encrypt);
+ burn = 8 * sizeof(void *);
+ burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+ out += 3 * TWOFISH_BLOCKSIZE;
+ in += 3 * TWOFISH_BLOCKSIZE;
+ num_blks -= 3;
+ }
+#endif
+
+ while (num_blks >= 1)
+ {
+ if (encrypt)
+ burn = twofish_encrypt((void *)ctx, out, in);
+ else
+ burn = twofish_decrypt((void *)ctx, out, in);
+
+ burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+ out += TWOFISH_BLOCKSIZE;
+ in += TWOFISH_BLOCKSIZE;
+ num_blks--;
+ }
+
+ return burn_stack_depth;
+}
+
+static unsigned int
+twofish_encrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 1);
+}
+
+static unsigned int
+twofish_decrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 0);
+}
+
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_twofish_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ TWOFISH_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * 16];
+ unsigned int tmp_used = 16;
+ size_t tmpbufsize = 15 * 16;
+ size_t nburn;
+
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ tmpbufsize = 16 * 16;
+#endif
+
+ nburn = bulk_xts_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+ : twofish_decrypt_blk1_16,
+ outbuf, inbuf, nblocks,
+ tweak, tmpbuf, tmpbufsize / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_twofish_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt)
+{
+ TWOFISH_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ size_t fn_maxblocks = 15;
+ size_t nburn;
+
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ fn_maxblocks = 16;
+#endif
+
+ nburn = bulk_ecb_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+ : twofish_decrypt_blk1_16,
+ outbuf, inbuf, nblocks, fn_maxblocks);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
+
+
/* Test a single encryption and decryption with each key size. */
--
2.37.2
More information about the Gcrypt-devel
mailing list