[PATCH 6/8] twofish: accelerate XTS and ECB modes

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Oct 23 18:16:06 CEST 2022


* cipher/twofish-amd64.S (_gcry_twofish_amd64_blk3): New.
* cipher/twofish-avx2-amd64.S (_gcry_twofish_avx2_blk16): New.
(_gcry_twofish_xts_crypt, _gcry_twofish_ecb_crypt)
(_gcry_twofish_avx2_blk16, _gcry_twofish_amd64_blk3)
(twofish_crypt_blk1_16, twofish_encrypt_blk1_16)
(twofish_decrypt_blk1_16): New.
(twofish_setkey): Setup XTS and ECB bulk functions.
--

Benchmark on AMD Ryzen 9 7900X:

Before:
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      2.52 ns/B     378.2 MiB/s     14.18 c/B      5625
        ECB dec |      2.51 ns/B     380.2 MiB/s     14.11 c/B      5625
        XTS enc |      2.65 ns/B     359.9 MiB/s     14.91 c/B      5625
        XTS dec |      2.63 ns/B     362.0 MiB/s     14.60 c/B      5541

After:
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.60 ns/B     594.8 MiB/s      9.02 c/B      5625
        ECB dec |      1.60 ns/B     594.8 MiB/s      9.02 c/B      5625
        XTS enc |      1.66 ns/B     573.9 MiB/s      9.35 c/B      5625
        XTS dec |      1.67 ns/B     569.6 MiB/s      9.41 c/B      5619±2

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/twofish-amd64.S      |  74 ++++++++++++++++++
 cipher/twofish-avx2-amd64.S |  46 +++++++++++
 cipher/twofish.c            | 147 +++++++++++++++++++++++++++++++++++-
 3 files changed, 264 insertions(+), 3 deletions(-)

diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index a7a60553..8998d296 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -544,6 +544,80 @@ __twofish_dec_blk3:
 	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
+.align 8
+.globl _gcry_twofish_amd64_blk3
+ELF(.type   _gcry_twofish_amd64_blk3, at function;)
+_gcry_twofish_amd64_blk3:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%ecx: encrypt (0 or 1)
+	 */
+	CFI_STARTPROC();
+	ENTER_SYSV_FUNC_PARAMS_0_4
+
+	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
+
+	testl %ecx, %ecx;
+	movq %rdx, RX0;
+	movq %rsi, (6 * 8)(%rsp);
+
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+
+	jz .Lblk1_3_dec;
+		call __twofish_enc_blk3;
+		jmp .Lblk1_3_end;
+	.Lblk1_3_dec:
+		call __twofish_dec_blk3;
+
+.Lblk1_3_end:
+	movq (6 * 8)(%rsp), RX0;
+	movq RCD0, (0 * 8)(RX0);
+	movq RAB0, (1 * 8)(RX0);
+	movq RCD1, (2 * 8)(RX0);
+	movq RAB1, (3 * 8)(RX0);
+	movq RCD2, (4 * 8)(RX0);
+	movq RAB2, (5 * 8)(RX0);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
+	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
+
+	EXIT_SYSV_FUNC
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;)
+
 .align 8
 .globl _gcry_twofish_amd64_ctr_enc
 ELF(.type   _gcry_twofish_amd64_ctr_enc, at function;)
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index 930ac792..0cb9a64c 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -468,6 +468,52 @@ __twofish_dec_blk16:
 	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
 
+.align 8
+.globl _gcry_twofish_avx2_blk16
+ELF(.type   _gcry_twofish_avx2_blk16, at function;)
+_gcry_twofish_avx2_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%ecx: encrypt
+	 */
+	CFI_STARTPROC();
+
+	vzeroupper;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RB0;
+	vmovdqu (2 * 32)(%rdx), RC0;
+	vmovdqu (3 * 32)(%rdx), RD0;
+	vmovdqu (4 * 32)(%rdx), RA1;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RC1;
+	vmovdqu (7 * 32)(%rdx), RD1;
+
+	testl %ecx, %ecx;
+	jz .Lblk16_dec;
+		call __twofish_enc_blk16;
+		jmp .Lblk16_end;
+	.Lblk16_dec:
+		call __twofish_dec_blk16;
+
+.Lblk16_end:
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_twofish_avx2_blk16,.-_gcry_twofish_avx2_blk16;)
+
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
diff --git a/cipher/twofish.c b/cipher/twofish.c
index b300715b..92c463fc 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -101,7 +101,12 @@ static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 				       int encrypt);
 static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 				      size_t nblocks);
-
+static void _gcry_twofish_xts_crypt (void *context, unsigned char *tweak,
+				     void *outbuf_arg, const void *inbuf_arg,
+				     size_t nblocks, int encrypt);
+static void _gcry_twofish_ecb_crypt (void *context, void *outbuf_arg,
+				     const void *inbuf_arg, size_t nblocks,
+				     int encrypt);
 
 /* Structure for an expanded Twofish key.  s contains the key-dependent
  * S-boxes composed with the MDS matrix; w contains the eight "whitening"
@@ -775,7 +780,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
   bulk_ops->cfb_dec = _gcry_twofish_cfb_dec;
   bulk_ops->ctr_enc = _gcry_twofish_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt;
-  bulk_ops->ocb_auth  = _gcry_twofish_ocb_auth;
+  bulk_ops->ocb_auth = _gcry_twofish_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_twofish_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_twofish_ecb_crypt;
 
   (void)hwfeatures;
 
@@ -788,6 +795,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
 /* Assembler implementations of Twofish using AVX2.  Process 16 block in
    parallel.
  */
+extern void _gcry_twofish_avx2_blk16 (const TWOFISH_context *c, byte *out,
+				      const byte *in, int encrypt) ASM_FUNC_ABI;
+
 extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
@@ -835,6 +845,9 @@ extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c,
 					      byte *out, const byte *in);
 
 /* These assembly implementations process three blocks in parallel. */
+extern void _gcry_twofish_amd64_blk3(const TWOFISH_context *c, byte *out,
+				     const byte *in, int encrypt);
+
 extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out,
 					const byte *in, byte *ctr);
 
@@ -1501,7 +1514,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 	blkn += 3;
 
 	twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-			      c->u_mode.ocb.aad_sum, Ls);
+			       c->u_mode.ocb.aad_sum, Ls);
 
 	nblocks -= 3;
 	abuf += 3 * TWOFISH_BLOCKSIZE;
@@ -1527,6 +1540,134 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 }
 
 
+static unsigned int
+twofish_crypt_blk1_16(const void *context, byte *out, const byte *in,
+		      unsigned int num_blks, int encrypt)
+{
+  const TWOFISH_context *ctx = context;
+  unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+  if (num_blks == 16 && ctx->use_avx2)
+    {
+      _gcry_twofish_avx2_blk16 (ctx, out, in, encrypt);
+      return 0;
+    }
+#endif
+
+#ifdef USE_AMD64_ASM
+  while (num_blks >= 3)
+    {
+      _gcry_twofish_amd64_blk3 (ctx, out, in, encrypt);
+      burn = 8 * sizeof(void *);
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += 3 * TWOFISH_BLOCKSIZE;
+      in += 3 * TWOFISH_BLOCKSIZE;
+      num_blks -= 3;
+    }
+#endif
+
+  while (num_blks >= 1)
+    {
+      if (encrypt)
+	burn = twofish_encrypt((void *)ctx, out, in);
+      else
+	burn = twofish_decrypt((void *)ctx, out, in);
+
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += TWOFISH_BLOCKSIZE;
+      in += TWOFISH_BLOCKSIZE;
+      num_blks--;
+    }
+
+  return burn_stack_depth;
+}
+
+static unsigned int
+twofish_encrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 1);
+}
+
+static unsigned int
+twofish_decrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 0);
+}
+
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_twofish_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+			 const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[16 * 16];
+      unsigned int tmp_used = 16;
+      size_t tmpbufsize = 15 * 16;
+      size_t nburn;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2)
+	tmpbufsize = 16 * 16;
+#endif
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+                                              : twofish_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks,
+                                 tweak, tmpbuf, tmpbufsize / 16,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_twofish_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+			 size_t nblocks, int encrypt)
+{
+  TWOFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t fn_maxblocks = 15;
+      size_t nburn;
+
+#ifdef USE_AVX2
+      if (ctx->use_avx2)
+	fn_maxblocks = 16;
+#endif
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16
+                                              : twofish_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks, fn_maxblocks);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+
 

 /* Test a single encryption and decryption with each key size. */
 
-- 
2.37.2




More information about the Gcrypt-devel mailing list