[PATCH 8/8] serpent: accelerate XTS and ECB modes

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Oct 23 18:16:08 CEST 2022


* cipher/serpent-armv7-neon.S (_gcry_serpent_neon_blk8): New.
* cipher/serpent-avx2-amd64.S (_gcry_serpent_avx2_blk16): New.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_blk8): New.
* cipher/serpent.c (_gcry_serpent_sse2_blk8)
(_gcry_serpent_avx2_blk16, _gcry_serpent_neon_blk8)
(_gcry_serpent_xts_crypt, _gcry_serpent_ecb_crypt)
(serpent_crypt_blk1_16, serpent_encrypt_blk1_16)
(serpent_decrypt_blk1_16): New.
(serpent_setkey): Setup XTS and ECB bulk functions.
--

Benchmark on AMD Ryzen 9 7900X:

Before:
 SERPENT128     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      5.42 ns/B     176.0 MiB/s     30.47 c/B      5625
        ECB dec |      4.82 ns/B     197.9 MiB/s     27.11 c/B      5625
        XTS enc |      5.57 ns/B     171.3 MiB/s     31.31 c/B      5625
        XTS dec |      4.99 ns/B     191.1 MiB/s     28.07 c/B      5625

After:
 SERPENT128     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.708 ns/B      1347 MiB/s      3.98 c/B      5625
        ECB dec |     0.694 ns/B      1373 MiB/s      3.91 c/B      5625
        XTS enc |     0.766 ns/B      1246 MiB/s      4.31 c/B      5625
        XTS dec |     0.754 ns/B      1264 MiB/s      4.24 c/B      5625

GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/serpent-armv7-neon.S |  56 ++++++++++++++
 cipher/serpent-avx2-amd64.S |  50 ++++++++++++
 cipher/serpent-sse2-amd64.S |  65 ++++++++++++++++
 cipher/serpent.c            | 147 +++++++++++++++++++++++++++++++++++-
 4 files changed, 317 insertions(+), 1 deletion(-)

diff --git a/cipher/serpent-armv7-neon.S b/cipher/serpent-armv7-neon.S
index adff6394..4179ba2c 100644
--- a/cipher/serpent-armv7-neon.S
+++ b/cipher/serpent-armv7-neon.S
@@ -600,6 +600,62 @@ __serpent_dec_blk8:
 	bx lr;
 .size __serpent_dec_blk8,.-__serpent_dec_blk8;
 
+.align 3
+.globl _gcry_serpent_neon_blk8
+.type _gcry_serpent_neon_blk8,%function;
+_gcry_serpent_neon_blk8:
+	/* input:
+	 *	r0: ctx, CTX
+	 *	r1: dst (8 blocks)
+	 *	r2: src (8 blocks)
+	 *	r3: encrypt
+	 */
+
+	push {lr};
+	vpush {RA4-RB2};
+
+	cmp r3, #0
+
+	vld1.8 {RA0, RA1}, [r2]!;
+	vld1.8 {RA2, RA3}, [r2]!;
+	vld1.8 {RB0, RB1}, [r2]!;
+	vld1.8 {RB2, RB3}, [r2]!;
+
+	beq .Lblk8_dec;
+		bl __serpent_enc_blk8;
+		vst1.8 {RA4}, [r1]!;
+		vst1.8 {RA1, RA2}, [r1]!;
+		vst1.8 {RA0}, [r1]!;
+		vst1.8 {RB4}, [r1]!;
+		vst1.8 {RB1, RB2}, [r1]!;
+		vst1.8 {RB0}, [r1]!;
+		b .Lblk8_end;
+	.Lblk8_dec:
+		bl __serpent_dec_blk8;
+		vst1.8 {RA0, RA1}, [r1]!;
+		vst1.8 {RA2, RA3}, [r1]!;
+		vst1.8 {RB0, RB1}, [r1]!;
+		vst1.8 {RB2, RB3}, [r1]!;
+
+.Lblk8_end:
+	/* clear the used registers */
+	veor RA0, RA0;
+	veor RA1, RA1;
+	veor RA2, RA2;
+	veor RA3, RA3;
+
+	vpop {RA4-RB2};
+
+	veor RB3, RB3;
+	veor RB4, RB4;
+	veor RT0, RT0;
+	veor RT1, RT1;
+	veor RT2, RT2;
+	veor RT3, RT3;
+
+	pop {pc};
+.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
+
 .align 3
 .globl _gcry_serpent_neon_ctr_enc
 .type _gcry_serpent_neon_ctr_enc,%function;
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index d3515a21..54ff61e4 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -583,6 +583,56 @@ __serpent_dec_blk16:
 	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
+.align 8
+.globl _gcry_serpent_avx2_blk16
+ELF(.type   _gcry_serpent_avx2_blk16, at function;)
+_gcry_serpent_avx2_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%ecx: encrypt
+	 */
+	CFI_STARTPROC();
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RA1;
+	vmovdqu (2 * 32)(%rdx), RA2;
+	vmovdqu (3 * 32)(%rdx), RA3;
+	vmovdqu (4 * 32)(%rdx), RB0;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RB2;
+	vmovdqu (7 * 32)(%rdx), RB3;
+
+	testl %ecx, %ecx;
+	jz .Lblk16_dec;
+		call __serpent_enc_blk16;
+		vmovdqu RA4, (0 * 32)(%rsi);
+		vmovdqu RA1, (1 * 32)(%rsi);
+		vmovdqu RA2, (2 * 32)(%rsi);
+		vmovdqu RA0, (3 * 32)(%rsi);
+		vmovdqu RB4, (4 * 32)(%rsi);
+		vmovdqu RB1, (5 * 32)(%rsi);
+		vmovdqu RB2, (6 * 32)(%rsi);
+		vmovdqu RB0, (7 * 32)(%rsi);
+		jmp .Lblk16_end;
+	.Lblk16_dec:
+		call __serpent_dec_blk16;
+		vmovdqu RA0, (0 * 32)(%rsi);
+		vmovdqu RA1, (1 * 32)(%rsi);
+		vmovdqu RA2, (2 * 32)(%rsi);
+		vmovdqu RA3, (3 * 32)(%rsi);
+		vmovdqu RB0, (4 * 32)(%rsi);
+		vmovdqu RB1, (5 * 32)(%rsi);
+		vmovdqu RB2, (6 * 32)(%rsi);
+		vmovdqu RB3, (7 * 32)(%rsi);
+
+.Lblk16_end:
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;)
+
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index b5935095..01723a2a 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -605,6 +605,71 @@ __serpent_dec_blk8:
 	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
+.align 8
+.globl _gcry_serpent_sse2_blk8
+ELF(.type   _gcry_serpent_sse2_blk8, at function;)
+_gcry_serpent_sse2_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%ecx: encrypt
+	 */
+	CFI_STARTPROC();
+
+	movdqu (0 * 16)(%rdx), RA0;
+	movdqu (1 * 16)(%rdx), RA1;
+	movdqu (2 * 16)(%rdx), RA2;
+	movdqu (3 * 16)(%rdx), RA3;
+	movdqu (4 * 16)(%rdx), RB0;
+	movdqu (5 * 16)(%rdx), RB1;
+	movdqu (6 * 16)(%rdx), RB2;
+	movdqu (7 * 16)(%rdx), RB3;
+
+	testl %ecx, %ecx;
+	jz .Lblk8_dec;
+		call __serpent_enc_blk8;
+		movdqu RA4, (0 * 16)(%rsi);
+		movdqu RA1, (1 * 16)(%rsi);
+		movdqu RA2, (2 * 16)(%rsi);
+		movdqu RA0, (3 * 16)(%rsi);
+		movdqu RB4, (4 * 16)(%rsi);
+		movdqu RB1, (5 * 16)(%rsi);
+		movdqu RB2, (6 * 16)(%rsi);
+		movdqu RB0, (7 * 16)(%rsi);
+		jmp .Lblk8_end;
+	.Lblk8_dec:
+		call __serpent_dec_blk8;
+		movdqu RA0, (0 * 16)(%rsi);
+		movdqu RA1, (1 * 16)(%rsi);
+		movdqu RA2, (2 * 16)(%rsi);
+		movdqu RA3, (3 * 16)(%rsi);
+		movdqu RB0, (4 * 16)(%rsi);
+		movdqu RB1, (5 * 16)(%rsi);
+		movdqu RB2, (6 * 16)(%rsi);
+		movdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_end:
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;)
+
 .align 8
 .globl _gcry_serpent_sse2_ctr_enc
 ELF(.type   _gcry_serpent_sse2_ctr_enc, at function;)
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 93c561c5..0a9ed27c 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -139,6 +139,9 @@ extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx,
 					unsigned char *offset,
 					unsigned char *checksum,
 					const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_blk8(const serpent_context_t *c, byte *out,
+				    const byte *in, int encrypt) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AVX2
@@ -179,6 +182,9 @@ extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx,
 					unsigned char *offset,
 					unsigned char *checksum,
 					const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_blk16(const serpent_context_t *c, byte *out,
+				     const byte *in, int encrypt) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_NEON
@@ -219,6 +225,9 @@ extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx,
 					unsigned char *offset,
 					unsigned char *checksum,
 					const void *Ls[8]);
+
+extern void _gcry_serpent_neon_blk8(const serpent_context_t *c, byte *out,
+				    const byte *in, int encrypt);
 #endif
 
 
@@ -239,6 +248,12 @@ static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 				       int encrypt);
 static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 				      size_t nblocks);
+static void _gcry_serpent_xts_crypt (void *context, unsigned char *tweak,
+				     void *outbuf_arg, const void *inbuf_arg,
+				     size_t nblocks, int encrypt);
+static void _gcry_serpent_ecb_crypt (void *context, void *outbuf_arg,
+				     const void *inbuf_arg, size_t nblocks,
+				     int encrypt);
 
 
 /*
@@ -790,7 +805,9 @@ serpent_setkey (void *ctx,
   bulk_ops->cfb_dec = _gcry_serpent_cfb_dec;
   bulk_ops->ctr_enc = _gcry_serpent_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_serpent_ocb_crypt;
-  bulk_ops->ocb_auth  = _gcry_serpent_ocb_auth;
+  bulk_ops->ocb_auth = _gcry_serpent_ocb_auth;
+  bulk_ops->xts_crypt = _gcry_serpent_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_serpent_ecb_crypt;
 
   if (serpent_test_ret)
     ret = GPG_ERR_SELFTEST_FAILED;
@@ -1538,6 +1555,134 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   return nblocks;
 }
 
+
+static unsigned int
+serpent_crypt_blk1_16(const void *context, byte *out, const byte *in,
+		      unsigned int num_blks, int encrypt)
+{
+  const serpent_context_t *ctx = context;
+  unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+  if (num_blks == 16 && ctx->use_avx2)
+    {
+      _gcry_serpent_avx2_blk16 (ctx, out, in, encrypt);
+      return 0;
+    }
+#endif
+
+#ifdef USE_SSE2
+  while (num_blks >= 8)
+    {
+      _gcry_serpent_sse2_blk8 (ctx, out, in, encrypt);
+      out += 8 * sizeof(serpent_block_t);
+      in += 8 * sizeof(serpent_block_t);
+      num_blks -= 8;
+    }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      while (num_blks >= 8)
+	{
+	  _gcry_serpent_neon_blk8 (ctx, out, in, encrypt);
+	  out += 8 * sizeof(serpent_block_t);
+	  in += 8 * sizeof(serpent_block_t);
+	  num_blks -= 8;
+	}
+    }
+#endif
+
+  while (num_blks >= 1)
+    {
+      if (encrypt)
+	serpent_encrypt_internal((void *)ctx, in, out);
+      else
+	serpent_decrypt_internal((void *)ctx, in, out);
+
+      burn = 2 * sizeof(serpent_block_t);
+      burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+      out += sizeof(serpent_block_t);
+      in += sizeof(serpent_block_t);
+      num_blks--;
+    }
+
+  return burn_stack_depth;
+}
+
+static unsigned int
+serpent_encrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 1);
+}
+
+static unsigned int
+serpent_decrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+			unsigned int num_blks)
+{
+  return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 0);
+}
+
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_serpent_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+			 const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[16 * 16];
+      unsigned int tmp_used = 16;
+      size_t nburn;
+
+      nburn = bulk_xts_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16
+                                              : serpent_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks,
+                                 tweak, tmpbuf, sizeof(tmpbuf) / 16,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+			 size_t nblocks, int encrypt)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      size_t nburn;
+
+      nburn = bulk_ecb_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16
+                                              : serpent_decrypt_blk1_16,
+                                 outbuf, inbuf, nblocks, 16);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
 
 
 /* Serpent test.  */
-- 
2.37.2




More information about the Gcrypt-devel mailing list