[PATCH 8/8] serpent: accelerate XTS and ECB modes
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Oct 23 18:16:08 CEST 2022
* cipher/serpent-armv7-neon.S (_gcry_serpent_neon_blk8): New.
* cipher/serpent-avx2-amd64.S (_gcry_serpent_avx2_blk16): New.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_blk8): New.
* cipher/serpent.c (_gcry_serpent_sse2_blk8)
(_gcry_serpent_avx2_blk16, _gcry_serpent_neon_blk8)
(_gcry_serpent_xts_crypt, _gcry_serpent_ecb_crypt)
(serpent_crypt_blk1_16, serpent_encrypt_blk1_16)
(serpent_decrypt_blk1_16): New.
(serpent_setkey): Setup XTS and ECB bulk functions.
--
Benchmark on AMD Ryzen 9 7900X:
Before:
SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 5.42 ns/B 176.0 MiB/s 30.47 c/B 5625
ECB dec | 4.82 ns/B 197.9 MiB/s 27.11 c/B 5625
XTS enc | 5.57 ns/B 171.3 MiB/s 31.31 c/B 5625
XTS dec | 4.99 ns/B 191.1 MiB/s 28.07 c/B 5625
After:
SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 0.708 ns/B 1347 MiB/s 3.98 c/B 5625
ECB dec | 0.694 ns/B 1373 MiB/s 3.91 c/B 5625
XTS enc | 0.766 ns/B 1246 MiB/s 4.31 c/B 5625
XTS dec | 0.754 ns/B 1264 MiB/s 4.24 c/B 5625
GnuPG-bug-id: T6242
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/serpent-armv7-neon.S | 56 ++++++++++++++
cipher/serpent-avx2-amd64.S | 50 ++++++++++++
cipher/serpent-sse2-amd64.S | 65 ++++++++++++++++
cipher/serpent.c | 147 +++++++++++++++++++++++++++++++++++-
4 files changed, 317 insertions(+), 1 deletion(-)
diff --git a/cipher/serpent-armv7-neon.S b/cipher/serpent-armv7-neon.S
index adff6394..4179ba2c 100644
--- a/cipher/serpent-armv7-neon.S
+++ b/cipher/serpent-armv7-neon.S
@@ -600,6 +600,62 @@ __serpent_dec_blk8:
bx lr;
.size __serpent_dec_blk8,.-__serpent_dec_blk8;
+.align 3
+.globl _gcry_serpent_neon_blk8
+.type _gcry_serpent_neon_blk8,%function;
+_gcry_serpent_neon_blk8:
+ /* input:
+ * r0: ctx, CTX
+ * r1: dst (8 blocks)
+ * r2: src (8 blocks)
+ * r3: encrypt
+ */
+
+ push {lr};
+ vpush {RA4-RB2};
+
+ cmp r3, #0
+
+ vld1.8 {RA0, RA1}, [r2]!;
+ vld1.8 {RA2, RA3}, [r2]!;
+ vld1.8 {RB0, RB1}, [r2]!;
+ vld1.8 {RB2, RB3}, [r2]!;
+
+ beq .Lblk8_dec;
+ bl __serpent_enc_blk8;
+ vst1.8 {RA4}, [r1]!;
+ vst1.8 {RA1, RA2}, [r1]!;
+ vst1.8 {RA0}, [r1]!;
+ vst1.8 {RB4}, [r1]!;
+ vst1.8 {RB1, RB2}, [r1]!;
+ vst1.8 {RB0}, [r1]!;
+ b .Lblk8_end;
+ .Lblk8_dec:
+ bl __serpent_dec_blk8;
+ vst1.8 {RA0, RA1}, [r1]!;
+ vst1.8 {RA2, RA3}, [r1]!;
+ vst1.8 {RB0, RB1}, [r1]!;
+ vst1.8 {RB2, RB3}, [r1]!;
+
+.Lblk8_end:
+ /* clear the used registers */
+ veor RA0, RA0;
+ veor RA1, RA1;
+ veor RA2, RA2;
+ veor RA3, RA3;
+
+ vpop {RA4-RB2};
+
+ veor RB3, RB3;
+ veor RB4, RB4;
+ veor RT0, RT0;
+ veor RT1, RT1;
+ veor RT2, RT2;
+ veor RT3, RT3;
+
+ pop {pc};
+.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
+
.align 3
.globl _gcry_serpent_neon_ctr_enc
.type _gcry_serpent_neon_ctr_enc,%function;
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index d3515a21..54ff61e4 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -583,6 +583,56 @@ __serpent_dec_blk16:
CFI_ENDPROC();
ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
+.align 8
+.globl _gcry_serpent_avx2_blk16
+ELF(.type _gcry_serpent_avx2_blk16, at function;)
+_gcry_serpent_avx2_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %ecx: encrypt
+ */
+ CFI_STARTPROC();
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ vmovdqu (4 * 32)(%rdx), RB0;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RB2;
+ vmovdqu (7 * 32)(%rdx), RB3;
+
+ testl %ecx, %ecx;
+ jz .Lblk16_dec;
+ call __serpent_enc_blk16;
+ vmovdqu RA4, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA0, (3 * 32)(%rsi);
+ vmovdqu RB4, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB0, (7 * 32)(%rsi);
+ jmp .Lblk16_end;
+ .Lblk16_dec:
+ call __serpent_dec_blk16;
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+.Lblk16_end:
+ vzeroall;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;)
+
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
vpsubq minus_one, x, x; \
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index b5935095..01723a2a 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -605,6 +605,71 @@ __serpent_dec_blk8:
CFI_ENDPROC();
ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
+.align 8
+.globl _gcry_serpent_sse2_blk8
+ELF(.type _gcry_serpent_sse2_blk8, at function;)
+_gcry_serpent_sse2_blk8:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %ecx: encrypt
+ */
+ CFI_STARTPROC();
+
+ movdqu (0 * 16)(%rdx), RA0;
+ movdqu (1 * 16)(%rdx), RA1;
+ movdqu (2 * 16)(%rdx), RA2;
+ movdqu (3 * 16)(%rdx), RA3;
+ movdqu (4 * 16)(%rdx), RB0;
+ movdqu (5 * 16)(%rdx), RB1;
+ movdqu (6 * 16)(%rdx), RB2;
+ movdqu (7 * 16)(%rdx), RB3;
+
+ testl %ecx, %ecx;
+ jz .Lblk8_dec;
+ call __serpent_enc_blk8;
+ movdqu RA4, (0 * 16)(%rsi);
+ movdqu RA1, (1 * 16)(%rsi);
+ movdqu RA2, (2 * 16)(%rsi);
+ movdqu RA0, (3 * 16)(%rsi);
+ movdqu RB4, (4 * 16)(%rsi);
+ movdqu RB1, (5 * 16)(%rsi);
+ movdqu RB2, (6 * 16)(%rsi);
+ movdqu RB0, (7 * 16)(%rsi);
+ jmp .Lblk8_end;
+ .Lblk8_dec:
+ call __serpent_dec_blk8;
+ movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA1, (1 * 16)(%rsi);
+ movdqu RA2, (2 * 16)(%rsi);
+ movdqu RA3, (3 * 16)(%rsi);
+ movdqu RB0, (4 * 16)(%rsi);
+ movdqu RB1, (5 * 16)(%rsi);
+ movdqu RB2, (6 * 16)(%rsi);
+ movdqu RB3, (7 * 16)(%rsi);
+
+.Lblk8_end:
+ /* clear the used registers */
+ pxor RA0, RA0;
+ pxor RA1, RA1;
+ pxor RA2, RA2;
+ pxor RA3, RA3;
+ pxor RA4, RA4;
+ pxor RB0, RB0;
+ pxor RB1, RB1;
+ pxor RB2, RB2;
+ pxor RB3, RB3;
+ pxor RB4, RB4;
+ pxor RTMP0, RTMP0;
+ pxor RTMP1, RTMP1;
+ pxor RTMP2, RTMP2;
+ pxor RNOT, RNOT;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;)
+
.align 8
.globl _gcry_serpent_sse2_ctr_enc
ELF(.type _gcry_serpent_sse2_ctr_enc, at function;)
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 93c561c5..0a9ed27c 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -139,6 +139,9 @@ extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_blk8(const serpent_context_t *c, byte *out,
+ const byte *in, int encrypt) ASM_FUNC_ABI;
#endif
#ifdef USE_AVX2
@@ -179,6 +182,9 @@ extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_blk16(const serpent_context_t *c, byte *out,
+ const byte *in, int encrypt) ASM_FUNC_ABI;
#endif
#ifdef USE_NEON
@@ -219,6 +225,9 @@ extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx,
unsigned char *offset,
unsigned char *checksum,
const void *Ls[8]);
+
+extern void _gcry_serpent_neon_blk8(const serpent_context_t *c, byte *out,
+ const byte *in, int encrypt);
#endif
@@ -239,6 +248,12 @@ static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
int encrypt);
static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
size_t nblocks);
+static void _gcry_serpent_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
+static void _gcry_serpent_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
/*
@@ -790,7 +805,9 @@ serpent_setkey (void *ctx,
bulk_ops->cfb_dec = _gcry_serpent_cfb_dec;
bulk_ops->ctr_enc = _gcry_serpent_ctr_enc;
bulk_ops->ocb_crypt = _gcry_serpent_ocb_crypt;
- bulk_ops->ocb_auth = _gcry_serpent_ocb_auth;
+ bulk_ops->ocb_auth = _gcry_serpent_ocb_auth;
+ bulk_ops->xts_crypt = _gcry_serpent_xts_crypt;
+ bulk_ops->ecb_crypt = _gcry_serpent_ecb_crypt;
if (serpent_test_ret)
ret = GPG_ERR_SELFTEST_FAILED;
@@ -1538,6 +1555,134 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
return nblocks;
}
+
+static unsigned int
+serpent_crypt_blk1_16(const void *context, byte *out, const byte *in,
+ unsigned int num_blks, int encrypt)
+{
+ const serpent_context_t *ctx = context;
+ unsigned int burn, burn_stack_depth = 0;
+
+#ifdef USE_AVX2
+ if (num_blks == 16 && ctx->use_avx2)
+ {
+ _gcry_serpent_avx2_blk16 (ctx, out, in, encrypt);
+ return 0;
+ }
+#endif
+
+#ifdef USE_SSE2
+ while (num_blks >= 8)
+ {
+ _gcry_serpent_sse2_blk8 (ctx, out, in, encrypt);
+ out += 8 * sizeof(serpent_block_t);
+ in += 8 * sizeof(serpent_block_t);
+ num_blks -= 8;
+ }
+#endif
+
+#ifdef USE_NEON
+ if (ctx->use_neon)
+ {
+ while (num_blks >= 8)
+ {
+ _gcry_serpent_neon_blk8 (ctx, out, in, encrypt);
+ out += 8 * sizeof(serpent_block_t);
+ in += 8 * sizeof(serpent_block_t);
+ num_blks -= 8;
+ }
+ }
+#endif
+
+ while (num_blks >= 1)
+ {
+ if (encrypt)
+ serpent_encrypt_internal((void *)ctx, in, out);
+ else
+ serpent_decrypt_internal((void *)ctx, in, out);
+
+ burn = 2 * sizeof(serpent_block_t);
+ burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth;
+ out += sizeof(serpent_block_t);
+ in += sizeof(serpent_block_t);
+ num_blks--;
+ }
+
+ return burn_stack_depth;
+}
+
+static unsigned int
+serpent_encrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 1);
+}
+
+static unsigned int
+serpent_decrypt_blk1_16(const void *ctx, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 0);
+}
+
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_serpent_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ serpent_context_t *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ unsigned char tmpbuf[16 * 16];
+ unsigned int tmp_used = 16;
+ size_t nburn;
+
+ nburn = bulk_xts_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16
+ : serpent_decrypt_blk1_16,
+ outbuf, inbuf, nblocks,
+ tweak, tmpbuf, sizeof(tmpbuf) / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt)
+{
+ serpent_context_t *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ size_t nburn;
+
+ nburn = bulk_ecb_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16
+ : serpent_decrypt_blk1_16,
+ outbuf, inbuf, nblocks, 16);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
+
/* Serpent test. */
--
2.37.2
More information about the Gcrypt-devel
mailing list