[PATCH 3/3] serpent: add parallel processing for CFB decryption
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu May 23 13:15:51 CEST 2013
* cipher/cipher.c (gcry_cipher_open): Add bulf CFB decryption function
for Serpent.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_cfb_dec): New
function.
* cipher/serpent.c (_gcry_serpent_sse2_cfb_dec): New prototype.
(_gcry_serpent_cfb_dec) New function.
(selftest_cfb_128) New function.
(selftest) Call selftest_cfb_128.
* src/cipher.h (_gcry_serpent_cfb_dec): New prototype.
--
Patch makes Serpent-CFB decryption 4.0 times faster on Intel Sandy-Bridge and
2.7 times faster on AMD K10.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/cipher.c | 1
cipher/serpent-sse2-amd64.S | 66 ++++++++++++++++++++++++++++++++
cipher/serpent.c | 88 +++++++++++++++++++++++++++++++++++++++++++
src/cipher.h | 3 +
4 files changed, 158 insertions(+)
diff --git a/cipher/cipher.c b/cipher/cipher.c
index e9a652f..652d795 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -732,6 +732,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
case GCRY_CIPHER_SERPENT192:
case GCRY_CIPHER_SERPENT256:
h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
+ h->bulk.cfb_dec = _gcry_serpent_cfb_dec;
h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
break;
#endif /*USE_SERPENT*/
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 8d8c8dd..5f9e9d2 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -822,5 +822,71 @@ _gcry_serpent_sse2_cbc_dec:
ret
.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
+.align 8
+.global _gcry_serpent_sse2_cfb_dec
+.type _gcry_serpent_sse2_cfb_dec, at function;
+_gcry_serpent_sse2_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+
+ .set RA0, enc_in_a0
+ .set RA1, enc_in_a1
+ .set RA2, enc_in_a2
+ .set RA3, enc_in_a3
+ .set RB0, enc_in_b0
+ .set RB1, enc_in_b1
+ .set RB2, enc_in_b2
+ .set RB3, enc_in_b3
+
+ /* Load input */
+ movdqu (%rcx), RA0;
+ movdqu 0 * 16(%rdx), RA1;
+ movdqu 1 * 16(%rdx), RA2;
+ movdqu 2 * 16(%rdx), RA3;
+ movdqu 3 * 16(%rdx), RB0;
+ movdqu 4 * 16(%rdx), RB1;
+ movdqu 5 * 16(%rdx), RB2;
+ movdqu 6 * 16(%rdx), RB3;
+
+ /* Update IV */
+ movdqu 7 * 16(%rdx), RNOT;
+ movdqu RNOT, (%rcx);
+
+ call __serpent_enc_blk8;
+
+ .set RA0, enc_out_a0
+ .set RA1, enc_out_a1
+ .set RA2, enc_out_a2
+ .set RA3, enc_out_a3
+ .set RB0, enc_out_b0
+ .set RB1, enc_out_b1
+ .set RB2, enc_out_b2
+ .set RB3, enc_out_b3
+
+ pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+ pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA3, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+ pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+
+ movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA1, (1 * 16)(%rsi);
+ movdqu RA2, (2 * 16)(%rsi);
+ movdqu RA3, (3 * 16)(%rsi);
+ movdqu RB0, (4 * 16)(%rsi);
+ movdqu RB1, (5 * 16)(%rsi);
+ movdqu RB2, (6 * 16)(%rsi);
+ movdqu RB3, (7 * 16)(%rsi);
+
+ ret
+.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;
+
#endif /*defined(USE_SERPENT)*/
#endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 7b82b48..95ac7c1 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -74,6 +74,11 @@ extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
+
+extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
#endif
/* A prototype. */
@@ -916,6 +921,71 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
+/* Bulk decryption of complete blocks in CFB mode. This function is only
+ intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_serpent_cfb_dec(void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks)
+{
+ serpent_context_t *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_SSE2
+ {
+ int did_use_sse2 = 0;
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
+ {
+ _gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 8;
+ outbuf += 8 * sizeof(serpent_block_t);
+ inbuf += 8 * sizeof(serpent_block_t);
+ did_use_sse2 = 1;
+ }
+
+ if (did_use_sse2)
+ {
+ /* clear SSE2 registers used by serpent-sse2 */
+ asm volatile (
+ "pxor %%xmm0, %%xmm0;\n"
+ "pxor %%xmm1, %%xmm1;\n"
+ "pxor %%xmm2, %%xmm2;\n"
+ "pxor %%xmm3, %%xmm3;\n"
+ "pxor %%xmm4, %%xmm4;\n"
+ "pxor %%xmm5, %%xmm5;\n"
+ "pxor %%xmm6, %%xmm6;\n"
+ "pxor %%xmm7, %%xmm7;\n"
+ "pxor %%xmm10, %%xmm10;\n"
+ "pxor %%xmm11, %%xmm11;\n"
+ "pxor %%xmm12, %%xmm12;\n"
+ "pxor %%xmm13, %%xmm13;\n"
+ :::);
+
+ /* serpent-sse2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+ for ( ;nblocks; nblocks-- )
+ {
+ serpent_encrypt_internal(ctx, iv, iv);
+ buf_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t));
+ outbuf += sizeof(serpent_block_t);
+ inbuf += sizeof(serpent_block_t);
+ }
+
+ _gcry_burn_stack(burn_stack_depth);
+}
+
/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
@@ -948,6 +1018,21 @@ selftest_cbc_128 (void)
}
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+ const int nblocks = 8+2;
+ const int blocksize = sizeof(serpent_block_t);
+ const int context_size = sizeof(serpent_context_t);
+
+ return _gcry_selftest_helper_cfb_128("SERPENT", &serpent_setkey,
+ &serpent_encrypt, &_gcry_serpent_cfb_dec, nblocks, blocksize,
+ context_size);
+}
+
+
/* Serpent test. */
static const char *
@@ -1034,6 +1119,9 @@ serpent_test (void)
if ( (r = selftest_cbc_128 ()) )
return r;
+ if ( (r = selftest_cfb_128 ()) )
+ return r;
+
return NULL;
}
diff --git a/src/cipher.h b/src/cipher.h
index f28990d..9d6cc01 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -113,6 +113,9 @@ void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks);
+void _gcry_serpent_cfb_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks);
/*-- dsa.c --*/
void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);
More information about the Gcrypt-devel
mailing list