[PATCH 3/3] serpent: add parallel processing for CFB decryption

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu May 23 13:15:51 CEST 2013


* cipher/cipher.c (gcry_cipher_open): Add bulf CFB decryption function
for Serpent.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_cfb_dec): New
function.
* cipher/serpent.c (_gcry_serpent_sse2_cfb_dec): New prototype.
(_gcry_serpent_cfb_dec) New function.
(selftest_cfb_128) New function.
(selftest) Call selftest_cfb_128.
* src/cipher.h (_gcry_serpent_cfb_dec): New prototype.
--

Patch makes Serpent-CFB decryption 4.0 times faster on Intel Sandy-Bridge and
2.7 times faster on AMD K10.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher.c             |    1 
 cipher/serpent-sse2-amd64.S |   66 ++++++++++++++++++++++++++++++++
 cipher/serpent.c            |   88 +++++++++++++++++++++++++++++++++++++++++++
 src/cipher.h                |    3 +
 4 files changed, 158 insertions(+)

diff --git a/cipher/cipher.c b/cipher/cipher.c
index e9a652f..652d795 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -732,6 +732,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
 	    case GCRY_CIPHER_SERPENT192:
 	    case GCRY_CIPHER_SERPENT256:
               h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
+              h->bulk.cfb_dec = _gcry_serpent_cfb_dec;
               h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
               break;
 #endif /*USE_SERPENT*/
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 8d8c8dd..5f9e9d2 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -822,5 +822,71 @@ _gcry_serpent_sse2_cbc_dec:
 	ret
 .size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
 
+.align 8
+.global _gcry_serpent_sse2_cfb_dec
+.type   _gcry_serpent_sse2_cfb_dec, at function;
+_gcry_serpent_sse2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+
+	.set RA0, enc_in_a0
+	.set RA1, enc_in_a1
+	.set RA2, enc_in_a2
+	.set RA3, enc_in_a3
+	.set RB0, enc_in_b0
+	.set RB1, enc_in_b1
+	.set RB2, enc_in_b2
+	.set RB3, enc_in_b3
+
+	/* Load input */
+	movdqu (%rcx), RA0;
+	movdqu 0 * 16(%rdx), RA1;
+	movdqu 1 * 16(%rdx), RA2;
+	movdqu 2 * 16(%rdx), RA3;
+	movdqu 3 * 16(%rdx), RB0;
+	movdqu 4 * 16(%rdx), RB1;
+	movdqu 5 * 16(%rdx), RB2;
+	movdqu 6 * 16(%rdx), RB3;
+
+	/* Update IV */
+	movdqu 7 * 16(%rdx), RNOT;
+	movdqu RNOT, (%rcx);
+
+	call __serpent_enc_blk8;
+
+	.set RA0, enc_out_a0
+	.set RA1, enc_out_a1
+	.set RA2, enc_out_a2
+	.set RA3, enc_out_a3
+	.set RB0, enc_out_b0
+	.set RB1, enc_out_b1
+	.set RB2, enc_out_b2
+	.set RB3, enc_out_b3
+
+	pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA3, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+
+	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA3, (3 * 16)(%rsi);
+	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;
+
 #endif /*defined(USE_SERPENT)*/
 #endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 7b82b48..95ac7c1 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -74,6 +74,11 @@ extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
+
+extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv);
 #endif
 
 /* A prototype.  */
@@ -916,6 +921,71 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_serpent_cfb_dec(void *context, unsigned char *iv,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* clear SSE2 registers used by serpent-sse2 */
+        asm volatile (
+          "pxor %%xmm0, %%xmm0;\n"
+          "pxor %%xmm1, %%xmm1;\n"
+          "pxor %%xmm2, %%xmm2;\n"
+          "pxor %%xmm3, %%xmm3;\n"
+          "pxor %%xmm4, %%xmm4;\n"
+          "pxor %%xmm5, %%xmm5;\n"
+          "pxor %%xmm6, %%xmm6;\n"
+          "pxor %%xmm7, %%xmm7;\n"
+          "pxor %%xmm10, %%xmm10;\n"
+          "pxor %%xmm11, %%xmm11;\n"
+          "pxor %%xmm12, %%xmm12;\n"
+          "pxor %%xmm13, %%xmm13;\n"
+          :::);
+
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      serpent_encrypt_internal(ctx, iv, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t));
+      outbuf += sizeof(serpent_block_t);
+      inbuf  += sizeof(serpent_block_t);
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
 

 
 /* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
@@ -948,6 +1018,21 @@ selftest_cbc_128 (void)
 }
 
 
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_cfb_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Serpent test.  */
 
 static const char *
@@ -1034,6 +1119,9 @@ serpent_test (void)
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/src/cipher.h b/src/cipher.h
index f28990d..9d6cc01 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -113,6 +113,9 @@ void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
 void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
                             void *outbuf_arg, const void *inbuf_arg,
                             unsigned int nblocks);
+void _gcry_serpent_cfb_dec (void *context, unsigned char *iv,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);




More information about the Gcrypt-devel mailing list