[PATCH 07/10] Add parallelized AES-NI ECB encryption

Jussi Kivilinna jussi.kivilinna at mbnet.fi
Fri Nov 23 18:22:25 CET 2012


* cipher/cipher-internal.h (struct gcry_cipher_handle): Add
bulk.ecb_enc.
* cipher/cipher.c (gcry_cipher_open) [USE_AES]: Set bulk.ecb_enc
to _gcry_aes_ecb_enc.
(do_ecb_encrypt): Redirect call into bulk.ecb_enc if non-null.
* src/cipher.h (_gcry_aes_ecb_enc): Add new function prototype.
* cipher/rijndeal.c (_gcry_aes_ecb_enc): Add new function.
[USE_AESNI] (do_aesni_enc_vec4): Add new function.
--

Parallelized ECB encryption is ~2.0x faster on Intel Sandy-Bridge (x86-64).

Before:

Running each test 1000 times.
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
AES            690ms   350ms  2130ms   470ms  1890ms   670ms  2220ms  2240ms   490ms   490ms
AES192         900ms   440ms  2460ms   560ms  2210ms   840ms  2550ms  2560ms   570ms   570ms
AES256        1040ms   520ms  2800ms   640ms  2550ms   970ms  2840ms  2850ms   660ms   650ms

After:

$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
AES            340ms   360ms  2130ms   470ms  1870ms   690ms  2200ms  2250ms   500ms   490ms
AES192         430ms   440ms  2460ms   550ms  2210ms   820ms  2540ms  2560ms   570ms   570ms
AES256         500ms   520ms  2790ms   640ms  2540ms   960ms  2830ms  2840ms   650ms   650ms

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
---
 cipher/cipher-internal.h |    3 +
 cipher/cipher.c          |    8 ++
 cipher/rijndael.c        |  174 ++++++++++++++++++++++++++++++++++++++++++++++
 src/cipher.h             |    2 +
 4 files changed, 187 insertions(+)

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index dcce708..edd8e17 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -89,6 +89,9 @@ struct gcry_cipher_handle
     void (*ctr_enc)(void *context, unsigned char *iv,
                     void *outbuf_arg, const void *inbuf_arg,
                     unsigned int nblocks);
+    void (*ecb_enc)(void *context, void *outbuf_arg,
+                    const void *inbuf_arg,
+                    unsigned int nblocks);
     void (*ecb_dec)(void *context, void *outbuf_arg,
                     const void *inbuf_arg,
                     unsigned int nblocks);
diff --git a/cipher/cipher.c b/cipher/cipher.c
index b0f9773..edc84f7 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -716,6 +716,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.cbc_enc = _gcry_aes_cbc_enc;
               h->bulk.cbc_dec = _gcry_aes_cbc_dec;
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
+              h->bulk.ecb_enc = _gcry_aes_ecb_enc;
               h->bulk.ecb_dec = _gcry_aes_ecb_dec;
               break;
 #endif /*USE_AES*/
@@ -859,6 +860,13 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
 
   nblocks = inbuflen / c->cipher->blocksize;
 
+  if (nblocks && c->bulk.ecb_enc)
+    {
+      c->bulk.ecb_enc (&c->context.c, outbuf, inbuf, nblocks);
+
+      return 0;
+    }
+
   for (n=0; n < nblocks; n++ )
     {
       c->cipher->encrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf);
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 421b159..5110c72 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -822,6 +822,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
 }
 
 
+/* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmp $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmp $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesenclast_xmm0_xmm1
+                aesenclast_xmm0_xmm2
+                aesenclast_xmm0_xmm3
+                aesenclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
 /* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4.  */
 static void
@@ -1476,6 +1585,71 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
 }
 
 
+/* Bulk encryption of complete blocks in ECB mode.  This function is only
+ * intended for the bulk encryption feature of cipher.c.  */
+void
+_gcry_aes_ecb_enc (void *context, void *outbuf_arg,
+		   const void *inbuf_arg, unsigned int nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      aesni_prepare ();
+
+      for ( ;nblocks > 3 ; nblocks -= 4 )
+        {
+          asm volatile
+            ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
+             "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+             "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+             "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+             : /* No output */
+             : [inbuf] "r" (inbuf)
+             : "memory");
+
+          do_aesni_enc_vec4 (ctx);
+
+          asm volatile
+            ("movdqu %%xmm1, 0*16(%[outbuf])\n\t" /* store output blocks */
+             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+             : /* No output */
+             : [outbuf] "r" (outbuf)
+             : "memory");
+
+          outbuf += 4*BLOCKSIZE;
+          inbuf  += 4*BLOCKSIZE;
+        }
+
+      for ( ;nblocks; nblocks-- )
+        {
+          do_aesni_enc_aligned (ctx, outbuf, inbuf);
+
+          inbuf += BLOCKSIZE;
+          outbuf += BLOCKSIZE;
+        }
+
+      aesni_cleanup ();
+      aesni_cleanup_2_5 ();
+    }
+#endif
+  else
+    for ( ;nblocks; nblocks-- )
+      {
+        rijndael_encrypt(context, outbuf, inbuf);
+        inbuf += BLOCKSIZE;
+        outbuf += BLOCKSIZE;
+      }
+}
+
+
 
 /* Decrypt one block.  A and B need to be aligned on a 4 byte boundary
    and the decryption must have been prepared.  A and B may be the
diff --git a/src/cipher.h b/src/cipher.h
index 6b34e90..66367c1 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -94,6 +94,8 @@ void _gcry_aes_cbc_dec (void *context, unsigned char *iv,
 void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
                         void *outbuf_arg, const void *inbuf_arg,
                         unsigned int nblocks);
+void _gcry_aes_ecb_enc (void *context, void *outbuf_arg,
+                        const void *inbuf_arg, unsigned int nblocks);
 void _gcry_aes_ecb_dec (void *context, void *outbuf_arg,
                         const void *inbuf_arg, unsigned int nblocks);
 




More information about the Gcrypt-devel mailing list