[PATCH 02/10] Improve parallelizability of CBC decryption for AES-NI
    Jussi Kivilinna 
    jussi.kivilinna at mbnet.fi
       
    Fri Nov 23 18:21:59 CET 2012
    
    
  
* cipher/rijndael.c (_gcry_aes_cbc_dec) [USE_AESNI]: Add AES-NI
specific CBC mode loop with temporary block and IV stored in free SSE
registers.
--
Benchmark results on Intel Core i5-2450M (x86-64) show ~2.5x improvement:
Before:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
AES            690ms   780ms  2940ms  2110ms  1880ms   670ms  2250ms  2250ms   490ms   500ms
AES192         890ms   930ms  3260ms  2390ms  2220ms   820ms  2580ms  2590ms   560ms   570ms
AES256        1040ms  1070ms  3590ms  2640ms  2540ms   970ms  2880ms  2890ms   650ms   650ms
After:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
AES            670ms   770ms  2920ms   720ms  1900ms   660ms  2260ms  2250ms   480ms   500ms
AES192         860ms   930ms  3250ms   870ms  2210ms   830ms  2580ms  2580ms   570ms   570ms
AES256        1020ms  1080ms  3580ms  1030ms  2550ms   970ms  2880ms  2870ms   660ms   660ms
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
---
 cipher/rijndael.c |   97 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 75 insertions(+), 22 deletions(-)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index d081b42..104f869 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1582,33 +1582,86 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
   int i;
   unsigned char savebuf[BLOCKSIZE];
 
-  aesni_prepare ();
-  for ( ;nblocks; nblocks-- )
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy (savebuf, inbuf, BLOCKSIZE);
+      aesni_prepare ();
 
-      if (0)
-        ;
+      if (!ctx->decryption_prepared )
+        {
+          prepare_decryption ( ctx );
+          ctx->decryption_prepared = 1;
+        }
+
+      /* As we avoid memcpy to/from stack by using xmm2 and xmm3 for temporary
+         storage, out-of-order CPUs see parallellism even over loop iterations
+         and see 2.5x to 2.9x speed up on Intel Sandy-Bridge. Further
+         improvements are possible with do_aesni_cbc_dec_4() when implemented.
+       */
+      asm volatile
+        ("movdqu %[iv], %%xmm3\n\t"	/* use xmm3 as fast IV storage */
+         : /* No output */
+         : [iv] "m" (*iv)
+         : "memory");
+
+      for ( ;nblocks; nblocks-- )
+        {
+          asm volatile
+            ("movdqu %[inbuf], %%xmm2\n\t"	/* use xmm2 as savebuf */
+             : /* No output */
+             : [inbuf] "m" (*inbuf)
+             : "memory");
+
+          /* uses only xmm0 and xmm1 */
+          do_aesni_dec_aligned (ctx, outbuf, inbuf);
+
+          asm volatile
+            ("movdqu %[outbuf], %%xmm0\n\t"
+             "pxor %%xmm3, %%xmm0\n\t"		/* xor IV with output */
+             "movdqu %%xmm0, %[outbuf]\n\t"
+             "movdqu %%xmm2, %%xmm3\n\t"	/* store savebuf as new IV */
+             : /* No output */
+             : [outbuf] "m" (*outbuf)
+             : "memory");
+
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
+        }
+
+      asm volatile
+        ("movdqu %%xmm3, %[iv]\n\t"	/* store IV */
+         : /* No output */
+         : [iv] "m" (*iv)
+         : "memory");
+
+      aesni_cleanup ();
+      aesni_cleanup_2_4 ();
+    }
+#endif /*USE_AESNI*/
+  else
+    for ( ;nblocks; nblocks-- )
+      {
+        /* We need to save INBUF away because it may be identical to
+           OUTBUF.  */
+        memcpy (savebuf, inbuf, BLOCKSIZE);
+
+        if (0)
+          ;
 #ifdef USE_PADLOCK
-      else if (ctx->use_padlock)
-        do_padlock (ctx, 1, outbuf, inbuf);
+        else if (ctx->use_padlock)
+          do_padlock (ctx, 1, outbuf, inbuf);
 #endif /*USE_PADLOCK*/
-#ifdef USE_AESNI
-      else if (ctx->use_aesni)
-        do_aesni (ctx, 1, outbuf, inbuf);
-#endif /*USE_AESNI*/
-      else
-        do_decrypt (ctx, outbuf, inbuf);
+        else
+          do_decrypt (ctx, outbuf, inbuf);
 
-      for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
-        outbuf[i] ^= *ivp++;
-      memcpy (iv, savebuf, BLOCKSIZE);
-      inbuf += BLOCKSIZE;
-      outbuf += BLOCKSIZE;
-    }
-  aesni_cleanup ();
+        for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
+          outbuf[i] ^= *ivp++;
+        memcpy (iv, savebuf, BLOCKSIZE);
+        inbuf += BLOCKSIZE;
+        outbuf += BLOCKSIZE;
+      }
 
   _gcry_burn_stack (48 + 2*sizeof(int) + BLOCKSIZE + 4*sizeof (char*));
 }
    
    
More information about the Gcrypt-devel
mailing list