[git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-32-ga00c5b2

by Jussi Kivilinna cvs at cvs.gnupg.org
Tue Jan 9 18:17:26 CET 2018


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  a00c5b2988cea256c7823a76ce601febf02c790f (commit)
       via  c9e9cb2eb6a1c659d3825ca627228b732f2f2152 (commit)
      from  b3ec0f752c925cde36f560f0f9309ab6450bbfd9 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit a00c5b2988cea256c7823a76ce601febf02c790f
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 18:53:20 2018 +0200

    Add AES-NI acceleration for AES-XTS
    
    * cipher/cipher-internal.h (gcry_cipher_handle): Change bulk
    XTS function to take cipher context.
    * cipher/cipher-xts.c (_gcry_cipher_xts_crypt): Ditto.
    * cipher/cipher.c (_gcry_cipher_open_internal): Setup AES-NI
    XTS bulk function.
    * cipher/rijndael-aesni.c (xts_gfmul_const, _gcry_aes_aesni_xts_enc)
    (_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_crypt): New.
    * cipher/rijndael.c (_gcry_aes_aesni_xts_crypt)
    (_gcry_aes_xts_crypt): New.
    * src/cipher.h (_gcry_aes_xts_crypt): New.
    --
    
    Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo):
    
    Before:
            XTS enc |      1.66 ns/B     575.7 MiB/s      6.63 c/B
            XTS dec |      1.66 ns/B     575.5 MiB/s      6.63 c/B
    
    After (~6x faster):
            XTS enc |     0.270 ns/B    3528.5 MiB/s      1.08 c/B
            XTS dec |     0.272 ns/B    3511.5 MiB/s      1.09 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index b748125..8c897d7 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -146,7 +146,7 @@ struct gcry_cipher_handle
 			const void *inbuf_arg, size_t nblocks, int encrypt);
     size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg,
 		       size_t nblocks);
-    void (*xts_crypt)(gcry_cipher_hd_t c, unsigned char *tweak,
+    void (*xts_crypt)(void *context, unsigned char *tweak,
 		      void *outbuf_arg, const void *inbuf_arg,
 		      size_t nblocks, int encrypt);
   } bulk;
diff --git a/cipher/cipher-xts.c b/cipher/cipher-xts.c
index 4da89e5..06cefbe 100644
--- a/cipher/cipher-xts.c
+++ b/cipher/cipher-xts.c
@@ -93,7 +93,8 @@ _gcry_cipher_xts_crypt (gcry_cipher_hd_t c,
   /* Use a bulk method if available.  */
   if (nblocks && c->bulk.xts_crypt)
     {
-      c->bulk.xts_crypt (c, c->u_ctr.ctr, outbuf, inbuf, nblocks, encrypt);
+      c->bulk.xts_crypt (&c->context.c, c->u_ctr.ctr, outbuf, inbuf, nblocks,
+			 encrypt);
       inbuf  += nblocks * GCRY_XTS_BLOCK_LEN;
       outbuf += nblocks * GCRY_XTS_BLOCK_LEN;
       inbuflen -= nblocks * GCRY_XTS_BLOCK_LEN;
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 9812738..063c13d 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -532,6 +532,7 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               h->bulk.ocb_crypt = _gcry_aes_ocb_crypt;
               h->bulk.ocb_auth  = _gcry_aes_ocb_auth;
+              h->bulk.xts_crypt = _gcry_aes_xts_crypt;
               break;
 #endif /*USE_AES*/
 #ifdef USE_BLOWFISH
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 3d323cf..50a0745 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -3007,4 +3007,295 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 }
 
 
+static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+  { 0x87, 0x01 };
+
+
+static void
+_gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+		: [tweak] "=m" (*tweak)
+		:
+		: "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+static void
+_gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+                : [tweak] "=m" (*tweak)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+			   unsigned char *outbuf, const unsigned char *inbuf,
+			   size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
+  else
+    _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
+}
+
 #endif /* USE_AESNI */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8637195..548bfa0 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -103,6 +103,11 @@ extern void _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                        int encrypt);
 extern void _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                                       size_t nblocks);
+extern void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx,
+				       unsigned char *tweak,
+				       unsigned char *outbuf,
+				       const unsigned char *inbuf,
+				       size_t nblocks, int encrypt);
 #endif
 
 #ifdef USE_SSSE3
@@ -1467,6 +1472,85 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 }
 
 
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+void
+_gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+		     void *outbuf_arg, const void *inbuf_arg,
+		     size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t crypt_fn;
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+
+  if (encrypt)
+    {
+      if (ctx->prefetch_enc_fn)
+        ctx->prefetch_enc_fn();
+
+      crypt_fn = ctx->encrypt_fn;
+    }
+  else
+    {
+      check_decryption_preparation (ctx);
+
+      if (ctx->prefetch_dec_fn)
+        ctx->prefetch_dec_fn();
+
+      crypt_fn = ctx->decrypt_fn;
+    }
+
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      _gcry_aes_aesni_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+      burn_depth = 0;
+    }
+#endif /*USE_AESNI*/
+  else
+    {
+      tweak_next_lo = buf_get_le64 (tweak + 0);
+      tweak_next_hi = buf_get_le64 (tweak + 8);
+
+      while (nblocks)
+	{
+	  tweak_lo = tweak_next_lo;
+	  tweak_hi = tweak_next_hi;
+
+	  /* Xor-Encrypt/Decrypt-Xor block. */
+	  tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo;
+	  tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi;
+
+	  buf_put_le64 (outbuf + 0, tmp_lo);
+	  buf_put_le64 (outbuf + 8, tmp_hi);
+
+	  /* Generate next tweak. */
+	  carry = -(tweak_next_hi >> 63) & 0x87;
+	  tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+	  tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+	  burn_depth = crypt_fn (ctx, outbuf, outbuf);
+
+	  buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo);
+	  buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi);
+
+	  outbuf += GCRY_XTS_BLOCK_LEN;
+	  inbuf += GCRY_XTS_BLOCK_LEN;
+	  nblocks--;
+	}
+
+      buf_put_le64 (tweak + 0, tweak_next_lo);
+      buf_put_le64 (tweak + 8, tweak_next_hi);
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 5 * sizeof(void *));
+}
+
 
 /* Run the self-tests for AES 128.  Returns NULL on success. */
 static const char*
diff --git a/src/cipher.h b/src/cipher.h
index a6f257d..7c2e5d9 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -164,6 +164,9 @@ size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			    const void *inbuf_arg, size_t nblocks, int encrypt);
 size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			   size_t nblocks);
+void _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  size_t nblocks, int encrypt);
 
 /*-- blowfish.c --*/
 void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,

commit c9e9cb2eb6a1c659d3825ca627228b732f2f2152
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 18:53:20 2018 +0200

    AES-NI improvements for AMD64
    
    * cipher/rijndael-aesni.c [__x86_64__] (aesni_prepare_7_15_variable)
    (aesni_prepare_7_15, aesni_cleanup_7_15, do_aesni_enc_vec8)
    (do_aesni_dec_vec8, do_aesni_ctr_8): New.
    (_gcry_aes_aesni_ctr_enc, _gcry_aes_aesni_cfb_dec)
    (_gcry_aes_aesni_cbc_dec, aesni_ocb_enc, aesni_ocb_dec)
    (_gcry_aes_aesni_ocb_auth) [__x86_64__]: Add 8 parallel blocks
    processing.
    --
    
    Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo, no HT):
    
    Before:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            CBC dec |     0.175 ns/B    5448.7 MiB/s     0.700 c/B
            CFB dec |     0.174 ns/B    5466.2 MiB/s     0.698 c/B
            CTR enc |     0.182 ns/B    5226.0 MiB/s     0.730 c/B
            OCB enc |     0.194 ns/B    4913.9 MiB/s     0.776 c/B
            OCB dec |     0.200 ns/B    4769.2 MiB/s     0.800 c/B
           OCB auth |     0.172 ns/B    5545.0 MiB/s     0.688 c/B
    
    After (1.08x to 1.14x faster):
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            CBC dec |     0.157 ns/B    6075.6 MiB/s     0.628 c/B
            CFB dec |     0.158 ns/B    6034.1 MiB/s     0.632 c/B
            CTR enc |     0.159 ns/B    5979.4 MiB/s     0.638 c/B
            OCB enc |     0.175 ns/B    5447.1 MiB/s     0.700 c/B
            OCB dec |     0.183 ns/B    5203.9 MiB/s     0.733 c/B
           OCB auth |     0.156 ns/B    6101.3 MiB/s     0.625 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 735e5cd..3d323cf 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -55,6 +55,7 @@ typedef struct u128_s
 #ifdef __WIN64__
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
 # define aesni_prepare_2_6_variable char win64tmp[16]
+# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9]
 # define aesni_prepare() do { } while (0)
 # define aesni_prepare_2_6()                                            \
    do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
@@ -62,6 +63,20 @@ typedef struct u128_s
                       :                                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_prepare_7_15()                                           \
+   do { asm volatile ("movdqu %%xmm7,  0*16(%0)\n\t"                    \
+                      "movdqu %%xmm8,  1*16(%0)\n\t"                    \
+                      "movdqu %%xmm9,  2*16(%0)\n\t"                    \
+                      "movdqu %%xmm10, 3*16(%0)\n\t"                    \
+                      "movdqu %%xmm11, 4*16(%0)\n\t"                    \
+                      "movdqu %%xmm12, 5*16(%0)\n\t"                    \
+                      "movdqu %%xmm13, 6*16(%0)\n\t"                    \
+                      "movdqu %%xmm14, 7*16(%0)\n\t"                    \
+                      "movdqu %%xmm15, 8*16(%0)\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
@@ -76,6 +91,20 @@ typedef struct u128_s
                       : "m" (*win64tmp)                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_cleanup_7_15()                                           \
+   do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t"                     \
+                      "movdqu 1*16(%0), %%xmm8\n\t"                     \
+                      "movdqu 2*16(%0), %%xmm9\n\t"                     \
+                      "movdqu 3*16(%0), %%xmm10\n\t"                    \
+                      "movdqu 4*16(%0), %%xmm11\n\t"                    \
+                      "movdqu 5*16(%0), %%xmm12\n\t"                    \
+                      "movdqu 6*16(%0), %%xmm13\n\t"                    \
+                      "movdqu 7*16(%0), %%xmm14\n\t"                    \
+                      "movdqu 8*16(%0), %%xmm15\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 #else
 # define aesni_prepare_2_6_variable
 # define aesni_prepare() do { } while (0)
@@ -91,6 +120,21 @@ typedef struct u128_s
                       "pxor %%xmm5, %%xmm5\n"                           \
                       "pxor %%xmm6, %%xmm6\n":: );                      \
    } while (0)
+# ifdef __x86_64__
+#  define aesni_prepare_7_15_variable
+#  define aesni_prepare_7_15() do { } while (0)
+#  define aesni_cleanup_7_15()                                          \
+   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
+                      "pxor %%xmm8, %%xmm8\n"                           \
+                      "pxor %%xmm9, %%xmm9\n"                           \
+                      "pxor %%xmm10, %%xmm10\n"                         \
+                      "pxor %%xmm11, %%xmm11\n"                         \
+                      "pxor %%xmm12, %%xmm12\n"                         \
+                      "pxor %%xmm13, %%xmm13\n"                         \
+                      "pxor %%xmm14, %%xmm14\n"                         \
+                      "pxor %%xmm15, %%xmm15\n":: );                    \
+   } while (0)
+# endif
 #endif
 
 void
@@ -704,6 +748,314 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 }
 
 
+#ifdef __x86_64__
+
+/* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesenclast %%xmm0, %%xmm1\n\t"
+                "aesenclast %%xmm0, %%xmm2\n\t"
+                "aesenclast %%xmm0, %%xmm3\n\t"
+                "aesenclast %%xmm0, %%xmm4\n\t"
+                "aesenclast %%xmm0, %%xmm8\n\t"
+                "aesenclast %%xmm0, %%xmm9\n\t"
+                "aesenclast %%xmm0, %%xmm10\n\t"
+                "aesenclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+
+/* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesdeclast %%xmm0, %%xmm1\n\t"
+                "aesdeclast %%xmm0, %%xmm2\n\t"
+                "aesdeclast %%xmm0, %%xmm3\n\t"
+                "aesdeclast %%xmm0, %%xmm4\n\t"
+                "aesdeclast %%xmm0, %%xmm8\n\t"
+                "aesdeclast %%xmm0, %%xmm9\n\t"
+                "aesdeclast %%xmm0, %%xmm10\n\t"
+                "aesdeclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 /* Perform a CTR encryption round using the counter CTR and the input
    block A.  Write the result to the output block B and update CTR.
    CTR needs to be a 16 byte aligned little-endian value.  */
@@ -808,7 +1160,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 #define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
 
   /* Register usage:
-      esi   keyschedule
+      [key] keyschedule
       xmm0  CTR-0
       xmm1  temp / round key
       xmm2  CTR-1
@@ -1003,6 +1355,327 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 }
 
 
+#ifdef __x86_64__
+
+/* Eight blocks at a time variant of do_aesni_ctr.  */
+static void
+do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+  static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
+    {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
+    };
+  const void *bige_addb = bige_addb_const;
+
+  /* Register usage:
+      [key] keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+      xmm8  CTR-4
+      xmm9  CTR-5
+      xmm10 CTR-6
+      xmm11 CTR-7
+      xmm12 temp
+      xmm13 temp
+      xmm14 temp
+      xmm15 temp
+   */
+
+  asm volatile (/* detect if 8-bit carry handling is needed */
+                "cmpb   $0xf7, 15(%[ctr])\n\t"
+                "ja     .Ladd32bit%=\n\t"
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
+                "movdqa 1*16(%[addb]), %%xmm3\n\t"  /* xmm3 := be(2) */
+                "movdqa 2*16(%[addb]), %%xmm4\n\t"  /* xmm4 := be(3) */
+                "movdqa 3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) */
+                "movdqa 4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) */
+                "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */
+                "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */
+                "movdqa 7*16(%[addb]), %%xmm5\n\t"  /* xmm5 := be(8) */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm8\n\t"     /* xmm8 := be(4) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm9\n\t"     /* xmm9 := be(5) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm10\n\t"    /* xmm10 := be(6) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm11\n\t"    /* xmm11 := be(7) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(8) + CTR (xmm0) */
+                "jmp    .Lstore_ctr%=\n\t"
+
+                ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm8\n\t"     /* xmm8 := xmm4     */
+                "psubq  %%xmm1, %%xmm8\n\t"     /* xmm8++           */
+                "movdqa %%xmm8, %%xmm9\n\t"     /* xmm9 := xmm8     */
+                "psubq  %%xmm1, %%xmm9\n\t"     /* xmm9++           */
+                "movdqa %%xmm9, %%xmm10\n\t"    /* xmm10 := xmm9    */
+                "psubq  %%xmm1, %%xmm10\n\t"    /* xmm10++          */
+                "movdqa %%xmm10, %%xmm11\n\t"   /* xmm11 := xmm10   */
+                "psubq  %%xmm1, %%xmm11\n\t"    /* xmm11++          */
+                "movdqa %%xmm11, %%xmm5\n\t"    /* xmm5 := xmm11    */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffff8, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffff8 */
+                "cmpl   $0xfffffffa, %%esi\n\t"
+                "jb     .Lcarry_xmm11%=\n\t"     /* esi == 0xfffffff9 */
+                "je     .Lcarry_xmm10%=\n\t"     /* esi == 0xfffffffa */
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lcarry_xmm9%=\n\t"     /* esi == 0xfffffffb */
+                "je     .Lcarry_xmm8%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm8%=:\n\t"
+                "psubq   %%xmm1, %%xmm8\n\t"
+                ".Lcarry_xmm9%=:\n\t"
+                "psubq   %%xmm1, %%xmm9\n\t"
+                ".Lcarry_xmm10%=:\n\t"
+                "psubq   %%xmm1, %%xmm10\n\t"
+                ".Lcarry_xmm11%=:\n\t"
+                "psubq   %%xmm1, %%xmm11\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "pshufb %%xmm6, %%xmm8\n\t"     /* xmm8 := be(xmm8) */
+                "pshufb %%xmm6, %%xmm9\n\t"     /* xmm9 := be(xmm9) */
+                "pshufb %%xmm6, %%xmm10\n\t"    /* xmm10 := be(xmm10) */
+                "pshufb %%xmm6, %%xmm11\n\t"    /* xmm11 := be(xmm11) */
+
+                ".Lstore_ctr%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+                :
+                : [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [addb] "r" (bige_addb)
+                : "%esi", "cc", "memory");
+
+  asm volatile ("pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "jb .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "je .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                "aesenclast %%xmm1, %%xmm0\n\t"
+                "aesenclast %%xmm1, %%xmm2\n\t"
+                "aesenclast %%xmm1, %%xmm3\n\t"
+                "aesenclast %%xmm1, %%xmm4\n\t"
+                "aesenclast %%xmm1, %%xmm8\n\t"
+                "aesenclast %%xmm1, %%xmm9\n\t"
+                "aesenclast %%xmm1, %%xmm10\n\t"
+                "aesenclast %%xmm1, %%xmm11\n\t"
+                :
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+
+  asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
+                "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
+                "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
+                "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
+                "movdqu 4*16(%[src]), %%xmm1\n\t"  /* Get block 5.      */
+                "pxor %%xmm12, %%xmm0\n\t"         /* EncCTR-1 ^= input */
+                "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
+                "pxor %%xmm13, %%xmm2\n\t"         /* EncCTR-2 ^= input */
+                "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
+                "pxor %%xmm14, %%xmm3\n\t"         /* EncCTR-3 ^= input */
+                "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
+                "pxor %%xmm15, %%xmm4\n\t"         /* EncCTR-4 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm1,  %%xmm8\n\t"         /* EncCTR-5 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm12, %%xmm9\n\t"         /* EncCTR-6 ^= input */
+                "movdqu %%xmm2, 1*16(%[dst])\n\t"  /* Store block 2.    */
+                "pxor %%xmm13, %%xmm10\n\t"        /* EncCTR-7 ^= input */
+                "movdqu %%xmm3, 2*16(%[dst])\n\t"  /* Store block 3.    */
+                "pxor %%xmm14, %%xmm11\n\t"        /* EncCTR-8 ^= input */
+                "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
+                "movdqu %%xmm8, 4*16(%[dst])\n\t"  /* Store block 8.    */
+                "movdqu %%xmm9, 5*16(%[dst])\n\t"  /* Store block 9.    */
+                "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10.   */
+                "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11.   */
+                :
+                : [src] "r" (a),
+                  [dst] "r" (b)
+                : "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 unsigned int
 _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
@@ -1123,7 +1796,25 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                   [ctr] "m" (*ctr)
                 : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
       outbuf += 4*BLOCKSIZE;
@@ -1175,6 +1866,76 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
                 : "memory" );
 
   /* CFB decryption can be parallelized */
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8; nblocks -= 8)
+	{
+	  asm volatile
+	    ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
+	     "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
+
+	     "movdqa %%xmm2, %%xmm12\n\t"
+	     "movdqa %%xmm3, %%xmm13\n\t"
+	     "movdqa %%xmm4, %%xmm14\n\t"
+	     "movdqa %%xmm8, %%xmm15\n\t"
+	     : /* No output */
+	     : [inbuf] "r" (inbuf)
+	     : "memory");
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile
+	    (
+	     "pxor %%xmm12, %%xmm1\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+	     "pxor %%xmm13, %%xmm2\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+	     "pxor %%xmm14, %%xmm3\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+	     "pxor %%xmm15, %%xmm4\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
+
+	     "pxor %%xmm12, %%xmm8\n\t"
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "pxor %%xmm13, %%xmm9\n\t"
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "pxor %%xmm14, %%xmm10\n\t"
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "pxor %%xmm15, %%xmm11\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
   for ( ;nblocks >= 4; nblocks -= 4)
     {
       asm volatile
@@ -1260,7 +2021,76 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
      : [iv] "m" (*iv)
      : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  asm volatile
+	    ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+	     "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqa %%xmm1, %%xmm12\n\t"
+	     "movdqa %%xmm2, %%xmm13\n\t"
+	     "movdqa %%xmm3, %%xmm14\n\t"
+	     "movdqa %%xmm4, %%xmm15\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf)
+	     : "memory");
+
+	  do_aesni_dec_vec8 (ctx);
+
+	  asm volatile
+	    ("pxor %%xmm5, %%xmm1\n\t"		/* xor IV with output */
+
+	     "pxor %%xmm12, %%xmm2\n\t"		/* xor IV with output */
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+
+	     "pxor %%xmm13, %%xmm3\n\t"		/* xor IV with output */
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+
+	     "pxor %%xmm14, %%xmm4\n\t"		/* xor IV with output */
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+
+	     "pxor %%xmm15, %%xmm8\n\t"		/* xor IV with output */
+	     "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
+	     "pxor %%xmm12, %%xmm9\n\t"		/* xor IV with output */
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "pxor %%xmm13, %%xmm10\n\t"		/* xor IV with output */
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "pxor %%xmm14, %%xmm11\n\t"		/* xor IV with output */
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       asm volatile
         ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
@@ -1386,7 +2216,142 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+			:
+			: [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm2,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm15\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm15,   %%xmm5\n\t"
+			"pxor   %%xmm4,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			:
+			: [l3] "m" (*l),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm8,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqu %%xmm5,    %[outbuf4]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm9,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm10,   %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"movdqu %[outbuf4],%%xmm0\n\t"
+			"movdqu %[outbuf5],%%xmm12\n\t"
+			"movdqu %[outbuf6],%%xmm13\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm15,   %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %%xmm12,   %%xmm9\n\t"
+			"pxor   %%xmm13,   %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			"movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+    aesni_cleanup_7_15();
+  }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1394,9 +2359,9 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm1,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
@@ -1414,19 +2379,17 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm3,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [l2] "m" (*c->u_mode.ocb.L[0]),
-		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l3],     %%xmm4\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
 		    "pxor   %%xmm4,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
@@ -1551,7 +2514,142 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+			:
+			: [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			:
+			: [l3] "m" (*l),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqu %%xmm5,    %[outbuf4]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_dec_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"movdqu %[outbuf4],%%xmm0\n\t"
+			"movdqu %[outbuf5],%%xmm12\n\t"
+			"movdqu %[outbuf6],%%xmm13\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm15,   %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %%xmm12,   %%xmm9\n\t"
+			"pxor   %%xmm13,   %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			"movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			"pxor   %%xmm2,    %%xmm1\n\t"
+			"pxor   %%xmm4,    %%xmm1\n\t"
+			"pxor   %%xmm9,    %%xmm1\n\t"
+			"pxor   %%xmm11,   %%xmm1\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm8,    %%xmm6\n\t"
+			"pxor   %%xmm10,   %%xmm6\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1559,9 +2657,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
@@ -1577,14 +2675,12 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [l2] "m" (*c->u_mode.ocb.L[0]),
-		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
       asm volatile ("movdqu %[l3],     %%xmm0\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
@@ -1722,16 +2818,115 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       abuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    "movdqu %[l1], %%xmm12\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l1] "m" (*c->u_mode.ocb.L[1])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  asm volatile ("movdqu %[abuf0],  %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			:
+			: [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
+			"pxor   %%xmm12,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			:
+			: [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			:
+			: [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[abuf3],  %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			:
+			: [l3] "m" (*l),
+			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[abuf4],  %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			:
+			: [abuf4] "m" (*(abuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf5],  %%xmm9\n\t"
+			"pxor   %%xmm12,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			:
+			: [abuf5] "m" (*(abuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf6],  %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			:
+			: [abuf6] "m" (*(abuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"movdqu %[abuf7],  %%xmm11\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm2,   %%xmm1\n\t"
+			"pxor   %%xmm3,   %%xmm1\n\t"
+			"pxor   %%xmm4,   %%xmm1\n\t"
+			"pxor   %%xmm8,   %%xmm1\n\t"
+			"pxor   %%xmm9,   %%xmm6\n\t"
+			"pxor   %%xmm10,  %%xmm6\n\t"
+			"pxor   %%xmm11,  %%xmm6\n\t"
+			"pxor   %%xmm1,   %%xmm6\n\t"
+			:
+			:
+			: "memory" );
+
+	  abuf += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[abuf0],  %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
@@ -1745,9 +2940,8 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[abuf2],  %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    :
 		    : [l2] "m" (*c->u_mode.ocb.L[0]),

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-internal.h |    2 +-
 cipher/cipher-xts.c      |    3 +-
 cipher/cipher.c          |    1 +
 cipher/rijndael-aesni.c  | 1539 +++++++++++++++++++++++++++++++++++++++++++++-
 cipher/rijndael.c        |   84 +++
 src/cipher.h             |    3 +
 6 files changed, 1603 insertions(+), 29 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits




More information about the Gcrypt-devel mailing list