[git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-32-ga00c5b2

by Jussi Kivilinna cvs at cvs.gnupg.org
Tue Jan 9 18:17:26 CET 2018


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  a00c5b2988cea256c7823a76ce601febf02c790f (commit)
       via  c9e9cb2eb6a1c659d3825ca627228b732f2f2152 (commit)
      from  b3ec0f752c925cde36f560f0f9309ab6450bbfd9 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit a00c5b2988cea256c7823a76ce601febf02c790f
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 18:53:20 2018 +0200

    Add AES-NI acceleration for AES-XTS
    
    * cipher/cipher-internal.h (gcry_cipher_handle): Change bulk
    XTS function to take cipher context.
    * cipher/cipher-xts.c (_gcry_cipher_xts_crypt): Ditto.
    * cipher/cipher.c (_gcry_cipher_open_internal): Setup AES-NI
    XTS bulk function.
    * cipher/rijndael-aesni.c (xts_gfmul_const, _gcry_aes_aesni_xts_enc)
    (_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_crypt): New.
    * cipher/rijndael.c (_gcry_aes_aesni_xts_crypt)
    (_gcry_aes_xts_crypt): New.
    * src/cipher.h (_gcry_aes_xts_crypt): New.
    --
    
    Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo):
    
    Before:
            XTS enc |      1.66 ns/B     575.7 MiB/s      6.63 c/B
            XTS dec |      1.66 ns/B     575.5 MiB/s      6.63 c/B
    
    After (~6x faster):
            XTS enc |     0.270 ns/B    3528.5 MiB/s      1.08 c/B
            XTS dec |     0.272 ns/B    3511.5 MiB/s      1.09 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index b748125..8c897d7 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -146,7 +146,7 @@ struct gcry_cipher_handle
 			const void *inbuf_arg, size_t nblocks, int encrypt);
     size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg,
 		       size_t nblocks);
-    void (*xts_crypt)(gcry_cipher_hd_t c, unsigned char *tweak,
+    void (*xts_crypt)(void *context, unsigned char *tweak,
 		      void *outbuf_arg, const void *inbuf_arg,
 		      size_t nblocks, int encrypt);
   } bulk;
diff --git a/cipher/cipher-xts.c b/cipher/cipher-xts.c
index 4da89e5..06cefbe 100644
--- a/cipher/cipher-xts.c
+++ b/cipher/cipher-xts.c
@@ -93,7 +93,8 @@ _gcry_cipher_xts_crypt (gcry_cipher_hd_t c,
   /* Use a bulk method if available.  */
   if (nblocks && c->bulk.xts_crypt)
     {
-      c->bulk.xts_crypt (c, c->u_ctr.ctr, outbuf, inbuf, nblocks, encrypt);
+      c->bulk.xts_crypt (&c->context.c, c->u_ctr.ctr, outbuf, inbuf, nblocks,
+			 encrypt);
       inbuf  += nblocks * GCRY_XTS_BLOCK_LEN;
       outbuf += nblocks * GCRY_XTS_BLOCK_LEN;
       inbuflen -= nblocks * GCRY_XTS_BLOCK_LEN;
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 9812738..063c13d 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -532,6 +532,7 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               h->bulk.ocb_crypt = _gcry_aes_ocb_crypt;
               h->bulk.ocb_auth  = _gcry_aes_ocb_auth;
+              h->bulk.xts_crypt = _gcry_aes_xts_crypt;
               break;
 #endif /*USE_AES*/
 #ifdef USE_BLOWFISH
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 3d323cf..50a0745 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -3007,4 +3007,295 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 }
 
 
+static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+  { 0x87, 0x01 };
+
+
+static void
+_gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+		: [tweak] "=m" (*tweak)
+		:
+		: "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+static void
+_gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
+			 unsigned char *outbuf, const unsigned char *inbuf,
+			 size_t nblocks)
+{
+  aesni_prepare_2_6_variable;
+
+  aesni_prepare ();
+  aesni_prepare_2_6 ();
+
+  /* Preload Tweak */
+  asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+		"movdqa %[gfmul], %%xmm6\n\t"
+		:
+		: [tweak] "m" (*tweak),
+		  [gfmul] "m" (*xts_gfmul_const)
+		: "memory" );
+
+  for ( ;nblocks >= 4; nblocks -= 4 )
+    {
+      asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * 16))
+		    : [inbuf0] "m" (*(inbuf + 0 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * 16))
+		    : [inbuf1] "m" (*(inbuf + 1 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+
+		    "movdqa %%xmm4,    %%xmm0\n\t"
+		    "paddd  %%xmm4,    %%xmm4\n\t"
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * 16))
+		    : [inbuf2] "m" (*(inbuf + 2 * 16))
+		    : "memory" );
+
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm5,    %[outbuf3]\n\t"
+
+		    "psrad  $31,       %%xmm0\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    : [outbuf3] "=m" (*(outbuf + 3 * 16))
+		    : [inbuf3] "m" (*(inbuf + 3 * 16))
+		    : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+                    "pxor   %%xmm0,     %%xmm1\n\t"
+		    "movdqu %[outbuf1], %%xmm0\n\t"
+		    "movdqu %%xmm1,     %[outbuf0]\n\t"
+		    "movdqu %[outbuf2], %%xmm1\n\t"
+                    "pxor   %%xmm0,     %%xmm2\n\t"
+		    "movdqu %[outbuf3], %%xmm0\n\t"
+                    "pxor   %%xmm1,     %%xmm3\n\t"
+                    "pxor   %%xmm0,     %%xmm4\n\t"
+		    "movdqu %%xmm2,     %[outbuf1]\n\t"
+		    "movdqu %%xmm3,     %[outbuf2]\n\t"
+		    "movdqu %%xmm4,     %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+		      [outbuf1] "+m" (*(outbuf + 1 * 16)),
+		      [outbuf2] "+m" (*(outbuf + 2 * 16)),
+		      [outbuf3] "+m" (*(outbuf + 3 * 16))
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE * 4;
+      inbuf += BLOCKSIZE * 4;
+    }
+
+  for ( ;nblocks; nblocks-- )
+    {
+      asm volatile ("movdqu %[inbuf],  %%xmm0\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "movdqa %%xmm5,    %%xmm4\n\t"
+
+		    "pshufd $0x13,     %%xmm5,  %%xmm1\n\t"
+		    "psrad  $31,       %%xmm1\n\t"
+		    "paddq  %%xmm5,    %%xmm5\n\t"
+		    "pand   %%xmm6,    %%xmm1\n\t"
+		    "pxor   %%xmm1,    %%xmm5\n\t"
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm4,    %%xmm0\n\t"
+		    "movdqu %%xmm0,    %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
+
+      outbuf += BLOCKSIZE;
+      inbuf += BLOCKSIZE;
+    }
+
+  asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+                : [tweak] "=m" (*tweak)
+                :
+                : "memory" );
+
+  aesni_cleanup ();
+  aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+			   unsigned char *outbuf, const unsigned char *inbuf,
+			   size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
+  else
+    _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
+}
+
 #endif /* USE_AESNI */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8637195..548bfa0 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -103,6 +103,11 @@ extern void _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                        int encrypt);
 extern void _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                                       size_t nblocks);
+extern void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx,
+				       unsigned char *tweak,
+				       unsigned char *outbuf,
+				       const unsigned char *inbuf,
+				       size_t nblocks, int encrypt);
 #endif
 
 #ifdef USE_SSSE3
@@ -1467,6 +1472,85 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 }
 
 
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+void
+_gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+		     void *outbuf_arg, const void *inbuf_arg,
+		     size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+  rijndael_cryptfn_t crypt_fn;
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+
+  if (encrypt)
+    {
+      if (ctx->prefetch_enc_fn)
+        ctx->prefetch_enc_fn();
+
+      crypt_fn = ctx->encrypt_fn;
+    }
+  else
+    {
+      check_decryption_preparation (ctx);
+
+      if (ctx->prefetch_dec_fn)
+        ctx->prefetch_dec_fn();
+
+      crypt_fn = ctx->decrypt_fn;
+    }
+
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
+    {
+      _gcry_aes_aesni_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+      burn_depth = 0;
+    }
+#endif /*USE_AESNI*/
+  else
+    {
+      tweak_next_lo = buf_get_le64 (tweak + 0);
+      tweak_next_hi = buf_get_le64 (tweak + 8);
+
+      while (nblocks)
+	{
+	  tweak_lo = tweak_next_lo;
+	  tweak_hi = tweak_next_hi;
+
+	  /* Xor-Encrypt/Decrypt-Xor block. */
+	  tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo;
+	  tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi;
+
+	  buf_put_le64 (outbuf + 0, tmp_lo);
+	  buf_put_le64 (outbuf + 8, tmp_hi);
+
+	  /* Generate next tweak. */
+	  carry = -(tweak_next_hi >> 63) & 0x87;
+	  tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+	  tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+	  burn_depth = crypt_fn (ctx, outbuf, outbuf);
+
+	  buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo);
+	  buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi);
+
+	  outbuf += GCRY_XTS_BLOCK_LEN;
+	  inbuf += GCRY_XTS_BLOCK_LEN;
+	  nblocks--;
+	}
+
+      buf_put_le64 (tweak + 0, tweak_next_lo);
+      buf_put_le64 (tweak + 8, tweak_next_hi);
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 5 * sizeof(void *));
+}
+
 

 /* Run the self-tests for AES 128.  Returns NULL on success. */
 static const char*
diff --git a/src/cipher.h b/src/cipher.h
index a6f257d..7c2e5d9 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -164,6 +164,9 @@ size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			    const void *inbuf_arg, size_t nblocks, int encrypt);
 size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			   size_t nblocks);
+void _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  size_t nblocks, int encrypt);
 
 /*-- blowfish.c --*/
 void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,

commit c9e9cb2eb6a1c659d3825ca627228b732f2f2152
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Jan 6 18:53:20 2018 +0200

    AES-NI improvements for AMD64
    
    * cipher/rijndael-aesni.c [__x86_64__] (aesni_prepare_7_15_variable)
    (aesni_prepare_7_15, aesni_cleanup_7_15, do_aesni_enc_vec8)
    (do_aesni_dec_vec8, do_aesni_ctr_8): New.
    (_gcry_aes_aesni_ctr_enc, _gcry_aes_aesni_cfb_dec)
    (_gcry_aes_aesni_cbc_dec, aesni_ocb_enc, aesni_ocb_dec)
    (_gcry_aes_aesni_ocb_auth) [__x86_64__]: Add 8 parallel blocks
    processing.
    --
    
    Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo, no HT):
    
    Before:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            CBC dec |     0.175 ns/B    5448.7 MiB/s     0.700 c/B
            CFB dec |     0.174 ns/B    5466.2 MiB/s     0.698 c/B
            CTR enc |     0.182 ns/B    5226.0 MiB/s     0.730 c/B
            OCB enc |     0.194 ns/B    4913.9 MiB/s     0.776 c/B
            OCB dec |     0.200 ns/B    4769.2 MiB/s     0.800 c/B
           OCB auth |     0.172 ns/B    5545.0 MiB/s     0.688 c/B
    
    After (1.08x to 1.14x faster):
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            CBC dec |     0.157 ns/B    6075.6 MiB/s     0.628 c/B
            CFB dec |     0.158 ns/B    6034.1 MiB/s     0.632 c/B
            CTR enc |     0.159 ns/B    5979.4 MiB/s     0.638 c/B
            OCB enc |     0.175 ns/B    5447.1 MiB/s     0.700 c/B
            OCB dec |     0.183 ns/B    5203.9 MiB/s     0.733 c/B
           OCB auth |     0.156 ns/B    6101.3 MiB/s     0.625 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 735e5cd..3d323cf 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -55,6 +55,7 @@ typedef struct u128_s
 #ifdef __WIN64__
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
 # define aesni_prepare_2_6_variable char win64tmp[16]
+# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9]
 # define aesni_prepare() do { } while (0)
 # define aesni_prepare_2_6()                                            \
    do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
@@ -62,6 +63,20 @@ typedef struct u128_s
                       :                                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_prepare_7_15()                                           \
+   do { asm volatile ("movdqu %%xmm7,  0*16(%0)\n\t"                    \
+                      "movdqu %%xmm8,  1*16(%0)\n\t"                    \
+                      "movdqu %%xmm9,  2*16(%0)\n\t"                    \
+                      "movdqu %%xmm10, 3*16(%0)\n\t"                    \
+                      "movdqu %%xmm11, 4*16(%0)\n\t"                    \
+                      "movdqu %%xmm12, 5*16(%0)\n\t"                    \
+                      "movdqu %%xmm13, 6*16(%0)\n\t"                    \
+                      "movdqu %%xmm14, 7*16(%0)\n\t"                    \
+                      "movdqu %%xmm15, 8*16(%0)\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
@@ -76,6 +91,20 @@ typedef struct u128_s
                       : "m" (*win64tmp)                                 \
                       : "memory");                                      \
    } while (0)
+# define aesni_cleanup_7_15()                                           \
+   do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t"                     \
+                      "movdqu 1*16(%0), %%xmm8\n\t"                     \
+                      "movdqu 2*16(%0), %%xmm9\n\t"                     \
+                      "movdqu 3*16(%0), %%xmm10\n\t"                    \
+                      "movdqu 4*16(%0), %%xmm11\n\t"                    \
+                      "movdqu 5*16(%0), %%xmm12\n\t"                    \
+                      "movdqu 6*16(%0), %%xmm13\n\t"                    \
+                      "movdqu 7*16(%0), %%xmm14\n\t"                    \
+                      "movdqu 8*16(%0), %%xmm15\n\t"                    \
+                      :                                                 \
+                      : "r" (win64tmp7_15)                              \
+                      : "memory");                                      \
+   } while (0)
 #else
 # define aesni_prepare_2_6_variable
 # define aesni_prepare() do { } while (0)
@@ -91,6 +120,21 @@ typedef struct u128_s
                       "pxor %%xmm5, %%xmm5\n"                           \
                       "pxor %%xmm6, %%xmm6\n":: );                      \
    } while (0)
+# ifdef __x86_64__
+#  define aesni_prepare_7_15_variable
+#  define aesni_prepare_7_15() do { } while (0)
+#  define aesni_cleanup_7_15()                                          \
+   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
+                      "pxor %%xmm8, %%xmm8\n"                           \
+                      "pxor %%xmm9, %%xmm9\n"                           \
+                      "pxor %%xmm10, %%xmm10\n"                         \
+                      "pxor %%xmm11, %%xmm11\n"                         \
+                      "pxor %%xmm12, %%xmm12\n"                         \
+                      "pxor %%xmm13, %%xmm13\n"                         \
+                      "pxor %%xmm14, %%xmm14\n"                         \
+                      "pxor %%xmm15, %%xmm15\n":: );                    \
+   } while (0)
+# endif
 #endif
 
 void
@@ -704,6 +748,314 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 }
 
 
+#ifdef __x86_64__
+
+/* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesenc %%xmm0, %%xmm1\n\t"
+                "aesenc %%xmm0, %%xmm2\n\t"
+                "aesenc %%xmm0, %%xmm3\n\t"
+                "aesenc %%xmm0, %%xmm4\n\t"
+                "aesenc %%xmm0, %%xmm8\n\t"
+                "aesenc %%xmm0, %%xmm9\n\t"
+                "aesenc %%xmm0, %%xmm10\n\t"
+                "aesenc %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesenclast %%xmm0, %%xmm1\n\t"
+                "aesenclast %%xmm0, %%xmm2\n\t"
+                "aesenclast %%xmm0, %%xmm3\n\t"
+                "aesenclast %%xmm0, %%xmm4\n\t"
+                "aesenclast %%xmm0, %%xmm8\n\t"
+                "aesenclast %%xmm0, %%xmm9\n\t"
+                "aesenclast %%xmm0, %%xmm10\n\t"
+                "aesenclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+
+/* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
+static inline void
+do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
+{
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+                "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+                "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "jb .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "je .Ldeclast%=\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                "aesdec %%xmm0, %%xmm1\n\t"
+                "aesdec %%xmm0, %%xmm2\n\t"
+                "aesdec %%xmm0, %%xmm3\n\t"
+                "aesdec %%xmm0, %%xmm4\n\t"
+                "aesdec %%xmm0, %%xmm8\n\t"
+                "aesdec %%xmm0, %%xmm9\n\t"
+                "aesdec %%xmm0, %%xmm10\n\t"
+                "aesdec %%xmm0, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                "aesdeclast %%xmm0, %%xmm1\n\t"
+                "aesdeclast %%xmm0, %%xmm2\n\t"
+                "aesdeclast %%xmm0, %%xmm3\n\t"
+                "aesdeclast %%xmm0, %%xmm4\n\t"
+                "aesdeclast %%xmm0, %%xmm8\n\t"
+                "aesdeclast %%xmm0, %%xmm9\n\t"
+                "aesdeclast %%xmm0, %%xmm10\n\t"
+                "aesdeclast %%xmm0, %%xmm11\n\t"
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 /* Perform a CTR encryption round using the counter CTR and the input
    block A.  Write the result to the output block B and update CTR.
    CTR needs to be a 16 byte aligned little-endian value.  */
@@ -808,7 +1160,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 #define aesenclast_xmm1_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe1\n\t"
 
   /* Register usage:
-      esi   keyschedule
+      [key] keyschedule
       xmm0  CTR-0
       xmm1  temp / round key
       xmm2  CTR-1
@@ -1003,6 +1355,327 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
 }
 
 
+#ifdef __x86_64__
+
+/* Eight blocks at a time variant of do_aesni_ctr.  */
+static void
+do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
+                unsigned char *ctr, unsigned char *b, const unsigned char *a)
+{
+  static const byte bige_addb_const[8][16] __attribute__ ((aligned (16))) =
+    {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }
+    };
+  const void *bige_addb = bige_addb_const;
+
+  /* Register usage:
+      [key] keyschedule
+      xmm0  CTR-0
+      xmm1  temp / round key
+      xmm2  CTR-1
+      xmm3  CTR-2
+      xmm4  CTR-3
+      xmm5  copy of *ctr
+      xmm6  endian swapping mask
+      xmm8  CTR-4
+      xmm9  CTR-5
+      xmm10 CTR-6
+      xmm11 CTR-7
+      xmm12 temp
+      xmm13 temp
+      xmm14 temp
+      xmm15 temp
+   */
+
+  asm volatile (/* detect if 8-bit carry handling is needed */
+                "cmpb   $0xf7, 15(%[ctr])\n\t"
+                "ja     .Ladd32bit%=\n\t"
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
+                "movdqa 1*16(%[addb]), %%xmm3\n\t"  /* xmm3 := be(2) */
+                "movdqa 2*16(%[addb]), %%xmm4\n\t"  /* xmm4 := be(3) */
+                "movdqa 3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) */
+                "movdqa 4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) */
+                "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */
+                "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */
+                "movdqa 7*16(%[addb]), %%xmm5\n\t"  /* xmm5 := be(8) */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm8\n\t"     /* xmm8 := be(4) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm9\n\t"     /* xmm9 := be(5) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm10\n\t"    /* xmm10 := be(6) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm11\n\t"    /* xmm11 := be(7) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(8) + CTR (xmm0) */
+                "jmp    .Lstore_ctr%=\n\t"
+
+                ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+                "movdqa %%xmm0, %%xmm2\n\t"
+                "pcmpeqd %%xmm1, %%xmm1\n\t"
+                "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := le(xmm2) */
+                "psubq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
+                "movdqa %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
+                "psubq  %%xmm1, %%xmm3\n\t"     /* xmm3++           */
+                "movdqa %%xmm3, %%xmm4\n\t"     /* xmm4 := xmm3     */
+                "psubq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
+                "movdqa %%xmm4, %%xmm8\n\t"     /* xmm8 := xmm4     */
+                "psubq  %%xmm1, %%xmm8\n\t"     /* xmm8++           */
+                "movdqa %%xmm8, %%xmm9\n\t"     /* xmm9 := xmm8     */
+                "psubq  %%xmm1, %%xmm9\n\t"     /* xmm9++           */
+                "movdqa %%xmm9, %%xmm10\n\t"    /* xmm10 := xmm9    */
+                "psubq  %%xmm1, %%xmm10\n\t"    /* xmm10++          */
+                "movdqa %%xmm10, %%xmm11\n\t"   /* xmm11 := xmm10   */
+                "psubq  %%xmm1, %%xmm11\n\t"    /* xmm11++          */
+                "movdqa %%xmm11, %%xmm5\n\t"    /* xmm5 := xmm11    */
+                "psubq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* detect if 64-bit carry handling is needed */
+                "cmpl   $0xffffffff, 8(%[ctr])\n\t"
+                "jne    .Lno_carry%=\n\t"
+                "movl   12(%[ctr]), %%esi\n\t"
+                "bswapl %%esi\n\t"
+                "cmpl   $0xfffffff8, %%esi\n\t"
+                "jb     .Lno_carry%=\n\t"       /* no carry */
+
+                "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
+                "je     .Lcarry_xmm5%=\n\t"     /* esi == 0xfffffff8 */
+                "cmpl   $0xfffffffa, %%esi\n\t"
+                "jb     .Lcarry_xmm11%=\n\t"     /* esi == 0xfffffff9 */
+                "je     .Lcarry_xmm10%=\n\t"     /* esi == 0xfffffffa */
+                "cmpl   $0xfffffffc, %%esi\n\t"
+                "jb     .Lcarry_xmm9%=\n\t"     /* esi == 0xfffffffb */
+                "je     .Lcarry_xmm8%=\n\t"     /* esi == 0xfffffffc */
+                "cmpl   $0xfffffffe, %%esi\n\t"
+                "jb     .Lcarry_xmm4%=\n\t"     /* esi == 0xfffffffd */
+                "je     .Lcarry_xmm3%=\n\t"     /* esi == 0xfffffffe */
+                /* esi == 0xffffffff */
+
+                "psubq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "psubq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "psubq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm8%=:\n\t"
+                "psubq   %%xmm1, %%xmm8\n\t"
+                ".Lcarry_xmm9%=:\n\t"
+                "psubq   %%xmm1, %%xmm9\n\t"
+                ".Lcarry_xmm10%=:\n\t"
+                "psubq   %%xmm1, %%xmm10\n\t"
+                ".Lcarry_xmm11%=:\n\t"
+                "psubq   %%xmm1, %%xmm11\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "psubq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+
+                "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
+                "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
+                "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "pshufb %%xmm6, %%xmm8\n\t"     /* xmm8 := be(xmm8) */
+                "pshufb %%xmm6, %%xmm9\n\t"     /* xmm9 := be(xmm9) */
+                "pshufb %%xmm6, %%xmm10\n\t"    /* xmm10 := be(xmm10) */
+                "pshufb %%xmm6, %%xmm11\n\t"    /* xmm11 := be(xmm11) */
+
+                ".Lstore_ctr%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+                :
+                : [ctr] "r" (ctr),
+                  [key] "r" (ctx->keyschenc),
+                  [addb] "r" (bige_addb)
+                : "%esi", "cc", "memory");
+
+  asm volatile ("pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "movdqa 0x10(%[key]), %%xmm1\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x20(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x30(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x40(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x50(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x60(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x70(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x80(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0x90(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xa0(%[key]), %%xmm1\n\t"
+                "jb .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xb0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xc0(%[key]), %%xmm1\n\t"
+                "je .Lenclast%=\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xd0(%[key]), %%xmm1\n\t"
+                "aesenc %%xmm1, %%xmm0\n\t"
+                "aesenc %%xmm1, %%xmm2\n\t"
+                "aesenc %%xmm1, %%xmm3\n\t"
+                "aesenc %%xmm1, %%xmm4\n\t"
+                "aesenc %%xmm1, %%xmm8\n\t"
+                "aesenc %%xmm1, %%xmm9\n\t"
+                "aesenc %%xmm1, %%xmm10\n\t"
+                "aesenc %%xmm1, %%xmm11\n\t"
+                "movdqa 0xe0(%[key]), %%xmm1\n"
+
+                ".Lenclast%=:\n\t"
+                "aesenclast %%xmm1, %%xmm0\n\t"
+                "aesenclast %%xmm1, %%xmm2\n\t"
+                "aesenclast %%xmm1, %%xmm3\n\t"
+                "aesenclast %%xmm1, %%xmm4\n\t"
+                "aesenclast %%xmm1, %%xmm8\n\t"
+                "aesenclast %%xmm1, %%xmm9\n\t"
+                "aesenclast %%xmm1, %%xmm10\n\t"
+                "aesenclast %%xmm1, %%xmm11\n\t"
+                :
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+
+  asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
+                "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
+                "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
+                "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
+                "movdqu 4*16(%[src]), %%xmm1\n\t"  /* Get block 5.      */
+                "pxor %%xmm12, %%xmm0\n\t"         /* EncCTR-1 ^= input */
+                "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
+                "pxor %%xmm13, %%xmm2\n\t"         /* EncCTR-2 ^= input */
+                "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
+                "pxor %%xmm14, %%xmm3\n\t"         /* EncCTR-3 ^= input */
+                "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
+                "pxor %%xmm15, %%xmm4\n\t"         /* EncCTR-4 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm1,  %%xmm8\n\t"         /* EncCTR-5 ^= input */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
+                "pxor %%xmm12, %%xmm9\n\t"         /* EncCTR-6 ^= input */
+                "movdqu %%xmm2, 1*16(%[dst])\n\t"  /* Store block 2.    */
+                "pxor %%xmm13, %%xmm10\n\t"        /* EncCTR-7 ^= input */
+                "movdqu %%xmm3, 2*16(%[dst])\n\t"  /* Store block 3.    */
+                "pxor %%xmm14, %%xmm11\n\t"        /* EncCTR-8 ^= input */
+                "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
+                "movdqu %%xmm8, 4*16(%[dst])\n\t"  /* Store block 8.    */
+                "movdqu %%xmm9, 5*16(%[dst])\n\t"  /* Store block 9.    */
+                "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10.   */
+                "movdqu %%xmm11, 7*16(%[dst])\n\t" /* Store block 11.   */
+                :
+                : [src] "r" (a),
+                  [dst] "r" (b)
+                : "memory");
+}
+
+#endif /* __x86_64__ */
+
+
 unsigned int
 _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
@@ -1123,7 +1796,25 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                   [ctr] "m" (*ctr)
                 : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  do_aesni_ctr_8 (ctx, ctr, outbuf, inbuf);
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       do_aesni_ctr_4 (ctx, ctr, outbuf, inbuf);
       outbuf += 4*BLOCKSIZE;
@@ -1175,6 +1866,76 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
                 : "memory" );
 
   /* CFB decryption can be parallelized */
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8; nblocks -= 8)
+	{
+	  asm volatile
+	    ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
+	     "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqu 7*16(%[inbuf]), %%xmm6\n\t" /* update IV */
+
+	     "movdqa %%xmm2, %%xmm12\n\t"
+	     "movdqa %%xmm3, %%xmm13\n\t"
+	     "movdqa %%xmm4, %%xmm14\n\t"
+	     "movdqa %%xmm8, %%xmm15\n\t"
+	     : /* No output */
+	     : [inbuf] "r" (inbuf)
+	     : "memory");
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile
+	    (
+	     "pxor %%xmm12, %%xmm1\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+	     "pxor %%xmm13, %%xmm2\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+	     "pxor %%xmm14, %%xmm3\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+	     "pxor %%xmm15, %%xmm4\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
+
+	     "pxor %%xmm12, %%xmm8\n\t"
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "pxor %%xmm13, %%xmm9\n\t"
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "pxor %%xmm14, %%xmm10\n\t"
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "pxor %%xmm15, %%xmm11\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
   for ( ;nblocks >= 4; nblocks -= 4)
     {
       asm volatile
@@ -1260,7 +2021,76 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
      : [iv] "m" (*iv)
      : "memory");
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  asm volatile
+	    ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+	     "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+	     "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+	     "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+	     "movdqu 4*16(%[inbuf]), %%xmm8\n\t"
+	     "movdqu 5*16(%[inbuf]), %%xmm9\n\t"
+	     "movdqu 6*16(%[inbuf]), %%xmm10\n\t"
+	     "movdqu 7*16(%[inbuf]), %%xmm11\n\t"
+
+	     "movdqa %%xmm1, %%xmm12\n\t"
+	     "movdqa %%xmm2, %%xmm13\n\t"
+	     "movdqa %%xmm3, %%xmm14\n\t"
+	     "movdqa %%xmm4, %%xmm15\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf)
+	     : "memory");
+
+	  do_aesni_dec_vec8 (ctx);
+
+	  asm volatile
+	    ("pxor %%xmm5, %%xmm1\n\t"		/* xor IV with output */
+
+	     "pxor %%xmm12, %%xmm2\n\t"		/* xor IV with output */
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+
+	     "pxor %%xmm13, %%xmm3\n\t"		/* xor IV with output */
+	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
+
+	     "pxor %%xmm14, %%xmm4\n\t"		/* xor IV with output */
+	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
+
+	     "pxor %%xmm15, %%xmm8\n\t"		/* xor IV with output */
+	     "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
+	     "pxor %%xmm12, %%xmm9\n\t"		/* xor IV with output */
+	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+	     "pxor %%xmm13, %%xmm10\n\t"		/* xor IV with output */
+	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+	     "pxor %%xmm14, %%xmm11\n\t"		/* xor IV with output */
+	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+	     "movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+	     "movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+	     "movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+
+	     : /* No output */
+	     : [inbuf] "r" (inbuf),
+	       [outbuf] "r" (outbuf)
+	     : "memory");
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       asm volatile
         ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
@@ -1386,7 +2216,142 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+			:
+			: [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm2,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm15\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm15,   %%xmm5\n\t"
+			"pxor   %%xmm4,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			:
+			: [l3] "m" (*l),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm8,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqu %%xmm5,    %[outbuf4]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm9,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm10,   %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"movdqu %[outbuf4],%%xmm0\n\t"
+			"movdqu %[outbuf5],%%xmm12\n\t"
+			"movdqu %[outbuf6],%%xmm13\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm15,   %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %%xmm12,   %%xmm9\n\t"
+			"pxor   %%xmm13,   %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			"movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+    aesni_cleanup_7_15();
+  }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1394,9 +2359,9 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm1,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
@@ -1414,19 +2379,17 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm3,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [l2] "m" (*c->u_mode.ocb.L[0]),
-		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l3],     %%xmm4\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
 		    "pxor   %%xmm4,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
@@ -1551,7 +2514,142 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+			:
+			: [l1] "m" (*c->u_mode.ocb.L[1]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+			:
+			: [l3] "m" (*l),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqu %%xmm5,    %[outbuf4]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
+			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm10,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
+			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [inbuf7] "m" (*(inbuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_dec_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm12,   %%xmm1\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+			"movdqu %[outbuf4],%%xmm0\n\t"
+			"movdqu %[outbuf5],%%xmm12\n\t"
+			"movdqu %[outbuf6],%%xmm13\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+			"pxor   %%xmm15,   %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"pxor   %%xmm12,   %%xmm9\n\t"
+			"pxor   %%xmm13,   %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"movdqu %%xmm8,    %[outbuf4]\n\t"
+			"movdqu %%xmm9,    %[outbuf5]\n\t"
+			"movdqu %%xmm10,   %[outbuf6]\n\t"
+			"movdqu %%xmm11,   %[outbuf7]\n\t"
+			"pxor   %%xmm2,    %%xmm1\n\t"
+			"pxor   %%xmm4,    %%xmm1\n\t"
+			"pxor   %%xmm9,    %%xmm1\n\t"
+			"pxor   %%xmm11,   %%xmm1\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm8,    %%xmm6\n\t"
+			"pxor   %%xmm10,   %%xmm6\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)),
+			  [outbuf4] "+m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "+m" (*(outbuf + 5 * BLOCKSIZE)),
+			  [outbuf6] "+m" (*(outbuf + 6 * BLOCKSIZE)),
+			  [outbuf7] "=m" (*(outbuf + 7 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
@@ -1559,9 +2657,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    "movdqu %%xmm5,    %[outbuf0]\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
@@ -1577,14 +2675,12 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    "movdqu %%xmm5,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [l2] "m" (*c->u_mode.ocb.L[0]),
-		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
       asm volatile ("movdqu %[l3],     %%xmm0\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
@@ -1722,16 +2818,115 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       abuf += BLOCKSIZE;
     }
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_7_15_variable;
+
+      aesni_prepare_7_15();
+
+      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+		    "movdqu %[l1], %%xmm12\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l1] "m" (*c->u_mode.ocb.L[1])
+		    : "memory" );
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  asm volatile ("movdqu %[abuf0],  %%xmm1\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			:
+			: [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
+			"pxor   %%xmm12,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			:
+			: [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			:
+			: [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[abuf3],  %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			:
+			: [l3] "m" (*l),
+			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  n += 4;
+	  l = ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[abuf4],  %%xmm8\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			:
+			: [abuf4] "m" (*(abuf + 4 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf5],  %%xmm9\n\t"
+			"pxor   %%xmm12,   %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			:
+			: [abuf5] "m" (*(abuf + 5 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[abuf6],  %%xmm10\n\t"
+			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			:
+			: [abuf6] "m" (*(abuf + 6 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
+			"movdqu %[abuf7],  %%xmm11\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			:
+			: [l7] "m" (*l),
+			  [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec8 (ctx);
+
+	  asm volatile ("pxor   %%xmm2,   %%xmm1\n\t"
+			"pxor   %%xmm3,   %%xmm1\n\t"
+			"pxor   %%xmm4,   %%xmm1\n\t"
+			"pxor   %%xmm8,   %%xmm1\n\t"
+			"pxor   %%xmm9,   %%xmm6\n\t"
+			"pxor   %%xmm10,  %%xmm6\n\t"
+			"pxor   %%xmm11,  %%xmm6\n\t"
+			"pxor   %%xmm1,   %%xmm6\n\t"
+			:
+			:
+			: "memory" );
+
+	  abuf += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_7_15();
+    }
+#endif
+
+  for ( ;nblocks >= 4 ; nblocks -= 4 )
     {
       n += 4;
       l = ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
 		    "movdqu %[abuf0],  %%xmm1\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm1\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
@@ -1745,9 +2940,8 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 		    : [l1] "m" (*c->u_mode.ocb.L[1]),
 		      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-		    "movdqu %[abuf2],  %%xmm3\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
+      asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
 		    :
 		    : [l2] "m" (*c->u_mode.ocb.L[0]),

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-internal.h |    2 +-
 cipher/cipher-xts.c      |    3 +-
 cipher/cipher.c          |    1 +
 cipher/rijndael-aesni.c  | 1539 +++++++++++++++++++++++++++++++++++++++++++++-
 cipher/rijndael.c        |   84 +++
 src/cipher.h             |    3 +
 6 files changed, 1603 insertions(+), 29 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org




More information about the Gnupg-commits mailing list