[git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-131-g9d9c4fd

by Jussi Kivilinna cvs at cvs.gnupg.org
Tue Nov 20 20:17:52 CET 2018


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  9d9c4fd18b445ff414d11678285d54af3afdb222 (commit)
       via  b42de67f34871a2520cfe370af513f2aab6e4f75 (commit)
      from  af0bbdb9019e0b4a72e87e8b1b4a55506d349834 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 9d9c4fd18b445ff414d11678285d54af3afdb222
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Tue Nov 20 21:16:08 2018 +0200

    Add clang target pragma for mixed C/assembly x86-64 implementations
    
    * cipher/cipher-gcm-intel-pclmul.c: Add target 'no-sse' attribute
    pragma for clang.
    * cipher/crc-intel-pclmul.c: Ditto.
    * cipher/rijndael-aesni.c: Ditto.
    * cipher/rijndael-ssse3-amd64.c: Ditto.
    * cipher/sha1-intel-shaext.c: Ditto.
    * cipher/sha256-intel-shaext.c: Ditto.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 0f26277..60ae7aa 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -37,6 +37,9 @@
 /* Prevent compiler from issuing SSE instructions between asm blocks. */
 #  pragma GCC target("no-sse")
 #endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
 
 
 /*
@@ -474,4 +477,8 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
   return 0;
 }
 
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
 #endif /* GCM_USE_INTEL_PCLMUL */
diff --git a/cipher/crc-intel-pclmul.c b/cipher/crc-intel-pclmul.c
index 8ff08ec..482b260 100644
--- a/cipher/crc-intel-pclmul.c
+++ b/cipher/crc-intel-pclmul.c
@@ -39,6 +39,9 @@
 /* Prevent compiler from issuing SSE instructions between asm blocks. */
 #  pragma GCC target("no-sse")
 #endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
 
 
 #define ALIGNED_16 __attribute__ ((aligned (16)))
@@ -922,4 +925,8 @@ _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
 #endif
 }
 
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
 #endif /* USE_INTEL_PCLMUL */
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index c1ebab0..483387c 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -39,6 +39,9 @@
 /* Prevent compiler from issuing SSE instructions between asm blocks. */
 #  pragma GCC target("no-sse")
 #endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
 
 
 #define ALWAYS_INLINE inline __attribute__((always_inline))
@@ -3514,4 +3517,8 @@ _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
     _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
 }
 
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
 #endif /* USE_AESNI */
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index fa481bb..0c1ae6e 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -55,6 +55,9 @@
 /* Prevent compiler from issuing SSE instructions between asm blocks. */
 #  pragma GCC target("no-sse")
 #endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
 
 
 /* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l
@@ -726,4 +729,8 @@ _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   return 0;
 }
 
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
 #endif /* USE_SSSE3 */
diff --git a/cipher/sha1-intel-shaext.c b/cipher/sha1-intel-shaext.c
index 5a2349e..d7e3d4f 100644
--- a/cipher/sha1-intel-shaext.c
+++ b/cipher/sha1-intel-shaext.c
@@ -29,6 +29,9 @@
 /* Prevent compiler from issuing SSE instructions between asm blocks. */
 #  pragma GCC target("no-sse")
 #endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
 
 /* Two macros to be called prior and after the use of SHA-EXT
    instructions.  There should be no external function calls between
@@ -278,4 +281,8 @@ _gcry_sha1_transform_intel_shaext(void *state, const unsigned char *data,
   return 0;
 }
 
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
 #endif /* HAVE_GCC_INLINE_ASM_SHA_EXT */
diff --git a/cipher/sha256-intel-shaext.c b/cipher/sha256-intel-shaext.c
index 0c107bb..2eda42d 100644
--- a/cipher/sha256-intel-shaext.c
+++ b/cipher/sha256-intel-shaext.c
@@ -29,6 +29,9 @@
 /* Prevent compiler from issuing SSE instructions between asm blocks. */
 #  pragma GCC target("no-sse")
 #endif
+#if __clang__
+#  pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function)
+#endif
 
 /* Two macros to be called prior and after the use of SHA-EXT
    instructions.  There should be no external function calls between
@@ -349,4 +352,8 @@ _gcry_sha256_transform_intel_shaext(u32 state[8], const unsigned char *data,
   return 0;
 }
 
+#if __clang__
+#  pragma clang attribute pop
+#endif
+
 #endif /* HAVE_GCC_INLINE_ASM_SHA_EXT */

commit b42de67f34871a2520cfe370af513f2aab6e4f75
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Tue Nov 20 21:16:08 2018 +0200

    Optimizations for AES-NI OCB
    
    * cipher/cipher-internal.h (gcry_cipher_handle): New pre-computed OCB
    values L0L1 and L0L1L0; Swap dimensions for OCB L table.
    * cipher/cipher-ocb.c (_gcry_cipher_ocb_set_nonce): Setup L0L1 and
    L0L1L0 values.
    (ocb_crypt): Process input in 24KiB chunks for better cache locality
    for checksumming.
    * cipher/rijndael-aesni.c (ALWAYS_INLINE): New macro for always
    inlining functions, change all functions with 'inline' to use
    ALWAYS_INLINE.
    (NO_INLINE): New macro.
    (aesni_prepare_2_6_variable, aesni_prepare_7_15_variable): Rename to...
    (aesni_prepare_2_7_variable, aesni_prepare_8_15_variable): ...these and
    adjust accordingly (xmm7 moved from *_7_15 to *_2_7).
    (aesni_prepare_2_6, aesni_prepare_7_15): Rename to...
    (aesni_prepare_2_7, aesni_prepare_8_15): ...these and adjust
    accordingly.
    (aesni_cleanup_2_6, aesni_cleanup_7_15): Rename to...
    (aesni_cleanup_2_7, aesni_cleanup_8_15): ...these and adjust
    accordingly.
    (aesni_ocb_checksum): New.
    (aesni_ocb_enc, aesni_ocb_dec): Calculate OCB offsets in parallel
    with help of pre-computed offsets L0+L1 ja L0+L1+L0; Do checksum
    calculation as separate pass instead of inline; Use NO_INLINE.
    (_gcry_aes_aesni_ocb_auth): Calculate OCB offsets in parallel
    with help of pre-computed offsets L0+L1 ja L0+L1+L0.
    * cipher/rijndael-internal.h (RIJNDAEL_context_s) [USE_AESNI]: Add
    'use_avx2' and 'use_avx'.
    * cipher/rijndael.c (do_setkey) [USE_AESNI]: Set 'use_avx2' if
    Intel AVX2 HW feature is available and 'use_avx' if Intel AVX HW
    feature is available.
    * tests/basic.c (do_check_ocb_cipher): New test vector; increase
    size of temporary buffers for new test vector.
    (check_ocb_cipher_largebuf_split): Make test plaintext non-uniform
    for better checksum testing.
    (check_ocb_cipher_checksum): New.
    (check_ocb_cipher_largebuf): Call check_ocb_cipher_checksum.
    (check_ocb_cipher): New expected tags for check_ocb_cipher_largebuf
    test runs.
    --
    
    Benchmark on Haswell i7-4970k @ 4.0Ghz:
    
    Before:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            OCB enc |     0.175 ns/B      5436 MiB/s     0.702 c/B
            OCB dec |     0.184 ns/B      5184 MiB/s     0.736 c/B
           OCB auth |     0.156 ns/B      6097 MiB/s     0.626 c/B
    
    After (enc +2% faster, dec +7% faster):
            OCB enc |     0.172 ns/B      5547 MiB/s     0.688 c/B
            OCB dec |     0.171 ns/B      5582 MiB/s     0.683 c/B
           OCB auth |     0.156 ns/B      6097 MiB/s     0.626 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index f93363b..8988696 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -319,7 +319,9 @@ struct gcry_cipher_handle
       /* Helper variables and pre-computed table of L values.  */
       unsigned char L_star[OCB_BLOCK_LEN];
       unsigned char L_dollar[OCB_BLOCK_LEN];
-      unsigned char L[OCB_BLOCK_LEN][OCB_L_TABLE_SIZE];
+      unsigned char L0L1[OCB_BLOCK_LEN];
+      unsigned char L0L1L0[OCB_BLOCK_LEN];
+      unsigned char L[OCB_L_TABLE_SIZE][OCB_BLOCK_LEN];
 
       /* The tag is valid if marks.tag has been set.  */
       unsigned char tag[OCB_BLOCK_LEN];
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index f71520a..58f7be7 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -170,6 +170,11 @@ _gcry_cipher_ocb_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce,
   double_block_cpy (c->u_mode.ocb.L[0], c->u_mode.ocb.L_dollar);
   for (i = 1; i < OCB_L_TABLE_SIZE; i++)
     double_block_cpy (c->u_mode.ocb.L[i], c->u_mode.ocb.L[i-1]);
+  /* Precalculated offsets L0+L1, L0+L1+L0 */
+  cipher_block_xor (c->u_mode.ocb.L0L1,
+		    c->u_mode.ocb.L[0], c->u_mode.ocb.L[1], OCB_BLOCK_LEN);
+  cipher_block_xor (c->u_mode.ocb.L0L1L0,
+		    c->u_mode.ocb.L[0], c->u_mode.ocb.L0L1, OCB_BLOCK_LEN);
 
   /* Prepare the nonce.  */
   memset (ktop, 0, (OCB_BLOCK_LEN - noncelen));
@@ -519,6 +524,12 @@ ocb_crypt (gcry_cipher_hd_t c, int encrypt,
 
       nblks = nblks < nmaxblks ? nblks : nmaxblks;
 
+      /* Since checksum xoring is done before/after encryption/decryption,
+	process input in 24KiB chunks to keep data loaded in L1 cache for
+	checksumming. */
+      if (nblks > 24 * 1024 / OCB_BLOCK_LEN)
+	nblks = 24 * 1024 / OCB_BLOCK_LEN;
+
       /* Use a bulk method if available.  */
       if (nblks && c->bulk.ocb_crypt)
         {
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index d190c0a..c1ebab0 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -41,6 +41,10 @@
 #endif
 
 
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+
+
 typedef struct u128_s
 {
   u32 a, b, c, d;
@@ -49,7 +53,7 @@ typedef struct u128_s
 
 /* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l
    because of 'pragma target'. */
-static inline const unsigned char *
+static ALWAYS_INLINE const unsigned char *
 aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
 {
   unsigned long ntz;
@@ -71,78 +75,78 @@ aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
    the key or the data.  */
 #ifdef __WIN64__
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
-# define aesni_prepare_2_6_variable char win64tmp[16]
-# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9]
+# define aesni_prepare_2_7_variable char win64tmp[16 * 2]
+# define aesni_prepare_8_15_variable char win64tmp8_15[16 * 8]
 # define aesni_prepare() do { } while (0)
-# define aesni_prepare_2_6()                                            \
+# define aesni_prepare_2_7()                                            \
    do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
-                      : "=m" (*win64tmp)                                \
+		      "movdqu %%xmm7, %1\n\t"                           \
+                      : "=m" (*win64tmp), "=m" (*(win64tmp+16))         \
                       :                                                 \
                       : "memory");                                      \
    } while (0)
-# define aesni_prepare_7_15()                                           \
-   do { asm volatile ("movdqu %%xmm7,  0*16(%0)\n\t"                    \
-                      "movdqu %%xmm8,  1*16(%0)\n\t"                    \
-                      "movdqu %%xmm9,  2*16(%0)\n\t"                    \
-                      "movdqu %%xmm10, 3*16(%0)\n\t"                    \
-                      "movdqu %%xmm11, 4*16(%0)\n\t"                    \
-                      "movdqu %%xmm12, 5*16(%0)\n\t"                    \
-                      "movdqu %%xmm13, 6*16(%0)\n\t"                    \
-                      "movdqu %%xmm14, 7*16(%0)\n\t"                    \
-                      "movdqu %%xmm15, 8*16(%0)\n\t"                    \
+# define aesni_prepare_8_15()                                           \
+   do { asm volatile ("movdqu %%xmm8,  0*16(%0)\n\t"                    \
+                      "movdqu %%xmm9,  1*16(%0)\n\t"                    \
+                      "movdqu %%xmm10, 2*16(%0)\n\t"                    \
+                      "movdqu %%xmm11, 3*16(%0)\n\t"                    \
+                      "movdqu %%xmm12, 4*16(%0)\n\t"                    \
+                      "movdqu %%xmm13, 5*16(%0)\n\t"                    \
+                      "movdqu %%xmm14, 6*16(%0)\n\t"                    \
+                      "movdqu %%xmm15, 7*16(%0)\n\t"                    \
                       :                                                 \
-                      : "r" (win64tmp7_15)                              \
+                      : "r" (win64tmp8_15)                              \
                       : "memory");                                      \
    } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
    } while (0)
-# define aesni_cleanup_2_6()                                            \
+# define aesni_cleanup_2_7()                                            \
    do { asm volatile ("movdqu %0,   %%xmm6\n\t"                         \
+		      "movdqu %1,   %%xmm7\n\t"                         \
                       "pxor %%xmm2, %%xmm2\n"                           \
                       "pxor %%xmm3, %%xmm3\n"                           \
                       "pxor %%xmm4, %%xmm4\n"                           \
                       "pxor %%xmm5, %%xmm5\n"                           \
                       :                                                 \
-                      : "m" (*win64tmp)                                 \
+                      : "m" (*win64tmp), "m" (*(win64tmp+16))           \
                       : "memory");                                      \
    } while (0)
-# define aesni_cleanup_7_15()                                           \
-   do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t"                     \
-                      "movdqu 1*16(%0), %%xmm8\n\t"                     \
-                      "movdqu 2*16(%0), %%xmm9\n\t"                     \
-                      "movdqu 3*16(%0), %%xmm10\n\t"                    \
-                      "movdqu 4*16(%0), %%xmm11\n\t"                    \
-                      "movdqu 5*16(%0), %%xmm12\n\t"                    \
-                      "movdqu 6*16(%0), %%xmm13\n\t"                    \
-                      "movdqu 7*16(%0), %%xmm14\n\t"                    \
-                      "movdqu 8*16(%0), %%xmm15\n\t"                    \
+# define aesni_cleanup_8_15()                                           \
+   do { asm volatile ("movdqu 0*16(%0), %%xmm8\n\t"                     \
+                      "movdqu 1*16(%0), %%xmm9\n\t"                     \
+                      "movdqu 2*16(%0), %%xmm10\n\t"                    \
+                      "movdqu 3*16(%0), %%xmm11\n\t"                    \
+                      "movdqu 4*16(%0), %%xmm12\n\t"                    \
+                      "movdqu 5*16(%0), %%xmm13\n\t"                    \
+                      "movdqu 6*16(%0), %%xmm14\n\t"                    \
+                      "movdqu 7*16(%0), %%xmm15\n\t"                    \
                       :                                                 \
-                      : "r" (win64tmp7_15)                              \
+                      : "r" (win64tmp8_15)                              \
                       : "memory");                                      \
    } while (0)
 #else
-# define aesni_prepare_2_6_variable
+# define aesni_prepare_2_7_variable
 # define aesni_prepare() do { } while (0)
-# define aesni_prepare_2_6() do { } while (0)
+# define aesni_prepare_2_7() do { } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
    } while (0)
-# define aesni_cleanup_2_6()                                            \
-   do { asm volatile ("pxor %%xmm2, %%xmm2\n\t"                         \
+# define aesni_cleanup_2_7()                                            \
+   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
+                      "pxor %%xmm2, %%xmm2\n\t"                         \
                       "pxor %%xmm3, %%xmm3\n"                           \
                       "pxor %%xmm4, %%xmm4\n"                           \
                       "pxor %%xmm5, %%xmm5\n"                           \
                       "pxor %%xmm6, %%xmm6\n":: );                      \
    } while (0)
 # ifdef __x86_64__
-#  define aesni_prepare_7_15_variable
-#  define aesni_prepare_7_15() do { } while (0)
-#  define aesni_cleanup_7_15()                                          \
-   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
-                      "pxor %%xmm8, %%xmm8\n"                           \
+#  define aesni_prepare_8_15_variable
+#  define aesni_prepare_8_15() do { } while (0)
+#  define aesni_cleanup_8_15()                                          \
+   do { asm volatile ("pxor %%xmm8, %%xmm8\n"                           \
                       "pxor %%xmm9, %%xmm9\n"                           \
                       "pxor %%xmm10, %%xmm10\n"                         \
                       "pxor %%xmm11, %%xmm11\n"                         \
@@ -157,10 +161,10 @@ aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
 void
 _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   if (ctx->rounds < 12)
     {
@@ -383,12 +387,12 @@ _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
     }
 
   aesni_cleanup();
-  aesni_cleanup_2_6();
+  aesni_cleanup_2_7();
 }
 
 
 /* Make a decryption key from an encryption key. */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_prepare_decryption (RIJNDAEL_context *ctx)
 {
   /* The AES-NI decrypt instructions use the Equivalent Inverse
@@ -447,7 +451,7 @@ _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
 
 /* Encrypt one block using the Intel AES-NI instructions.  Block is input
  * and output through SSE register xmm0. */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_enc (const RIJNDAEL_context *ctx)
 {
 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
@@ -500,7 +504,7 @@ do_aesni_enc (const RIJNDAEL_context *ctx)
 
 /* Decrypt one block using the Intel AES-NI instructions.  Block is input
  * and output through SSE register xmm0. */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_dec (const RIJNDAEL_context *ctx)
 {
 #define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
@@ -553,7 +557,7 @@ do_aesni_dec (const RIJNDAEL_context *ctx)
 
 /* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
 {
 #define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
@@ -662,7 +666,7 @@ do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
 
 /* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 {
 #define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
@@ -773,7 +777,7 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 
 /* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
 {
   asm volatile ("movdqa (%[key]), %%xmm0\n\t"
@@ -925,7 +929,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
 
 /* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
 {
   asm volatile ("movdqa (%[key]), %%xmm0\n\t"
@@ -1757,10 +1761,10 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks, int cbc_mac)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
                 : /* No output */
@@ -1794,7 +1798,7 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -1805,10 +1809,10 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
                 "movdqa %[ctr], %%xmm5\n\t"  /* Preload CTR */
@@ -1820,9 +1824,9 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
 	{
@@ -1831,7 +1835,7 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -1848,7 +1852,7 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
       inbuf  += BLOCKSIZE;
     }
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -1876,10 +1880,10 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   asm volatile ("movdqu %[iv], %%xmm6\n\t"
                 : /* No output */
@@ -1891,9 +1895,9 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8; nblocks -= 8)
 	{
@@ -1953,7 +1957,7 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -2022,7 +2026,7 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -2031,10 +2035,10 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   if ( !ctx->decryption_prepared )
     {
@@ -2051,9 +2055,9 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
 	{
@@ -2113,7 +2117,7 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -2187,11 +2191,175 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
      : "memory");
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
-static void
+static ALWAYS_INLINE void
+aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
+		    size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+
+  /* Calculate checksum */
+  asm volatile ("movdqu %[checksum], %%xmm6\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                :
+                :[checksum] "m" (*c->u_ctr.ctr)
+                : "memory" );
+
+  if (0) {}
+#if defined(HAVE_GCC_INLINE_ASM_AVX2)
+  else if (nblocks >= 16 && ctx->use_avx2)
+    {
+      /* Use wider 256-bit registers for fast xoring of plaintext. */
+      asm volatile ("vzeroupper\n\t"
+		    "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
+		    "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
+		    "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
+		    "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
+                    :
+                    :
+                    : "memory");
+
+      for (;nblocks >= 16; nblocks -= 16)
+	{
+	  asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
+			"vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
+			"vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
+			"vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
+			"vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
+			"vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
+			"vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
+			"vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
+			:
+			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
+			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
+			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
+			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)),
+			  [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
+			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
+			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
+			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
+			: "memory" );
+	  plaintext += BLOCKSIZE * 16;
+	}
+
+      asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
+		    "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
+		    "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
+		    "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
+		    "vextracti128 $1, %%ymm6, %%xmm0\n\t"
+		    "vextracti128 $1, %%ymm1, %%xmm4\n\t"
+		    "vextracti128 $1, %%ymm2, %%xmm5\n\t"
+		    "vextracti128 $1, %%ymm3, %%xmm7\n\t"
+		    "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
+		    "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
+		    "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
+		    "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
+		    "vzeroupper\n\t"
+		    :
+		    :
+		    : "memory" );
+    }
+#endif
+#if defined(HAVE_GCC_INLINE_ASM_AVX)
+  else if (nblocks >= 16 && ctx->use_avx)
+    {
+      /* Same as AVX2, except using 256-bit floating point instructions. */
+      asm volatile ("vzeroupper\n\t"
+		    "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t"
+		    "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t"
+		    "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t"
+		    "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t"
+                    :
+                    :
+                    : "memory");
+
+      for (;nblocks >= 16; nblocks -= 16)
+	{
+	  asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t"
+			"vxorpd %[ptr1], %%ymm1, %%ymm1\n\t"
+			"vxorpd %[ptr2], %%ymm2, %%ymm2\n\t"
+			"vxorpd %[ptr3], %%ymm3, %%ymm3\n\t"
+			"vxorpd %[ptr4], %%ymm0, %%ymm0\n\t"
+			"vxorpd %[ptr5], %%ymm4, %%ymm4\n\t"
+			"vxorpd %[ptr6], %%ymm5, %%ymm5\n\t"
+			"vxorpd %[ptr7], %%ymm7, %%ymm7\n\t"
+			:
+			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
+			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
+			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
+			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)),
+			  [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
+			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
+			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
+			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
+			: "memory" );
+	  plaintext += BLOCKSIZE * 16;
+	}
+
+      asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t"
+		    "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t"
+		    "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t"
+		    "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t"
+		    "vextractf128 $1, %%ymm6, %%xmm0\n\t"
+		    "vextractf128 $1, %%ymm1, %%xmm4\n\t"
+		    "vextractf128 $1, %%ymm2, %%xmm5\n\t"
+		    "vextractf128 $1, %%ymm3, %%xmm7\n\t"
+		    "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t"
+		    "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t"
+		    "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t"
+		    "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t"
+		    "vzeroupper\n\t"
+		    :
+		    :
+		    : "memory" );
+    }
+#endif
+
+  for (;nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
+		    "movdqu %[ptr1], %%xmm4\n\t"
+		    "movdqu %[ptr2], %%xmm5\n\t"
+		    "movdqu %[ptr3], %%xmm7\n\t"
+		    "pxor %%xmm0, %%xmm6\n\t"
+		    "pxor %%xmm4, %%xmm1\n\t"
+		    "pxor %%xmm5, %%xmm2\n\t"
+		    "pxor %%xmm7, %%xmm3\n\t"
+		    :
+		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
+		      [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
+		      [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
+		      [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
+		    : "memory" );
+      plaintext += BLOCKSIZE * 4;
+    }
+
+  for (;nblocks >= 1; nblocks -= 1)
+    {
+      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
+		    "pxor %%xmm0, %%xmm6\n\t"
+		    :
+		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
+		    : "memory" );
+      plaintext += BLOCKSIZE;
+    }
+
+  asm volatile ("pxor %%xmm1, %%xmm6\n\t"
+		"pxor %%xmm2, %%xmm6\n\t"
+		"pxor %%xmm3, %%xmm6\n\t"
+		"movdqu %%xmm6, %[checksum]\n\t"
+		: [checksum] "=m" (*c->u_ctr.ctr)
+		:
+		: "memory" );
+}
+
+
+static unsigned int NO_INLINE
 aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                const void *inbuf_arg, size_t nblocks)
 {
@@ -2200,31 +2368,28 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
   const unsigned char *l;
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
+
+  aesni_ocb_checksum (c, inbuf_arg, nblocks);
 
-  /* Preload Offset and Checksum */
+  /* Preload Offset */
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
-                "movdqu %[ctr], %%xmm6\n\t"
                 : /* No output */
-                : [iv] "m" (*c->u_iv.iv),
-                  [ctr] "m" (*c->u_ctr.ctr)
+                : [iv] "m" (*c->u_iv.iv)
                 : "memory" );
 
-
   for ( ;nblocks && n % 4; nblocks-- )
     {
       l = aes_ocb_get_l(c, ++n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
       asm volatile ("movdqu %[l],     %%xmm1\n\t"
                     "movdqu %[inbuf], %%xmm0\n\t"
                     "pxor   %%xmm1,   %%xmm5\n\t"
-                    "pxor   %%xmm0,   %%xmm6\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
                     : [l] "m" (*l),
@@ -2246,11 +2411,11 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
-      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+      asm volatile ("movdqu %[l0], %%xmm6\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0])
 		    : "memory" );
@@ -2260,78 +2425,78 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 	  n += 4;
 	  l = aes_ocb_get_l(c, n);
 
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-
-	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqa %%xmm5,    %%xmm12\n\t"
+	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
+			"movdqu %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l3],     %%xmm15\n\t"
 			:
-			: [l1] "m" (*c->u_mode.ocb.L[1]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
+			  [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+			  [l3] "m" (*l)
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm2,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqa %%xmm5,    %%xmm13\n\t"
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
+	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
 			:
-			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqa %%xmm5,    %%xmm14\n\t"
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"movdqu %[inbuf4], %%xmm8\n\t"
+			"movdqu %[inbuf5], %%xmm9\n\t"
 			:
-			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
+			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
+			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm15\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
+	  asm volatile ("movdqa %%xmm6,    %%xmm12\n\t"
+			"pxor   %%xmm5,    %%xmm12\n\t"
+			"pxor   %%xmm12,   %%xmm1\n\t"
+
+			"movdqa %%xmm10,   %%xmm13\n\t"
+			"pxor   %%xmm5,    %%xmm13\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+
+			"movdqa %%xmm11,   %%xmm14\n\t"
+			"pxor   %%xmm5,    %%xmm14\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+
+			"pxor   %%xmm11,   %%xmm5\n\t"
 			"pxor   %%xmm15,   %%xmm5\n\t"
-			"pxor   %%xmm4,    %%xmm6\n\t"
 			"pxor   %%xmm5,    %%xmm4\n\t"
 			"movdqa %%xmm5,    %%xmm15\n\t"
-			:
-			: [l3] "m" (*l),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
 
-	  n += 4;
-	  l = aes_ocb_get_l(c, n);
-
-	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm8,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm8\n\t"
-			"movdqu %%xmm5,    %[outbuf4]\n\t"
-			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
-			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm9,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm9\n\t"
-			"movdqu %%xmm5,    %[outbuf5]\n\t"
-			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
-			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			"movdqa %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"movdqu %%xmm0,    %[outbuf4]\n\t"
+
+			"movdqa %%xmm10,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm9\n\t"
+			"movdqu %%xmm0,    %[outbuf5]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			:
 			: "memory" );
 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm10,   %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm10\n\t"
-			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm10\n\t"
+			"movdqu %%xmm0,    %[outbuf6]\n\t"
 			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
 			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[l7],     %%xmm11\n\t"
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
 			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
 			"movdqu %[inbuf7], %%xmm11\n\t"
-			"pxor   %%xmm11,   %%xmm6\n\t"
 			"pxor   %%xmm5,    %%xmm11\n\t"
 			:
 			: [l7] "m" (*l),
@@ -2374,7 +2539,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-    aesni_cleanup_7_15();
+    aesni_cleanup_8_15();
   }
 #endif
 
@@ -2384,44 +2549,46 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       l = aes_ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
-		    "pxor   %%xmm1,    %%xmm6\n\t"
-		    "pxor   %%xmm5,    %%xmm1\n\t"
-		    "movdqu %%xmm5,    %[outbuf0]\n\t"
-		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    "movdqu %[l0l1],   %%xmm3\n\t"
+		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-		    "movdqu %[inbuf1], %%xmm2\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
-		    "pxor   %%xmm2,    %%xmm6\n\t"
-		    "pxor   %%xmm5,    %%xmm2\n\t"
-		    "movdqu %%xmm5,    %[outbuf1]\n\t"
-		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-		    : [l1] "m" (*c->u_mode.ocb.L[1]),
-		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm0,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		      [l3] "m" (*l)
 		    : "memory" );
-      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
-		    "pxor   %%xmm3,    %%xmm6\n\t"
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
-		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
+		    "movdqu %%xmm3,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm0,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    :
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm4\n\t"
+      asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
 		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
-		    "pxor   %%xmm4,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
-		    : [l3] "m" (*l),
-		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
 		    : "memory" );
 
       do_aesni_enc_vec4 (ctx);
@@ -2453,12 +2620,10 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       l = aes_ocb_get_l(c, ++n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
       asm volatile ("movdqu %[l],     %%xmm1\n\t"
                     "movdqu %[inbuf], %%xmm0\n\t"
                     "pxor   %%xmm1,   %%xmm5\n\t"
-                    "pxor   %%xmm0,   %%xmm6\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
                     : [l] "m" (*l),
@@ -2479,30 +2644,31 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = n;
   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
-                "movdqu %%xmm6, %[ctr]\n\t"
-                : [iv] "=m" (*c->u_iv.iv),
-                  [ctr] "=m" (*c->u_ctr.ctr)
+                : [iv] "=m" (*c->u_iv.iv)
                 :
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
+
+  return 0;
 }
 
 
-static void
+static unsigned int NO_INLINE
 aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
-               const void *inbuf_arg, size_t nblocks)
+               const void *inbuf_arg, size_t nblocks_arg)
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
   const unsigned char *l;
-  aesni_prepare_2_6_variable;
+  size_t nblocks = nblocks_arg;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   if ( !ctx->decryption_prepared )
     {
@@ -2510,12 +2676,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       ctx->decryption_prepared = 1;
     }
 
-  /* Preload Offset and Checksum */
+  /* Preload Offset */
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
-                "movdqu %[ctr], %%xmm6\n\t"
                 : /* No output */
-                : [iv] "m" (*c->u_iv.iv),
-                  [ctr] "m" (*c->u_ctr.ctr)
+                : [iv] "m" (*c->u_iv.iv)
                 : "memory" );
 
   for ( ;nblocks && n % 4; nblocks-- )
@@ -2524,7 +2688,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       asm volatile ("movdqu %[l],     %%xmm1\n\t"
                     "movdqu %[inbuf], %%xmm0\n\t"
                     "pxor   %%xmm1,   %%xmm5\n\t"
@@ -2537,7 +2700,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       do_aesni_dec (ctx);
 
       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
-                    "pxor   %%xmm0, %%xmm6\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
                     : [outbuf] "=m" (*outbuf)
                     :
@@ -2550,11 +2712,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
-      asm volatile ("movdqu %[l0], %%xmm7\n\t"
+      asm volatile ("movdqu %[l0], %%xmm6\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0])
 		    : "memory" );
@@ -2564,70 +2726,78 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 	  n += 4;
 	  l = aes_ocb_get_l(c, n);
 
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-
-	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqa %%xmm5,    %%xmm12\n\t"
+	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
+			"movdqu %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l3],     %%xmm15\n\t"
 			:
-			: [l1] "m" (*c->u_mode.ocb.L[1]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: [l0l1] "m" (*c->u_mode.ocb.L0L1),
+			  [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+			  [l3] "m" (*l)
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqa %%xmm5,    %%xmm13\n\t"
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
 			:
-			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqa %%xmm5,    %%xmm14\n\t"
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"movdqu %[inbuf4], %%xmm8\n\t"
+			"movdqu %[inbuf5], %%xmm9\n\t"
 			:
-			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
+			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
+			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
+	  asm volatile ("movdqa %%xmm6,    %%xmm12\n\t"
+			"pxor   %%xmm5,    %%xmm12\n\t"
+			"pxor   %%xmm12,   %%xmm1\n\t"
+
+			"movdqa %%xmm10,   %%xmm13\n\t"
+			"pxor   %%xmm5,    %%xmm13\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+
+			"movdqa %%xmm11,   %%xmm14\n\t"
+			"pxor   %%xmm5,    %%xmm14\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm15,   %%xmm5\n\t"
 			"pxor   %%xmm5,    %%xmm4\n\t"
 			"movdqa %%xmm5,    %%xmm15\n\t"
-			:
-			: [l3] "m" (*l),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
-
-	  n += 4;
-	  l = aes_ocb_get_l(c, n);
 
-	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm8\n\t"
-			"movdqu %%xmm5,    %[outbuf4]\n\t"
-			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
-			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm9\n\t"
-			"movdqu %%xmm5,    %[outbuf5]\n\t"
-			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
-			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			"movdqa %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"movdqu %%xmm0,    %[outbuf4]\n\t"
+
+			"movdqa %%xmm10,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm9\n\t"
+			"movdqu %%xmm0,    %[outbuf5]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			:
 			: "memory" );
 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm10\n\t"
-			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm10\n\t"
+			"movdqu %%xmm0,    %[outbuf6]\n\t"
 			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
 			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
 			: "memory" );
 	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
-			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm5\n\t"
 			"pxor   %%xmm0,    %%xmm5\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
 			"pxor   %%xmm5,    %%xmm11\n\t"
 			:
 			: [l7] "m" (*l),
@@ -2655,14 +2825,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 			"movdqu %%xmm9,    %[outbuf5]\n\t"
 			"movdqu %%xmm10,   %[outbuf6]\n\t"
 			"movdqu %%xmm11,   %[outbuf7]\n\t"
-			"pxor   %%xmm2,    %%xmm1\n\t"
-			"pxor   %%xmm4,    %%xmm1\n\t"
-			"pxor   %%xmm9,    %%xmm1\n\t"
-			"pxor   %%xmm11,   %%xmm1\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm8,    %%xmm6\n\t"
-			"pxor   %%xmm10,   %%xmm6\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
 			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
 			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
 			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
@@ -2678,7 +2840,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -2688,40 +2850,46 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       l = aes_ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
-      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
+      /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
-		    "pxor   %%xmm5,    %%xmm1\n\t"
-		    "movdqu %%xmm5,    %[outbuf0]\n\t"
-		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    "movdqu %[l0l1],   %%xmm3\n\t"
+		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-		    "movdqu %[inbuf1], %%xmm2\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
-		    "pxor   %%xmm5,    %%xmm2\n\t"
-		    "movdqu %%xmm5,    %[outbuf1]\n\t"
-		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-		    : [l1] "m" (*c->u_mode.ocb.L[1]),
-		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm0,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		      [l3] "m" (*l)
 		    : "memory" );
-      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
-		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
+		    "movdqu %%xmm3,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm0,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    :
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+      asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
-		    : [l3] "m" (*l),
-		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
 		    : "memory" );
 
       do_aesni_dec_vec4 (ctx);
@@ -2737,10 +2905,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		    "movdqu %%xmm3,    %[outbuf2]\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    "movdqu %%xmm4,    %[outbuf3]\n\t"
-		    "pxor   %%xmm1,    %%xmm6\n\t"
-		    "pxor   %%xmm2,    %%xmm6\n\t"
-		    "pxor   %%xmm3,    %%xmm6\n\t"
-		    "pxor   %%xmm4,    %%xmm6\n\t"
 		    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
 		      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
 		      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
@@ -2771,7 +2935,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       do_aesni_dec (ctx);
 
       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
-                    "pxor   %%xmm0, %%xmm6\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
                     : [outbuf] "=m" (*outbuf)
                     :
@@ -2783,14 +2946,16 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = n;
   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
-                "movdqu %%xmm6, %[ctr]\n\t"
-                : [iv] "=m" (*c->u_iv.iv),
-                  [ctr] "=m" (*c->u_ctr.ctr)
+                : [iv] "=m" (*c->u_iv.iv)
                 :
                 : "memory" );
 
+  aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
+
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
+
+  return 0;
 }
 
 
@@ -2799,11 +2964,9 @@ _gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
                           const void *inbuf_arg, size_t nblocks, int encrypt)
 {
   if (encrypt)
-    aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+    return aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
   else
-    aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
-
-  return 0;
+    return aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
 }
 
 
@@ -2815,10 +2978,10 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   const unsigned char *abuf = abuf_arg;
   u64 n = c->u_mode.ocb.aad_nblocks;
   const unsigned char *l;
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   /* Preload Offset and Sum */
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
@@ -2856,15 +3019,17 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
-      asm volatile ("movdqu %[l0], %%xmm7\n\t"
-		    "movdqu %[l1], %%xmm12\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm7\n\t"
+		    "movdqu %[l0l1],   %%xmm12\n\t"
+		    "movdqu %[l0l1l0], %%xmm13\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
-		      [l1] "m" (*c->u_mode.ocb.L[1])
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
+		      [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0)
 		    : "memory" );
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
@@ -2872,63 +3037,66 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 	  n += 4;
 	  l = aes_ocb_get_l(c, n);
 
+	  asm volatile ("movdqu %[l3],   %%xmm0\n\t"
+			"pxor   %%xmm13, %%xmm0\n\t"
+			:
+			: [l3] "m" (*l)
+			: "memory" );
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  asm volatile ("movdqu %[l7],   %%xmm14\n\t"
+			"pxor   %%xmm13, %%xmm14\n\t"
+			:
+			: [l7] "m" (*l)
+			: "memory" );
+
 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
 	  asm volatile ("movdqu %[abuf0],  %%xmm1\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqu %[abuf1],  %%xmm2\n\t"
+			"movdqu %[abuf2],  %%xmm3\n\t"
+			"movdqu %[abuf3],  %%xmm4\n\t"
+			"movdqu %[abuf4],  %%xmm8\n\t"
+			"movdqu %[abuf5],  %%xmm9\n\t"
+			"movdqu %[abuf6],  %%xmm10\n\t"
+			"movdqu %[abuf7],  %%xmm11\n\t"
 			:
-			: [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+			: [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)),
+			  [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)),
+			  [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)),
+			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)),
+			  [abuf4] "m" (*(abuf + 4 * BLOCKSIZE)),
+			  [abuf5] "m" (*(abuf + 5 * BLOCKSIZE)),
+			  [abuf6] "m" (*(abuf + 6 * BLOCKSIZE)),
+			  [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
-			"pxor   %%xmm12,   %%xmm5\n\t"
+	  asm volatile ("pxor   %%xmm7,    %%xmm1\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+
+			"pxor   %%xmm12,   %%xmm2\n\t"
 			"pxor   %%xmm5,    %%xmm2\n\t"
-			:
-			: [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
+
+			"pxor   %%xmm13,   %%xmm3\n\t"
 			"pxor   %%xmm5,    %%xmm3\n\t"
-			:
-			: [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[abuf3],  %%xmm4\n\t"
+
 			"pxor   %%xmm0,    %%xmm5\n\t"
 			"pxor   %%xmm5,    %%xmm4\n\t"
-			:
-			: [l3] "m" (*l),
-			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
-			: "memory" );
 
-	  n += 4;
-	  l = aes_ocb_get_l(c, n);
-
-	  asm volatile ("movdqu %[abuf4],  %%xmm8\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
+			"pxor   %%xmm7,    %%xmm8\n\t"
 			"pxor   %%xmm5,    %%xmm8\n\t"
-			:
-			: [abuf4] "m" (*(abuf + 4 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[abuf5],  %%xmm9\n\t"
-			"pxor   %%xmm12,   %%xmm5\n\t"
+
+			"pxor   %%xmm12,   %%xmm9\n\t"
 			"pxor   %%xmm5,    %%xmm9\n\t"
-			:
-			: [abuf5] "m" (*(abuf + 5 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[abuf6],  %%xmm10\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
+
+			"pxor   %%xmm13,   %%xmm10\n\t"
 			"pxor   %%xmm5,    %%xmm10\n\t"
-			:
-			: [abuf6] "m" (*(abuf + 6 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
-			"movdqu %[abuf7],  %%xmm11\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
+
+			"pxor   %%xmm14,   %%xmm5\n\t"
 			"pxor   %%xmm5,    %%xmm11\n\t"
 			:
-			: [l7] "m" (*l),
-			  [abuf7] "m" (*(abuf + 7 * BLOCKSIZE))
+			:
 			: "memory" );
 
 	  do_aesni_enc_vec8 (ctx);
@@ -2948,7 +3116,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 	  abuf += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -2959,36 +3127,41 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
 		    "movdqu %[abuf0],  %%xmm1\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
-		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %[l0l1],   %%xmm3\n\t"
 		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*c->u_mode.ocb.L0L1),
 		      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-		    "movdqu %[abuf1],  %%xmm2\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
-		    "pxor   %%xmm5,    %%xmm2\n\t"
+      asm volatile ("movdqu %[l0l1l0], %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm7\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
 		    :
-		    : [l1] "m" (*c->u_mode.ocb.L[1]),
-		      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+		    : [l0l1l0] "m" (*c->u_mode.ocb.L0L1L0),
+		      [l3] "m" (*l)
 		    : "memory" );
-      asm volatile ("movdqu %[abuf2],  %%xmm3\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
+      asm volatile ("movdqu %[abuf1],  %%xmm2\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
 		    :
-		    : [l2] "m" (*c->u_mode.ocb.L[0]),
-		      [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+		    : [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[abuf2],  %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    :
+		    : [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("pxor   %%xmm7,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[abuf3],  %%xmm4\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
-		    : [l3] "m" (*l),
-		      [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+		    : [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
 		    : "memory" );
 
       do_aesni_enc_vec4 (ctx);
@@ -3038,7 +3211,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 
   return 0;
 }
@@ -3053,10 +3226,10 @@ _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
 			 unsigned char *outbuf, const unsigned char *inbuf,
 			 size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   /* Preload Tweak */
   asm volatile ("movdqu %[tweak], %%xmm5\n\t"
@@ -3182,7 +3355,7 @@ _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
 		: "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -3191,10 +3364,10 @@ _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
 			 unsigned char *outbuf, const unsigned char *inbuf,
 			 size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   if ( !ctx->decryption_prepared )
     {
@@ -3326,7 +3499,7 @@ _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 160fb8c..876d55f 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -143,6 +143,8 @@ typedef struct RIJNDAEL_context_s
 #endif /*USE_PADLOCK*/
 #ifdef USE_AESNI
   unsigned int use_aesni:1;           /* AES-NI shall be used.  */
+  unsigned int use_avx:1;             /* AVX shall be used. */
+  unsigned int use_avx2:1;            /* AVX2 shall be used. */
 #endif /*USE_AESNI*/
 #ifdef USE_SSSE3
   unsigned int use_ssse3:1;           /* SSSE3 shall be used.  */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 1bc8b0f..8094537 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -334,6 +334,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prefetch_enc_fn = NULL;
       ctx->prefetch_dec_fn = NULL;
       ctx->use_aesni = 1;
+      ctx->use_avx = !!(hwfeatures & HWF_INTEL_AVX);
+      ctx->use_avx2 = !!(hwfeatures & HWF_INTEL_AVX2);
       if (hd)
         {
           hd->bulk.cfb_enc = _gcry_aes_aesni_cfb_enc;
diff --git a/tests/basic.c b/tests/basic.c
index f3d8951..0afae30 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -4411,11 +4411,114 @@ do_check_ocb_cipher (int inplace)
       "1792A4E31E0755FB03E31B22116E6C2DDF9EFD6E33D536F1"
       "A0124B0A55BAE884ED93481529C76B6AD0C515F4D1CDD4FD"
       "AC4F02AA"
+    },
+    { GCRY_CIPHER_AES, 12, "0F0E0D0C0B0A09080706050403020100",
+      "BBAA9988776655443322110D",
+      "000102030405060708090A0B0C0D0E0F1011121314151617"
+      "18191A1B1C1D1E1F2021222324252627",
+      /* test vector for checksumming */
+      "01000000000000000000000000000000"
+      "02000000000000000000000000000000"
+      "04000000000000000000000000000000"
+      "08000000000000000000000000000000"
+      "10000000000000000000000000000000"
+      "20000000000000000000000000000000"
+      "40000000000000000000000000000000"
+      "80000000000000000000000000000000"
+      "00010000000000000000000000000000"
+      "00020000000000000000000000000000"
+      "00040000000000000000000000000000"
+      "00080000000000000000000000000000"
+      "00100000000000000000000000000000"
+      "00200000000000000000000000000000"
+      "00400000000000000000000000000000"
+      "00800000000000000000000000000000"
+      "00000100000000000000000000000000"
+      "00000200000000000000000000000000"
+      "00000400000000000000000000000000"
+      "00000800000000000000000000000000"
+      "00001000000000000000000000000000"
+      "00002000000000000000000000000000"
+      "00004000000000000000000000000000"
+      "00008000000000000000000000000000"
+      "00000001000000000000000000000000"
+      "00000002000000000000000000000000"
+      "00000004000000000000000000000000"
+      "00000008000000000000000000000000"
+      "00000010000000000000000000000000"
+      "00000020000000000000000000000000"
+      "00000040000000000000000000000000"
+      "00000080000000000000000000000000"
+      "00000000010000000000000000000000"
+      "00000000020000000000000000000000"
+      "00000000040000000000000000000000"
+      "00000000080000000000000000000000"
+      "00000000100000000000000000000000"
+      "00000000200000000000000000000000"
+      "00000000400000000000000000000000"
+      "00000000800000000000000000000000"
+      "00000000000100000000000000000000"
+      "00000000000200000000000000000000"
+      "00000000000400000000000000000000"
+      "00000000000800000000000000000000"
+      "00000000001000000000000000000000"
+      "00000000002000000000000000000000"
+      "00000000004000000000000000000000"
+      "00000000008000000000000000000000",
+      "01105c6e36f6ac480f022c51e31ed702"
+      "90fda4b7b783194d4b4be8e4e1e2dff4"
+      "6a0804d1c5f9f808ea7933e31c063233"
+      "2bf65a22b20bb13cde3b80b3682ba965"
+      "b1207c58916f7856fa9968b410e50dee"
+      "98b35c071163d1b352b9bbccd09fde29"
+      "b850f40e71a8ae7d2e2d577f5ee39c46"
+      "7fa28130b50a123c29958e4665dda9a5"
+      "e0793997f8f19633a96392141d6e0e88"
+      "77850ed4364065d1d2f8746e2f1d5fd1"
+      "996cdde03215306503a30e41f58ef3c4"
+      "400365cfea4fa6381157c12a46598edf"
+      "18604854462ec66e3d3cf26d4723cb6a"
+      "9d801095048086a606fdb9192760889b"
+      "a8ce2e70e1b55a469137a9e2e6734565"
+      "283cb1e2c74f37e0854d03e33f8ba499"
+      "ef5d9af4edfce077c6280338f0a64286"
+      "2e6bc27ebd5a4c91b3778e22631251c8"
+      "c5bb75a10945597a9d6c274fc82d3338"
+      "b403a0a549d1375f26e71ef22bce0941"
+      "93ea87e2ed72fce0546148c351eec3be"
+      "867bb1b96070c377fff3c98e21562beb"
+      "475cfe28abcaaedf49981f6599b15140"
+      "ea6130d24407079f18ba9d4a8960b082"
+      "b39c57320e2e064f02fde88c23112146"
+      "1cac3655868aef584714826ee4f361fb"
+      "e6d692e1589cbb9dd3c74fa628df2a1f"
+      "3b0029b1d62b7e9978013ed3c793c1dd"
+      "1f184c8f7022a853cac40b74ac749aa3"
+      "f33f0d14732dfda0f2c3c20591bf1f5a"
+      "710ec0d0bca342baa5146068a78ff58c"
+      "66316312b7a98af35a0f4e92799b4047"
+      "f047ae61f25c28d232ce5c168cc745d6"
+      "6da13cb0f9e38a696635dba7a21571cf"
+      "cd64ec8cc33db7879f59a90d9edd00f6"
+      "a899e39ab36b9269a3ac04ebad9326bf"
+      "53cd9b400168a61714cd628a4056d236"
+      "bd8622c76daa54cb65f5db2fe03bafbe"
+      "0b23549ae31136f607293e8093a21934"
+      "74fd5e9c2451b4c8e0499e6ad34fafc8"
+      "ab77722a282f7f84b14ddebf7e696300"
+      "c1ef92d4a0263c6cca104530f996e272"
+      "f58992ff68d642b071a5848dc4acf2ae"
+      "28fb1f27ae0f297d5136a7a0a4a03e89"
+      "b588755b8217a1c62773790e69261269"
+      "19f45daf7b3ccf18e3fc590a9a0e172f"
+      "033ac4d13c3decc4c62d7de718ace802"
+      "140452dc850989f6762e3578bbb04be3"
+      "1a237c599c4649f4e586b2de"
     }
   };
   gpg_error_t err = 0;
   gcry_cipher_hd_t hde, hdd;
-  unsigned char out[MAX_DATA_LEN];
+  unsigned char out[1024];
   unsigned char tag[16];
   int tidx;
 
@@ -4548,7 +4651,7 @@ do_check_ocb_cipher (int inplace)
             }
           else
             {
-              err = gcry_cipher_encrypt (hde, out, MAX_DATA_LEN,
+              err = gcry_cipher_encrypt (hde, out, sizeof(out),
                                          plain, plainlen);
             }
         }
@@ -4605,7 +4708,7 @@ do_check_ocb_cipher (int inplace)
             }
           else
             {
-              unsigned char tmp[MAX_DATA_LEN];
+              unsigned char tmp[sizeof(out)];
 
               memcpy(tmp, out, plainlen);
               err = gcry_cipher_decrypt (hdd, out, plainlen, tmp, plainlen);
@@ -4696,7 +4799,7 @@ check_ocb_cipher_largebuf_split (int algo, int keylen, const char *tagexpect,
     }
 
   for (i = 0; i < buflen; i++)
-    inbuf[i] = 'a';
+    inbuf[i] = (i + 181081) * 5039;
 
   err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_OCB, 0);
   if (!err)
@@ -4855,6 +4958,131 @@ out_free:
 
 
 static void
+check_ocb_cipher_checksum (int algo, int keylen)
+{
+  static const unsigned char key[32] =
+	"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+	"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F";
+  static const unsigned char nonce[12] =
+	"\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F\x00\x01\x02\x03";
+  const size_t buflen = 128 * 16;
+  unsigned char *inbuf, *outbuf;
+  gpg_error_t err = 0;
+  gcry_cipher_hd_t hde, hde2;
+  unsigned char tag[16];
+  unsigned char tag2[16];
+  int i;
+
+  inbuf = xmalloc(buflen);
+  if (!inbuf)
+    {
+      fail ("out-of-memory\n");
+      return;
+    }
+  outbuf = xmalloc(buflen);
+  if (!inbuf)
+    {
+      fail ("out-of-memory\n");
+      xfree(inbuf);
+      return;
+    }
+
+  memset(inbuf, 0, buflen);
+  for (i = 0; i < 128; i += 16)
+    {
+      unsigned char *blk = inbuf + i;
+      int bit2set = i / 16;
+      int byteidx = bit2set / 8;
+      int bitpos = bit2set % 8;
+
+      blk[byteidx] |= 1 << bitpos;
+    }
+
+  err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_OCB, 0);
+  if (!err)
+    err = gcry_cipher_open (&hde2, algo, GCRY_CIPHER_MODE_OCB, 0);
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_open failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      goto out_free;
+    }
+
+  err = gcry_cipher_setkey (hde, key, keylen);
+  if (!err)
+    err = gcry_cipher_setkey (hde2, key, keylen);
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_setkey failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hde2);
+      goto out_free;
+    }
+
+  err = gcry_cipher_setiv (hde, nonce, 12);
+  if (!err)
+    err = gcry_cipher_setiv (hde2, nonce, 12);
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_setiv failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hde2);
+      goto out_free;
+    }
+
+  err = gcry_cipher_final (hde);
+  if (!err)
+    {
+      err = gcry_cipher_encrypt (hde, outbuf, buflen, inbuf, buflen);
+    }
+  for (i = 0; i < buflen && !err; i += 16)
+    {
+      if (i + 16 == buflen)
+	err = gcry_cipher_final (hde2);
+      if (!err)
+	err = gcry_cipher_encrypt (hde2, outbuf + i, 16, inbuf + i, 16);
+    }
+
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_encrypt failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hde2);
+      goto out_free;
+    }
+
+  /* Check that the tag matches. */
+  err = gcry_cipher_gettag (hde, tag, 16);
+  if (err)
+    {
+      fail ("cipher_ocb, gcry_cipher_gettag failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+    }
+  err = gcry_cipher_gettag (hde2, tag2, 16);
+  if (err)
+    {
+      fail ("cipher_ocb, gcry_cipher_gettag failed (checksum2, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+    }
+  if (memcmp (tag, tag2, 16))
+    {
+      mismatch (tag, 16, tag2, 16);
+      fail ("cipher-ocb, encrypt tag mismatch (checksum, algo %d)\n", algo);
+    }
+
+  gcry_cipher_close (hde);
+  gcry_cipher_close (hde2);
+
+out_free:
+  xfree(inbuf);
+  xfree(outbuf);
+}
+
+
+static void
 check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
 {
   unsigned int split;
@@ -4863,6 +5091,8 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
     {
       check_ocb_cipher_largebuf_split(algo, keylen, tagexpect, split);
     }
+
+  check_ocb_cipher_checksum(algo, keylen);
 }
 
 
@@ -5108,35 +5338,25 @@ check_ocb_cipher (void)
 
   /* Check large buffer encryption/decryption. */
   check_ocb_cipher_largebuf(GCRY_CIPHER_AES, 16,
-			    "\xf5\xf3\x12\x7d\x58\x2d\x96\xe8"
-			    "\x33\xfd\x7a\x4f\x42\x60\x5d\x20");
+    "\xc1\x5b\xf1\x80\xa4\xd5\xea\xfd\xae\x17\xa6\xcd\x6b\x10\xa8\xea");
   check_ocb_cipher_largebuf(GCRY_CIPHER_AES256, 32,
-			    "\xfa\x26\xa5\xbf\xf6\x7d\x3a\x8d"
-			    "\xfe\x96\x67\xc9\xc8\x41\x03\x51");
+    "\x2b\xb7\x25\x6b\x77\xc7\xfb\x21\x5c\xc9\x6c\x36\x17\x1a\x1a\xd5");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA128, 16,
-			    "\x28\x23\x38\x45\x2b\xfd\x42\x45"
-			    "\x43\x64\x7e\x67\x7f\xf4\x8b\xcd");
+    "\xe0\xae\x3f\x29\x3a\xee\xd8\xe3\xf2\x20\xc1\xa2\xd8\x72\x12\xd9");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA192, 24,
-			    "\xee\xca\xe5\x39\x27\x2d\x33\xe7"
-			    "\x79\x74\xb0\x1d\x37\x12\xd5\x6c");
+    "\xd7\x98\x71\xcf\x19\x5c\xa3\x3d\x6c\xfc\xc9\xbe\x9f\x13\x6b\xbd");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA256, 32,
-			    "\x39\x39\xd0\x2d\x05\x68\x74\xee"
-			    "\x18\x6b\xea\x3d\x0b\xd3\x58\xae");
+    "\x03\xf6\xec\x1a\x0e\xae\x66\x24\x2b\xba\x26\x0f\xb3\xb3\x1f\xb9");
   check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 16,
-			    "\x63\xe3\x0e\xb9\x11\x6f\x14\xba"
-			    "\x79\xe4\xa7\x9e\xad\x3c\x02\x0c");
+    "\x1c\xf9\xc7\xfc\x3a\x32\xac\xc7\x5e\x0a\xc2\x5c\x90\xd6\xf6\xf9");
   check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 32,
-			    "\xf6\xd4\xfe\x4e\x50\x85\x13\x59"
-			    "\x69\x0e\x4c\x67\x3e\xdd\x47\x90");
+    "\x53\x02\xc8\x0d\x4e\x9a\x44\x9e\x43\xd4\xaa\x06\x30\x93\xcc\x16");
   check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT128, 16,
-			    "\x3c\xfb\x66\x14\x3c\xc8\x6c\x67"
-			    "\x26\xb8\x23\xeb\xaf\x43\x98\x69");
+    "\xd3\x64\xac\x40\x48\x88\x77\xe2\x41\x26\x4c\xde\x21\x29\x21\x8d");
   check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT192, 24,
-			    "\x5e\x62\x27\xc5\x32\xc3\x1d\xe6"
-			    "\x2e\x65\xe7\xd6\xfb\x05\xd7\xb2");
+    "\x99\xeb\x35\xb0\x62\x4e\x7b\xf1\x5e\x9f\xed\x32\x78\x90\x0b\xd0");
   check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT256, 32,
-			    "\xe7\x8b\xe6\xd4\x2f\x7a\x36\x4c"
-			    "\xba\xee\x20\xe2\x68\xf4\xcb\xcc");
+    "\x71\x66\x2f\x68\xbf\xdd\xcc\xb1\xbf\x81\x56\x5f\x01\x73\xeb\x44");
 
   /* Check that the AAD data is correctly buffered.  */
   check_ocb_cipher_splitaad ();

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-gcm-intel-pclmul.c |   7 +
 cipher/cipher-internal.h         |   4 +-
 cipher/cipher-ocb.c              |  11 +
 cipher/crc-intel-pclmul.c        |   7 +
 cipher/rijndael-aesni.c          | 878 +++++++++++++++++++++++----------------
 cipher/rijndael-internal.h       |   2 +
 cipher/rijndael-ssse3-amd64.c    |   7 +
 cipher/rijndael.c                |   2 +
 cipher/sha1-intel-shaext.c       |   7 +
 cipher/sha256-intel-shaext.c     |   7 +
 tests/basic.c                    | 268 ++++++++++--
 11 files changed, 826 insertions(+), 374 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits




More information about the Gcrypt-devel mailing list