[PATCH] Optimizations for AES-NI OCB

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Nov 11 14:41:09 CET 2018


* cipher/cipher-ocb.c (ocb_crypt): Process input in 24KiB chunks for
better cache locality for checksumming.
* cipher/rijndael-aesni.c (ALWAYS_INLINE): New macro for always
inlining functions, change all functions with 'inline' to use
ALWAYS_INLINE.
(NO_INLINE): New macro.
(aesni_prepare_2_6_variable, aesni_prepare_7_15_variable): Rename to...
(aesni_prepare_2_7_variable, aesni_prepare_8_15_variable): ...these and
adjust accordingly (xmm7 moved from *_7_15 to *_2_7).
(aesni_prepare_2_6, aesni_prepare_7_15): Rename to...
(aesni_prepare_2_7, aesni_prepare_8_15): ...these and adjust
accordingly.
(aesni_cleanup_2_6, aesni_cleanup_7_15): Rename to...
(aesni_cleanup_2_7, aesni_cleanup_8_15): ...these and adjust
accordingly.
(aesni_ocb_checksum): New.
(aesni_ocb_enc, aesni_ocb_dec): Calculate OCB offsets in parallel
with help of precalculated offsets L0+L1 ja L0+L1+L0; Do checksum
calculation as separate pass instead of inline; Use NO_INLINE.
* cipher/rijndael-internal.h (RIJNDAEL_context_s) [USE_AESNI]: Add
'use_avx2'.
* cipher/rijndael.c (do_setkey) [USE_AESNI]: Set 'use_avx2' if
Intel AVX2 HW feature is available.
* tests/basic.c (do_check_ocb_cipher): New test vector; increase
size of temporary buffers for new test vector.
(check_ocb_cipher_largebuf_split): Make test plaintext non-uniform
for better checksum testing.
(check_ocb_cipher_checksum): New.
(check_ocb_cipher_largebuf): Call check_ocb_cipher_checksum.
(check_ocb_cipher): New expected tags for check_ocb_cipher_largebuf
test runs.
--

Benchmark on Haswell i7-4970k @ 4.0Ghz:

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        OCB enc |     0.175 ns/B      5436 MiB/s     0.702 c/B
        OCB dec |     0.184 ns/B      5184 MiB/s     0.736 c/B

After (enc +2% faster, dec +7% faster):
        OCB enc |     0.172 ns/B      5557 MiB/s     0.686 c/B
        OCB dec |     0.171 ns/B      5572 MiB/s     0.685 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index f71520ad2..cb6afd2b5 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -519,6 +519,12 @@ ocb_crypt (gcry_cipher_hd_t c, int encrypt,
 
       nblks = nblks < nmaxblks ? nblks : nmaxblks;
 
+      /* Since checksum xoring is done before/after encryption/decryption,
+	process input in 24KiB chunks to keep data loaded in L1 cache for
+	checksumming. */
+      if (nblks > 24 * 1024 / OCB_BLOCK_LEN)
+	nblks = 24 * 1024 / OCB_BLOCK_LEN;
+
       /* Use a bulk method if available.  */
       if (nblks && c->bulk.ocb_crypt)
         {
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index d190c0ac4..081bf124c 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -41,6 +41,10 @@
 #endif
 
 
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+
+
 typedef struct u128_s
 {
   u32 a, b, c, d;
@@ -49,7 +53,7 @@ typedef struct u128_s
 
 /* Copy of ocb_get_l needed here as GCC is unable to inline ocb_get_l
    because of 'pragma target'. */
-static inline const unsigned char *
+static ALWAYS_INLINE const unsigned char *
 aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
 {
   unsigned long ntz;
@@ -71,78 +75,78 @@ aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
    the key or the data.  */
 #ifdef __WIN64__
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
-# define aesni_prepare_2_6_variable char win64tmp[16]
-# define aesni_prepare_7_15_variable char win64tmp7_15[16 * 9]
+# define aesni_prepare_2_7_variable char win64tmp[16 * 2]
+# define aesni_prepare_8_15_variable char win64tmp8_15[16 * 8]
 # define aesni_prepare() do { } while (0)
-# define aesni_prepare_2_6()                                            \
+# define aesni_prepare_2_7()                                            \
    do { asm volatile ("movdqu %%xmm6, %0\n\t"                           \
-                      : "=m" (*win64tmp)                                \
+		      "movdqu %%xmm7, %1\n\t"                           \
+                      : "=m" (*win64tmp), "=m" (*(win64tmp+16))         \
                       :                                                 \
                       : "memory");                                      \
    } while (0)
-# define aesni_prepare_7_15()                                           \
-   do { asm volatile ("movdqu %%xmm7,  0*16(%0)\n\t"                    \
-                      "movdqu %%xmm8,  1*16(%0)\n\t"                    \
-                      "movdqu %%xmm9,  2*16(%0)\n\t"                    \
-                      "movdqu %%xmm10, 3*16(%0)\n\t"                    \
-                      "movdqu %%xmm11, 4*16(%0)\n\t"                    \
-                      "movdqu %%xmm12, 5*16(%0)\n\t"                    \
-                      "movdqu %%xmm13, 6*16(%0)\n\t"                    \
-                      "movdqu %%xmm14, 7*16(%0)\n\t"                    \
-                      "movdqu %%xmm15, 8*16(%0)\n\t"                    \
+# define aesni_prepare_8_15()                                           \
+   do { asm volatile ("movdqu %%xmm8,  0*16(%0)\n\t"                    \
+                      "movdqu %%xmm9,  1*16(%0)\n\t"                    \
+                      "movdqu %%xmm10, 2*16(%0)\n\t"                    \
+                      "movdqu %%xmm11, 3*16(%0)\n\t"                    \
+                      "movdqu %%xmm12, 4*16(%0)\n\t"                    \
+                      "movdqu %%xmm13, 5*16(%0)\n\t"                    \
+                      "movdqu %%xmm14, 6*16(%0)\n\t"                    \
+                      "movdqu %%xmm15, 7*16(%0)\n\t"                    \
                       :                                                 \
-                      : "r" (win64tmp7_15)                              \
+                      : "r" (win64tmp8_15)                              \
                       : "memory");                                      \
    } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
    } while (0)
-# define aesni_cleanup_2_6()                                            \
+# define aesni_cleanup_2_7()                                            \
    do { asm volatile ("movdqu %0,   %%xmm6\n\t"                         \
+		      "movdqu %1,   %%xmm7\n\t"                         \
                       "pxor %%xmm2, %%xmm2\n"                           \
                       "pxor %%xmm3, %%xmm3\n"                           \
                       "pxor %%xmm4, %%xmm4\n"                           \
                       "pxor %%xmm5, %%xmm5\n"                           \
                       :                                                 \
-                      : "m" (*win64tmp)                                 \
+                      : "m" (*win64tmp), "m" (*(win64tmp+16))           \
                       : "memory");                                      \
    } while (0)
-# define aesni_cleanup_7_15()                                           \
-   do { asm volatile ("movdqu 0*16(%0), %%xmm7\n\t"                     \
-                      "movdqu 1*16(%0), %%xmm8\n\t"                     \
-                      "movdqu 2*16(%0), %%xmm9\n\t"                     \
-                      "movdqu 3*16(%0), %%xmm10\n\t"                    \
-                      "movdqu 4*16(%0), %%xmm11\n\t"                    \
-                      "movdqu 5*16(%0), %%xmm12\n\t"                    \
-                      "movdqu 6*16(%0), %%xmm13\n\t"                    \
-                      "movdqu 7*16(%0), %%xmm14\n\t"                    \
-                      "movdqu 8*16(%0), %%xmm15\n\t"                    \
+# define aesni_cleanup_8_15()                                           \
+   do { asm volatile ("movdqu 0*16(%0), %%xmm8\n\t"                     \
+                      "movdqu 1*16(%0), %%xmm9\n\t"                     \
+                      "movdqu 2*16(%0), %%xmm10\n\t"                    \
+                      "movdqu 3*16(%0), %%xmm11\n\t"                    \
+                      "movdqu 4*16(%0), %%xmm12\n\t"                    \
+                      "movdqu 5*16(%0), %%xmm13\n\t"                    \
+                      "movdqu 6*16(%0), %%xmm14\n\t"                    \
+                      "movdqu 7*16(%0), %%xmm15\n\t"                    \
                       :                                                 \
-                      : "r" (win64tmp7_15)                              \
+                      : "r" (win64tmp8_15)                              \
                       : "memory");                                      \
    } while (0)
 #else
-# define aesni_prepare_2_6_variable
+# define aesni_prepare_2_7_variable
 # define aesni_prepare() do { } while (0)
-# define aesni_prepare_2_6() do { } while (0)
+# define aesni_prepare_2_7() do { } while (0)
 # define aesni_cleanup()                                                \
    do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                         \
                       "pxor %%xmm1, %%xmm1\n" :: );                     \
    } while (0)
-# define aesni_cleanup_2_6()                                            \
-   do { asm volatile ("pxor %%xmm2, %%xmm2\n\t"                         \
+# define aesni_cleanup_2_7()                                            \
+   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
+                      "pxor %%xmm2, %%xmm2\n\t"                         \
                       "pxor %%xmm3, %%xmm3\n"                           \
                       "pxor %%xmm4, %%xmm4\n"                           \
                       "pxor %%xmm5, %%xmm5\n"                           \
                       "pxor %%xmm6, %%xmm6\n":: );                      \
    } while (0)
 # ifdef __x86_64__
-#  define aesni_prepare_7_15_variable
-#  define aesni_prepare_7_15() do { } while (0)
-#  define aesni_cleanup_7_15()                                          \
-   do { asm volatile ("pxor %%xmm7, %%xmm7\n\t"                         \
-                      "pxor %%xmm8, %%xmm8\n"                           \
+#  define aesni_prepare_8_15_variable
+#  define aesni_prepare_8_15() do { } while (0)
+#  define aesni_cleanup_8_15()                                          \
+   do { asm volatile ("pxor %%xmm8, %%xmm8\n"                           \
                       "pxor %%xmm9, %%xmm9\n"                           \
                       "pxor %%xmm10, %%xmm10\n"                         \
                       "pxor %%xmm11, %%xmm11\n"                         \
@@ -157,10 +161,10 @@ aes_ocb_get_l (gcry_cipher_hd_t c, u64 n)
 void
 _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   if (ctx->rounds < 12)
     {
@@ -383,12 +387,12 @@ _gcry_aes_aesni_do_setkey (RIJNDAEL_context *ctx, const byte *key)
     }
 
   aesni_cleanup();
-  aesni_cleanup_2_6();
+  aesni_cleanup_2_7();
 }
 
 
 /* Make a decryption key from an encryption key. */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_prepare_decryption (RIJNDAEL_context *ctx)
 {
   /* The AES-NI decrypt instructions use the Equivalent Inverse
@@ -447,7 +451,7 @@ _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
 
 /* Encrypt one block using the Intel AES-NI instructions.  Block is input
  * and output through SSE register xmm0. */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_enc (const RIJNDAEL_context *ctx)
 {
 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
@@ -500,7 +504,7 @@ do_aesni_enc (const RIJNDAEL_context *ctx)
 
 /* Decrypt one block using the Intel AES-NI instructions.  Block is input
  * and output through SSE register xmm0. */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_dec (const RIJNDAEL_context *ctx)
 {
 #define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
@@ -553,7 +557,7 @@ do_aesni_dec (const RIJNDAEL_context *ctx)
 
 /* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
 {
 #define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
@@ -662,7 +666,7 @@ do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
 
 /* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 {
 #define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
@@ -773,7 +777,7 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 
 /* Encrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
 {
   asm volatile ("movdqa (%[key]), %%xmm0\n\t"
@@ -925,7 +929,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
 
 /* Decrypt eight blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4 and xmm8 to xmm11.  */
-static inline void
+static ALWAYS_INLINE void
 do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
 {
   asm volatile ("movdqa (%[key]), %%xmm0\n\t"
@@ -1757,10 +1761,10 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks, int cbc_mac)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
                 : /* No output */
@@ -1794,7 +1798,7 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -1805,10 +1809,10 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
                 "movdqa %[ctr], %%xmm5\n\t"  /* Preload CTR */
@@ -1820,9 +1824,9 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
 	{
@@ -1831,7 +1835,7 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -1848,7 +1852,7 @@ _gcry_aes_aesni_ctr_enc (RIJNDAEL_context *ctx, unsigned char *ctr,
       inbuf  += BLOCKSIZE;
     }
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -1876,10 +1880,10 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   asm volatile ("movdqu %[iv], %%xmm6\n\t"
                 : /* No output */
@@ -1891,9 +1895,9 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8; nblocks -= 8)
 	{
@@ -1953,7 +1957,7 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -2022,7 +2026,7 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -2031,10 +2035,10 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6();
+  aesni_prepare_2_7();
 
   if ( !ctx->decryption_prepared )
     {
@@ -2051,9 +2055,9 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
 	{
@@ -2113,7 +2117,7 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -2187,11 +2191,119 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
      : "memory");
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
-static void
+static ALWAYS_INLINE void
+aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
+		    size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+
+  /* Calculate checksum */
+  asm volatile ("movdqu %[checksum], %%xmm6\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                :
+                :[checksum] "m" (*c->u_ctr.ctr)
+                : "memory" );
+
+#if defined(HAVE_GCC_INLINE_ASM_AVX2)
+  if (nblocks >= 16 && ctx->use_avx2)
+    {
+      asm volatile ("vzeroupper\n\t"
+		    "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
+		    "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
+		    "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
+		    "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
+                    :
+                    :
+                    : "memory");
+
+      for (;nblocks >= 16; nblocks -= 16)
+	{
+	  asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
+			"vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
+			"vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
+			"vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
+			"vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
+			"vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
+			"vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
+			"vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
+			:
+			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
+			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
+			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
+			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2)),
+			  [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
+			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
+			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
+			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
+			: "memory" );
+	  plaintext += BLOCKSIZE * 16;
+	}
+
+      asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
+		    "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
+		    "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
+		    "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
+		    "vextracti128 $1, %%ymm6, %%xmm0\n\t"
+		    "vextracti128 $1, %%ymm1, %%xmm4\n\t"
+		    "vextracti128 $1, %%ymm2, %%xmm5\n\t"
+		    "vextracti128 $1, %%ymm3, %%xmm7\n\t"
+		    "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
+		    "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
+		    "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
+		    "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
+		    "vzeroupper\n\t"
+		    :
+		    :
+		    : "memory" );
+    }
+#endif
+
+  for (;nblocks >= 4; nblocks -= 4)
+    {
+      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
+		    "movdqu %[ptr1], %%xmm4\n\t"
+		    "movdqu %[ptr2], %%xmm5\n\t"
+		    "movdqu %[ptr3], %%xmm7\n\t"
+		    "pxor %%xmm0, %%xmm6\n\t"
+		    "pxor %%xmm4, %%xmm1\n\t"
+		    "pxor %%xmm5, %%xmm2\n\t"
+		    "pxor %%xmm7, %%xmm3\n\t"
+		    :
+		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
+		      [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
+		      [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
+		      [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
+		    : "memory" );
+      plaintext += BLOCKSIZE * 4;
+    }
+
+  for (;nblocks >= 1; nblocks -= 1)
+    {
+      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
+		    "pxor %%xmm0, %%xmm6\n\t"
+		    :
+		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
+		    : "memory" );
+      plaintext += BLOCKSIZE;
+    }
+
+  asm volatile ("pxor %%xmm1, %%xmm6\n\t"
+		"pxor %%xmm2, %%xmm6\n\t"
+		"pxor %%xmm3, %%xmm6\n\t"
+		"movdqu %%xmm6, %[checksum]\n\t"
+		: [checksum] "=m" (*c->u_ctr.ctr)
+		:
+		: "memory" );
+}
+
+
+static unsigned int NO_INLINE
 aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                const void *inbuf_arg, size_t nblocks)
 {
@@ -2200,31 +2312,35 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
   const unsigned char *l;
-  aesni_prepare_2_6_variable;
+  byte tempbuf[16 * 2 + 15];
+  byte *l0l1;
+  byte *l0l1l0;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
+
+  aesni_ocb_checksum (c, inbuf_arg, nblocks);
 
-  /* Preload Offset and Checksum */
+  asm volatile ("" : "=r" (l0l1) : "0" (tempbuf) : "memory");
+  l0l1 = l0l1 + (-(uintptr_t)l0l1 & 15);
+  l0l1l0 = l0l1 + 16;
+
+  /* Preload Offset */
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
-                "movdqu %[ctr], %%xmm6\n\t"
                 : /* No output */
-                : [iv] "m" (*c->u_iv.iv),
-                  [ctr] "m" (*c->u_ctr.ctr)
+                : [iv] "m" (*c->u_iv.iv)
                 : "memory" );
 
-
   for ( ;nblocks && n % 4; nblocks-- )
     {
       l = aes_ocb_get_l(c, ++n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
       asm volatile ("movdqu %[l],     %%xmm1\n\t"
                     "movdqu %[inbuf], %%xmm0\n\t"
                     "pxor   %%xmm1,   %%xmm5\n\t"
-                    "pxor   %%xmm0,   %%xmm6\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
                     : [l] "m" (*l),
@@ -2243,95 +2359,103 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
+  asm volatile ("movdqu %[l0], %%xmm6\n\t"
+		"movdqu %[l1], %%xmm0\n\t"
+		"pxor %%xmm6, %%xmm0\n\t"
+		"movdqa %%xmm0, %[l0l1]\n\t"
+		"pxor %%xmm6, %%xmm0\n\t"
+		"movdqa %%xmm0, %[l0l1l0]\n\t"
+		: [l0l1] "=m" (*l0l1),
+		  [l0l1l0] "=m" (*l0l1l0)
+		: [l0] "m" (*c->u_mode.ocb.L[0]),
+		  [l1] "m" (*c->u_mode.ocb.L[1])
+		: "memory" );
+
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
-
-      aesni_prepare_7_15();
+      aesni_prepare_8_15_variable;
 
-      asm volatile ("movdqu %[l0], %%xmm7\n\t"
-		    :
-		    : [l0] "m" (*c->u_mode.ocb.L[0])
-		    : "memory" );
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
 	{
 	  n += 4;
 	  l = aes_ocb_get_l(c, n);
 
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-
-	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqa %%xmm5,    %%xmm12\n\t"
+	  asm volatile ("movdqa %[l0l1],   %%xmm10\n\t"
+			"movdqa %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l3],     %%xmm15\n\t"
 			:
-			: [l1] "m" (*c->u_mode.ocb.L[1]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: [l0l1] "m" (*l0l1),
+			  [l0l1l0] "m" (*l0l1l0),
+			  [l3] "m" (*l)
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm2,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqa %%xmm5,    %%xmm13\n\t"
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
+
+	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
 			:
-			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqa %%xmm5,    %%xmm14\n\t"
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"movdqu %[inbuf4], %%xmm8\n\t"
+			"movdqu %[inbuf5], %%xmm9\n\t"
 			:
-			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
+			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
+			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm15\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
+	  asm volatile ("movdqa %%xmm6,    %%xmm12\n\t"
+			"pxor   %%xmm5,    %%xmm12\n\t"
+			"pxor   %%xmm12,   %%xmm1\n\t"
+
+			"movdqa %%xmm10,   %%xmm13\n\t"
+			"pxor   %%xmm5,    %%xmm13\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+
+			"movdqa %%xmm11,   %%xmm14\n\t"
+			"pxor   %%xmm5,    %%xmm14\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+
+			"pxor   %%xmm11,   %%xmm5\n\t"
 			"pxor   %%xmm15,   %%xmm5\n\t"
-			"pxor   %%xmm4,    %%xmm6\n\t"
 			"pxor   %%xmm5,    %%xmm4\n\t"
 			"movdqa %%xmm5,    %%xmm15\n\t"
-			:
-			: [l3] "m" (*l),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
 
-	  n += 4;
-	  l = aes_ocb_get_l(c, n);
-
-	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm8,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm8\n\t"
-			"movdqu %%xmm5,    %[outbuf4]\n\t"
-			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
-			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm9,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm9\n\t"
-			"movdqu %%xmm5,    %[outbuf5]\n\t"
-			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
-			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			"movdqa %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"movdqu %%xmm0,    %[outbuf4]\n\t"
+
+			"movdqa %%xmm10,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm9\n\t"
+			"movdqu %%xmm0,    %[outbuf5]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			:
 			: "memory" );
 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm10,   %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm10\n\t"
-			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm10\n\t"
+			"movdqu %%xmm0,    %[outbuf6]\n\t"
 			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
 			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[l7],     %%xmm11\n\t"
+	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
 			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
 			"movdqu %[inbuf7], %%xmm11\n\t"
-			"pxor   %%xmm11,   %%xmm6\n\t"
 			"pxor   %%xmm5,    %%xmm11\n\t"
 			:
 			: [l7] "m" (*l),
@@ -2374,7 +2498,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-    aesni_cleanup_7_15();
+    aesni_cleanup_8_15();
   }
 #endif
 
@@ -2384,44 +2508,47 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       l = aes_ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
+
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
-		    "pxor   %%xmm1,    %%xmm6\n\t"
-		    "pxor   %%xmm5,    %%xmm1\n\t"
-		    "movdqu %%xmm5,    %[outbuf0]\n\t"
-		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    "movdqa %[l0l1],   %%xmm3\n\t"
+		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*l0l1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-		    "movdqu %[inbuf1], %%xmm2\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
-		    "pxor   %%xmm2,    %%xmm6\n\t"
-		    "pxor   %%xmm5,    %%xmm2\n\t"
-		    "movdqu %%xmm5,    %[outbuf1]\n\t"
-		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-		    : [l1] "m" (*c->u_mode.ocb.L[1]),
-		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+      asm volatile ("movdqa %[l0l1l0], %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm0,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0l1l0] "m" (*l0l1l0),
+		      [l3] "m" (*l)
 		    : "memory" );
-      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
-		    "pxor   %%xmm3,    %%xmm6\n\t"
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
-		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
+		    "movdqu %%xmm3,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm0,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    :
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm4\n\t"
+      asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
 		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
-		    "pxor   %%xmm4,    %%xmm6\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
-		    : [l3] "m" (*l),
-		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
 		    : "memory" );
 
       do_aesni_enc_vec4 (ctx);
@@ -2453,12 +2580,10 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
       l = aes_ocb_get_l(c, ++n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
       asm volatile ("movdqu %[l],     %%xmm1\n\t"
                     "movdqu %[inbuf], %%xmm0\n\t"
                     "pxor   %%xmm1,   %%xmm5\n\t"
-                    "pxor   %%xmm0,   %%xmm6\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
                     : [l] "m" (*l),
@@ -2479,30 +2604,41 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = n;
   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
-                "movdqu %%xmm6, %[ctr]\n\t"
-                : [iv] "=m" (*c->u_iv.iv),
-                  [ctr] "=m" (*c->u_ctr.ctr)
+                : [iv] "=m" (*c->u_iv.iv)
                 :
                 : "memory" );
 
+  asm volatile ("pxor   %%xmm0, %%xmm0\n\t"
+		"movdqa %%xmm0, %[l0l1]\n\t"
+		"movdqa %%xmm0, %[l0l1l0]\n\t"
+		: [l0l1] "=m" (*l0l1),
+		  [l0l1l0] "=m" (*l0l1l0)
+		:
+		: "memory" );
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
+
+  return 0;
 }
 
 
-static void
+static unsigned int NO_INLINE
 aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
-               const void *inbuf_arg, size_t nblocks)
+               const void *inbuf_arg, size_t nblocks_arg)
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
   const unsigned char *l;
-  aesni_prepare_2_6_variable;
+  size_t nblocks = nblocks_arg;
+  byte tempbuf[16 * 2 + 15];
+  byte *l0l1;
+  byte *l0l1l0;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   if ( !ctx->decryption_prepared )
     {
@@ -2510,12 +2646,14 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       ctx->decryption_prepared = 1;
     }
 
-  /* Preload Offset and Checksum */
+  asm volatile ("" : "=r" (l0l1) : "0" (tempbuf) : "memory");
+  l0l1 = l0l1 + (-(uintptr_t)l0l1 & 15);
+  l0l1l0 = l0l1 + 16;
+
+  /* Preload Offset */
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
-                "movdqu %[ctr], %%xmm6\n\t"
                 : /* No output */
-                : [iv] "m" (*c->u_iv.iv),
-                  [ctr] "m" (*c->u_ctr.ctr)
+                : [iv] "m" (*c->u_iv.iv)
                 : "memory" );
 
   for ( ;nblocks && n % 4; nblocks-- )
@@ -2524,7 +2662,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
       asm volatile ("movdqu %[l],     %%xmm1\n\t"
                     "movdqu %[inbuf], %%xmm0\n\t"
                     "pxor   %%xmm1,   %%xmm5\n\t"
@@ -2537,7 +2674,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       do_aesni_dec (ctx);
 
       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
-                    "pxor   %%xmm0, %%xmm6\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
                     : [outbuf] "=m" (*outbuf)
                     :
@@ -2547,87 +2683,103 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       outbuf += BLOCKSIZE;
     }
 
+  asm volatile ("movdqu %[l0], %%xmm6\n\t"
+		"movdqu %[l1], %%xmm0\n\t"
+		"pxor %%xmm6, %%xmm0\n\t"
+		"movdqa %%xmm0, %[l0l1]\n\t"
+		"pxor %%xmm6, %%xmm0\n\t"
+		"movdqa %%xmm0, %[l0l1l0]\n\t"
+		: [l0l1] "=m" (*l0l1),
+		  [l0l1l0] "=m" (*l0l1l0)
+		: [l0] "m" (*c->u_mode.ocb.L[0]),
+		  [l1] "m" (*c->u_mode.ocb.L[1])
+		: "memory" );
+
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
-
-      aesni_prepare_7_15();
+      aesni_prepare_8_15_variable;
 
-      asm volatile ("movdqu %[l0], %%xmm7\n\t"
-		    :
-		    : [l0] "m" (*c->u_mode.ocb.L[0])
-		    : "memory" );
+      aesni_prepare_8_15();
 
       for ( ;nblocks >= 8 ; nblocks -= 8 )
 	{
 	  n += 4;
 	  l = aes_ocb_get_l(c, n);
 
+	  asm volatile ("movdqa %[l0l1],   %%xmm10\n\t"
+			"movdqa %[l0l1l0], %%xmm11\n\t"
+			"movdqu %[l3],     %%xmm15\n\t"
+			:
+			: [l0l1] "m" (*l0l1),
+			  [l0l1l0] "m" (*l0l1l0),
+			  [l3] "m" (*l)
+			: "memory" );
+
+	  n += 4;
+	  l = aes_ocb_get_l(c, n);
+
 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
 
-	  asm volatile ("movdqu %[l1],     %%xmm10\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqa %%xmm5,    %%xmm12\n\t"
+	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
 			:
-			: [l1] "m" (*c->u_mode.ocb.L[1]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqa %%xmm5,    %%xmm13\n\t"
-			:
-			: [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqa %%xmm5,    %%xmm14\n\t"
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"movdqu %[inbuf4], %%xmm8\n\t"
+			"movdqu %[inbuf5], %%xmm9\n\t"
 			:
-			: [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
+			  [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE)),
+			  [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
+	  asm volatile ("movdqa %%xmm6,    %%xmm12\n\t"
+			"pxor   %%xmm5,    %%xmm12\n\t"
+			"pxor   %%xmm12,   %%xmm1\n\t"
+
+			"movdqa %%xmm10,   %%xmm13\n\t"
+			"pxor   %%xmm5,    %%xmm13\n\t"
+			"pxor   %%xmm13,   %%xmm2\n\t"
+
+			"movdqa %%xmm11,   %%xmm14\n\t"
+			"pxor   %%xmm5,    %%xmm14\n\t"
+			"pxor   %%xmm14,   %%xmm3\n\t"
+
+			"pxor   %%xmm11,   %%xmm5\n\t"
+			"pxor   %%xmm15,   %%xmm5\n\t"
 			"pxor   %%xmm5,    %%xmm4\n\t"
 			"movdqa %%xmm5,    %%xmm15\n\t"
-			:
-			: [l3] "m" (*l),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
 
-	  n += 4;
-	  l = aes_ocb_get_l(c, n);
-
-	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm8\n\t"
-			"movdqu %%xmm5,    %[outbuf4]\n\t"
-			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE))
-			: [inbuf4] "m" (*(inbuf + 4 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
-			"pxor   %%xmm10,   %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm9\n\t"
-			"movdqu %%xmm5,    %[outbuf5]\n\t"
-			: [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
-			: [inbuf5] "m" (*(inbuf + 5 * BLOCKSIZE))
+			"movdqa %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm8\n\t"
+			"movdqu %%xmm0,    %[outbuf4]\n\t"
+
+			"movdqa %%xmm10,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm9\n\t"
+			"movdqu %%xmm0,    %[outbuf5]\n\t"
+			: [outbuf4] "=m" (*(outbuf + 4 * BLOCKSIZE)),
+			  [outbuf5] "=m" (*(outbuf + 5 * BLOCKSIZE))
+			:
 			: "memory" );
 	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
-			"pxor   %%xmm7,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm10\n\t"
-			"movdqu %%xmm5,    %[outbuf6]\n\t"
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"pxor   %%xmm5,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm10\n\t"
+			"movdqu %%xmm0,    %[outbuf6]\n\t"
 			: [outbuf6] "=m" (*(outbuf + 6 * BLOCKSIZE))
 			: [inbuf6] "m" (*(inbuf + 6 * BLOCKSIZE))
 			: "memory" );
 	  asm volatile ("movdqu %[l7],     %%xmm0\n\t"
-			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm11,   %%xmm5\n\t"
 			"pxor   %%xmm0,    %%xmm5\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
 			"pxor   %%xmm5,    %%xmm11\n\t"
 			:
 			: [l7] "m" (*l),
@@ -2655,14 +2807,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 			"movdqu %%xmm9,    %[outbuf5]\n\t"
 			"movdqu %%xmm10,   %[outbuf6]\n\t"
 			"movdqu %%xmm11,   %[outbuf7]\n\t"
-			"pxor   %%xmm2,    %%xmm1\n\t"
-			"pxor   %%xmm4,    %%xmm1\n\t"
-			"pxor   %%xmm9,    %%xmm1\n\t"
-			"pxor   %%xmm11,   %%xmm1\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm8,    %%xmm6\n\t"
-			"pxor   %%xmm10,   %%xmm6\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
 			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
 			  [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)),
 			  [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
@@ -2678,7 +2822,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 	  inbuf  += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -2688,40 +2832,47 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       l = aes_ocb_get_l(c, n);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
-      asm volatile ("movdqu %[l0],     %%xmm4\n\t"
+      /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i)  */
+
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
 		    "movdqu %[inbuf0], %%xmm1\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
-		    "pxor   %%xmm5,    %%xmm1\n\t"
-		    "movdqu %%xmm5,    %[outbuf0]\n\t"
-		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    "movdqa %[l0l1],   %%xmm3\n\t"
+		    :
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [l0l1] "m" (*l0l1),
 		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-		    "movdqu %[inbuf1], %%xmm2\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
-		    "pxor   %%xmm5,    %%xmm2\n\t"
-		    "movdqu %%xmm5,    %[outbuf1]\n\t"
-		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-		    : [l1] "m" (*c->u_mode.ocb.L[1]),
-		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+      asm volatile ("movdqa %[l0l1l0], %%xmm4\n\t"
+		    "movdqu %[l3],     %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm0,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0l1l0] "m" (*l0l1l0),
+		      [l3] "m" (*l)
 		    : "memory" );
-      asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
-		    "pxor   %%xmm4,    %%xmm5\n\t"
+      asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
 		    "pxor   %%xmm5,    %%xmm3\n\t"
-		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    "pxor   %%xmm3,    %%xmm2\n\t"
+		    "movdqu %%xmm3,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqa %%xmm4,    %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm5,    %%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm0,    %[outbuf2]\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-		    : [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    :
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 		    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+      asm volatile ("pxor   %%xmm6,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm5\n\t"
 		    "movdqu %[inbuf3], %%xmm4\n\t"
-		    "pxor   %%xmm0,    %%xmm5\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    :
-		    : [l3] "m" (*l),
-		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
 		    : "memory" );
 
       do_aesni_dec_vec4 (ctx);
@@ -2737,10 +2888,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		    "movdqu %%xmm3,    %[outbuf2]\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    "movdqu %%xmm4,    %[outbuf3]\n\t"
-		    "pxor   %%xmm1,    %%xmm6\n\t"
-		    "pxor   %%xmm2,    %%xmm6\n\t"
-		    "pxor   %%xmm3,    %%xmm6\n\t"
-		    "pxor   %%xmm4,    %%xmm6\n\t"
 		    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
 		      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
 		      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
@@ -2771,7 +2918,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       do_aesni_dec (ctx);
 
       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
-                    "pxor   %%xmm0, %%xmm6\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
                     : [outbuf] "=m" (*outbuf)
                     :
@@ -2783,14 +2929,23 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = n;
   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
-                "movdqu %%xmm6, %[ctr]\n\t"
-                : [iv] "=m" (*c->u_iv.iv),
-                  [ctr] "=m" (*c->u_ctr.ctr)
+                : [iv] "=m" (*c->u_iv.iv)
                 :
                 : "memory" );
 
+  aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
+
+  asm volatile ("pxor   %%xmm0, %%xmm0\n\t"
+		"movdqa %%xmm0, %[l0l1]\n\t"
+		"movdqa %%xmm0, %[l0l1l0]\n\t"
+		: [l0l1] "=m" (*l0l1),
+		  [l0l1l0] "=m" (*l0l1l0)
+		:
+		: "memory" );
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
+
+  return 0;
 }
 
 
@@ -2799,11 +2954,9 @@ _gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
                           const void *inbuf_arg, size_t nblocks, int encrypt)
 {
   if (encrypt)
-    aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+    return aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
   else
-    aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
-
-  return 0;
+    return aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
 }
 
 
@@ -2815,10 +2968,10 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   const unsigned char *abuf = abuf_arg;
   u64 n = c->u_mode.ocb.aad_nblocks;
   const unsigned char *l;
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   /* Preload Offset and Sum */
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
@@ -2856,9 +3009,9 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      aesni_prepare_7_15_variable;
+      aesni_prepare_8_15_variable;
 
-      aesni_prepare_7_15();
+      aesni_prepare_8_15();
 
       asm volatile ("movdqu %[l0], %%xmm7\n\t"
 		    "movdqu %[l1], %%xmm12\n\t"
@@ -2948,7 +3101,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 	  abuf += 8*BLOCKSIZE;
 	}
 
-      aesni_cleanup_7_15();
+      aesni_cleanup_8_15();
     }
 #endif
 
@@ -3038,7 +3191,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 
   return 0;
 }
@@ -3053,10 +3206,10 @@ _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
 			 unsigned char *outbuf, const unsigned char *inbuf,
 			 size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   /* Preload Tweak */
   asm volatile ("movdqu %[tweak], %%xmm5\n\t"
@@ -3182,7 +3335,7 @@ _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
 		: "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -3191,10 +3344,10 @@ _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
 			 unsigned char *outbuf, const unsigned char *inbuf,
 			 size_t nblocks)
 {
-  aesni_prepare_2_6_variable;
+  aesni_prepare_2_7_variable;
 
   aesni_prepare ();
-  aesni_prepare_2_6 ();
+  aesni_prepare_2_7 ();
 
   if ( !ctx->decryption_prepared )
     {
@@ -3326,7 +3479,7 @@ _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
                 : "memory" );
 
   aesni_cleanup ();
-  aesni_cleanup_2_6 ();
+  aesni_cleanup_2_7 ();
 }
 
 
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 160fb8c36..1dcfcd5e4 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -143,6 +143,7 @@ typedef struct RIJNDAEL_context_s
 #endif /*USE_PADLOCK*/
 #ifdef USE_AESNI
   unsigned int use_aesni:1;           /* AES-NI shall be used.  */
+  unsigned int use_avx2:1;            /* AVX2 shall be used. */
 #endif /*USE_AESNI*/
 #ifdef USE_SSSE3
   unsigned int use_ssse3:1;           /* SSSE3 shall be used.  */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 1bc8b0fc2..e8ec7993b 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -334,6 +334,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prefetch_enc_fn = NULL;
       ctx->prefetch_dec_fn = NULL;
       ctx->use_aesni = 1;
+      ctx->use_avx2 = !!(hwfeatures & HWF_INTEL_AVX2);
       if (hd)
         {
           hd->bulk.cfb_enc = _gcry_aes_aesni_cfb_enc;
diff --git a/tests/basic.c b/tests/basic.c
index f3d895153..0afae3047 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -4411,11 +4411,114 @@ do_check_ocb_cipher (int inplace)
       "1792A4E31E0755FB03E31B22116E6C2DDF9EFD6E33D536F1"
       "A0124B0A55BAE884ED93481529C76B6AD0C515F4D1CDD4FD"
       "AC4F02AA"
+    },
+    { GCRY_CIPHER_AES, 12, "0F0E0D0C0B0A09080706050403020100",
+      "BBAA9988776655443322110D",
+      "000102030405060708090A0B0C0D0E0F1011121314151617"
+      "18191A1B1C1D1E1F2021222324252627",
+      /* test vector for checksumming */
+      "01000000000000000000000000000000"
+      "02000000000000000000000000000000"
+      "04000000000000000000000000000000"
+      "08000000000000000000000000000000"
+      "10000000000000000000000000000000"
+      "20000000000000000000000000000000"
+      "40000000000000000000000000000000"
+      "80000000000000000000000000000000"
+      "00010000000000000000000000000000"
+      "00020000000000000000000000000000"
+      "00040000000000000000000000000000"
+      "00080000000000000000000000000000"
+      "00100000000000000000000000000000"
+      "00200000000000000000000000000000"
+      "00400000000000000000000000000000"
+      "00800000000000000000000000000000"
+      "00000100000000000000000000000000"
+      "00000200000000000000000000000000"
+      "00000400000000000000000000000000"
+      "00000800000000000000000000000000"
+      "00001000000000000000000000000000"
+      "00002000000000000000000000000000"
+      "00004000000000000000000000000000"
+      "00008000000000000000000000000000"
+      "00000001000000000000000000000000"
+      "00000002000000000000000000000000"
+      "00000004000000000000000000000000"
+      "00000008000000000000000000000000"
+      "00000010000000000000000000000000"
+      "00000020000000000000000000000000"
+      "00000040000000000000000000000000"
+      "00000080000000000000000000000000"
+      "00000000010000000000000000000000"
+      "00000000020000000000000000000000"
+      "00000000040000000000000000000000"
+      "00000000080000000000000000000000"
+      "00000000100000000000000000000000"
+      "00000000200000000000000000000000"
+      "00000000400000000000000000000000"
+      "00000000800000000000000000000000"
+      "00000000000100000000000000000000"
+      "00000000000200000000000000000000"
+      "00000000000400000000000000000000"
+      "00000000000800000000000000000000"
+      "00000000001000000000000000000000"
+      "00000000002000000000000000000000"
+      "00000000004000000000000000000000"
+      "00000000008000000000000000000000",
+      "01105c6e36f6ac480f022c51e31ed702"
+      "90fda4b7b783194d4b4be8e4e1e2dff4"
+      "6a0804d1c5f9f808ea7933e31c063233"
+      "2bf65a22b20bb13cde3b80b3682ba965"
+      "b1207c58916f7856fa9968b410e50dee"
+      "98b35c071163d1b352b9bbccd09fde29"
+      "b850f40e71a8ae7d2e2d577f5ee39c46"
+      "7fa28130b50a123c29958e4665dda9a5"
+      "e0793997f8f19633a96392141d6e0e88"
+      "77850ed4364065d1d2f8746e2f1d5fd1"
+      "996cdde03215306503a30e41f58ef3c4"
+      "400365cfea4fa6381157c12a46598edf"
+      "18604854462ec66e3d3cf26d4723cb6a"
+      "9d801095048086a606fdb9192760889b"
+      "a8ce2e70e1b55a469137a9e2e6734565"
+      "283cb1e2c74f37e0854d03e33f8ba499"
+      "ef5d9af4edfce077c6280338f0a64286"
+      "2e6bc27ebd5a4c91b3778e22631251c8"
+      "c5bb75a10945597a9d6c274fc82d3338"
+      "b403a0a549d1375f26e71ef22bce0941"
+      "93ea87e2ed72fce0546148c351eec3be"
+      "867bb1b96070c377fff3c98e21562beb"
+      "475cfe28abcaaedf49981f6599b15140"
+      "ea6130d24407079f18ba9d4a8960b082"
+      "b39c57320e2e064f02fde88c23112146"
+      "1cac3655868aef584714826ee4f361fb"
+      "e6d692e1589cbb9dd3c74fa628df2a1f"
+      "3b0029b1d62b7e9978013ed3c793c1dd"
+      "1f184c8f7022a853cac40b74ac749aa3"
+      "f33f0d14732dfda0f2c3c20591bf1f5a"
+      "710ec0d0bca342baa5146068a78ff58c"
+      "66316312b7a98af35a0f4e92799b4047"
+      "f047ae61f25c28d232ce5c168cc745d6"
+      "6da13cb0f9e38a696635dba7a21571cf"
+      "cd64ec8cc33db7879f59a90d9edd00f6"
+      "a899e39ab36b9269a3ac04ebad9326bf"
+      "53cd9b400168a61714cd628a4056d236"
+      "bd8622c76daa54cb65f5db2fe03bafbe"
+      "0b23549ae31136f607293e8093a21934"
+      "74fd5e9c2451b4c8e0499e6ad34fafc8"
+      "ab77722a282f7f84b14ddebf7e696300"
+      "c1ef92d4a0263c6cca104530f996e272"
+      "f58992ff68d642b071a5848dc4acf2ae"
+      "28fb1f27ae0f297d5136a7a0a4a03e89"
+      "b588755b8217a1c62773790e69261269"
+      "19f45daf7b3ccf18e3fc590a9a0e172f"
+      "033ac4d13c3decc4c62d7de718ace802"
+      "140452dc850989f6762e3578bbb04be3"
+      "1a237c599c4649f4e586b2de"
     }
   };
   gpg_error_t err = 0;
   gcry_cipher_hd_t hde, hdd;
-  unsigned char out[MAX_DATA_LEN];
+  unsigned char out[1024];
   unsigned char tag[16];
   int tidx;
 
@@ -4548,7 +4651,7 @@ do_check_ocb_cipher (int inplace)
             }
           else
             {
-              err = gcry_cipher_encrypt (hde, out, MAX_DATA_LEN,
+              err = gcry_cipher_encrypt (hde, out, sizeof(out),
                                          plain, plainlen);
             }
         }
@@ -4605,7 +4708,7 @@ do_check_ocb_cipher (int inplace)
             }
           else
             {
-              unsigned char tmp[MAX_DATA_LEN];
+              unsigned char tmp[sizeof(out)];
 
               memcpy(tmp, out, plainlen);
               err = gcry_cipher_decrypt (hdd, out, plainlen, tmp, plainlen);
@@ -4696,7 +4799,7 @@ check_ocb_cipher_largebuf_split (int algo, int keylen, const char *tagexpect,
     }
 
   for (i = 0; i < buflen; i++)
-    inbuf[i] = 'a';
+    inbuf[i] = (i + 181081) * 5039;
 
   err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_OCB, 0);
   if (!err)
@@ -4854,6 +4957,131 @@ out_free:
 }
 
 
+static void
+check_ocb_cipher_checksum (int algo, int keylen)
+{
+  static const unsigned char key[32] =
+	"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
+	"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F";
+  static const unsigned char nonce[12] =
+	"\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F\x00\x01\x02\x03";
+  const size_t buflen = 128 * 16;
+  unsigned char *inbuf, *outbuf;
+  gpg_error_t err = 0;
+  gcry_cipher_hd_t hde, hde2;
+  unsigned char tag[16];
+  unsigned char tag2[16];
+  int i;
+
+  inbuf = xmalloc(buflen);
+  if (!inbuf)
+    {
+      fail ("out-of-memory\n");
+      return;
+    }
+  outbuf = xmalloc(buflen);
+  if (!inbuf)
+    {
+      fail ("out-of-memory\n");
+      xfree(inbuf);
+      return;
+    }
+
+  memset(inbuf, 0, buflen);
+  for (i = 0; i < 128; i += 16)
+    {
+      unsigned char *blk = inbuf + i;
+      int bit2set = i / 16;
+      int byteidx = bit2set / 8;
+      int bitpos = bit2set % 8;
+
+      blk[byteidx] |= 1 << bitpos;
+    }
+
+  err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_OCB, 0);
+  if (!err)
+    err = gcry_cipher_open (&hde2, algo, GCRY_CIPHER_MODE_OCB, 0);
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_open failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      goto out_free;
+    }
+
+  err = gcry_cipher_setkey (hde, key, keylen);
+  if (!err)
+    err = gcry_cipher_setkey (hde2, key, keylen);
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_setkey failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hde2);
+      goto out_free;
+    }
+
+  err = gcry_cipher_setiv (hde, nonce, 12);
+  if (!err)
+    err = gcry_cipher_setiv (hde2, nonce, 12);
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_setiv failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hde2);
+      goto out_free;
+    }
+
+  err = gcry_cipher_final (hde);
+  if (!err)
+    {
+      err = gcry_cipher_encrypt (hde, outbuf, buflen, inbuf, buflen);
+    }
+  for (i = 0; i < buflen && !err; i += 16)
+    {
+      if (i + 16 == buflen)
+	err = gcry_cipher_final (hde2);
+      if (!err)
+	err = gcry_cipher_encrypt (hde2, outbuf + i, 16, inbuf + i, 16);
+    }
+
+  if (err)
+    {
+      fail ("cipher-ocb, gcry_cipher_encrypt failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hde2);
+      goto out_free;
+    }
+
+  /* Check that the tag matches. */
+  err = gcry_cipher_gettag (hde, tag, 16);
+  if (err)
+    {
+      fail ("cipher_ocb, gcry_cipher_gettag failed (checksum, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+    }
+  err = gcry_cipher_gettag (hde2, tag2, 16);
+  if (err)
+    {
+      fail ("cipher_ocb, gcry_cipher_gettag failed (checksum2, algo %d): %s\n",
+	    algo, gpg_strerror (err));
+    }
+  if (memcmp (tag, tag2, 16))
+    {
+      mismatch (tag, 16, tag2, 16);
+      fail ("cipher-ocb, encrypt tag mismatch (checksum, algo %d)\n", algo);
+    }
+
+  gcry_cipher_close (hde);
+  gcry_cipher_close (hde2);
+
+out_free:
+  xfree(inbuf);
+  xfree(outbuf);
+}
+
+
 static void
 check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
 {
@@ -4863,6 +5091,8 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
     {
       check_ocb_cipher_largebuf_split(algo, keylen, tagexpect, split);
     }
+
+  check_ocb_cipher_checksum(algo, keylen);
 }
 
 
@@ -5108,35 +5338,25 @@ check_ocb_cipher (void)
 
   /* Check large buffer encryption/decryption. */
   check_ocb_cipher_largebuf(GCRY_CIPHER_AES, 16,
-			    "\xf5\xf3\x12\x7d\x58\x2d\x96\xe8"
-			    "\x33\xfd\x7a\x4f\x42\x60\x5d\x20");
+    "\xc1\x5b\xf1\x80\xa4\xd5\xea\xfd\xae\x17\xa6\xcd\x6b\x10\xa8\xea");
   check_ocb_cipher_largebuf(GCRY_CIPHER_AES256, 32,
-			    "\xfa\x26\xa5\xbf\xf6\x7d\x3a\x8d"
-			    "\xfe\x96\x67\xc9\xc8\x41\x03\x51");
+    "\x2b\xb7\x25\x6b\x77\xc7\xfb\x21\x5c\xc9\x6c\x36\x17\x1a\x1a\xd5");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA128, 16,
-			    "\x28\x23\x38\x45\x2b\xfd\x42\x45"
-			    "\x43\x64\x7e\x67\x7f\xf4\x8b\xcd");
+    "\xe0\xae\x3f\x29\x3a\xee\xd8\xe3\xf2\x20\xc1\xa2\xd8\x72\x12\xd9");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA192, 24,
-			    "\xee\xca\xe5\x39\x27\x2d\x33\xe7"
-			    "\x79\x74\xb0\x1d\x37\x12\xd5\x6c");
+    "\xd7\x98\x71\xcf\x19\x5c\xa3\x3d\x6c\xfc\xc9\xbe\x9f\x13\x6b\xbd");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA256, 32,
-			    "\x39\x39\xd0\x2d\x05\x68\x74\xee"
-			    "\x18\x6b\xea\x3d\x0b\xd3\x58\xae");
+    "\x03\xf6\xec\x1a\x0e\xae\x66\x24\x2b\xba\x26\x0f\xb3\xb3\x1f\xb9");
   check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 16,
-			    "\x63\xe3\x0e\xb9\x11\x6f\x14\xba"
-			    "\x79\xe4\xa7\x9e\xad\x3c\x02\x0c");
+    "\x1c\xf9\xc7\xfc\x3a\x32\xac\xc7\x5e\x0a\xc2\x5c\x90\xd6\xf6\xf9");
   check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 32,
-			    "\xf6\xd4\xfe\x4e\x50\x85\x13\x59"
-			    "\x69\x0e\x4c\x67\x3e\xdd\x47\x90");
+    "\x53\x02\xc8\x0d\x4e\x9a\x44\x9e\x43\xd4\xaa\x06\x30\x93\xcc\x16");
   check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT128, 16,
-			    "\x3c\xfb\x66\x14\x3c\xc8\x6c\x67"
-			    "\x26\xb8\x23\xeb\xaf\x43\x98\x69");
+    "\xd3\x64\xac\x40\x48\x88\x77\xe2\x41\x26\x4c\xde\x21\x29\x21\x8d");
   check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT192, 24,
-			    "\x5e\x62\x27\xc5\x32\xc3\x1d\xe6"
-			    "\x2e\x65\xe7\xd6\xfb\x05\xd7\xb2");
+    "\x99\xeb\x35\xb0\x62\x4e\x7b\xf1\x5e\x9f\xed\x32\x78\x90\x0b\xd0");
   check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT256, 32,
-			    "\xe7\x8b\xe6\xd4\x2f\x7a\x36\x4c"
-			    "\xba\xee\x20\xe2\x68\xf4\xcb\xcc");
+    "\x71\x66\x2f\x68\xbf\xdd\xcc\xb1\xbf\x81\x56\x5f\x01\x73\xeb\x44");
 
   /* Check that the AAD data is correctly buffered.  */
   check_ocb_cipher_splitaad ();




More information about the Gcrypt-devel mailing list