[PATCH] Simplify OCB offset calculation for parallel implementations

Jussi Kivilinna jussi.kivilinna at iki.fi
Tue Aug 11 22:12:44 CEST 2015


* cipher/camellia-glue.c (_gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth): Precalculate Ls array always, instead of
just if 'blkn % <parallel blocks> == 0'.
* cipher/serpent.c (_gcry_serpent_ocb_crypt)
(_gcry_serpent_ocb_auth): Ditto.
* cipher/rijndael-aesni.c (get_l): Remove low-bit checks.
(aes_ocb_enc, aes_ocb_dec, _gcry_aes_aesni_ocb_auth): Handle leading
blocks until block counter is multiple of 4, so that parallel block
processing loop can use 'c->u_mode.ocb.L' array directly.
* tests/basic.c (check_ocb_cipher_largebuf): Rename to...
(check_ocb_cipher_largebuf_split): ...this and add option to process
large buffer as two split buffers.
(check_ocb_cipher_largebuf): New.
--

Patch simplifies source and reduce object size.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-glue.c  |  254 +++++++++------------
 cipher/rijndael-aesni.c |  562 ++++++++++++++++++++++++-----------------------
 cipher/serpent.c        |  370 +++++++++++++------------------
 tests/basic.c           |   48 ++++
 4 files changed, 586 insertions(+), 648 deletions(-)

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 2d5dd20..dee0169 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -631,58 +631,47 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_aesni_avx2 = 0;
       const void *Ls[32];
+      unsigned int n = 32 - (blkn % 32);
+      const void **l;
       int i;
 
-      if (blkn % 32 == 0)
+      if (nblocks >= 32)
 	{
 	  for (i = 0; i < 32; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	  Ls[15] = c->u_mode.ocb.L[4];
-	  Ls[23] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 32] = c->u_mode.ocb.L[3];
+	  Ls[(15 + n) % 32] = c->u_mode.ocb.L[4];
+	  Ls[(23 + n) % 32] = c->u_mode.ocb.L[3];
+	  l = &Ls[(31 + n) % 32];
 
-      /* Process data in 32 block chunks. */
-      while (nblocks >= 32)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 32 == 0)
+	  /* Process data in 32 block chunks. */
+	  while (nblocks >= 32)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 32;
-	      Ls[31] = ocb_get_l(c, l_tmp, blkn);
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 32);
+
+	      if (encrypt)
+		_gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+						  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+						  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 32;
+	      outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+	      inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx2 = 1;
 	    }
-	  else
-	    {
-	      for (i = 0; i < 32; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
-
-	  if (encrypt)
-	    _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-					      c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-					      c->u_ctr.ctr, Ls);
-
-	  nblocks -= 32;
-	  outbuf += 32 * CAMELLIA_BLOCK_SIZE;
-	  inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx2 = 1;
 	}
 
       if (did_use_aesni_avx2)
@@ -703,56 +692,45 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_aesni_avx = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+						c->u_ctr.ctr, Ls);
+	      else
+		_gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+						c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+	      inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx = 1;
 	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
-
-	  if (encrypt)
-	    _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-					    c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-					    c->u_ctr.ctr, Ls);
-
-	  nblocks -= 16;
-	  outbuf += 16 * CAMELLIA_BLOCK_SIZE;
-	  inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx = 1;
 	}
 
       if (did_use_aesni_avx)
@@ -803,53 +781,43 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_aesni_avx2 = 0;
       const void *Ls[32];
+      unsigned int n = 32 - (blkn % 32);
+      const void **l;
       int i;
 
-      if (blkn % 32 == 0)
+      if (nblocks >= 32)
 	{
 	  for (i = 0; i < 32; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	  Ls[15] = c->u_mode.ocb.L[4];
-	  Ls[23] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 32] = c->u_mode.ocb.L[3];
+	  Ls[(15 + n) % 32] = c->u_mode.ocb.L[4];
+	  Ls[(23 + n) % 32] = c->u_mode.ocb.L[3];
+	  l = &Ls[(31 + n) % 32];
 
-      /* Process data in 32 block chunks. */
-      while (nblocks >= 32)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 32 == 0)
+	  /* Process data in 32 block chunks. */
+	  while (nblocks >= 32)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 32;
-	      Ls[31] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 32; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 32);
 
-	  _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-					    c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
+						 c->u_mode.ocb.aad_offset,
+						 c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 32;
-	  abuf += 32 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx2 = 1;
+	      nblocks -= 32;
+	      abuf += 32 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx2 = 1;
+	    }
 	}
 
       if (did_use_aesni_avx2)
@@ -870,51 +838,41 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_aesni_avx = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
 
-	  _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-					    c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf,
+						c->u_mode.ocb.aad_offset,
+						c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 16;
-	  abuf += 16 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx = 1;
+	      nblocks -= 16;
+	      abuf += 16 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx = 1;
+	    }
 	}
 
       if (did_use_aesni_avx)
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 882cc79..be57b3d 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -1307,11 +1307,7 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
   const unsigned char *l;
   unsigned int ntz;
 
-  if (i & 1)
-    return c->u_mode.ocb.L[0];
-  else if (i & 2)
-    return c->u_mode.ocb.L[1];
-  else if (i & 0xffffffffU)
+  if (i & 0xffffffffU)
     {
       asm ("rep;bsf %k[low], %k[ntz]\n\t"
            : [ntz] "=r" (ntz)
@@ -1376,7 +1372,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
-  const unsigned char *l[4] = {};
+  const unsigned char *l;
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1390,103 +1386,112 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                   [ctr] "m" (*c->u_ctr.ctr)
                 : "memory" );
 
-  if (nblocks > 3)
+
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm0,   %%xmm6\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
     {
-      if (n % 4 == 0)
-	{
-	  l[0] = c->u_mode.ocb.L[0];
-	  l[1] = c->u_mode.ocb.L[1];
-	  l[2] = c->u_mode.ocb.L[0];
-	}
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (n % 4 == 0)
-	    {
-	      n += 4;
-	      l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
-	    }
-	  else
-	    {
-	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
-	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
-	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
-	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
-	      n += 4;
-	    }
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqu %%xmm5,    %[outbuf0]\n\t"
-			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-			: [l0] "m" (*l[0]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-			"movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm2,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqu %%xmm5,    %[outbuf1]\n\t"
-			: [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-			: [l1] "m" (*l[1]),
-			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-			"movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqu %%xmm5,    %[outbuf2]\n\t"
-			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-			: [l2] "m" (*l[2]),
-			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm4,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			:
-			: [l3] "m" (*l[3]),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
-
-	  do_aesni_enc_vec4 (ctx);
-
-	  asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm1\n\t"
-			"movdqu %%xmm1,    %[outbuf0]\n\t"
-			"movdqu %[outbuf1],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm2\n\t"
-			"movdqu %%xmm2,    %[outbuf1]\n\t"
-			"movdqu %[outbuf2],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm3\n\t"
-			"movdqu %%xmm3,    %[outbuf2]\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			"movdqu %%xmm4,    %[outbuf3]\n\t"
-			: [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
-			  [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
-			  [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
-			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
-			:
-			: "memory" );
-
-	  outbuf += 4*BLOCKSIZE;
-	  inbuf  += 4*BLOCKSIZE;
-	}
+      /* l_tmp will be used only every 65536-th block. */
+      n += 4;
+      l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm1,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+		    "movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm2,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm3,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+		    : [l2] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [l3] "m" (*l),
+		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm1,    %[outbuf0]\n\t"
+		    "movdqu %[outbuf1],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm2\n\t"
+		    "movdqu %%xmm2,    %[outbuf1]\n\t"
+		    "movdqu %[outbuf2],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm3,    %[outbuf2]\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm4,    %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+		      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+		      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+		    :
+		    : "memory" );
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
     }
 
   for ( ;nblocks; nblocks-- )
     {
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
@@ -1497,7 +1502,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                     "pxor   %%xmm0,   %%xmm6\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l[0]),
+                    : [l] "m" (*l),
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
@@ -1537,7 +1542,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
-  const unsigned char *l[4] = {};
+  const unsigned char *l;
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1551,103 +1556,111 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                   [ctr] "m" (*c->u_ctr.ctr)
                 : "memory" );
 
-  if (nblocks > 3)
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "pxor   %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
     {
-      if (n % 4 == 0)
-	{
-	  l[0] = c->u_mode.ocb.L[0];
-	  l[1] = c->u_mode.ocb.L[1];
-	  l[2] = c->u_mode.ocb.L[0];
-	}
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (n % 4 == 0)
-	    {
-	      n += 4;
-	      l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
-	    }
-	  else
-	    {
-	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
-	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
-	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
-	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
-	      n += 4;
-	    }
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqu %%xmm5,    %[outbuf0]\n\t"
-			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-			: [l0] "m" (*l[0]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-			"movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqu %%xmm5,    %[outbuf1]\n\t"
-			: [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-			: [l1] "m" (*l[1]),
-			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-			"movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqu %%xmm5,    %[outbuf2]\n\t"
-			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-			: [l2] "m" (*l[2]),
-			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			:
-			: [l3] "m" (*l[3]),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
-
-	  do_aesni_dec_vec4 (ctx);
-
-	  asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm1\n\t"
-			"movdqu %%xmm1,    %[outbuf0]\n\t"
-			"movdqu %[outbuf1],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm2\n\t"
-			"movdqu %%xmm2,    %[outbuf1]\n\t"
-			"movdqu %[outbuf2],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm3\n\t"
-			"movdqu %%xmm3,    %[outbuf2]\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			"movdqu %%xmm4,    %[outbuf3]\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
-			"pxor   %%xmm2,    %%xmm6\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm4,    %%xmm6\n\t"
-			: [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
-			  [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
-			  [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
-			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
-			:
-			: "memory" );
-
-	  outbuf += 4*BLOCKSIZE;
-	  inbuf  += 4*BLOCKSIZE;
-	}
+      /* l_tmp will be used only every 65536-th block. */
+      n += 4;
+      l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+		    "movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+		    : [l2] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [l3] "m" (*l),
+		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm1,    %[outbuf0]\n\t"
+		    "movdqu %[outbuf1],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm2\n\t"
+		    "movdqu %%xmm2,    %[outbuf1]\n\t"
+		    "movdqu %[outbuf2],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm3,    %[outbuf2]\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm4,    %[outbuf3]\n\t"
+		    "pxor   %%xmm1,    %%xmm6\n\t"
+		    "pxor   %%xmm2,    %%xmm6\n\t"
+		    "pxor   %%xmm3,    %%xmm6\n\t"
+		    "pxor   %%xmm4,    %%xmm6\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+		      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+		      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+		    :
+		    : "memory" );
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
     }
 
   for ( ;nblocks; nblocks-- )
     {
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
@@ -1657,7 +1670,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                     "pxor   %%xmm1,   %%xmm5\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l[0]),
+                    : [l] "m" (*l),
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
@@ -1708,7 +1721,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   u64 n = c->u_mode.ocb.aad_nblocks;
-  const unsigned char *l[4] = {};
+  const unsigned char *l;
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1722,90 +1735,91 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                   [ctr] "m" (*c->u_mode.ocb.aad_sum)
                 : "memory" );
 
-  if (nblocks > 3)
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+                c->u_mode.ocb.aad_sum);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[abuf],  %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [abuf] "m" (*abuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
+                    :
+                    :
+                    : "memory" );
+
+      abuf += BLOCKSIZE;
+    }
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
     {
-      if (n % 4 == 0)
-	{
-	  l[0] = c->u_mode.ocb.L[0];
-	  l[1] = c->u_mode.ocb.L[1];
-	  l[2] = c->u_mode.ocb.L[0];
-	}
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (n % 4 == 0)
-	    {
-	      n += 4;
-	      l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	    }
-	  else
-	    {
-	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      n += 4;
-	    }
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-			"movdqu %[abuf0],  %%xmm1\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			:
-			: [l0] "m" (*l[0]),
-			  [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-			"movdqu %[abuf1],  %%xmm2\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			:
-			: [l1] "m" (*l[1]),
-			  [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-			"movdqu %[abuf2],  %%xmm3\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			:
-			: [l2] "m" (*l[2]),
-			  [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[abuf3],  %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			:
-			: [l3] "m" (*l[3]),
-			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
-			: "memory" );
-
-	  do_aesni_enc_vec4 (ctx);
-
-	  asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
-			"pxor   %%xmm2,   %%xmm6\n\t"
-			"pxor   %%xmm3,   %%xmm6\n\t"
-			"pxor   %%xmm4,   %%xmm6\n\t"
-			:
-			:
-			: "memory" );
-
-	  abuf += 4*BLOCKSIZE;
-	}
+      /* l_tmp will be used only every 65536-th block. */
+      n += 4;
+      l = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
+		c->u_mode.ocb.aad_sum);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[abuf0],  %%xmm1\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+		    "movdqu %[abuf1],  %%xmm2\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    :
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+		    "movdqu %[abuf2],  %%xmm3\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    :
+		    : [l2] "m" (*c->u_mode.ocb.L[0]),
+		      [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+		    "movdqu %[abuf3],  %%xmm4\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [l3] "m" (*l),
+		      [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
+		    "pxor   %%xmm2,   %%xmm6\n\t"
+		    "pxor   %%xmm3,   %%xmm6\n\t"
+		    "pxor   %%xmm4,   %%xmm6\n\t"
+		    :
+		    :
+		    : "memory" );
+
+      abuf += 4*BLOCKSIZE;
     }
 
   for ( ;nblocks; nblocks-- )
     {
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
-                   c->u_mode.ocb.aad_sum);
+      l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+                c->u_mode.ocb.aad_sum);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
@@ -1814,7 +1828,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                     "pxor   %%xmm1,   %%xmm5\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l[0]),
+                    : [l] "m" (*l),
                       [abuf] "m" (*abuf)
                     : "memory" );
 
diff --git a/cipher/serpent.c b/cipher/serpent.c
index a47a1b7..fc3afa6 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1250,56 +1250,45 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_avx2 = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * sizeof(serpent_block_t);
+	      inbuf  += 16 * sizeof(serpent_block_t);
+	      did_use_avx2 = 1;
 	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
-
-	  if (encrypt)
-	    _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-
-	  nblocks -= 16;
-	  outbuf += 16 * sizeof(serpent_block_t);
-	  inbuf  += 16 * sizeof(serpent_block_t);
-	  did_use_avx2 = 1;
 	}
 
       if (did_use_avx2)
@@ -1317,51 +1306,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   {
     int did_use_sse2 = 0;
     const void *Ls[8];
-    int i;
+    unsigned int n = 8 - (blkn % 8);
+    const void **l;
 
-    if (blkn % 8 == 0)
+    if (nblocks >= 8)
       {
-	Ls[0] = c->u_mode.ocb.L[0];
-	Ls[1] = c->u_mode.ocb.L[1];
-	Ls[2] = c->u_mode.ocb.L[0];
-	Ls[3] = c->u_mode.ocb.L[2];
-	Ls[4] = c->u_mode.ocb.L[0];
-	Ls[5] = c->u_mode.ocb.L[1];
-	Ls[6] = c->u_mode.ocb.L[0];
-      }
-
-    /* Process data in 8 block chunks. */
-    while (nblocks >= 8)
-      {
-	/* l_tmp will be used only every 65536-th block. */
-	if (blkn % 8 == 0)
+	Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	l = &Ls[(7 + n) % 8];
+
+	/* Process data in 8 block chunks. */
+	while (nblocks >= 8)
 	  {
+	    /* l_tmp will be used only every 65536-th block. */
 	    blkn += 8;
-	    Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	  }
-	else
-	  {
-	    for (i = 0; i < 8; i += 4)
-	      {
-		Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		blkn += 4;
-	      }
+	    *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
+
+	    if (encrypt)
+	      _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	    else
+	      _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	    nblocks -= 8;
+	    outbuf += 8 * sizeof(serpent_block_t);
+	    inbuf  += 8 * sizeof(serpent_block_t);
+	    did_use_sse2 = 1;
 	  }
-
-	if (encrypt)
-	  _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-	else
-	  _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-
-	nblocks -= 8;
-	outbuf += 8 * sizeof(serpent_block_t);
-	inbuf  += 8 * sizeof(serpent_block_t);
-	did_use_sse2 = 1;
       }
 
     if (did_use_sse2)
@@ -1380,51 +1357,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_neon = 0;
       const void *Ls[8];
-      int i;
+      unsigned int n = 8 - (blkn % 8);
+      const void **l;
 
-      if (blkn % 8 == 0)
+      if (nblocks >= 8)
 	{
-	  Ls[0] = c->u_mode.ocb.L[0];
-	  Ls[1] = c->u_mode.ocb.L[1];
-	  Ls[2] = c->u_mode.ocb.L[0];
-	  Ls[3] = c->u_mode.ocb.L[2];
-	  Ls[4] = c->u_mode.ocb.L[0];
-	  Ls[5] = c->u_mode.ocb.L[1];
-	  Ls[6] = c->u_mode.ocb.L[0];
-	}
-
-      /* Process data in 8 block chunks. */
-      while (nblocks >= 8)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 8 == 0)
+	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 8;
-	      Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 8; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
+
+	      if (encrypt)
+		_gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 8;
+	      outbuf += 8 * sizeof(serpent_block_t);
+	      inbuf  += 8 * sizeof(serpent_block_t);
+	      did_use_neon = 1;
 	    }
-
-	  if (encrypt)
-	    _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-				       c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-				       c->u_ctr.ctr, Ls);
-
-	  nblocks -= 8;
-	  outbuf += 8 * sizeof(serpent_block_t);
-	  inbuf  += 8 * sizeof(serpent_block_t);
-	  did_use_neon = 1;
 	}
 
       if (did_use_neon)
@@ -1471,51 +1436,40 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_avx2 = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
 
-	  _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-				      c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 16;
-	  abuf += 16 * sizeof(serpent_block_t);
-	  did_use_avx2 = 1;
+	      nblocks -= 16;
+	      abuf += 16 * sizeof(serpent_block_t);
+	      did_use_avx2 = 1;
+	    }
 	}
 
       if (did_use_avx2)
@@ -1533,46 +1487,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   {
     int did_use_sse2 = 0;
     const void *Ls[8];
-    int i;
+    unsigned int n = 8 - (blkn % 8);
+    const void **l;
 
-    if (blkn % 8 == 0)
+    if (nblocks >= 8)
       {
-	Ls[0] = c->u_mode.ocb.L[0];
-	Ls[1] = c->u_mode.ocb.L[1];
-	Ls[2] = c->u_mode.ocb.L[0];
-	Ls[3] = c->u_mode.ocb.L[2];
-	Ls[4] = c->u_mode.ocb.L[0];
-	Ls[5] = c->u_mode.ocb.L[1];
-	Ls[6] = c->u_mode.ocb.L[0];
-      }
-
-    /* Process data in 8 block chunks. */
-    while (nblocks >= 8)
-      {
-	/* l_tmp will be used only every 65536-th block. */
-	if (blkn % 8 == 0)
+	Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	l = &Ls[(7 + n) % 8];
+
+	/* Process data in 8 block chunks. */
+	while (nblocks >= 8)
 	  {
+	    /* l_tmp will be used only every 65536-th block. */
 	    blkn += 8;
-	    Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	  }
-	else
-	  {
-	    for (i = 0; i < 8; i += 4)
-	      {
-		Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		blkn += 4;
-	      }
-	  }
+	    *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
 
-	_gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-				    c->u_mode.ocb.aad_sum, Ls);
+	    _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					c->u_mode.ocb.aad_sum, Ls);
 
-	nblocks -= 8;
-	abuf += 8 * sizeof(serpent_block_t);
-	did_use_sse2 = 1;
+	    nblocks -= 8;
+	    abuf += 8 * sizeof(serpent_block_t);
+	    did_use_sse2 = 1;
+	  }
       }
 
     if (did_use_sse2)
@@ -1591,46 +1533,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_neon = 0;
       const void *Ls[8];
-      int i;
-
-      if (blkn % 8 == 0)
-	{
-	  Ls[0] = c->u_mode.ocb.L[0];
-	  Ls[1] = c->u_mode.ocb.L[1];
-	  Ls[2] = c->u_mode.ocb.L[0];
-	  Ls[3] = c->u_mode.ocb.L[2];
-	  Ls[4] = c->u_mode.ocb.L[0];
-	  Ls[5] = c->u_mode.ocb.L[1];
-	  Ls[6] = c->u_mode.ocb.L[0];
-	}
+      unsigned int n = 8 - (blkn % 8);
+      const void **l;
 
-      /* Process data in 8 block chunks. */
-      while (nblocks >= 8)
+      if (nblocks >= 8)
 	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 8 == 0)
+	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 8;
-	      Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 8; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
 
-	  _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-				      c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 8;
-	  abuf += 8 * sizeof(serpent_block_t);
-	  did_use_neon = 1;
+	      nblocks -= 8;
+	      abuf += 8 * sizeof(serpent_block_t);
+	      did_use_neon = 1;
+	    }
 	}
 
       if (did_use_neon)
diff --git a/tests/basic.c b/tests/basic.c
index c1aa76a..4ea91a9 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3153,7 +3153,8 @@ do_check_ocb_cipher (int inplace)
 
 
 static void
-check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
+check_ocb_cipher_largebuf_split (int algo, int keylen, const char *tagexpect,
+				 unsigned int splitpos)
 {
   static const unsigned char key[32] =
         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
@@ -3219,7 +3220,14 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
       goto out_free;
     }
 
-  err = gcry_cipher_authenticate (hde, inbuf, buflen);
+  if (splitpos)
+    {
+      err = gcry_cipher_authenticate (hde, inbuf, splitpos);
+    }
+  if (!err)
+    {
+      err = gcry_cipher_authenticate (hde, inbuf + splitpos, buflen - splitpos);
+    }
   if (err)
     {
       fail ("cipher-ocb, gcry_cipher_authenticate failed (large, algo %d): %s\n",
@@ -3229,10 +3237,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
       goto out_free;
     }
 
-  err = gcry_cipher_final (hde);
+  if (splitpos)
+    {
+      err = gcry_cipher_encrypt (hde, outbuf, splitpos, inbuf, splitpos);
+    }
   if (!err)
     {
-      err = gcry_cipher_encrypt (hde, outbuf, buflen, inbuf, buflen);
+      err = gcry_cipher_final (hde);
+      if (!err)
+	{
+	  err = gcry_cipher_encrypt (hde, outbuf + splitpos, buflen - splitpos,
+				    inbuf + splitpos, buflen - splitpos);
+	}
     }
   if (err)
     {
@@ -3267,10 +3283,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
     }
 
   /* Now for the decryption.  */
-  err = gcry_cipher_final (hdd);
+  if (splitpos)
+    {
+      err = gcry_cipher_decrypt (hdd, outbuf, splitpos, NULL, 0);
+    }
   if (!err)
     {
-      err = gcry_cipher_decrypt (hdd, outbuf, buflen, NULL, 0);
+      err = gcry_cipher_final (hdd);
+      if (!err)
+	{
+	  err = gcry_cipher_decrypt (hdd, outbuf + splitpos, buflen - splitpos,
+				     NULL, 0);
+	}
     }
   if (err)
     {
@@ -3319,6 +3343,18 @@ out_free:
 
 
 static void
+check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
+{
+  unsigned int split;
+
+  for (split = 0; split < 32 * 16; split = split * 2 + 16)
+    {
+      check_ocb_cipher_largebuf_split(algo, keylen, tagexpect, split);
+    }
+}
+
+
+static void
 check_ocb_cipher (void)
 {
   /* Check OCB cipher with separate destination and source buffers for




More information about the Gcrypt-devel mailing list