[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-253-g24ebf53

by Jussi Kivilinna cvs at cvs.gnupg.org
Wed Aug 12 17:19:26 CEST 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  24ebf53f1e8a8afa27dcd768339bda70a740bb03 (commit)
       via  e11895da1f4af9782d89e92ba2e6b1a63235b54b (commit)
      from  80321eb3a63a20f86734d6eebb3f419c0ec895aa (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 24ebf53f1e8a8afa27dcd768339bda70a740bb03
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Tue Aug 11 07:22:16 2015 +0300

    Simplify OCB offset calculation for parallel implementations
    
    * cipher/camellia-glue.c (_gcry_camellia_ocb_crypt)
    (_gcry_camellia_ocb_auth): Precalculate Ls array always, instead of
    just if 'blkn % <parallel blocks> == 0'.
    * cipher/serpent.c (_gcry_serpent_ocb_crypt)
    (_gcry_serpent_ocb_auth): Ditto.
    * cipher/rijndael-aesni.c (get_l): Remove low-bit checks.
    (aes_ocb_enc, aes_ocb_dec, _gcry_aes_aesni_ocb_auth): Handle leading
    blocks until block counter is multiple of 4, so that parallel block
    processing loop can use 'c->u_mode.ocb.L' array directly.
    * tests/basic.c (check_ocb_cipher_largebuf): Rename to...
    (check_ocb_cipher_largebuf_split): ...this and add option to process
    large buffer as two split buffers.
    (check_ocb_cipher_largebuf): New.
    --
    
    Patch simplifies source and reduce object size.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 2d5dd20..dee0169 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -631,58 +631,47 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_aesni_avx2 = 0;
       const void *Ls[32];
+      unsigned int n = 32 - (blkn % 32);
+      const void **l;
       int i;
 
-      if (blkn % 32 == 0)
+      if (nblocks >= 32)
 	{
 	  for (i = 0; i < 32; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	  Ls[15] = c->u_mode.ocb.L[4];
-	  Ls[23] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 32] = c->u_mode.ocb.L[3];
+	  Ls[(15 + n) % 32] = c->u_mode.ocb.L[4];
+	  Ls[(23 + n) % 32] = c->u_mode.ocb.L[3];
+	  l = &Ls[(31 + n) % 32];
 
-      /* Process data in 32 block chunks. */
-      while (nblocks >= 32)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 32 == 0)
+	  /* Process data in 32 block chunks. */
+	  while (nblocks >= 32)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 32;
-	      Ls[31] = ocb_get_l(c, l_tmp, blkn);
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 32);
+
+	      if (encrypt)
+		_gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+						  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+						  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 32;
+	      outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+	      inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx2 = 1;
 	    }
-	  else
-	    {
-	      for (i = 0; i < 32; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
-
-	  if (encrypt)
-	    _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-					      c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-					      c->u_ctr.ctr, Ls);
-
-	  nblocks -= 32;
-	  outbuf += 32 * CAMELLIA_BLOCK_SIZE;
-	  inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx2 = 1;
 	}
 
       if (did_use_aesni_avx2)
@@ -703,56 +692,45 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_aesni_avx = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+						c->u_ctr.ctr, Ls);
+	      else
+		_gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+						c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+	      inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx = 1;
 	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
-
-	  if (encrypt)
-	    _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-					    c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-					    c->u_ctr.ctr, Ls);
-
-	  nblocks -= 16;
-	  outbuf += 16 * CAMELLIA_BLOCK_SIZE;
-	  inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx = 1;
 	}
 
       if (did_use_aesni_avx)
@@ -803,53 +781,43 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_aesni_avx2 = 0;
       const void *Ls[32];
+      unsigned int n = 32 - (blkn % 32);
+      const void **l;
       int i;
 
-      if (blkn % 32 == 0)
+      if (nblocks >= 32)
 	{
 	  for (i = 0; i < 32; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	  Ls[15] = c->u_mode.ocb.L[4];
-	  Ls[23] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 32] = c->u_mode.ocb.L[3];
+	  Ls[(15 + n) % 32] = c->u_mode.ocb.L[4];
+	  Ls[(23 + n) % 32] = c->u_mode.ocb.L[3];
+	  l = &Ls[(31 + n) % 32];
 
-      /* Process data in 32 block chunks. */
-      while (nblocks >= 32)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 32 == 0)
+	  /* Process data in 32 block chunks. */
+	  while (nblocks >= 32)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 32;
-	      Ls[31] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 32; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 32);
 
-	  _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-					    c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
+						 c->u_mode.ocb.aad_offset,
+						 c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 32;
-	  abuf += 32 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx2 = 1;
+	      nblocks -= 32;
+	      abuf += 32 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx2 = 1;
+	    }
 	}
 
       if (did_use_aesni_avx2)
@@ -870,51 +838,41 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_aesni_avx = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
 
-	  _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-					    c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf,
+						c->u_mode.ocb.aad_offset,
+						c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 16;
-	  abuf += 16 * CAMELLIA_BLOCK_SIZE;
-	  did_use_aesni_avx = 1;
+	      nblocks -= 16;
+	      abuf += 16 * CAMELLIA_BLOCK_SIZE;
+	      did_use_aesni_avx = 1;
+	    }
 	}
 
       if (did_use_aesni_avx)
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 6678785..5c85903 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -1338,11 +1338,7 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
   const unsigned char *l;
   unsigned int ntz;
 
-  if (i & 1)
-    return c->u_mode.ocb.L[0];
-  else if (i & 2)
-    return c->u_mode.ocb.L[1];
-  else if (i & 0xffffffffU)
+  if (i & 0xffffffffU)
     {
       asm ("rep;bsf %k[low], %k[ntz]\n\t"
            : [ntz] "=r" (ntz)
@@ -1407,7 +1403,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
-  const unsigned char *l[4] = {};
+  const unsigned char *l;
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1421,103 +1417,112 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                   [ctr] "m" (*c->u_ctr.ctr)
                 : "memory" );
 
-  if (nblocks > 3)
+
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm0,   %%xmm6\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
     {
-      if (n % 4 == 0)
-	{
-	  l[0] = c->u_mode.ocb.L[0];
-	  l[1] = c->u_mode.ocb.L[1];
-	  l[2] = c->u_mode.ocb.L[0];
-	}
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (n % 4 == 0)
-	    {
-	      n += 4;
-	      l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
-	    }
-	  else
-	    {
-	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
-	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
-	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
-	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
-	      n += 4;
-	    }
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqu %%xmm5,    %[outbuf0]\n\t"
-			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-			: [l0] "m" (*l[0]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-			"movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm2,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqu %%xmm5,    %[outbuf1]\n\t"
-			: [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-			: [l1] "m" (*l[1]),
-			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-			"movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqu %%xmm5,    %[outbuf2]\n\t"
-			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-			: [l2] "m" (*l[2]),
-			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm4,    %%xmm6\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			:
-			: [l3] "m" (*l[3]),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
-
-	  do_aesni_enc_vec4 (ctx);
-
-	  asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm1\n\t"
-			"movdqu %%xmm1,    %[outbuf0]\n\t"
-			"movdqu %[outbuf1],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm2\n\t"
-			"movdqu %%xmm2,    %[outbuf1]\n\t"
-			"movdqu %[outbuf2],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm3\n\t"
-			"movdqu %%xmm3,    %[outbuf2]\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			"movdqu %%xmm4,    %[outbuf3]\n\t"
-			: [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
-			  [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
-			  [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
-			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
-			:
-			: "memory" );
-
-	  outbuf += 4*BLOCKSIZE;
-	  inbuf  += 4*BLOCKSIZE;
-	}
+      /* l_tmp will be used only every 65536-th block. */
+      n += 4;
+      l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm1,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+		    "movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm2,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm3,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+		    : [l2] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm4,    %%xmm6\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [l3] "m" (*l),
+		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm1,    %[outbuf0]\n\t"
+		    "movdqu %[outbuf1],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm2\n\t"
+		    "movdqu %%xmm2,    %[outbuf1]\n\t"
+		    "movdqu %[outbuf2],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm3,    %[outbuf2]\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm4,    %[outbuf3]\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+		      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+		      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+		    :
+		    : "memory" );
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
     }
 
   for ( ;nblocks; nblocks-- )
     {
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
@@ -1528,7 +1533,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                     "pxor   %%xmm0,   %%xmm6\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l[0]),
+                    : [l] "m" (*l),
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
@@ -1568,7 +1573,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
-  const unsigned char *l[4] = {};
+  const unsigned char *l;
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1582,103 +1587,111 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                   [ctr] "m" (*c->u_ctr.ctr)
                 : "memory" );
 
-  if (nblocks > 3)
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_aesni_dec (ctx);
+
+      asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
+                    "pxor   %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
     {
-      if (n % 4 == 0)
-	{
-	  l[0] = c->u_mode.ocb.L[0];
-	  l[1] = c->u_mode.ocb.L[1];
-	  l[2] = c->u_mode.ocb.L[0];
-	}
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (n % 4 == 0)
-	    {
-	      n += 4;
-	      l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
-	    }
-	  else
-	    {
-	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
-	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
-	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
-	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
-	      n += 4;
-	    }
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-			"movdqu %[inbuf0], %%xmm1\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			"movdqu %%xmm5,    %[outbuf0]\n\t"
-			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-			: [l0] "m" (*l[0]),
-			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-			"movdqu %[inbuf1], %%xmm2\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			"movdqu %%xmm5,    %[outbuf1]\n\t"
-			: [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-			: [l1] "m" (*l[1]),
-			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-			"movdqu %[inbuf2], %%xmm3\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			"movdqu %%xmm5,    %[outbuf2]\n\t"
-			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-			: [l2] "m" (*l[2]),
-			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[inbuf3], %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			:
-			: [l3] "m" (*l[3]),
-			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-			: "memory" );
-
-	  do_aesni_dec_vec4 (ctx);
-
-	  asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm1\n\t"
-			"movdqu %%xmm1,    %[outbuf0]\n\t"
-			"movdqu %[outbuf1],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm2\n\t"
-			"movdqu %%xmm2,    %[outbuf1]\n\t"
-			"movdqu %[outbuf2],%%xmm0\n\t"
-			"pxor   %%xmm0,    %%xmm3\n\t"
-			"movdqu %%xmm3,    %[outbuf2]\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			"movdqu %%xmm4,    %[outbuf3]\n\t"
-			"pxor   %%xmm1,    %%xmm6\n\t"
-			"pxor   %%xmm2,    %%xmm6\n\t"
-			"pxor   %%xmm3,    %%xmm6\n\t"
-			"pxor   %%xmm4,    %%xmm6\n\t"
-			: [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
-			  [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
-			  [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
-			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
-			:
-			: "memory" );
-
-	  outbuf += 4*BLOCKSIZE;
-	  inbuf  += 4*BLOCKSIZE;
-	}
+      /* l_tmp will be used only every 65536-th block. */
+      n += 4;
+      l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[inbuf0], %%xmm1\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    "movdqu %%xmm5,    %[outbuf0]\n\t"
+		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+		    "movdqu %[inbuf1], %%xmm2\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    "movdqu %%xmm5,    %[outbuf1]\n\t"
+		    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+		    "movdqu %[inbuf2], %%xmm3\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    "movdqu %%xmm5,    %[outbuf2]\n\t"
+		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+		    : [l2] "m" (*c->u_mode.ocb.L[0]),
+		      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+		    "movdqu %[inbuf3], %%xmm4\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [l3] "m" (*l),
+		      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_dec_vec4 (ctx);
+
+      asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm1\n\t"
+		    "movdqu %%xmm1,    %[outbuf0]\n\t"
+		    "movdqu %[outbuf1],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm2\n\t"
+		    "movdqu %%xmm2,    %[outbuf1]\n\t"
+		    "movdqu %[outbuf2],%%xmm0\n\t"
+		    "pxor   %%xmm0,    %%xmm3\n\t"
+		    "movdqu %%xmm3,    %[outbuf2]\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    "movdqu %%xmm4,    %[outbuf3]\n\t"
+		    "pxor   %%xmm1,    %%xmm6\n\t"
+		    "pxor   %%xmm2,    %%xmm6\n\t"
+		    "pxor   %%xmm3,    %%xmm6\n\t"
+		    "pxor   %%xmm4,    %%xmm6\n\t"
+		    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+		      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+		      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+		    :
+		    : "memory" );
+
+      outbuf += 4*BLOCKSIZE;
+      inbuf  += 4*BLOCKSIZE;
     }
 
   for ( ;nblocks; nblocks-- )
     {
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
@@ -1688,7 +1701,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                     "pxor   %%xmm1,   %%xmm5\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l[0]),
+                    : [l] "m" (*l),
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
@@ -1739,7 +1752,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   u64 n = c->u_mode.ocb.aad_nblocks;
-  const unsigned char *l[4] = {};
+  const unsigned char *l;
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1753,90 +1766,91 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                   [ctr] "m" (*c->u_mode.ocb.aad_sum)
                 : "memory" );
 
-  if (nblocks > 3)
+  for ( ;nblocks && n % 4; nblocks-- )
+    {
+      l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+                c->u_mode.ocb.aad_sum);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[abuf],  %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm5\n\t"
+                    "pxor   %%xmm5,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [abuf] "m" (*abuf)
+                    : "memory" );
+
+      do_aesni_enc (ctx);
+
+      asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
+                    :
+                    :
+                    : "memory" );
+
+      abuf += BLOCKSIZE;
+    }
+
+  for ( ;nblocks > 3 ; nblocks -= 4 )
     {
-      if (n % 4 == 0)
-	{
-	  l[0] = c->u_mode.ocb.L[0];
-	  l[1] = c->u_mode.ocb.L[1];
-	  l[2] = c->u_mode.ocb.L[0];
-	}
-
-      for ( ;nblocks > 3 ; nblocks -= 4 )
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (n % 4 == 0)
-	    {
-	      n += 4;
-	      l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	    }
-	  else
-	    {
-	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset,
-			  c->u_mode.ocb.aad_sum);
-	      n += 4;
-	    }
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-			"movdqu %[abuf0],  %%xmm1\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm1\n\t"
-			:
-			: [l0] "m" (*l[0]),
-			  [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-			"movdqu %[abuf1],  %%xmm2\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm2\n\t"
-			:
-			: [l1] "m" (*l[1]),
-			  [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-			"movdqu %[abuf2],  %%xmm3\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm3\n\t"
-			:
-			: [l2] "m" (*l[2]),
-			  [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
-			: "memory" );
-	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-			"movdqu %[abuf3],  %%xmm4\n\t"
-			"pxor   %%xmm0,    %%xmm5\n\t"
-			"pxor   %%xmm5,    %%xmm4\n\t"
-			:
-			: [l3] "m" (*l[3]),
-			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
-			: "memory" );
-
-	  do_aesni_enc_vec4 (ctx);
-
-	  asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
-			"pxor   %%xmm2,   %%xmm6\n\t"
-			"pxor   %%xmm3,   %%xmm6\n\t"
-			"pxor   %%xmm4,   %%xmm6\n\t"
-			:
-			:
-			: "memory" );
-
-	  abuf += 4*BLOCKSIZE;
-	}
+      /* l_tmp will be used only every 65536-th block. */
+      n += 4;
+      l = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
+		c->u_mode.ocb.aad_sum);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+		    "movdqu %[abuf0],  %%xmm1\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm1\n\t"
+		    :
+		    : [l0] "m" (*c->u_mode.ocb.L[0]),
+		      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+		    "movdqu %[abuf1],  %%xmm2\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm2\n\t"
+		    :
+		    : [l1] "m" (*c->u_mode.ocb.L[1]),
+		      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+		    "movdqu %[abuf2],  %%xmm3\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm3\n\t"
+		    :
+		    : [l2] "m" (*c->u_mode.ocb.L[0]),
+		      [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+		    : "memory" );
+      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+		    "movdqu %[abuf3],  %%xmm4\n\t"
+		    "pxor   %%xmm0,    %%xmm5\n\t"
+		    "pxor   %%xmm5,    %%xmm4\n\t"
+		    :
+		    : [l3] "m" (*l),
+		      [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+		    : "memory" );
+
+      do_aesni_enc_vec4 (ctx);
+
+      asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
+		    "pxor   %%xmm2,   %%xmm6\n\t"
+		    "pxor   %%xmm3,   %%xmm6\n\t"
+		    "pxor   %%xmm4,   %%xmm6\n\t"
+		    :
+		    :
+		    : "memory" );
+
+      abuf += 4*BLOCKSIZE;
     }
 
   for ( ;nblocks; nblocks-- )
     {
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
-                   c->u_mode.ocb.aad_sum);
+      l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+                c->u_mode.ocb.aad_sum);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
@@ -1845,7 +1859,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                     "pxor   %%xmm1,   %%xmm5\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l[0]),
+                    : [l] "m" (*l),
                       [abuf] "m" (*abuf)
                     : "memory" );
 
diff --git a/cipher/serpent.c b/cipher/serpent.c
index a47a1b7..fc3afa6 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1250,56 +1250,45 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_avx2 = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * sizeof(serpent_block_t);
+	      inbuf  += 16 * sizeof(serpent_block_t);
+	      did_use_avx2 = 1;
 	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
-
-	  if (encrypt)
-	    _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-
-	  nblocks -= 16;
-	  outbuf += 16 * sizeof(serpent_block_t);
-	  inbuf  += 16 * sizeof(serpent_block_t);
-	  did_use_avx2 = 1;
 	}
 
       if (did_use_avx2)
@@ -1317,51 +1306,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   {
     int did_use_sse2 = 0;
     const void *Ls[8];
-    int i;
+    unsigned int n = 8 - (blkn % 8);
+    const void **l;
 
-    if (blkn % 8 == 0)
+    if (nblocks >= 8)
       {
-	Ls[0] = c->u_mode.ocb.L[0];
-	Ls[1] = c->u_mode.ocb.L[1];
-	Ls[2] = c->u_mode.ocb.L[0];
-	Ls[3] = c->u_mode.ocb.L[2];
-	Ls[4] = c->u_mode.ocb.L[0];
-	Ls[5] = c->u_mode.ocb.L[1];
-	Ls[6] = c->u_mode.ocb.L[0];
-      }
-
-    /* Process data in 8 block chunks. */
-    while (nblocks >= 8)
-      {
-	/* l_tmp will be used only every 65536-th block. */
-	if (blkn % 8 == 0)
+	Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	l = &Ls[(7 + n) % 8];
+
+	/* Process data in 8 block chunks. */
+	while (nblocks >= 8)
 	  {
+	    /* l_tmp will be used only every 65536-th block. */
 	    blkn += 8;
-	    Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	  }
-	else
-	  {
-	    for (i = 0; i < 8; i += 4)
-	      {
-		Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		blkn += 4;
-	      }
+	    *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
+
+	    if (encrypt)
+	      _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	    else
+	      _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	    nblocks -= 8;
+	    outbuf += 8 * sizeof(serpent_block_t);
+	    inbuf  += 8 * sizeof(serpent_block_t);
+	    did_use_sse2 = 1;
 	  }
-
-	if (encrypt)
-	  _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-	else
-	  _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-				      c->u_ctr.ctr, Ls);
-
-	nblocks -= 8;
-	outbuf += 8 * sizeof(serpent_block_t);
-	inbuf  += 8 * sizeof(serpent_block_t);
-	did_use_sse2 = 1;
       }
 
     if (did_use_sse2)
@@ -1380,51 +1357,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     {
       int did_use_neon = 0;
       const void *Ls[8];
-      int i;
+      unsigned int n = 8 - (blkn % 8);
+      const void **l;
 
-      if (blkn % 8 == 0)
+      if (nblocks >= 8)
 	{
-	  Ls[0] = c->u_mode.ocb.L[0];
-	  Ls[1] = c->u_mode.ocb.L[1];
-	  Ls[2] = c->u_mode.ocb.L[0];
-	  Ls[3] = c->u_mode.ocb.L[2];
-	  Ls[4] = c->u_mode.ocb.L[0];
-	  Ls[5] = c->u_mode.ocb.L[1];
-	  Ls[6] = c->u_mode.ocb.L[0];
-	}
-
-      /* Process data in 8 block chunks. */
-      while (nblocks >= 8)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 8 == 0)
+	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 8;
-	      Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 8; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
+
+	      if (encrypt)
+		_gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 8;
+	      outbuf += 8 * sizeof(serpent_block_t);
+	      inbuf  += 8 * sizeof(serpent_block_t);
+	      did_use_neon = 1;
 	    }
-
-	  if (encrypt)
-	    _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-				       c->u_ctr.ctr, Ls);
-	  else
-	    _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-				       c->u_ctr.ctr, Ls);
-
-	  nblocks -= 8;
-	  outbuf += 8 * sizeof(serpent_block_t);
-	  inbuf  += 8 * sizeof(serpent_block_t);
-	  did_use_neon = 1;
 	}
 
       if (did_use_neon)
@@ -1471,51 +1436,40 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_avx2 = 0;
       const void *Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      const void **l;
       int i;
 
-      if (blkn % 16 == 0)
+      if (nblocks >= 16)
 	{
 	  for (i = 0; i < 16; i += 8)
 	    {
-	      Ls[i + 0] = c->u_mode.ocb.L[0];
-	      Ls[i + 1] = c->u_mode.ocb.L[1];
-	      Ls[i + 2] = c->u_mode.ocb.L[0];
-	      Ls[i + 3] = c->u_mode.ocb.L[2];
-	      Ls[i + 4] = c->u_mode.ocb.L[0];
-	      Ls[i + 5] = c->u_mode.ocb.L[1];
-	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	      Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
 	    }
 
-	  Ls[7] = c->u_mode.ocb.L[3];
-	}
+	  Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
 
-      /* Process data in 16 block chunks. */
-      while (nblocks >= 16)
-	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 16 == 0)
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 16;
-	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 16; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
 
-	  _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-				      c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 16;
-	  abuf += 16 * sizeof(serpent_block_t);
-	  did_use_avx2 = 1;
+	      nblocks -= 16;
+	      abuf += 16 * sizeof(serpent_block_t);
+	      did_use_avx2 = 1;
+	    }
 	}
 
       if (did_use_avx2)
@@ -1533,46 +1487,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   {
     int did_use_sse2 = 0;
     const void *Ls[8];
-    int i;
+    unsigned int n = 8 - (blkn % 8);
+    const void **l;
 
-    if (blkn % 8 == 0)
+    if (nblocks >= 8)
       {
-	Ls[0] = c->u_mode.ocb.L[0];
-	Ls[1] = c->u_mode.ocb.L[1];
-	Ls[2] = c->u_mode.ocb.L[0];
-	Ls[3] = c->u_mode.ocb.L[2];
-	Ls[4] = c->u_mode.ocb.L[0];
-	Ls[5] = c->u_mode.ocb.L[1];
-	Ls[6] = c->u_mode.ocb.L[0];
-      }
-
-    /* Process data in 8 block chunks. */
-    while (nblocks >= 8)
-      {
-	/* l_tmp will be used only every 65536-th block. */
-	if (blkn % 8 == 0)
+	Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	l = &Ls[(7 + n) % 8];
+
+	/* Process data in 8 block chunks. */
+	while (nblocks >= 8)
 	  {
+	    /* l_tmp will be used only every 65536-th block. */
 	    blkn += 8;
-	    Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	  }
-	else
-	  {
-	    for (i = 0; i < 8; i += 4)
-	      {
-		Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		blkn += 4;
-	      }
-	  }
+	    *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
 
-	_gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-				    c->u_mode.ocb.aad_sum, Ls);
+	    _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					c->u_mode.ocb.aad_sum, Ls);
 
-	nblocks -= 8;
-	abuf += 8 * sizeof(serpent_block_t);
-	did_use_sse2 = 1;
+	    nblocks -= 8;
+	    abuf += 8 * sizeof(serpent_block_t);
+	    did_use_sse2 = 1;
+	  }
       }
 
     if (did_use_sse2)
@@ -1591,46 +1533,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     {
       int did_use_neon = 0;
       const void *Ls[8];
-      int i;
-
-      if (blkn % 8 == 0)
-	{
-	  Ls[0] = c->u_mode.ocb.L[0];
-	  Ls[1] = c->u_mode.ocb.L[1];
-	  Ls[2] = c->u_mode.ocb.L[0];
-	  Ls[3] = c->u_mode.ocb.L[2];
-	  Ls[4] = c->u_mode.ocb.L[0];
-	  Ls[5] = c->u_mode.ocb.L[1];
-	  Ls[6] = c->u_mode.ocb.L[0];
-	}
+      unsigned int n = 8 - (blkn % 8);
+      const void **l;
 
-      /* Process data in 8 block chunks. */
-      while (nblocks >= 8)
+      if (nblocks >= 8)
 	{
-	  /* l_tmp will be used only every 65536-th block. */
-	  if (blkn % 8 == 0)
+	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+	  l = &Ls[(7 + n) % 8];
+
+	  /* Process data in 8 block chunks. */
+	  while (nblocks >= 8)
 	    {
+	      /* l_tmp will be used only every 65536-th block. */
 	      blkn += 8;
-	      Ls[7] = ocb_get_l(c, l_tmp, blkn);
-	    }
-	  else
-	    {
-	      for (i = 0; i < 8; i += 4)
-		{
-		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
-		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
-		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
-		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
-		  blkn += 4;
-		}
-	    }
+	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
 
-	  _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
-				      c->u_mode.ocb.aad_sum, Ls);
+	      _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
 
-	  nblocks -= 8;
-	  abuf += 8 * sizeof(serpent_block_t);
-	  did_use_neon = 1;
+	      nblocks -= 8;
+	      abuf += 8 * sizeof(serpent_block_t);
+	      did_use_neon = 1;
+	    }
 	}
 
       if (did_use_neon)
diff --git a/tests/basic.c b/tests/basic.c
index c1aa76a..4ea91a9 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3153,7 +3153,8 @@ do_check_ocb_cipher (int inplace)
 
 
 static void
-check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
+check_ocb_cipher_largebuf_split (int algo, int keylen, const char *tagexpect,
+				 unsigned int splitpos)
 {
   static const unsigned char key[32] =
         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
@@ -3219,7 +3220,14 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
       goto out_free;
     }
 
-  err = gcry_cipher_authenticate (hde, inbuf, buflen);
+  if (splitpos)
+    {
+      err = gcry_cipher_authenticate (hde, inbuf, splitpos);
+    }
+  if (!err)
+    {
+      err = gcry_cipher_authenticate (hde, inbuf + splitpos, buflen - splitpos);
+    }
   if (err)
     {
       fail ("cipher-ocb, gcry_cipher_authenticate failed (large, algo %d): %s\n",
@@ -3229,10 +3237,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
       goto out_free;
     }
 
-  err = gcry_cipher_final (hde);
+  if (splitpos)
+    {
+      err = gcry_cipher_encrypt (hde, outbuf, splitpos, inbuf, splitpos);
+    }
   if (!err)
     {
-      err = gcry_cipher_encrypt (hde, outbuf, buflen, inbuf, buflen);
+      err = gcry_cipher_final (hde);
+      if (!err)
+	{
+	  err = gcry_cipher_encrypt (hde, outbuf + splitpos, buflen - splitpos,
+				    inbuf + splitpos, buflen - splitpos);
+	}
     }
   if (err)
     {
@@ -3267,10 +3283,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
     }
 
   /* Now for the decryption.  */
-  err = gcry_cipher_final (hdd);
+  if (splitpos)
+    {
+      err = gcry_cipher_decrypt (hdd, outbuf, splitpos, NULL, 0);
+    }
   if (!err)
     {
-      err = gcry_cipher_decrypt (hdd, outbuf, buflen, NULL, 0);
+      err = gcry_cipher_final (hdd);
+      if (!err)
+	{
+	  err = gcry_cipher_decrypt (hdd, outbuf + splitpos, buflen - splitpos,
+				     NULL, 0);
+	}
     }
   if (err)
     {
@@ -3319,6 +3343,18 @@ out_free:
 
 
 static void
+check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
+{
+  unsigned int split;
+
+  for (split = 0; split < 32 * 16; split = split * 2 + 16)
+    {
+      check_ocb_cipher_largebuf_split(algo, keylen, tagexpect, split);
+    }
+}
+
+
+static void
 check_ocb_cipher (void)
 {
   /* Check OCB cipher with separate destination and source buffers for

commit e11895da1f4af9782d89e92ba2e6b1a63235b54b
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Mon Aug 10 20:48:02 2015 +0300

    Add carryless 8-bit addition fast-path for AES-NI CTR mode
    
    * cipher/rijndael-aesni.c (do_aesni_ctr_4): Do addition using
    CTR in big-endian form, if least-significant byte does not overflow.
    --
    
    Patch improves AES-NI CTR speed by 20%.
    
    Benchmark on Intel Haswell (3.2 Ghz):
    
    Before:
     AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
            CTR enc |     0.273 ns/B    3489.8 MiB/s     0.875 c/B
            CTR dec |     0.273 ns/B    3491.0 MiB/s     0.874 c/B
    
    After:
            CTR enc |     0.228 ns/B    4190.0 MiB/s     0.729 c/B
            CTR dec |     0.228 ns/B    4190.2 MiB/s     0.729 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 882cc79..6678785 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -787,6 +787,13 @@ static void
 do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
                 unsigned char *ctr, unsigned char *b, const unsigned char *a)
 {
+  static const byte bige_addb_const[4][16] __attribute__ ((aligned (16))) =
+    {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
+    };
 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
 #define aesenc_xmm1_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd1\n\t"
 #define aesenc_xmm1_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd9\n\t"
@@ -807,7 +814,25 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
       xmm6  endian swapping mask
    */
 
-  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
+  asm volatile (/* detect if 8-bit carry handling is needed */
+                "cmpb   $0xfb, 15(%[ctr])\n\t"
+                "ja     .Ladd32bit%=\n\t"
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa %[addb_1], %%xmm2\n\t"  /* xmm2 := be(1) */
+                "movdqa %[addb_2], %%xmm3\n\t"  /* xmm3 := be(2) */
+                "movdqa %[addb_3], %%xmm4\n\t"  /* xmm4 := be(3) */
+                "movdqa %[addb_4], %%xmm5\n\t"  /* xmm5 := be(4) */
+                "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
+                "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(4) + CTR (xmm0) */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "movl   %[rounds], %%esi\n\t"
+                "jmp    .Lstore_ctr%=\n\t"
+
+                ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
                 "movdqa %%xmm0, %%xmm2\n\t"
                 "pcmpeqd %%xmm1, %%xmm1\n\t"
                 "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
@@ -852,6 +877,8 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
                 "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
                 "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
                 "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+
+                ".Lstore_ctr%=:\n\t"
                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
 
                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
@@ -956,7 +983,11 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
                   [src] "r" (a),
                   [dst] "r" (b),
                   [key] "r" (ctx->keyschenc),
-                  [rounds] "g" (ctx->rounds)
+                  [rounds] "g" (ctx->rounds),
+                  [addb_1] "m" (bige_addb_const[0][0]),
+                  [addb_2] "m" (bige_addb_const[1][0]),
+                  [addb_3] "m" (bige_addb_const[2][0]),
+                  [addb_4] "m" (bige_addb_const[3][0])
                 : "%esi", "cc", "memory");
 #undef aesenc_xmm1_xmm0
 #undef aesenc_xmm1_xmm2

-----------------------------------------------------------------------

Summary of changes:
 cipher/camellia-glue.c  | 254 +++++++++-----------
 cipher/rijndael-aesni.c | 597 ++++++++++++++++++++++++++----------------------
 cipher/serpent.c        | 370 ++++++++++++------------------
 tests/basic.c           |  48 +++-
 4 files changed, 619 insertions(+), 650 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits




More information about the Gcrypt-devel mailing list