[PATCH] Optimize OCB offset calculation

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri Aug 7 19:21:04 CEST 2015


* cipher/cipher-internal.h (ocb_get_l): New.
* cipher/cipher-ocb.c (_gcry_cipher_ocb_authenticate)
(ocb_crypt): Use 'ocb_get_l' instead of '_gcry_cipher_ocb_get_l'.
* cipher/camellia-glue.c (get_l): Remove.
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Precalculate
offset array when block count matches parallel operation size; Use
'ocb_get_l' instead of 'get_l'.
* cipher/rijndael-aesni.c (get_l): Add fast path for 75% most common
offsets.
(aesni_ocb_enc, aesni_ocb_dec, _gcry_aes_aesni_ocb_auth): Precalculate
offset array when block count matches parallel operation size.
* cipher/rijndael-ssse3-amd64.c (get_l): Add fast path for 75% most
common offsets.
* cipher/rijndael.c (_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth): Use
'ocb_get_l' instead of '_gcry_cipher_ocb_get_l'.
* cipher/serpent.c (get_l): Remove.
(_gcry_serpent_ocb_crypt, _gcry_serpent_ocb_auth): Precalculate
offset array when block count matches parallel operation size; Use
'ocb_get_l' instead of 'get_l'.
* cipher/twofish.c (get_l): Remove.
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Use 'ocb_get_l'
instead of 'get_l'.
--

Patch optimizes OCB offset calculation for generic code and
assembly implementations with parallel block processing.

Benchmark of OCB AES-NI on Intel Haswell:

 $ tests/bench-slope --cpu-mhz 3201 cipher aes

 Before:
  AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
         CTR enc |     0.274 ns/B    3483.9 MiB/s     0.876 c/B
         CTR dec |     0.273 ns/B    3490.0 MiB/s     0.875 c/B
         OCB enc |     0.289 ns/B    3296.1 MiB/s     0.926 c/B
         OCB dec |     0.299 ns/B    3189.9 MiB/s     0.957 c/B
        OCB auth |     0.260 ns/B    3670.0 MiB/s     0.832 c/B

 After:
  AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
         CTR enc |     0.273 ns/B    3489.4 MiB/s     0.875 c/B
         CTR dec |     0.273 ns/B    3487.5 MiB/s     0.875 c/B
         OCB enc |     0.248 ns/B    3852.8 MiB/s     0.792 c/B
         OCB dec |     0.261 ns/B    3659.5 MiB/s     0.834 c/B
        OCB auth |     0.227 ns/B    4205.5 MiB/s     0.726 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-glue.c        |  161 ++++++++++---
 cipher/cipher-internal.h      |   20 ++
 cipher/cipher-ocb.c           |    5 
 cipher/rijndael-aesni.c       |  498 +++++++++++++++++++++++------------------
 cipher/rijndael-ssse3-amd64.c |    6 
 cipher/rijndael.c             |   24 --
 cipher/serpent.c              |  209 +++++++++++++----
 cipher/twofish.c              |   25 --
 8 files changed, 597 insertions(+), 351 deletions(-)

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 99516fc..2d5dd20 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -604,19 +604,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
-static inline const unsigned char *
-get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
-{
-  unsigned int ntz = _gcry_ctz64 (i);
-
-  if (ntz < OCB_L_TABLE_SIZE)
-      return c->u_mode.ocb.L[ntz];
-  else
-      return _gcry_cipher_ocb_get_l (c, l_tmp, i);
-}
-#endif
-
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
 size_t
 _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
@@ -646,17 +633,43 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       const void *Ls[32];
       int i;
 
+      if (blkn % 32 == 0)
+	{
+	  for (i = 0; i < 32; i += 8)
+	    {
+	      Ls[i + 0] = c->u_mode.ocb.L[0];
+	      Ls[i + 1] = c->u_mode.ocb.L[1];
+	      Ls[i + 2] = c->u_mode.ocb.L[0];
+	      Ls[i + 3] = c->u_mode.ocb.L[2];
+	      Ls[i + 4] = c->u_mode.ocb.L[0];
+	      Ls[i + 5] = c->u_mode.ocb.L[1];
+	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[7] = c->u_mode.ocb.L[3];
+	  Ls[15] = c->u_mode.ocb.L[4];
+	  Ls[23] = c->u_mode.ocb.L[3];
+	}
+
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 32; i += 4)
+	  if (blkn % 32 == 0)
+	    {
+	      blkn += 32;
+	      Ls[31] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      for (i = 0; i < 32; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  if (encrypt)
@@ -692,17 +705,41 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       const void *Ls[16];
       int i;
 
+      if (blkn % 16 == 0)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      Ls[i + 0] = c->u_mode.ocb.L[0];
+	      Ls[i + 1] = c->u_mode.ocb.L[1];
+	      Ls[i + 2] = c->u_mode.ocb.L[0];
+	      Ls[i + 3] = c->u_mode.ocb.L[2];
+	      Ls[i + 4] = c->u_mode.ocb.L[0];
+	      Ls[i + 5] = c->u_mode.ocb.L[1];
+	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[7] = c->u_mode.ocb.L[3];
+	}
+
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 16; i += 4)
+	  if (blkn % 16 == 0)
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      blkn += 16;
+	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
+	    {
+	      for (i = 0; i < 16; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  if (encrypt)
@@ -768,17 +805,43 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       const void *Ls[32];
       int i;
 
+      if (blkn % 32 == 0)
+	{
+	  for (i = 0; i < 32; i += 8)
+	    {
+	      Ls[i + 0] = c->u_mode.ocb.L[0];
+	      Ls[i + 1] = c->u_mode.ocb.L[1];
+	      Ls[i + 2] = c->u_mode.ocb.L[0];
+	      Ls[i + 3] = c->u_mode.ocb.L[2];
+	      Ls[i + 4] = c->u_mode.ocb.L[0];
+	      Ls[i + 5] = c->u_mode.ocb.L[1];
+	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[7] = c->u_mode.ocb.L[3];
+	  Ls[15] = c->u_mode.ocb.L[4];
+	  Ls[23] = c->u_mode.ocb.L[3];
+	}
+
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 32; i += 4)
+	  if (blkn % 32 == 0)
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      blkn += 32;
+	      Ls[31] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
+	    {
+	      for (i = 0; i < 32; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
@@ -809,17 +872,41 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       const void *Ls[16];
       int i;
 
+      if (blkn % 16 == 0)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      Ls[i + 0] = c->u_mode.ocb.L[0];
+	      Ls[i + 1] = c->u_mode.ocb.L[1];
+	      Ls[i + 2] = c->u_mode.ocb.L[0];
+	      Ls[i + 3] = c->u_mode.ocb.L[2];
+	      Ls[i + 4] = c->u_mode.ocb.L[0];
+	      Ls[i + 5] = c->u_mode.ocb.L[1];
+	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[7] = c->u_mode.ocb.L[3];
+	}
+
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 16; i += 4)
+	  if (blkn % 16 == 0)
+	    {
+	      blkn += 16;
+	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      for (i = 0; i < 16; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index bb86d37..29c6f33 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -448,4 +448,24 @@ const unsigned char *_gcry_cipher_ocb_get_l
 /*           */ (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 n);
 
 
+/* Inline version of _gcry_cipher_ocb_get_l, with hard-coded fast paths for
+   most common cases.  */
+static inline const unsigned char *
+ocb_get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 n)
+{
+  if (n & 1)
+    return c->u_mode.ocb.L[0];
+  else if (n & 2)
+    return c->u_mode.ocb.L[1];
+  else
+    {
+      unsigned int ntz = _gcry_ctz64 (n);
+
+      if (ntz < OCB_L_TABLE_SIZE)
+	return c->u_mode.ocb.L[ntz];
+      else
+	return _gcry_cipher_ocb_get_l (c, l_tmp, n);
+    }
+}
+
 #endif /*G10_CIPHER_INTERNAL_H*/
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index 096975a..a3a2c9b 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -280,7 +280,7 @@ _gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       buf_xor_1 (c->u_mode.ocb.aad_offset,
-                 _gcry_cipher_ocb_get_l (c, l_tmp, c->u_mode.ocb.aad_nblocks),
+                 ocb_get_l (c, l_tmp, c->u_mode.ocb.aad_nblocks),
                  OCB_BLOCK_LEN);
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
       buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, OCB_BLOCK_LEN);
@@ -392,8 +392,7 @@ ocb_crypt (gcry_cipher_hd_t c, int encrypt,
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           buf_xor_1 (c->u_iv.iv,
-                     _gcry_cipher_ocb_get_l (c, l_tmp,
-                                             c->u_mode.ocb.data_nblocks),
+                     ocb_get_l (c, l_tmp, c->u_mode.ocb.data_nblocks),
                      OCB_BLOCK_LEN);
           /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
           buf_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN);
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 910bc68..882cc79 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -1307,7 +1307,11 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
   const unsigned char *l;
   unsigned int ntz;
 
-  if (i & 0xffffffffU)
+  if (i & 1)
+    return c->u_mode.ocb.L[0];
+  else if (i & 2)
+    return c->u_mode.ocb.L[1];
+  else if (i & 0xffffffffU)
     {
       asm ("rep;bsf %k[low], %k[ntz]\n\t"
            : [ntz] "=r" (ntz)
@@ -1372,6 +1376,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
+  const unsigned char *l[4] = {};
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1385,87 +1390,103 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                   [ctr] "m" (*c->u_ctr.ctr)
                 : "memory" );
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+  if (nblocks > 3)
     {
-      const unsigned char *l[4];
-
-      /* l_tmp will be used only every 65536-th block. */
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-      l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-      l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-      l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
-      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-                    "movdqu %[inbuf0], %%xmm1\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm1,    %%xmm6\n\t"
-                    "pxor   %%xmm5,    %%xmm1\n\t"
-                    "movdqu %%xmm5,    %[outbuf0]\n\t"
-                    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-                    : [l0] "m" (*l[0]),
-                      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-                    "movdqu %[inbuf1], %%xmm2\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm2,    %%xmm6\n\t"
-                    "pxor   %%xmm5,    %%xmm2\n\t"
-                    "movdqu %%xmm5,    %[outbuf1]\n\t"
-                    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-                    : [l1] "m" (*l[1]),
-                      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-                    "movdqu %[inbuf2], %%xmm3\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm3,    %%xmm6\n\t"
-                    "pxor   %%xmm5,    %%xmm3\n\t"
-                    "movdqu %%xmm5,    %[outbuf2]\n\t"
-                    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-                    : [l2] "m" (*l[2]),
-                      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-                    "movdqu %[inbuf3], %%xmm4\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm4,    %%xmm6\n\t"
-                    "pxor   %%xmm5,    %%xmm4\n\t"
-                    :
-                    : [l3] "m" (*l[3]),
-                      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-                    : "memory" );
-
-      do_aesni_enc_vec4 (ctx);
-
-      asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
-                    "pxor   %%xmm0,    %%xmm1\n\t"
-                    "movdqu %%xmm1,    %[outbuf0]\n\t"
-                    "movdqu %[outbuf1],%%xmm0\n\t"
-                    "pxor   %%xmm0,    %%xmm2\n\t"
-                    "movdqu %%xmm2,    %[outbuf1]\n\t"
-                    "movdqu %[outbuf2],%%xmm0\n\t"
-                    "pxor   %%xmm0,    %%xmm3\n\t"
-                    "movdqu %%xmm3,    %[outbuf2]\n\t"
-                    "pxor   %%xmm5,    %%xmm4\n\t"
-                    "movdqu %%xmm4,    %[outbuf3]\n\t"
-                    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
-                      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
-                      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
-                      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
-                    :
-                    : "memory" );
-
-      outbuf += 4*BLOCKSIZE;
-      inbuf  += 4*BLOCKSIZE;
+      if (n % 4 == 0)
+	{
+	  l[0] = c->u_mode.ocb.L[0];
+	  l[1] = c->u_mode.ocb.L[1];
+	  l[2] = c->u_mode.ocb.L[0];
+	}
+
+      for ( ;nblocks > 3 ; nblocks -= 4 )
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  if (n % 4 == 0)
+	    {
+	      n += 4;
+	      l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+	    }
+	  else
+	    {
+	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
+	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
+	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
+	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
+	      n += 4;
+	    }
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqu %%xmm5,    %[outbuf0]\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+			: [l0] "m" (*l[0]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm2,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqu %%xmm5,    %[outbuf1]\n\t"
+			: [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+			: [l1] "m" (*l[1]),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqu %%xmm5,    %[outbuf2]\n\t"
+			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+			: [l2] "m" (*l[2]),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm4,    %%xmm6\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			:
+			: [l3] "m" (*l[3]),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec4 (ctx);
+
+	  asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm1\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %[outbuf1],%%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm2\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %[outbuf2],%%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm3\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			: [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 4*BLOCKSIZE;
+	  inbuf  += 4*BLOCKSIZE;
+	}
     }
+
   for ( ;nblocks; nblocks-- )
     {
-      const unsigned char *l;
-
-      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Checksum_i = Checksum_{i-1} xor P_i  */
@@ -1476,7 +1497,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                     "pxor   %%xmm0,   %%xmm6\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l),
+                    : [l] "m" (*l[0]),
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
@@ -1516,6 +1537,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
+  const unsigned char *l[4] = {};
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1529,87 +1551,103 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                   [ctr] "m" (*c->u_ctr.ctr)
                 : "memory" );
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+  if (nblocks > 3)
     {
-      const unsigned char *l[4];
-
-      /* l_tmp will be used only every 65536-th block. */
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-      l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-      l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-      l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-      /* Checksum_i = Checksum_{i-1} xor P_i  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-                    "movdqu %[inbuf0], %%xmm1\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm1\n\t"
-                    "movdqu %%xmm5,    %[outbuf0]\n\t"
-                    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
-                    : [l0] "m" (*l[0]),
-                      [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-                    "movdqu %[inbuf1], %%xmm2\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm2\n\t"
-                    "movdqu %%xmm5,    %[outbuf1]\n\t"
-                    : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
-                    : [l1] "m" (*l[1]),
-                      [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-                    "movdqu %[inbuf2], %%xmm3\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm3\n\t"
-                    "movdqu %%xmm5,    %[outbuf2]\n\t"
-                    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
-                    : [l2] "m" (*l[2]),
-                      [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-                    "movdqu %[inbuf3], %%xmm4\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm4\n\t"
-                    :
-                    : [l3] "m" (*l[3]),
-                      [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
-                    : "memory" );
-
-      do_aesni_dec_vec4 (ctx);
-
-      asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
-                    "pxor   %%xmm0,    %%xmm1\n\t"
-                    "movdqu %%xmm1,    %[outbuf0]\n\t"
-                    "movdqu %[outbuf1],%%xmm0\n\t"
-                    "pxor   %%xmm0,    %%xmm2\n\t"
-                    "movdqu %%xmm2,    %[outbuf1]\n\t"
-                    "movdqu %[outbuf2],%%xmm0\n\t"
-                    "pxor   %%xmm0,    %%xmm3\n\t"
-                    "movdqu %%xmm3,    %[outbuf2]\n\t"
-                    "pxor   %%xmm5,    %%xmm4\n\t"
-                    "movdqu %%xmm4,    %[outbuf3]\n\t"
-                    "pxor   %%xmm1,    %%xmm6\n\t"
-                    "pxor   %%xmm2,    %%xmm6\n\t"
-                    "pxor   %%xmm3,    %%xmm6\n\t"
-                    "pxor   %%xmm4,    %%xmm6\n\t"
-                    : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
-                      [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
-                      [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
-                      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
-                    :
-                    : "memory" );
-
-      outbuf += 4*BLOCKSIZE;
-      inbuf  += 4*BLOCKSIZE;
+      if (n % 4 == 0)
+	{
+	  l[0] = c->u_mode.ocb.L[0];
+	  l[1] = c->u_mode.ocb.L[1];
+	  l[2] = c->u_mode.ocb.L[0];
+	}
+
+      for ( ;nblocks > 3 ; nblocks -= 4 )
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  if (n % 4 == 0)
+	    {
+	      n += 4;
+	      l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+	    }
+	  else
+	    {
+	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
+	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
+	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
+	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
+	      n += 4;
+	    }
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqu %%xmm5,    %[outbuf0]\n\t"
+			: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+			: [l0] "m" (*l[0]),
+			  [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+			"movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqu %%xmm5,    %[outbuf1]\n\t"
+			: [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+			: [l1] "m" (*l[1]),
+			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+			"movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqu %%xmm5,    %[outbuf2]\n\t"
+			: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+			: [l2] "m" (*l[2]),
+			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			:
+			: [l3] "m" (*l[3]),
+			  [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_dec_vec4 (ctx);
+
+	  asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm1\n\t"
+			"movdqu %%xmm1,    %[outbuf0]\n\t"
+			"movdqu %[outbuf1],%%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm2\n\t"
+			"movdqu %%xmm2,    %[outbuf1]\n\t"
+			"movdqu %[outbuf2],%%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm3\n\t"
+			"movdqu %%xmm3,    %[outbuf2]\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqu %%xmm4,    %[outbuf3]\n\t"
+			"pxor   %%xmm1,    %%xmm6\n\t"
+			"pxor   %%xmm2,    %%xmm6\n\t"
+			"pxor   %%xmm3,    %%xmm6\n\t"
+			"pxor   %%xmm4,    %%xmm6\n\t"
+			: [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+			  [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+			  [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+			  [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+			:
+			: "memory" );
+
+	  outbuf += 4*BLOCKSIZE;
+	  inbuf  += 4*BLOCKSIZE;
+	}
     }
+
   for ( ;nblocks; nblocks-- )
     {
-      const unsigned char *l;
-
-      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+      l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
@@ -1619,7 +1657,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                     "pxor   %%xmm1,   %%xmm5\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l),
+                    : [l] "m" (*l[0]),
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
@@ -1670,6 +1708,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   RIJNDAEL_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   u64 n = c->u_mode.ocb.aad_nblocks;
+  const unsigned char *l[4] = {};
   aesni_prepare_2_6_variable;
 
   aesni_prepare ();
@@ -1683,73 +1722,90 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                   [ctr] "m" (*c->u_mode.ocb.aad_sum)
                 : "memory" );
 
-  for ( ;nblocks > 3 ; nblocks -= 4 )
+  if (nblocks > 3)
     {
-      const unsigned char *l[4];
-
-      /* l_tmp will be used only every 65536-th block. */
-      l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
-                   c->u_mode.ocb.aad_sum);
-      l[1] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
-                   c->u_mode.ocb.aad_sum);
-      l[2] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
-                   c->u_mode.ocb.aad_sum);
-      l[3] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
-                   c->u_mode.ocb.aad_sum);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      asm volatile ("movdqu %[l0],     %%xmm0\n\t"
-                    "movdqu %[abuf0],  %%xmm1\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm1\n\t"
-                    :
-                    : [l0] "m" (*l[0]),
-                      [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l1],     %%xmm0\n\t"
-                    "movdqu %[abuf1],  %%xmm2\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm2\n\t"
-                    :
-                    : [l1] "m" (*l[1]),
-                      [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l2],     %%xmm0\n\t"
-                    "movdqu %[abuf2],  %%xmm3\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm3\n\t"
-                    :
-                    : [l2] "m" (*l[2]),
-                      [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
-                    : "memory" );
-      asm volatile ("movdqu %[l3],     %%xmm0\n\t"
-                    "movdqu %[abuf3],  %%xmm4\n\t"
-                    "pxor   %%xmm0,    %%xmm5\n\t"
-                    "pxor   %%xmm5,    %%xmm4\n\t"
-                    :
-                    : [l3] "m" (*l[3]),
-                      [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
-                    : "memory" );
-
-      do_aesni_enc_vec4 (ctx);
-
-      asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
-                    "pxor   %%xmm2,   %%xmm6\n\t"
-                    "pxor   %%xmm3,   %%xmm6\n\t"
-                    "pxor   %%xmm4,   %%xmm6\n\t"
-                    :
-                    :
-                    : "memory" );
-
-      abuf += 4*BLOCKSIZE;
+      if (n % 4 == 0)
+	{
+	  l[0] = c->u_mode.ocb.L[0];
+	  l[1] = c->u_mode.ocb.L[1];
+	  l[2] = c->u_mode.ocb.L[0];
+	}
+
+      for ( ;nblocks > 3 ; nblocks -= 4 )
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  if (n % 4 == 0)
+	    {
+	      n += 4;
+	      l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
+			  c->u_mode.ocb.aad_sum);
+	    }
+	  else
+	    {
+	      l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset,
+			  c->u_mode.ocb.aad_sum);
+	      l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset,
+			  c->u_mode.ocb.aad_sum);
+	      l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset,
+			  c->u_mode.ocb.aad_sum);
+	      l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset,
+			  c->u_mode.ocb.aad_sum);
+	      n += 4;
+	    }
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  asm volatile ("movdqu %[l0],     %%xmm0\n\t"
+			"movdqu %[abuf0],  %%xmm1\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			:
+			: [l0] "m" (*l[0]),
+			  [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l1],     %%xmm0\n\t"
+			"movdqu %[abuf1],  %%xmm2\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			:
+			: [l1] "m" (*l[1]),
+			  [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l2],     %%xmm0\n\t"
+			"movdqu %[abuf2],  %%xmm3\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			:
+			: [l2] "m" (*l[2]),
+			  [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+			: "memory" );
+	  asm volatile ("movdqu %[l3],     %%xmm0\n\t"
+			"movdqu %[abuf3],  %%xmm4\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			:
+			: [l3] "m" (*l[3]),
+			  [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+			: "memory" );
+
+	  do_aesni_enc_vec4 (ctx);
+
+	  asm volatile ("pxor   %%xmm1,   %%xmm6\n\t"
+			"pxor   %%xmm2,   %%xmm6\n\t"
+			"pxor   %%xmm3,   %%xmm6\n\t"
+			"pxor   %%xmm4,   %%xmm6\n\t"
+			:
+			:
+			: "memory" );
+
+	  abuf += 4*BLOCKSIZE;
+	}
     }
+
   for ( ;nblocks; nblocks-- )
     {
-      const unsigned char *l;
-
-      l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
-                c->u_mode.ocb.aad_sum);
+      l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+                   c->u_mode.ocb.aad_sum);
 
       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
@@ -1758,7 +1814,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                     "pxor   %%xmm1,   %%xmm5\n\t"
                     "pxor   %%xmm5,   %%xmm0\n\t"
                     :
-                    : [l] "m" (*l),
+                    : [l] "m" (*l[0]),
                       [abuf] "m" (*abuf)
                     : "memory" );
 
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index 0cdb532..937d868 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -535,7 +535,11 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
   const unsigned char *l;
   unsigned int ntz;
 
-  if (i & 0xffffffffU)
+  if (i & 1)
+    return c->u_mode.ocb.L[0];
+  else if (i & 2)
+    return c->u_mode.ocb.L[1];
+  else if (i & 0xffffffffU)
     {
       asm ("rep;bsf %k[low], %k[ntz]\n\t"
            : [ntz] "=r" (ntz)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 4368c6d..eff59c2 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1246,13 +1246,7 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       for ( ;nblocks; nblocks-- )
         {
           u64 i = ++c->u_mode.ocb.data_nblocks;
-          unsigned int ntz = _gcry_ctz64 (i);
-          const unsigned char *l;
-
-          if (ntz < OCB_L_TABLE_SIZE)
-              l = c->u_mode.ocb.L[ntz];
-          else
-              l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i);
+          const unsigned char *l = ocb_get_l(c, l_tmp.x1, i);
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE);
@@ -1277,13 +1271,7 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       for ( ;nblocks; nblocks-- )
         {
           u64 i = ++c->u_mode.ocb.data_nblocks;
-          unsigned int ntz = _gcry_ctz64 (i);
-          const unsigned char *l;
-
-          if (ntz < OCB_L_TABLE_SIZE)
-              l = c->u_mode.ocb.L[ntz];
-          else
-              l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i);
+          const unsigned char *l = ocb_get_l(c, l_tmp.x1, i);
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE);
@@ -1343,13 +1331,7 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
       for ( ;nblocks; nblocks-- )
         {
           u64 i = ++c->u_mode.ocb.aad_nblocks;
-          unsigned int ntz = _gcry_ctz64 (i);
-          const unsigned char *l;
-
-          if (ntz < OCB_L_TABLE_SIZE)
-              l = c->u_mode.ocb.L[ntz];
-          else
-              l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i);
+          const unsigned char *l = ocb_get_l(c, l_tmp.x1, i);
 
           /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
           buf_xor_1 (c->u_mode.ocb.aad_offset, l, BLOCKSIZE);
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 0a54a17..a47a1b7 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1226,19 +1226,6 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
-#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
-static inline const unsigned char *
-get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
-{
-  unsigned int ntz = _gcry_ctz64 (i);
-
-  if (ntz < OCB_L_TABLE_SIZE)
-      return c->u_mode.ocb.L[ntz];
-  else
-      return _gcry_cipher_ocb_get_l (c, l_tmp, i);
-}
-#endif
-
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
 size_t
 _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
@@ -1265,17 +1252,41 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       const void *Ls[16];
       int i;
 
+      if (blkn % 16 == 0)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      Ls[i + 0] = c->u_mode.ocb.L[0];
+	      Ls[i + 1] = c->u_mode.ocb.L[1];
+	      Ls[i + 2] = c->u_mode.ocb.L[0];
+	      Ls[i + 3] = c->u_mode.ocb.L[2];
+	      Ls[i + 4] = c->u_mode.ocb.L[0];
+	      Ls[i + 5] = c->u_mode.ocb.L[1];
+	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[7] = c->u_mode.ocb.L[3];
+	}
+
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 16; i += 4)
+	  if (blkn % 16 == 0)
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      blkn += 16;
+	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
+	    {
+	      for (i = 0; i < 16; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  if (encrypt)
@@ -1308,17 +1319,36 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     const void *Ls[8];
     int i;
 
+    if (blkn % 8 == 0)
+      {
+	Ls[0] = c->u_mode.ocb.L[0];
+	Ls[1] = c->u_mode.ocb.L[1];
+	Ls[2] = c->u_mode.ocb.L[0];
+	Ls[3] = c->u_mode.ocb.L[2];
+	Ls[4] = c->u_mode.ocb.L[0];
+	Ls[5] = c->u_mode.ocb.L[1];
+	Ls[6] = c->u_mode.ocb.L[0];
+      }
+
     /* Process data in 8 block chunks. */
     while (nblocks >= 8)
       {
 	/* l_tmp will be used only every 65536-th block. */
-	for (i = 0; i < 8; i += 4)
+	if (blkn % 8 == 0)
 	  {
-	    Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	    Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	    Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	    Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	    blkn += 4;
+	    blkn += 8;
+	    Ls[7] = ocb_get_l(c, l_tmp, blkn);
+	  }
+	else
+	  {
+	    for (i = 0; i < 8; i += 4)
+	      {
+		Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		blkn += 4;
+	      }
 	  }
 
 	if (encrypt)
@@ -1352,17 +1382,36 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       const void *Ls[8];
       int i;
 
+      if (blkn % 8 == 0)
+	{
+	  Ls[0] = c->u_mode.ocb.L[0];
+	  Ls[1] = c->u_mode.ocb.L[1];
+	  Ls[2] = c->u_mode.ocb.L[0];
+	  Ls[3] = c->u_mode.ocb.L[2];
+	  Ls[4] = c->u_mode.ocb.L[0];
+	  Ls[5] = c->u_mode.ocb.L[1];
+	  Ls[6] = c->u_mode.ocb.L[0];
+	}
+
       /* Process data in 8 block chunks. */
       while (nblocks >= 8)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 8; i += 4)
+	  if (blkn % 8 == 0)
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      blkn += 8;
+	      Ls[7] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
+	    {
+	      for (i = 0; i < 8; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  if (encrypt)
@@ -1424,17 +1473,41 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       const void *Ls[16];
       int i;
 
+      if (blkn % 16 == 0)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      Ls[i + 0] = c->u_mode.ocb.L[0];
+	      Ls[i + 1] = c->u_mode.ocb.L[1];
+	      Ls[i + 2] = c->u_mode.ocb.L[0];
+	      Ls[i + 3] = c->u_mode.ocb.L[2];
+	      Ls[i + 4] = c->u_mode.ocb.L[0];
+	      Ls[i + 5] = c->u_mode.ocb.L[1];
+	      Ls[i + 6] = c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[7] = c->u_mode.ocb.L[3];
+	}
+
       /* Process data in 16 block chunks. */
       while (nblocks >= 16)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 16; i += 4)
+	  if (blkn % 16 == 0)
+	    {
+	      blkn += 16;
+	      Ls[15] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      for (i = 0; i < 16; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
@@ -1462,17 +1535,36 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     const void *Ls[8];
     int i;
 
+    if (blkn % 8 == 0)
+      {
+	Ls[0] = c->u_mode.ocb.L[0];
+	Ls[1] = c->u_mode.ocb.L[1];
+	Ls[2] = c->u_mode.ocb.L[0];
+	Ls[3] = c->u_mode.ocb.L[2];
+	Ls[4] = c->u_mode.ocb.L[0];
+	Ls[5] = c->u_mode.ocb.L[1];
+	Ls[6] = c->u_mode.ocb.L[0];
+      }
+
     /* Process data in 8 block chunks. */
     while (nblocks >= 8)
       {
 	/* l_tmp will be used only every 65536-th block. */
-	for (i = 0; i < 8; i += 4)
+	if (blkn % 8 == 0)
+	  {
+	    blkn += 8;
+	    Ls[7] = ocb_get_l(c, l_tmp, blkn);
+	  }
+	else
 	  {
-	    Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	    Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	    Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	    Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	    blkn += 4;
+	    for (i = 0; i < 8; i += 4)
+	      {
+		Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		blkn += 4;
+	      }
 	  }
 
 	_gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
@@ -1501,17 +1593,36 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
       const void *Ls[8];
       int i;
 
+      if (blkn % 8 == 0)
+	{
+	  Ls[0] = c->u_mode.ocb.L[0];
+	  Ls[1] = c->u_mode.ocb.L[1];
+	  Ls[2] = c->u_mode.ocb.L[0];
+	  Ls[3] = c->u_mode.ocb.L[2];
+	  Ls[4] = c->u_mode.ocb.L[0];
+	  Ls[5] = c->u_mode.ocb.L[1];
+	  Ls[6] = c->u_mode.ocb.L[0];
+	}
+
       /* Process data in 8 block chunks. */
       while (nblocks >= 8)
 	{
 	  /* l_tmp will be used only every 65536-th block. */
-	  for (i = 0; i < 8; i += 4)
+	  if (blkn % 8 == 0)
+	    {
+	      blkn += 8;
+	      Ls[7] = ocb_get_l(c, l_tmp, blkn);
+	    }
+	  else
 	    {
-	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
-	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
-	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
-	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
-	      blkn += 4;
+	      for (i = 0; i < 8; i += 4)
+		{
+		  Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+		  Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+		  Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+		  Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+		  blkn += 4;
+		}
 	    }
 
 	  _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 11e60a7..7f361c9 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -1247,19 +1247,6 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
   _gcry_burn_stack(burn_stack_depth);
 }
 
-#ifdef USE_AMD64_ASM
-static inline const unsigned char *
-get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
-{
-  unsigned int ntz = _gcry_ctz64 (i);
-
-  if (ntz < OCB_L_TABLE_SIZE)
-      return c->u_mode.ocb.L[ntz];
-  else
-      return _gcry_cipher_ocb_get_l (c, l_tmp, i);
-}
-#endif
-
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
 size_t
 _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
@@ -1280,9 +1267,9 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     while (nblocks >= 3)
       {
 	/* l_tmp will be used only every 65536-th block. */
-	Ls[0] = get_l(c, l_tmp, blkn + 1);
-	Ls[1] = get_l(c, l_tmp, blkn + 2);
-	Ls[2] = get_l(c, l_tmp, blkn + 3);
+	Ls[0] = ocb_get_l(c, l_tmp, blkn + 1);
+	Ls[1] = ocb_get_l(c, l_tmp, blkn + 2);
+	Ls[2] = ocb_get_l(c, l_tmp, blkn + 3);
 	blkn += 3;
 
 	if (encrypt)
@@ -1339,9 +1326,9 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     while (nblocks >= 3)
       {
 	/* l_tmp will be used only every 65536-th block. */
-	Ls[0] = get_l(c, l_tmp, blkn + 1);
-	Ls[1] = get_l(c, l_tmp, blkn + 2);
-	Ls[2] = get_l(c, l_tmp, blkn + 3);
+	Ls[0] = ocb_get_l(c, l_tmp, blkn + 1);
+	Ls[1] = ocb_get_l(c, l_tmp, blkn + 2);
+	Ls[2] = ocb_get_l(c, l_tmp, blkn + 3);
 	blkn += 3;
 
 	twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,




More information about the Gcrypt-devel mailing list