[PATCH] Optimize OCB offset calculation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Aug 7 19:21:04 CEST 2015
* cipher/cipher-internal.h (ocb_get_l): New.
* cipher/cipher-ocb.c (_gcry_cipher_ocb_authenticate)
(ocb_crypt): Use 'ocb_get_l' instead of '_gcry_cipher_ocb_get_l'.
* cipher/camellia-glue.c (get_l): Remove.
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Precalculate
offset array when block count matches parallel operation size; Use
'ocb_get_l' instead of 'get_l'.
* cipher/rijndael-aesni.c (get_l): Add fast path for 75% most common
offsets.
(aesni_ocb_enc, aesni_ocb_dec, _gcry_aes_aesni_ocb_auth): Precalculate
offset array when block count matches parallel operation size.
* cipher/rijndael-ssse3-amd64.c (get_l): Add fast path for 75% most
common offsets.
* cipher/rijndael.c (_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth): Use
'ocb_get_l' instead of '_gcry_cipher_ocb_get_l'.
* cipher/serpent.c (get_l): Remove.
(_gcry_serpent_ocb_crypt, _gcry_serpent_ocb_auth): Precalculate
offset array when block count matches parallel operation size; Use
'ocb_get_l' instead of 'get_l'.
* cipher/twofish.c (get_l): Remove.
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Use 'ocb_get_l'
instead of 'get_l'.
--
Patch optimizes OCB offset calculation for generic code and
assembly implementations with parallel block processing.
Benchmark of OCB AES-NI on Intel Haswell:
$ tests/bench-slope --cpu-mhz 3201 cipher aes
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CTR enc | 0.274 ns/B 3483.9 MiB/s 0.876 c/B
CTR dec | 0.273 ns/B 3490.0 MiB/s 0.875 c/B
OCB enc | 0.289 ns/B 3296.1 MiB/s 0.926 c/B
OCB dec | 0.299 ns/B 3189.9 MiB/s 0.957 c/B
OCB auth | 0.260 ns/B 3670.0 MiB/s 0.832 c/B
After:
AES | nanosecs/byte mebibytes/sec cycles/byte
CTR enc | 0.273 ns/B 3489.4 MiB/s 0.875 c/B
CTR dec | 0.273 ns/B 3487.5 MiB/s 0.875 c/B
OCB enc | 0.248 ns/B 3852.8 MiB/s 0.792 c/B
OCB dec | 0.261 ns/B 3659.5 MiB/s 0.834 c/B
OCB auth | 0.227 ns/B 4205.5 MiB/s 0.726 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-glue.c | 161 ++++++++++---
cipher/cipher-internal.h | 20 ++
cipher/cipher-ocb.c | 5
cipher/rijndael-aesni.c | 498 +++++++++++++++++++++++------------------
cipher/rijndael-ssse3-amd64.c | 6
cipher/rijndael.c | 24 --
cipher/serpent.c | 209 +++++++++++++----
cipher/twofish.c | 25 --
8 files changed, 597 insertions(+), 351 deletions(-)
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 99516fc..2d5dd20 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -604,19 +604,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
-static inline const unsigned char *
-get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
-{
- unsigned int ntz = _gcry_ctz64 (i);
-
- if (ntz < OCB_L_TABLE_SIZE)
- return c->u_mode.ocb.L[ntz];
- else
- return _gcry_cipher_ocb_get_l (c, l_tmp, i);
-}
-#endif
-
/* Bulk encryption/decryption of complete blocks in OCB mode. */
size_t
_gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
@@ -646,17 +633,43 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *Ls[32];
int i;
+ if (blkn % 32 == 0)
+ {
+ for (i = 0; i < 32; i += 8)
+ {
+ Ls[i + 0] = c->u_mode.ocb.L[0];
+ Ls[i + 1] = c->u_mode.ocb.L[1];
+ Ls[i + 2] = c->u_mode.ocb.L[0];
+ Ls[i + 3] = c->u_mode.ocb.L[2];
+ Ls[i + 4] = c->u_mode.ocb.L[0];
+ Ls[i + 5] = c->u_mode.ocb.L[1];
+ Ls[i + 6] = c->u_mode.ocb.L[0];
+ }
+
+ Ls[7] = c->u_mode.ocb.L[3];
+ Ls[15] = c->u_mode.ocb.L[4];
+ Ls[23] = c->u_mode.ocb.L[3];
+ }
+
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 32; i += 4)
+ if (blkn % 32 == 0)
+ {
+ blkn += 32;
+ Ls[31] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ for (i = 0; i < 32; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
if (encrypt)
@@ -692,17 +705,41 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *Ls[16];
int i;
+ if (blkn % 16 == 0)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ Ls[i + 0] = c->u_mode.ocb.L[0];
+ Ls[i + 1] = c->u_mode.ocb.L[1];
+ Ls[i + 2] = c->u_mode.ocb.L[0];
+ Ls[i + 3] = c->u_mode.ocb.L[2];
+ Ls[i + 4] = c->u_mode.ocb.L[0];
+ Ls[i + 5] = c->u_mode.ocb.L[1];
+ Ls[i + 6] = c->u_mode.ocb.L[0];
+ }
+
+ Ls[7] = c->u_mode.ocb.L[3];
+ }
+
/* Process data in 16 block chunks. */
while (nblocks >= 16)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 16; i += 4)
+ if (blkn % 16 == 0)
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ blkn += 16;
+ Ls[15] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
+ {
+ for (i = 0; i < 16; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
if (encrypt)
@@ -768,17 +805,43 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
const void *Ls[32];
int i;
+ if (blkn % 32 == 0)
+ {
+ for (i = 0; i < 32; i += 8)
+ {
+ Ls[i + 0] = c->u_mode.ocb.L[0];
+ Ls[i + 1] = c->u_mode.ocb.L[1];
+ Ls[i + 2] = c->u_mode.ocb.L[0];
+ Ls[i + 3] = c->u_mode.ocb.L[2];
+ Ls[i + 4] = c->u_mode.ocb.L[0];
+ Ls[i + 5] = c->u_mode.ocb.L[1];
+ Ls[i + 6] = c->u_mode.ocb.L[0];
+ }
+
+ Ls[7] = c->u_mode.ocb.L[3];
+ Ls[15] = c->u_mode.ocb.L[4];
+ Ls[23] = c->u_mode.ocb.L[3];
+ }
+
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 32; i += 4)
+ if (blkn % 32 == 0)
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ blkn += 32;
+ Ls[31] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
+ {
+ for (i = 0; i < 32; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
_gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
@@ -809,17 +872,41 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
const void *Ls[16];
int i;
+ if (blkn % 16 == 0)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ Ls[i + 0] = c->u_mode.ocb.L[0];
+ Ls[i + 1] = c->u_mode.ocb.L[1];
+ Ls[i + 2] = c->u_mode.ocb.L[0];
+ Ls[i + 3] = c->u_mode.ocb.L[2];
+ Ls[i + 4] = c->u_mode.ocb.L[0];
+ Ls[i + 5] = c->u_mode.ocb.L[1];
+ Ls[i + 6] = c->u_mode.ocb.L[0];
+ }
+
+ Ls[7] = c->u_mode.ocb.L[3];
+ }
+
/* Process data in 16 block chunks. */
while (nblocks >= 16)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 16; i += 4)
+ if (blkn % 16 == 0)
+ {
+ blkn += 16;
+ Ls[15] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ for (i = 0; i < 16; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
_gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index bb86d37..29c6f33 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -448,4 +448,24 @@ const unsigned char *_gcry_cipher_ocb_get_l
/* */ (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 n);
+/* Inline version of _gcry_cipher_ocb_get_l, with hard-coded fast paths for
+ most common cases. */
+static inline const unsigned char *
+ocb_get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 n)
+{
+ if (n & 1)
+ return c->u_mode.ocb.L[0];
+ else if (n & 2)
+ return c->u_mode.ocb.L[1];
+ else
+ {
+ unsigned int ntz = _gcry_ctz64 (n);
+
+ if (ntz < OCB_L_TABLE_SIZE)
+ return c->u_mode.ocb.L[ntz];
+ else
+ return _gcry_cipher_ocb_get_l (c, l_tmp, n);
+ }
+}
+
#endif /*G10_CIPHER_INTERNAL_H*/
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index 096975a..a3a2c9b 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -280,7 +280,7 @@ _gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
buf_xor_1 (c->u_mode.ocb.aad_offset,
- _gcry_cipher_ocb_get_l (c, l_tmp, c->u_mode.ocb.aad_nblocks),
+ ocb_get_l (c, l_tmp, c->u_mode.ocb.aad_nblocks),
OCB_BLOCK_LEN);
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, OCB_BLOCK_LEN);
@@ -392,8 +392,7 @@ ocb_crypt (gcry_cipher_hd_t c, int encrypt,
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
buf_xor_1 (c->u_iv.iv,
- _gcry_cipher_ocb_get_l (c, l_tmp,
- c->u_mode.ocb.data_nblocks),
+ ocb_get_l (c, l_tmp, c->u_mode.ocb.data_nblocks),
OCB_BLOCK_LEN);
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
buf_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN);
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 910bc68..882cc79 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -1307,7 +1307,11 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
const unsigned char *l;
unsigned int ntz;
- if (i & 0xffffffffU)
+ if (i & 1)
+ return c->u_mode.ocb.L[0];
+ else if (i & 2)
+ return c->u_mode.ocb.L[1];
+ else if (i & 0xffffffffU)
{
asm ("rep;bsf %k[low], %k[ntz]\n\t"
: [ntz] "=r" (ntz)
@@ -1372,6 +1376,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
+ const unsigned char *l[4] = {};
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1385,87 +1390,103 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
[ctr] "m" (*c->u_ctr.ctr)
: "memory" );
- for ( ;nblocks > 3 ; nblocks -= 4 )
+ if (nblocks > 3)
{
- const unsigned char *l[4];
-
- /* l_tmp will be used only every 65536-th block. */
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
- l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
- l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
- l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- "movdqu %%xmm5, %[outbuf0]\n\t"
- : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
- : [l0] "m" (*l[0]),
- [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[inbuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "movdqu %%xmm5, %[outbuf1]\n\t"
- : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
- : [l1] "m" (*l[1]),
- [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "movdqu %%xmm5, %[outbuf2]\n\t"
- : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*l[2]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[inbuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_enc_vec4 (ctx);
-
- asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm1\n\t"
- "movdqu %%xmm1, %[outbuf0]\n\t"
- "movdqu %[outbuf1],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm2\n\t"
- "movdqu %%xmm2, %[outbuf1]\n\t"
- "movdqu %[outbuf2],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm3\n\t"
- "movdqu %%xmm3, %[outbuf2]\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- "movdqu %%xmm4, %[outbuf3]\n\t"
- : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
- [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
- [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
- [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
- :
- : "memory" );
-
- outbuf += 4*BLOCKSIZE;
- inbuf += 4*BLOCKSIZE;
+ if (n % 4 == 0)
+ {
+ l[0] = c->u_mode.ocb.L[0];
+ l[1] = c->u_mode.ocb.L[1];
+ l[2] = c->u_mode.ocb.L[0];
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ if (n % 4 == 0)
+ {
+ n += 4;
+ l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+ }
+ else
+ {
+ l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
+ l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
+ l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
+ l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
+ n += 4;
+ }
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+ : [l0] "m" (*l[0]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+ : [l1] "m" (*l[1]),
+ [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+ : [l2] "m" (*l[2]),
+ [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l[3]),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf1],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %[outbuf2],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
}
+
for ( ;nblocks; nblocks-- )
{
- const unsigned char *l;
-
- l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+ l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Checksum_i = Checksum_{i-1} xor P_i */
@@ -1476,7 +1497,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %%xmm0, %%xmm6\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l),
+ : [l] "m" (*l[0]),
[inbuf] "m" (*inbuf)
: "memory" );
@@ -1516,6 +1537,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
+ const unsigned char *l[4] = {};
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1529,87 +1551,103 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
[ctr] "m" (*c->u_ctr.ctr)
: "memory" );
- for ( ;nblocks > 3 ; nblocks -= 4 )
+ if (nblocks > 3)
{
- const unsigned char *l[4];
-
- /* l_tmp will be used only every 65536-th block. */
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
- l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
- l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
- l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- "movdqu %%xmm5, %[outbuf0]\n\t"
- : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
- : [l0] "m" (*l[0]),
- [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[inbuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "movdqu %%xmm5, %[outbuf1]\n\t"
- : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
- : [l1] "m" (*l[1]),
- [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "movdqu %%xmm5, %[outbuf2]\n\t"
- : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*l[2]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[inbuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_dec_vec4 (ctx);
-
- asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm1\n\t"
- "movdqu %%xmm1, %[outbuf0]\n\t"
- "movdqu %[outbuf1],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm2\n\t"
- "movdqu %%xmm2, %[outbuf1]\n\t"
- "movdqu %[outbuf2],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm3\n\t"
- "movdqu %%xmm3, %[outbuf2]\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- "movdqu %%xmm4, %[outbuf3]\n\t"
- "pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
- [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
- [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
- [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
- :
- : "memory" );
-
- outbuf += 4*BLOCKSIZE;
- inbuf += 4*BLOCKSIZE;
+ if (n % 4 == 0)
+ {
+ l[0] = c->u_mode.ocb.L[0];
+ l[1] = c->u_mode.ocb.L[1];
+ l[2] = c->u_mode.ocb.L[0];
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ if (n % 4 == 0)
+ {
+ n += 4;
+ l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+ }
+ else
+ {
+ l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
+ l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
+ l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
+ l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
+ n += 4;
+ }
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+ : [l0] "m" (*l[0]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+ : [l1] "m" (*l[1]),
+ [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+ : [l2] "m" (*l[2]),
+ [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l[3]),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf1],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %[outbuf2],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
}
+
for ( ;nblocks; nblocks-- )
{
- const unsigned char *l;
-
- l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+ l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
@@ -1619,7 +1657,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %%xmm1, %%xmm5\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l),
+ : [l] "m" (*l[0]),
[inbuf] "m" (*inbuf)
: "memory" );
@@ -1670,6 +1708,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
RIJNDAEL_context *ctx = (void *)&c->context.c;
const unsigned char *abuf = abuf_arg;
u64 n = c->u_mode.ocb.aad_nblocks;
+ const unsigned char *l[4] = {};
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1683,73 +1722,90 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
[ctr] "m" (*c->u_mode.ocb.aad_sum)
: "memory" );
- for ( ;nblocks > 3 ; nblocks -= 4 )
+ if (nblocks > 3)
{
- const unsigned char *l[4];
-
- /* l_tmp will be used only every 65536-th block. */
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[1] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[2] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[3] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[abuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- :
- : [l0] "m" (*l[0]),
- [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[abuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- :
- : [l1] "m" (*l[1]),
- [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[abuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- :
- : [l2] "m" (*l[2]),
- [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[abuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_enc_vec4 (ctx);
-
- asm volatile ("pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- :
- :
- : "memory" );
-
- abuf += 4*BLOCKSIZE;
+ if (n % 4 == 0)
+ {
+ l[0] = c->u_mode.ocb.L[0];
+ l[1] = c->u_mode.ocb.L[1];
+ l[2] = c->u_mode.ocb.L[0];
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ if (n % 4 == 0)
+ {
+ n += 4;
+ l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+ }
+ else
+ {
+ l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+ l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+ l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+ l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+ n += 4;
+ }
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[abuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ :
+ : [l0] "m" (*l[0]),
+ [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[abuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ :
+ : [l1] "m" (*l[1]),
+ [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[abuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ :
+ : [l2] "m" (*l[2]),
+ [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[abuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l[3]),
+ [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ :
+ :
+ : "memory" );
+
+ abuf += 4*BLOCKSIZE;
+ }
}
+
for ( ;nblocks; nblocks-- )
{
- const unsigned char *l;
-
- l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
+ l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
@@ -1758,7 +1814,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
"pxor %%xmm1, %%xmm5\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l),
+ : [l] "m" (*l[0]),
[abuf] "m" (*abuf)
: "memory" );
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index 0cdb532..937d868 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -535,7 +535,11 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
const unsigned char *l;
unsigned int ntz;
- if (i & 0xffffffffU)
+ if (i & 1)
+ return c->u_mode.ocb.L[0];
+ else if (i & 2)
+ return c->u_mode.ocb.L[1];
+ else if (i & 0xffffffffU)
{
asm ("rep;bsf %k[low], %k[ntz]\n\t"
: [ntz] "=r" (ntz)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 4368c6d..eff59c2 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1246,13 +1246,7 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
for ( ;nblocks; nblocks-- )
{
u64 i = ++c->u_mode.ocb.data_nblocks;
- unsigned int ntz = _gcry_ctz64 (i);
- const unsigned char *l;
-
- if (ntz < OCB_L_TABLE_SIZE)
- l = c->u_mode.ocb.L[ntz];
- else
- l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i);
+ const unsigned char *l = ocb_get_l(c, l_tmp.x1, i);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE);
@@ -1277,13 +1271,7 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
for ( ;nblocks; nblocks-- )
{
u64 i = ++c->u_mode.ocb.data_nblocks;
- unsigned int ntz = _gcry_ctz64 (i);
- const unsigned char *l;
-
- if (ntz < OCB_L_TABLE_SIZE)
- l = c->u_mode.ocb.L[ntz];
- else
- l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i);
+ const unsigned char *l = ocb_get_l(c, l_tmp.x1, i);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE);
@@ -1343,13 +1331,7 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
for ( ;nblocks; nblocks-- )
{
u64 i = ++c->u_mode.ocb.aad_nblocks;
- unsigned int ntz = _gcry_ctz64 (i);
- const unsigned char *l;
-
- if (ntz < OCB_L_TABLE_SIZE)
- l = c->u_mode.ocb.L[ntz];
- else
- l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i);
+ const unsigned char *l = ocb_get_l(c, l_tmp.x1, i);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
buf_xor_1 (c->u_mode.ocb.aad_offset, l, BLOCKSIZE);
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 0a54a17..a47a1b7 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1226,19 +1226,6 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
-#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
-static inline const unsigned char *
-get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
-{
- unsigned int ntz = _gcry_ctz64 (i);
-
- if (ntz < OCB_L_TABLE_SIZE)
- return c->u_mode.ocb.L[ntz];
- else
- return _gcry_cipher_ocb_get_l (c, l_tmp, i);
-}
-#endif
-
/* Bulk encryption/decryption of complete blocks in OCB mode. */
size_t
_gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
@@ -1265,17 +1252,41 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *Ls[16];
int i;
+ if (blkn % 16 == 0)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ Ls[i + 0] = c->u_mode.ocb.L[0];
+ Ls[i + 1] = c->u_mode.ocb.L[1];
+ Ls[i + 2] = c->u_mode.ocb.L[0];
+ Ls[i + 3] = c->u_mode.ocb.L[2];
+ Ls[i + 4] = c->u_mode.ocb.L[0];
+ Ls[i + 5] = c->u_mode.ocb.L[1];
+ Ls[i + 6] = c->u_mode.ocb.L[0];
+ }
+
+ Ls[7] = c->u_mode.ocb.L[3];
+ }
+
/* Process data in 16 block chunks. */
while (nblocks >= 16)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 16; i += 4)
+ if (blkn % 16 == 0)
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ blkn += 16;
+ Ls[15] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
+ {
+ for (i = 0; i < 16; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
if (encrypt)
@@ -1308,17 +1319,36 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *Ls[8];
int i;
+ if (blkn % 8 == 0)
+ {
+ Ls[0] = c->u_mode.ocb.L[0];
+ Ls[1] = c->u_mode.ocb.L[1];
+ Ls[2] = c->u_mode.ocb.L[0];
+ Ls[3] = c->u_mode.ocb.L[2];
+ Ls[4] = c->u_mode.ocb.L[0];
+ Ls[5] = c->u_mode.ocb.L[1];
+ Ls[6] = c->u_mode.ocb.L[0];
+ }
+
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 8; i += 4)
+ if (blkn % 8 == 0)
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ blkn += 8;
+ Ls[7] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
+ {
+ for (i = 0; i < 8; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
if (encrypt)
@@ -1352,17 +1382,36 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *Ls[8];
int i;
+ if (blkn % 8 == 0)
+ {
+ Ls[0] = c->u_mode.ocb.L[0];
+ Ls[1] = c->u_mode.ocb.L[1];
+ Ls[2] = c->u_mode.ocb.L[0];
+ Ls[3] = c->u_mode.ocb.L[2];
+ Ls[4] = c->u_mode.ocb.L[0];
+ Ls[5] = c->u_mode.ocb.L[1];
+ Ls[6] = c->u_mode.ocb.L[0];
+ }
+
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 8; i += 4)
+ if (blkn % 8 == 0)
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ blkn += 8;
+ Ls[7] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
+ {
+ for (i = 0; i < 8; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
if (encrypt)
@@ -1424,17 +1473,41 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
const void *Ls[16];
int i;
+ if (blkn % 16 == 0)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ Ls[i + 0] = c->u_mode.ocb.L[0];
+ Ls[i + 1] = c->u_mode.ocb.L[1];
+ Ls[i + 2] = c->u_mode.ocb.L[0];
+ Ls[i + 3] = c->u_mode.ocb.L[2];
+ Ls[i + 4] = c->u_mode.ocb.L[0];
+ Ls[i + 5] = c->u_mode.ocb.L[1];
+ Ls[i + 6] = c->u_mode.ocb.L[0];
+ }
+
+ Ls[7] = c->u_mode.ocb.L[3];
+ }
+
/* Process data in 16 block chunks. */
while (nblocks >= 16)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 16; i += 4)
+ if (blkn % 16 == 0)
+ {
+ blkn += 16;
+ Ls[15] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ for (i = 0; i < 16; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
_gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
@@ -1462,17 +1535,36 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
const void *Ls[8];
int i;
+ if (blkn % 8 == 0)
+ {
+ Ls[0] = c->u_mode.ocb.L[0];
+ Ls[1] = c->u_mode.ocb.L[1];
+ Ls[2] = c->u_mode.ocb.L[0];
+ Ls[3] = c->u_mode.ocb.L[2];
+ Ls[4] = c->u_mode.ocb.L[0];
+ Ls[5] = c->u_mode.ocb.L[1];
+ Ls[6] = c->u_mode.ocb.L[0];
+ }
+
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 8; i += 4)
+ if (blkn % 8 == 0)
+ {
+ blkn += 8;
+ Ls[7] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ for (i = 0; i < 8; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
_gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
@@ -1501,17 +1593,36 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
const void *Ls[8];
int i;
+ if (blkn % 8 == 0)
+ {
+ Ls[0] = c->u_mode.ocb.L[0];
+ Ls[1] = c->u_mode.ocb.L[1];
+ Ls[2] = c->u_mode.ocb.L[0];
+ Ls[3] = c->u_mode.ocb.L[2];
+ Ls[4] = c->u_mode.ocb.L[0];
+ Ls[5] = c->u_mode.ocb.L[1];
+ Ls[6] = c->u_mode.ocb.L[0];
+ }
+
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
/* l_tmp will be used only every 65536-th block. */
- for (i = 0; i < 8; i += 4)
+ if (blkn % 8 == 0)
+ {
+ blkn += 8;
+ Ls[7] = ocb_get_l(c, l_tmp, blkn);
+ }
+ else
{
- Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
- blkn += 4;
+ for (i = 0; i < 8; i += 4)
+ {
+ Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
}
_gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 11e60a7..7f361c9 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -1247,19 +1247,6 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
_gcry_burn_stack(burn_stack_depth);
}
-#ifdef USE_AMD64_ASM
-static inline const unsigned char *
-get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
-{
- unsigned int ntz = _gcry_ctz64 (i);
-
- if (ntz < OCB_L_TABLE_SIZE)
- return c->u_mode.ocb.L[ntz];
- else
- return _gcry_cipher_ocb_get_l (c, l_tmp, i);
-}
-#endif
-
/* Bulk encryption/decryption of complete blocks in OCB mode. */
size_t
_gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
@@ -1280,9 +1267,9 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
while (nblocks >= 3)
{
/* l_tmp will be used only every 65536-th block. */
- Ls[0] = get_l(c, l_tmp, blkn + 1);
- Ls[1] = get_l(c, l_tmp, blkn + 2);
- Ls[2] = get_l(c, l_tmp, blkn + 3);
+ Ls[0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[2] = ocb_get_l(c, l_tmp, blkn + 3);
blkn += 3;
if (encrypt)
@@ -1339,9 +1326,9 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
while (nblocks >= 3)
{
/* l_tmp will be used only every 65536-th block. */
- Ls[0] = get_l(c, l_tmp, blkn + 1);
- Ls[1] = get_l(c, l_tmp, blkn + 2);
- Ls[2] = get_l(c, l_tmp, blkn + 3);
+ Ls[0] = ocb_get_l(c, l_tmp, blkn + 1);
+ Ls[1] = ocb_get_l(c, l_tmp, blkn + 2);
+ Ls[2] = ocb_get_l(c, l_tmp, blkn + 3);
blkn += 3;
twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
More information about the Gcrypt-devel
mailing list