[PATCH] Simplify OCB offset calculation for parallel implementations
Jussi Kivilinna
jussi.kivilinna at iki.fi
Tue Aug 11 22:12:44 CEST 2015
* cipher/camellia-glue.c (_gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth): Precalculate Ls array always, instead of
just if 'blkn % <parallel blocks> == 0'.
* cipher/serpent.c (_gcry_serpent_ocb_crypt)
(_gcry_serpent_ocb_auth): Ditto.
* cipher/rijndael-aesni.c (get_l): Remove low-bit checks.
(aes_ocb_enc, aes_ocb_dec, _gcry_aes_aesni_ocb_auth): Handle leading
blocks until block counter is multiple of 4, so that parallel block
processing loop can use 'c->u_mode.ocb.L' array directly.
* tests/basic.c (check_ocb_cipher_largebuf): Rename to...
(check_ocb_cipher_largebuf_split): ...this and add option to process
large buffer as two split buffers.
(check_ocb_cipher_largebuf): New.
--
Patch simplifies source and reduce object size.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-glue.c | 254 +++++++++------------
cipher/rijndael-aesni.c | 562 ++++++++++++++++++++++++-----------------------
cipher/serpent.c | 370 +++++++++++++------------------
tests/basic.c | 48 ++++
4 files changed, 586 insertions(+), 648 deletions(-)
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 2d5dd20..dee0169 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -631,58 +631,47 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_aesni_avx2 = 0;
const void *Ls[32];
+ unsigned int n = 32 - (blkn % 32);
+ const void **l;
int i;
- if (blkn % 32 == 0)
+ if (nblocks >= 32)
{
for (i = 0; i < 32; i += 8)
{
- Ls[i + 0] = c->u_mode.ocb.L[0];
- Ls[i + 1] = c->u_mode.ocb.L[1];
- Ls[i + 2] = c->u_mode.ocb.L[0];
- Ls[i + 3] = c->u_mode.ocb.L[2];
- Ls[i + 4] = c->u_mode.ocb.L[0];
- Ls[i + 5] = c->u_mode.ocb.L[1];
- Ls[i + 6] = c->u_mode.ocb.L[0];
+ Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0];
}
- Ls[7] = c->u_mode.ocb.L[3];
- Ls[15] = c->u_mode.ocb.L[4];
- Ls[23] = c->u_mode.ocb.L[3];
- }
+ Ls[(7 + n) % 32] = c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 32] = c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 32] = c->u_mode.ocb.L[3];
+ l = &Ls[(31 + n) % 32];
- /* Process data in 32 block chunks. */
- while (nblocks >= 32)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 32 == 0)
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 32;
- Ls[31] = ocb_get_l(c, l_tmp, blkn);
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 32);
+
+ if (encrypt)
+ _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 32;
+ outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 32 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx2 = 1;
}
- else
- {
- for (i = 0; i < 32; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
-
- if (encrypt)
- _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-
- nblocks -= 32;
- outbuf += 32 * CAMELLIA_BLOCK_SIZE;
- inbuf += 32 * CAMELLIA_BLOCK_SIZE;
- did_use_aesni_avx2 = 1;
}
if (did_use_aesni_avx2)
@@ -703,56 +692,45 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_aesni_avx = 0;
const void *Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ const void **l;
int i;
- if (blkn % 16 == 0)
+ if (nblocks >= 16)
{
for (i = 0; i < 16; i += 8)
{
- Ls[i + 0] = c->u_mode.ocb.L[0];
- Ls[i + 1] = c->u_mode.ocb.L[1];
- Ls[i + 2] = c->u_mode.ocb.L[0];
- Ls[i + 3] = c->u_mode.ocb.L[2];
- Ls[i + 4] = c->u_mode.ocb.L[0];
- Ls[i + 5] = c->u_mode.ocb.L[1];
- Ls[i + 6] = c->u_mode.ocb.L[0];
+ Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
}
- Ls[7] = c->u_mode.ocb.L[3];
- }
+ Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 16 == 0)
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 16;
- Ls[15] = ocb_get_l(c, l_tmp, blkn);
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
+
+ if (encrypt)
+ _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
}
- else
- {
- for (i = 0; i < 16; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
-
- if (encrypt)
- _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-
- nblocks -= 16;
- outbuf += 16 * CAMELLIA_BLOCK_SIZE;
- inbuf += 16 * CAMELLIA_BLOCK_SIZE;
- did_use_aesni_avx = 1;
}
if (did_use_aesni_avx)
@@ -803,53 +781,43 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_aesni_avx2 = 0;
const void *Ls[32];
+ unsigned int n = 32 - (blkn % 32);
+ const void **l;
int i;
- if (blkn % 32 == 0)
+ if (nblocks >= 32)
{
for (i = 0; i < 32; i += 8)
{
- Ls[i + 0] = c->u_mode.ocb.L[0];
- Ls[i + 1] = c->u_mode.ocb.L[1];
- Ls[i + 2] = c->u_mode.ocb.L[0];
- Ls[i + 3] = c->u_mode.ocb.L[2];
- Ls[i + 4] = c->u_mode.ocb.L[0];
- Ls[i + 5] = c->u_mode.ocb.L[1];
- Ls[i + 6] = c->u_mode.ocb.L[0];
+ Ls[(i + 0 + n) % 32] = c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 32] = c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 32] = c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 32] = c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 32] = c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 32] = c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 32] = c->u_mode.ocb.L[0];
}
- Ls[7] = c->u_mode.ocb.L[3];
- Ls[15] = c->u_mode.ocb.L[4];
- Ls[23] = c->u_mode.ocb.L[3];
- }
+ Ls[(7 + n) % 32] = c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 32] = c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 32] = c->u_mode.ocb.L[3];
+ l = &Ls[(31 + n) % 32];
- /* Process data in 32 block chunks. */
- while (nblocks >= 32)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 32 == 0)
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 32;
- Ls[31] = ocb_get_l(c, l_tmp, blkn);
- }
- else
- {
- for (i = 0; i < 32; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 32);
- _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
- nblocks -= 32;
- abuf += 32 * CAMELLIA_BLOCK_SIZE;
- did_use_aesni_avx2 = 1;
+ nblocks -= 32;
+ abuf += 32 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx2 = 1;
+ }
}
if (did_use_aesni_avx2)
@@ -870,51 +838,41 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_aesni_avx = 0;
const void *Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ const void **l;
int i;
- if (blkn % 16 == 0)
+ if (nblocks >= 16)
{
for (i = 0; i < 16; i += 8)
{
- Ls[i + 0] = c->u_mode.ocb.L[0];
- Ls[i + 1] = c->u_mode.ocb.L[1];
- Ls[i + 2] = c->u_mode.ocb.L[0];
- Ls[i + 3] = c->u_mode.ocb.L[2];
- Ls[i + 4] = c->u_mode.ocb.L[0];
- Ls[i + 5] = c->u_mode.ocb.L[1];
- Ls[i + 6] = c->u_mode.ocb.L[0];
+ Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
}
- Ls[7] = c->u_mode.ocb.L[3];
- }
+ Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 16 == 0)
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 16;
- Ls[15] = ocb_get_l(c, l_tmp, blkn);
- }
- else
- {
- for (i = 0; i < 16; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
- _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
- nblocks -= 16;
- abuf += 16 * CAMELLIA_BLOCK_SIZE;
- did_use_aesni_avx = 1;
+ nblocks -= 16;
+ abuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
+ }
}
if (did_use_aesni_avx)
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 882cc79..be57b3d 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -1307,11 +1307,7 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
const unsigned char *l;
unsigned int ntz;
- if (i & 1)
- return c->u_mode.ocb.L[0];
- else if (i & 2)
- return c->u_mode.ocb.L[1];
- else if (i & 0xffffffffU)
+ if (i & 0xffffffffU)
{
asm ("rep;bsf %k[low], %k[ntz]\n\t"
: [ntz] "=r" (ntz)
@@ -1376,7 +1372,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
- const unsigned char *l[4] = {};
+ const unsigned char *l;
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1390,103 +1386,112 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
[ctr] "m" (*c->u_ctr.ctr)
: "memory" );
- if (nblocks > 3)
+
+ for ( ;nblocks && n % 4; nblocks-- )
+ {
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm0, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_enc (ctx);
+
+ asm volatile ("pxor %%xmm5, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
{
- if (n % 4 == 0)
- {
- l[0] = c->u_mode.ocb.L[0];
- l[1] = c->u_mode.ocb.L[1];
- l[2] = c->u_mode.ocb.L[0];
- }
-
- for ( ;nblocks > 3 ; nblocks -= 4 )
- {
- /* l_tmp will be used only every 65536-th block. */
- if (n % 4 == 0)
- {
- n += 4;
- l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
- }
- else
- {
- l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
- l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
- l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
- l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
- n += 4;
- }
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- "movdqu %%xmm5, %[outbuf0]\n\t"
- : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
- : [l0] "m" (*l[0]),
- [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[inbuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "movdqu %%xmm5, %[outbuf1]\n\t"
- : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
- : [l1] "m" (*l[1]),
- [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "movdqu %%xmm5, %[outbuf2]\n\t"
- : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*l[2]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[inbuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_enc_vec4 (ctx);
-
- asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm1\n\t"
- "movdqu %%xmm1, %[outbuf0]\n\t"
- "movdqu %[outbuf1],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm2\n\t"
- "movdqu %%xmm2, %[outbuf1]\n\t"
- "movdqu %[outbuf2],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm3\n\t"
- "movdqu %%xmm3, %[outbuf2]\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- "movdqu %%xmm4, %[outbuf3]\n\t"
- : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
- [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
- [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
- [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
- :
- : "memory" );
-
- outbuf += 4*BLOCKSIZE;
- inbuf += 4*BLOCKSIZE;
- }
+ /* l_tmp will be used only every 65536-th block. */
+ n += 4;
+ l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+ : [l0] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+ : [l2] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf1],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %[outbuf2],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
}
for ( ;nblocks; nblocks-- )
{
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Checksum_i = Checksum_{i-1} xor P_i */
@@ -1497,7 +1502,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %%xmm0, %%xmm6\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l[0]),
+ : [l] "m" (*l),
[inbuf] "m" (*inbuf)
: "memory" );
@@ -1537,7 +1542,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
- const unsigned char *l[4] = {};
+ const unsigned char *l;
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1551,103 +1556,111 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
[ctr] "m" (*c->u_ctr.ctr)
: "memory" );
- if (nblocks > 3)
+ for ( ;nblocks && n % 4; nblocks-- )
+ {
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_dec (ctx);
+
+ asm volatile ("pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm6\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
{
- if (n % 4 == 0)
- {
- l[0] = c->u_mode.ocb.L[0];
- l[1] = c->u_mode.ocb.L[1];
- l[2] = c->u_mode.ocb.L[0];
- }
-
- for ( ;nblocks > 3 ; nblocks -= 4 )
- {
- /* l_tmp will be used only every 65536-th block. */
- if (n % 4 == 0)
- {
- n += 4;
- l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
- }
- else
- {
- l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
- l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
- l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
- l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
- n += 4;
- }
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- "movdqu %%xmm5, %[outbuf0]\n\t"
- : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
- : [l0] "m" (*l[0]),
- [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[inbuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "movdqu %%xmm5, %[outbuf1]\n\t"
- : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
- : [l1] "m" (*l[1]),
- [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "movdqu %%xmm5, %[outbuf2]\n\t"
- : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*l[2]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[inbuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_dec_vec4 (ctx);
-
- asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm1\n\t"
- "movdqu %%xmm1, %[outbuf0]\n\t"
- "movdqu %[outbuf1],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm2\n\t"
- "movdqu %%xmm2, %[outbuf1]\n\t"
- "movdqu %[outbuf2],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm3\n\t"
- "movdqu %%xmm3, %[outbuf2]\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- "movdqu %%xmm4, %[outbuf3]\n\t"
- "pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
- [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
- [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
- [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
- :
- : "memory" );
-
- outbuf += 4*BLOCKSIZE;
- inbuf += 4*BLOCKSIZE;
- }
+ /* l_tmp will be used only every 65536-th block. */
+ n += 4;
+ l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+ : [l0] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+ : [l2] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf1],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %[outbuf2],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
}
for ( ;nblocks; nblocks-- )
{
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
@@ -1657,7 +1670,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %%xmm1, %%xmm5\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l[0]),
+ : [l] "m" (*l),
[inbuf] "m" (*inbuf)
: "memory" );
@@ -1708,7 +1721,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
RIJNDAEL_context *ctx = (void *)&c->context.c;
const unsigned char *abuf = abuf_arg;
u64 n = c->u_mode.ocb.aad_nblocks;
- const unsigned char *l[4] = {};
+ const unsigned char *l;
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1722,90 +1735,91 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
[ctr] "m" (*c->u_mode.ocb.aad_sum)
: "memory" );
- if (nblocks > 3)
+ for ( ;nblocks && n % 4; nblocks-- )
+ {
+ l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[abuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [abuf] "m" (*abuf)
+ : "memory" );
+
+ do_aesni_enc (ctx);
+
+ asm volatile ("pxor %%xmm0, %%xmm6\n\t"
+ :
+ :
+ : "memory" );
+
+ abuf += BLOCKSIZE;
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
{
- if (n % 4 == 0)
- {
- l[0] = c->u_mode.ocb.L[0];
- l[1] = c->u_mode.ocb.L[1];
- l[2] = c->u_mode.ocb.L[0];
- }
-
- for ( ;nblocks > 3 ; nblocks -= 4 )
- {
- /* l_tmp will be used only every 65536-th block. */
- if (n % 4 == 0)
- {
- n += 4;
- l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- }
- else
- {
- l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- n += 4;
- }
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[abuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- :
- : [l0] "m" (*l[0]),
- [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[abuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- :
- : [l1] "m" (*l[1]),
- [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[abuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- :
- : [l2] "m" (*l[2]),
- [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[abuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_enc_vec4 (ctx);
-
- asm volatile ("pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- :
- :
- : "memory" );
-
- abuf += 4*BLOCKSIZE;
- }
+ /* l_tmp will be used only every 65536-th block. */
+ n += 4;
+ l = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[abuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ :
+ : [l0] "m" (*c->u_mode.ocb.L[0]),
+ [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[abuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ :
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[abuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ :
+ : [l2] "m" (*c->u_mode.ocb.L[0]),
+ [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[abuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l),
+ [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ :
+ :
+ : "memory" );
+
+ abuf += 4*BLOCKSIZE;
}
for ( ;nblocks; nblocks-- )
{
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
+ l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
@@ -1814,7 +1828,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
"pxor %%xmm1, %%xmm5\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l[0]),
+ : [l] "m" (*l),
[abuf] "m" (*abuf)
: "memory" );
diff --git a/cipher/serpent.c b/cipher/serpent.c
index a47a1b7..fc3afa6 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1250,56 +1250,45 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_avx2 = 0;
const void *Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ const void **l;
int i;
- if (blkn % 16 == 0)
+ if (nblocks >= 16)
{
for (i = 0; i < 16; i += 8)
{
- Ls[i + 0] = c->u_mode.ocb.L[0];
- Ls[i + 1] = c->u_mode.ocb.L[1];
- Ls[i + 2] = c->u_mode.ocb.L[0];
- Ls[i + 3] = c->u_mode.ocb.L[2];
- Ls[i + 4] = c->u_mode.ocb.L[0];
- Ls[i + 5] = c->u_mode.ocb.L[1];
- Ls[i + 6] = c->u_mode.ocb.L[0];
+ Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
}
- Ls[7] = c->u_mode.ocb.L[3];
- }
+ Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 16 == 0)
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 16;
- Ls[15] = ocb_get_l(c, l_tmp, blkn);
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
+
+ if (encrypt)
+ _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * sizeof(serpent_block_t);
+ inbuf += 16 * sizeof(serpent_block_t);
+ did_use_avx2 = 1;
}
- else
- {
- for (i = 0; i < 16; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
-
- if (encrypt)
- _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-
- nblocks -= 16;
- outbuf += 16 * sizeof(serpent_block_t);
- inbuf += 16 * sizeof(serpent_block_t);
- did_use_avx2 = 1;
}
if (did_use_avx2)
@@ -1317,51 +1306,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_sse2 = 0;
const void *Ls[8];
- int i;
+ unsigned int n = 8 - (blkn % 8);
+ const void **l;
- if (blkn % 8 == 0)
+ if (nblocks >= 8)
{
- Ls[0] = c->u_mode.ocb.L[0];
- Ls[1] = c->u_mode.ocb.L[1];
- Ls[2] = c->u_mode.ocb.L[0];
- Ls[3] = c->u_mode.ocb.L[2];
- Ls[4] = c->u_mode.ocb.L[0];
- Ls[5] = c->u_mode.ocb.L[1];
- Ls[6] = c->u_mode.ocb.L[0];
- }
-
- /* Process data in 8 block chunks. */
- while (nblocks >= 8)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 8 == 0)
+ Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+ Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+ l = &Ls[(7 + n) % 8];
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 8;
- Ls[7] = ocb_get_l(c, l_tmp, blkn);
- }
- else
- {
- for (i = 0; i < 8; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
+
+ if (encrypt)
+ _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 8;
+ outbuf += 8 * sizeof(serpent_block_t);
+ inbuf += 8 * sizeof(serpent_block_t);
+ did_use_sse2 = 1;
}
-
- if (encrypt)
- _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-
- nblocks -= 8;
- outbuf += 8 * sizeof(serpent_block_t);
- inbuf += 8 * sizeof(serpent_block_t);
- did_use_sse2 = 1;
}
if (did_use_sse2)
@@ -1380,51 +1357,39 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_neon = 0;
const void *Ls[8];
- int i;
+ unsigned int n = 8 - (blkn % 8);
+ const void **l;
- if (blkn % 8 == 0)
+ if (nblocks >= 8)
{
- Ls[0] = c->u_mode.ocb.L[0];
- Ls[1] = c->u_mode.ocb.L[1];
- Ls[2] = c->u_mode.ocb.L[0];
- Ls[3] = c->u_mode.ocb.L[2];
- Ls[4] = c->u_mode.ocb.L[0];
- Ls[5] = c->u_mode.ocb.L[1];
- Ls[6] = c->u_mode.ocb.L[0];
- }
-
- /* Process data in 8 block chunks. */
- while (nblocks >= 8)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 8 == 0)
+ Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+ Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+ l = &Ls[(7 + n) % 8];
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 8;
- Ls[7] = ocb_get_l(c, l_tmp, blkn);
- }
- else
- {
- for (i = 0; i < 8; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
+
+ if (encrypt)
+ _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 8;
+ outbuf += 8 * sizeof(serpent_block_t);
+ inbuf += 8 * sizeof(serpent_block_t);
+ did_use_neon = 1;
}
-
- if (encrypt)
- _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-
- nblocks -= 8;
- outbuf += 8 * sizeof(serpent_block_t);
- inbuf += 8 * sizeof(serpent_block_t);
- did_use_neon = 1;
}
if (did_use_neon)
@@ -1471,51 +1436,40 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_avx2 = 0;
const void *Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ const void **l;
int i;
- if (blkn % 16 == 0)
+ if (nblocks >= 16)
{
for (i = 0; i < 16; i += 8)
{
- Ls[i + 0] = c->u_mode.ocb.L[0];
- Ls[i + 1] = c->u_mode.ocb.L[1];
- Ls[i + 2] = c->u_mode.ocb.L[0];
- Ls[i + 3] = c->u_mode.ocb.L[2];
- Ls[i + 4] = c->u_mode.ocb.L[0];
- Ls[i + 5] = c->u_mode.ocb.L[1];
- Ls[i + 6] = c->u_mode.ocb.L[0];
+ Ls[(i + 0 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = c->u_mode.ocb.L[0];
}
- Ls[7] = c->u_mode.ocb.L[3];
- }
+ Ls[(7 + n) % 16] = c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
- /* Process data in 16 block chunks. */
- while (nblocks >= 16)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 16 == 0)
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 16;
- Ls[15] = ocb_get_l(c, l_tmp, blkn);
- }
- else
- {
- for (i = 0; i < 16; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 16);
- _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
- nblocks -= 16;
- abuf += 16 * sizeof(serpent_block_t);
- did_use_avx2 = 1;
+ nblocks -= 16;
+ abuf += 16 * sizeof(serpent_block_t);
+ did_use_avx2 = 1;
+ }
}
if (did_use_avx2)
@@ -1533,46 +1487,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_sse2 = 0;
const void *Ls[8];
- int i;
+ unsigned int n = 8 - (blkn % 8);
+ const void **l;
- if (blkn % 8 == 0)
+ if (nblocks >= 8)
{
- Ls[0] = c->u_mode.ocb.L[0];
- Ls[1] = c->u_mode.ocb.L[1];
- Ls[2] = c->u_mode.ocb.L[0];
- Ls[3] = c->u_mode.ocb.L[2];
- Ls[4] = c->u_mode.ocb.L[0];
- Ls[5] = c->u_mode.ocb.L[1];
- Ls[6] = c->u_mode.ocb.L[0];
- }
-
- /* Process data in 8 block chunks. */
- while (nblocks >= 8)
- {
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 8 == 0)
+ Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+ Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+ l = &Ls[(7 + n) % 8];
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 8;
- Ls[7] = ocb_get_l(c, l_tmp, blkn);
- }
- else
- {
- for (i = 0; i < 8; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
- _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
- nblocks -= 8;
- abuf += 8 * sizeof(serpent_block_t);
- did_use_sse2 = 1;
+ nblocks -= 8;
+ abuf += 8 * sizeof(serpent_block_t);
+ did_use_sse2 = 1;
+ }
}
if (did_use_sse2)
@@ -1591,46 +1533,34 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_neon = 0;
const void *Ls[8];
- int i;
-
- if (blkn % 8 == 0)
- {
- Ls[0] = c->u_mode.ocb.L[0];
- Ls[1] = c->u_mode.ocb.L[1];
- Ls[2] = c->u_mode.ocb.L[0];
- Ls[3] = c->u_mode.ocb.L[2];
- Ls[4] = c->u_mode.ocb.L[0];
- Ls[5] = c->u_mode.ocb.L[1];
- Ls[6] = c->u_mode.ocb.L[0];
- }
+ unsigned int n = 8 - (blkn % 8);
+ const void **l;
- /* Process data in 8 block chunks. */
- while (nblocks >= 8)
+ if (nblocks >= 8)
{
- /* l_tmp will be used only every 65536-th block. */
- if (blkn % 8 == 0)
+ Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
+ Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
+ Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
+ Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
+ l = &Ls[(7 + n) % 8];
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
{
+ /* l_tmp will be used only every 65536-th block. */
blkn += 8;
- Ls[7] = ocb_get_l(c, l_tmp, blkn);
- }
- else
- {
- for (i = 0; i < 8; i += 4)
- {
- Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1);
- Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2);
- Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3);
- Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4);
- blkn += 4;
- }
- }
+ *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);
- _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
- nblocks -= 8;
- abuf += 8 * sizeof(serpent_block_t);
- did_use_neon = 1;
+ nblocks -= 8;
+ abuf += 8 * sizeof(serpent_block_t);
+ did_use_neon = 1;
+ }
}
if (did_use_neon)
diff --git a/tests/basic.c b/tests/basic.c
index c1aa76a..4ea91a9 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3153,7 +3153,8 @@ do_check_ocb_cipher (int inplace)
static void
-check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
+check_ocb_cipher_largebuf_split (int algo, int keylen, const char *tagexpect,
+ unsigned int splitpos)
{
static const unsigned char key[32] =
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
@@ -3219,7 +3220,14 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
goto out_free;
}
- err = gcry_cipher_authenticate (hde, inbuf, buflen);
+ if (splitpos)
+ {
+ err = gcry_cipher_authenticate (hde, inbuf, splitpos);
+ }
+ if (!err)
+ {
+ err = gcry_cipher_authenticate (hde, inbuf + splitpos, buflen - splitpos);
+ }
if (err)
{
fail ("cipher-ocb, gcry_cipher_authenticate failed (large, algo %d): %s\n",
@@ -3229,10 +3237,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
goto out_free;
}
- err = gcry_cipher_final (hde);
+ if (splitpos)
+ {
+ err = gcry_cipher_encrypt (hde, outbuf, splitpos, inbuf, splitpos);
+ }
if (!err)
{
- err = gcry_cipher_encrypt (hde, outbuf, buflen, inbuf, buflen);
+ err = gcry_cipher_final (hde);
+ if (!err)
+ {
+ err = gcry_cipher_encrypt (hde, outbuf + splitpos, buflen - splitpos,
+ inbuf + splitpos, buflen - splitpos);
+ }
}
if (err)
{
@@ -3267,10 +3283,18 @@ check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
}
/* Now for the decryption. */
- err = gcry_cipher_final (hdd);
+ if (splitpos)
+ {
+ err = gcry_cipher_decrypt (hdd, outbuf, splitpos, NULL, 0);
+ }
if (!err)
{
- err = gcry_cipher_decrypt (hdd, outbuf, buflen, NULL, 0);
+ err = gcry_cipher_final (hdd);
+ if (!err)
+ {
+ err = gcry_cipher_decrypt (hdd, outbuf + splitpos, buflen - splitpos,
+ NULL, 0);
+ }
}
if (err)
{
@@ -3319,6 +3343,18 @@ out_free:
static void
+check_ocb_cipher_largebuf (int algo, int keylen, const char *tagexpect)
+{
+ unsigned int split;
+
+ for (split = 0; split < 32 * 16; split = split * 2 + 16)
+ {
+ check_ocb_cipher_largebuf_split(algo, keylen, tagexpect, split);
+ }
+}
+
+
+static void
check_ocb_cipher (void)
{
/* Check OCB cipher with separate destination and source buffers for
More information about the Gcrypt-devel
mailing list