[PATCH 1/6] Add OCB bulk mode for AES SSSE3 implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Jul 27 11:04:15 CEST 2015
* cipher/rijndael-ssse3-amd64.c (SSSE3_STATE_SIZE): New.
[HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (vpaes_ssse3_prepare): Use
'ssse3_state' for storing current SSSE3 state.
[HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
(vpaes_ssse3_cleanup): Restore SSSE3 state from 'ssse3_state'.
(_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
(_gcry_aes_ssse3_encrypt, _gcry_aes_ssse3_cfb_enc)
(_gcry_aes_ssse3_cbc_enc, _gcry_aes_ssse3_ctr_enc)
(_gcry_aes_ssse3_decrypt, _gcry_aes_ssse3_cfb_dec)
(_gcry_aes_ssse3_cbc_dec, _gcry_aes_ssse3_cbc_dec): Add 'ssse3_state'
array.
(get_l, ssse3_ocb_enc, ssse3_ocb_dec, _gcry_aes_ssse3_ocb_crypt)
(_gcry_aes_ssse3_ocb_auth): New.
* cipher/rijndael.c (_gcry_aes_ssse3_ocb_crypt)
(_gcry_aes_ssse3_ocb_auth): New.
(_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth) [USE_SSSE3]: Use SSSE3
implementation for OCB.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-ssse3-amd64.c | 305 ++++++++++++++++++++++++++++++++++++++++-
cipher/rijndael.c | 19 +++
2 files changed, 320 insertions(+), 4 deletions(-)
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index 21438dc..0cdb532 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -45,6 +45,7 @@
#include "bufhelp.h"
#include "cipher-selftest.h"
#include "rijndael-internal.h"
+#include "./cipher-internal.h"
#ifdef USE_SSSE3
@@ -62,9 +63,9 @@
SSE registers are cleared and won't reveal any information about
the key or the data. */
#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define SSSE3_STATE_SIZE (16 * 10)
/* XMM6-XMM15 are callee-saved registers on WIN64. */
# define vpaes_ssse3_prepare() \
- char win64tmp[16 * 10]; \
asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" \
"movdqu %%xmm7, 1*16(%0)\n\t" \
"movdqu %%xmm8, 2*16(%0)\n\t" \
@@ -76,7 +77,7 @@
"movdqu %%xmm14, 8*16(%0)\n\t" \
"movdqu %%xmm15, 9*16(%0)\n\t" \
: \
- : "r" (win64tmp) \
+ : "r" (ssse3_state) \
: "memory" )
# define vpaes_ssse3_cleanup() \
asm volatile ("pxor %%xmm0, %%xmm0 \n\t" \
@@ -96,10 +97,11 @@
"movdqu 8*16(%0), %%xmm14 \n\t" \
"movdqu 9*16(%0), %%xmm15 \n\t" \
: \
- : "r" (win64tmp) \
+ : "r" (ssse3_state) \
: "memory" )
#else
-# define vpaes_ssse3_prepare() /*_*/
+# define SSSE3_STATE_SIZE 1
+# define vpaes_ssse3_prepare() (void)ssse3_state
# define vpaes_ssse3_cleanup() \
asm volatile ("pxor %%xmm0, %%xmm0 \n\t" \
"pxor %%xmm1, %%xmm1 \n\t" \
@@ -148,6 +150,7 @@ void
_gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
{
unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare();
@@ -178,6 +181,7 @@ void
_gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
{
unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare();
@@ -238,6 +242,7 @@ _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
{
unsigned int nrounds = ctx->rounds;
const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare_enc (aes_const_ptr);
asm volatile ("movdqu %[src], %%xmm0\n\t"
@@ -261,6 +266,7 @@ _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
{
unsigned int nrounds = ctx->rounds;
const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare_enc (aes_const_ptr);
@@ -300,6 +306,7 @@ _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
{
unsigned int nrounds = ctx->rounds;
const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare_enc (aes_const_ptr);
@@ -347,6 +354,7 @@ _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
unsigned int nrounds = ctx->rounds;
const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
u64 ctrlow;
vpaes_ssse3_prepare_enc (aes_const_ptr);
@@ -411,6 +419,7 @@ _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
{
unsigned int nrounds = ctx->rounds;
const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare_dec (aes_const_ptr);
asm volatile ("movdqu %[src], %%xmm0\n\t"
@@ -434,6 +443,7 @@ _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
{
unsigned int nrounds = ctx->rounds;
const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare_enc (aes_const_ptr);
@@ -474,6 +484,7 @@ _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
{
unsigned int nrounds = ctx->rounds;
const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
vpaes_ssse3_prepare_dec (aes_const_ptr);
@@ -516,6 +527,292 @@ _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
}
+static inline const unsigned char *
+get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
+ unsigned char *ctr, const void **aes_const_ptr,
+ byte ssse3_state[SSSE3_STATE_SIZE], int encrypt)
+{
+ const unsigned char *l;
+ unsigned int ntz;
+
+ if (i & 0xffffffffU)
+ {
+ asm ("rep;bsf %k[low], %k[ntz]\n\t"
+ : [ntz] "=r" (ntz)
+ : [low] "r" (i & 0xffffffffU)
+ : "cc");
+ }
+ else
+ {
+ if (OCB_L_TABLE_SIZE < 32)
+ {
+ ntz = 32;
+ }
+ else if (i)
+ {
+ asm ("rep;bsf %k[high], %k[ntz]\n\t"
+ : [ntz] "=r" (ntz)
+ : [high] "r" (i >> 32)
+ : "cc");
+ ntz += 32;
+ }
+ else
+ {
+ ntz = 64;
+ }
+ }
+
+ if (ntz < OCB_L_TABLE_SIZE)
+ {
+ l = c->u_mode.ocb.L[ntz];
+ }
+ else
+ {
+ /* Store Offset & Checksum before calling external function */
+ asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+ "movdqu %%xmm6, %[ctr]\n\t"
+ : [iv] "=m" (*iv),
+ [ctr] "=m" (*ctr)
+ :
+ : "memory" );
+
+ /* Restore SSSE3 state. */
+ vpaes_ssse3_cleanup();
+
+ l = _gcry_cipher_ocb_get_l (c, l_tmp, i);
+
+ /* Save SSSE3 state. */
+ if (encrypt)
+ {
+ vpaes_ssse3_prepare_enc (*aes_const_ptr);
+ }
+ else
+ {
+ vpaes_ssse3_prepare_dec (*aes_const_ptr);
+ }
+
+ /* Restore Offset & Checksum */
+ asm volatile ("movdqu %[iv], %%xmm7\n\t"
+ "movdqu %[ctr], %%xmm6\n\t"
+ : /* No output */
+ : [iv] "m" (*iv),
+ [ctr] "m" (*ctr)
+ : "memory" );
+ }
+
+ return l;
+}
+
+
+static void
+ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks)
+{
+ union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ u64 n = c->u_mode.ocb.data_nblocks;
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
+
+ vpaes_ssse3_prepare_enc (aes_const_ptr);
+
+ /* Preload Offset and Checksum */
+ asm volatile ("movdqu %[iv], %%xmm7\n\t"
+ "movdqu %[ctr], %%xmm6\n\t"
+ : /* No output */
+ : [iv] "m" (*c->u_iv.iv),
+ [ctr] "m" (*c->u_ctr.ctr)
+ : "memory" );
+
+ for ( ;nblocks; nblocks-- )
+ {
+ const unsigned char *l;
+
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr, &aes_const_ptr,
+ ssse3_state, 1);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm7\n\t"
+ "pxor %%xmm0, %%xmm6\n\t"
+ "pxor %%xmm7, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+ asm volatile ("pxor %%xmm7, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ c->u_mode.ocb.data_nblocks = n;
+ asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+ "movdqu %%xmm6, %[ctr]\n\t"
+ : [iv] "=m" (*c->u_iv.iv),
+ [ctr] "=m" (*c->u_ctr.ctr)
+ :
+ : "memory" );
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+ vpaes_ssse3_cleanup ();
+}
+
+static void
+ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks)
+{
+ union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ u64 n = c->u_mode.ocb.data_nblocks;
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
+
+ vpaes_ssse3_prepare_dec (aes_const_ptr);
+
+ /* Preload Offset and Checksum */
+ asm volatile ("movdqu %[iv], %%xmm7\n\t"
+ "movdqu %[ctr], %%xmm6\n\t"
+ : /* No output */
+ : [iv] "m" (*c->u_iv.iv),
+ [ctr] "m" (*c->u_ctr.ctr)
+ : "memory" );
+
+ for ( ;nblocks; nblocks-- )
+ {
+ const unsigned char *l;
+
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr, &aes_const_ptr,
+ ssse3_state, 0);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm7\n\t"
+ "pxor %%xmm7, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+
+ asm volatile ("pxor %%xmm7, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm6\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ c->u_mode.ocb.data_nblocks = n;
+ asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+ "movdqu %%xmm6, %[ctr]\n\t"
+ : [iv] "=m" (*c->u_iv.iv),
+ [ctr] "=m" (*c->u_ctr.ctr)
+ :
+ : "memory" );
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+ vpaes_ssse3_cleanup ();
+}
+
+
+void
+_gcry_aes_ssse3_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ if (encrypt)
+ ssse3_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+ else
+ ssse3_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
+}
+
+
+void
+_gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+ size_t nblocks)
+{
+ union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ const unsigned char *abuf = abuf_arg;
+ u64 n = c->u_mode.ocb.aad_nblocks;
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+ byte ssse3_state[SSSE3_STATE_SIZE];
+
+ vpaes_ssse3_prepare_enc (aes_const_ptr);
+
+ /* Preload Offset and Sum */
+ asm volatile ("movdqu %[iv], %%xmm7\n\t"
+ "movdqu %[ctr], %%xmm6\n\t"
+ : /* No output */
+ : [iv] "m" (*c->u_mode.ocb.aad_offset),
+ [ctr] "m" (*c->u_mode.ocb.aad_sum)
+ : "memory" );
+
+ for ( ;nblocks; nblocks-- )
+ {
+ const unsigned char *l;
+
+ l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, &aes_const_ptr, ssse3_state, 1);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[abuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm7\n\t"
+ "pxor %%xmm7, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [abuf] "m" (*abuf)
+ : "memory" );
+
+ do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+ asm volatile ("pxor %%xmm0, %%xmm6\n\t"
+ :
+ :
+ : "memory" );
+
+ abuf += BLOCKSIZE;
+ }
+
+ c->u_mode.ocb.aad_nblocks = n;
+ asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+ "movdqu %%xmm6, %[ctr]\n\t"
+ : [iv] "=m" (*c->u_mode.ocb.aad_offset),
+ [ctr] "=m" (*c->u_mode.ocb.aad_sum)
+ :
+ : "memory" );
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+ vpaes_ssse3_cleanup ();
+}
+
+
#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
# define X(...)
#else
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 4f063c4..1fe16d6 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -137,6 +137,11 @@ extern void _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx,
unsigned char *outbuf,
const unsigned char *inbuf,
unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_ssse3_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
+extern void _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+ size_t nblocks);
#endif
#ifdef USE_PADLOCK
@@ -1226,6 +1231,13 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
+ burn_depth = 0;
+ }
+#endif /*USE_SSSE3*/
else if (encrypt)
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
@@ -1314,6 +1326,13 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_ocb_auth (c, abuf, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_SSSE3*/
else
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
More information about the Gcrypt-devel
mailing list