[PATCH] Add AES-NI acceleration for AES-XTS
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Aug 6 14:09:43 CEST 2017
* cipher/cipher-internal.h (gcry_cipher_handle): Change bulk
XTS function to take cipher context.
* cipher/cipher-xts.c (_gcry_cipher_xts_crypt): Ditto.
* cipher/cipher.c (_gcry_cipher_open_internal): Setup AES-NI
XTS bulk function.
* cipher/rijndael-aesni.c (xts_gfmul_const, _gcry_aes_aesni_xts_enc)
(_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_crypt): New.
* cipher/rijndael.c (_gcry_aes_aesni_xts_crypt)
(_gcry_aes_xts_crypt): New.
* src/cipher.h (_gcry_aes_xts_crypt): New.
--
Benchmarks on Intel Core i7-4790K, 4.0Ghz (no turbo):
Before:
XTS enc | 1.66 ns/B 575.7 MiB/s 6.63 c/B
XTS dec | 1.66 ns/B 575.5 MiB/s 6.63 c/B
After (~6x faster):
XTS enc | 0.270 ns/B 3528.5 MiB/s 1.08 c/B
XTS dec | 0.272 ns/B 3511.5 MiB/s 1.09 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index b7481255..8c897d7b 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -146,7 +146,7 @@ struct gcry_cipher_handle
const void *inbuf_arg, size_t nblocks, int encrypt);
size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg,
size_t nblocks);
- void (*xts_crypt)(gcry_cipher_hd_t c, unsigned char *tweak,
+ void (*xts_crypt)(void *context, unsigned char *tweak,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks, int encrypt);
} bulk;
diff --git a/cipher/cipher-xts.c b/cipher/cipher-xts.c
index 4da89e55..06cefbe0 100644
--- a/cipher/cipher-xts.c
+++ b/cipher/cipher-xts.c
@@ -93,7 +93,8 @@ _gcry_cipher_xts_crypt (gcry_cipher_hd_t c,
/* Use a bulk method if available. */
if (nblocks && c->bulk.xts_crypt)
{
- c->bulk.xts_crypt (c, c->u_ctr.ctr, outbuf, inbuf, nblocks, encrypt);
+ c->bulk.xts_crypt (&c->context.c, c->u_ctr.ctr, outbuf, inbuf, nblocks,
+ encrypt);
inbuf += nblocks * GCRY_XTS_BLOCK_LEN;
outbuf += nblocks * GCRY_XTS_BLOCK_LEN;
inbuflen -= nblocks * GCRY_XTS_BLOCK_LEN;
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 98127386..063c13da 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -532,6 +532,7 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
h->bulk.ctr_enc = _gcry_aes_ctr_enc;
h->bulk.ocb_crypt = _gcry_aes_ocb_crypt;
h->bulk.ocb_auth = _gcry_aes_ocb_auth;
+ h->bulk.xts_crypt = _gcry_aes_xts_crypt;
break;
#endif /*USE_AES*/
#ifdef USE_BLOWFISH
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 3d323cf0..50a0745b 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -3007,4 +3007,295 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
}
+static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+ { 0x87, 0x01 };
+
+
+static void
+_gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks)
+{
+ aesni_prepare_2_6_variable;
+
+ aesni_prepare ();
+ aesni_prepare_2_6 ();
+
+ /* Preload Tweak */
+ asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+ "movdqa %[gfmul], %%xmm6\n\t"
+ :
+ : [tweak] "m" (*tweak),
+ [gfmul] "m" (*xts_gfmul_const)
+ : "memory" );
+
+ for ( ;nblocks >= 4; nblocks -= 4 )
+ {
+ asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * 16))
+ : [inbuf0] "m" (*(inbuf + 0 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * 16))
+ : [inbuf1] "m" (*(inbuf + 1 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * 16))
+ : [inbuf2] "m" (*(inbuf + 2 * 16))
+ : "memory" );
+
+ asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm5, %[outbuf3]\n\t"
+
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf3] "=m" (*(outbuf + 3 * 16))
+ : [inbuf3] "m" (*(inbuf + 3 * 16))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %[outbuf1], %%xmm0\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf2], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %[outbuf3], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm4\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+ [outbuf1] "+m" (*(outbuf + 1 * 16)),
+ [outbuf2] "+m" (*(outbuf + 2 * 16)),
+ [outbuf3] "+m" (*(outbuf + 3 * 16))
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE * 4;
+ inbuf += BLOCKSIZE * 4;
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ "movdqa %%xmm5, %%xmm4\n\t"
+
+ "pshufd $0x13, %%xmm5, %%xmm1\n\t"
+ "psrad $31, %%xmm1\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_enc (ctx);
+
+ asm volatile ("pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+ : [tweak] "=m" (*tweak)
+ :
+ : "memory" );
+
+ aesni_cleanup ();
+ aesni_cleanup_2_6 ();
+}
+
+
+static void
+_gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks)
+{
+ aesni_prepare_2_6_variable;
+
+ aesni_prepare ();
+ aesni_prepare_2_6 ();
+
+ /* Preload Tweak */
+ asm volatile ("movdqu %[tweak], %%xmm5\n\t"
+ "movdqa %[gfmul], %%xmm6\n\t"
+ :
+ : [tweak] "m" (*tweak),
+ [gfmul] "m" (*xts_gfmul_const)
+ : "memory" );
+
+ for ( ;nblocks >= 4; nblocks -= 4 )
+ {
+ asm volatile ("pshufd $0x13, %%xmm5, %%xmm4\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * 16))
+ : [inbuf0] "m" (*(inbuf + 0 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * 16))
+ : [inbuf1] "m" (*(inbuf + 1 * 16))
+ : "memory" );
+
+ asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+
+ "movdqa %%xmm4, %%xmm0\n\t"
+ "paddd %%xmm4, %%xmm4\n\t"
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * 16))
+ : [inbuf2] "m" (*(inbuf + 2 * 16))
+ : "memory" );
+
+ asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm5, %[outbuf3]\n\t"
+
+ "psrad $31, %%xmm0\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ : [outbuf3] "=m" (*(outbuf + 3 * 16))
+ : [inbuf3] "m" (*(inbuf + 3 * 16))
+ : "memory" );
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0], %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %[outbuf1], %%xmm0\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf2], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %[outbuf3], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm4\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * 16)),
+ [outbuf1] "+m" (*(outbuf + 1 * 16)),
+ [outbuf2] "+m" (*(outbuf + 2 * 16)),
+ [outbuf3] "+m" (*(outbuf + 3 * 16))
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE * 4;
+ inbuf += BLOCKSIZE * 4;
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ "movdqa %%xmm5, %%xmm4\n\t"
+
+ "pshufd $0x13, %%xmm5, %%xmm1\n\t"
+ "psrad $31, %%xmm1\n\t"
+ "paddq %%xmm5, %%xmm5\n\t"
+ "pand %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_dec (ctx);
+
+ asm volatile ("pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile ("movdqu %%xmm5, %[tweak]\n\t"
+ : [tweak] "=m" (*tweak)
+ :
+ : "memory" );
+
+ aesni_cleanup ();
+ aesni_cleanup_2_6 ();
+}
+
+
+void
+_gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks, int encrypt)
+{
+ if (encrypt)
+ _gcry_aes_aesni_xts_enc(ctx, tweak, outbuf, inbuf, nblocks);
+ else
+ _gcry_aes_aesni_xts_dec(ctx, tweak, outbuf, inbuf, nblocks);
+}
+
#endif /* USE_AESNI */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8637195a..548bfa09 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -103,6 +103,11 @@ extern void _gcry_aes_aesni_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
int encrypt);
extern void _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
size_t nblocks);
+extern void _gcry_aes_aesni_xts_crypt (RIJNDAEL_context *ctx,
+ unsigned char *tweak,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ size_t nblocks, int encrypt);
#endif
#ifdef USE_SSSE3
@@ -1467,6 +1472,85 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
}
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+void
+_gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt)
+{
+ RIJNDAEL_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned int burn_depth = 0;
+ rijndael_cryptfn_t crypt_fn;
+ u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+
+ if (encrypt)
+ {
+ if (ctx->prefetch_enc_fn)
+ ctx->prefetch_enc_fn();
+
+ crypt_fn = ctx->encrypt_fn;
+ }
+ else
+ {
+ check_decryption_preparation (ctx);
+
+ if (ctx->prefetch_dec_fn)
+ ctx->prefetch_dec_fn();
+
+ crypt_fn = ctx->decrypt_fn;
+ }
+
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
+ {
+ _gcry_aes_aesni_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+ burn_depth = 0;
+ }
+#endif /*USE_AESNI*/
+ else
+ {
+ tweak_next_lo = buf_get_le64 (tweak + 0);
+ tweak_next_hi = buf_get_le64 (tweak + 8);
+
+ while (nblocks)
+ {
+ tweak_lo = tweak_next_lo;
+ tweak_hi = tweak_next_hi;
+
+ /* Xor-Encrypt/Decrypt-Xor block. */
+ tmp_lo = buf_get_le64 (inbuf + 0) ^ tweak_lo;
+ tmp_hi = buf_get_le64 (inbuf + 8) ^ tweak_hi;
+
+ buf_put_le64 (outbuf + 0, tmp_lo);
+ buf_put_le64 (outbuf + 8, tmp_hi);
+
+ /* Generate next tweak. */
+ carry = -(tweak_next_hi >> 63) & 0x87;
+ tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+ tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+ burn_depth = crypt_fn (ctx, outbuf, outbuf);
+
+ buf_put_le64 (outbuf + 0, buf_get_le64 (outbuf + 0) ^ tweak_lo);
+ buf_put_le64 (outbuf + 8, buf_get_le64 (outbuf + 8) ^ tweak_hi);
+
+ outbuf += GCRY_XTS_BLOCK_LEN;
+ inbuf += GCRY_XTS_BLOCK_LEN;
+ nblocks--;
+ }
+
+ buf_put_le64 (tweak + 0, tweak_next_lo);
+ buf_put_le64 (tweak + 8, tweak_next_hi);
+ }
+
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth + 5 * sizeof(void *));
+}
+
/* Run the self-tests for AES 128. Returns NULL on success. */
static const char*
diff --git a/src/cipher.h b/src/cipher.h
index f2acb556..d9e0ac6a 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -158,6 +158,9 @@ size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks, int encrypt);
size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
size_t nblocks);
+void _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
/*-- blowfish.c --*/
void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,
More information about the Gcrypt-devel
mailing list