[PATCH 05/10] Add parallelized AES-NI CBC decryption
Jussi Kivilinna
jussi.kivilinna at mbnet.fi
Fri Nov 23 18:22:14 CET 2012
* cipher/rijndael.c [USE_AESNI] (aesni_cleanup_5): New macro.
[USE_AESNI] (do_aesni_dec_vec4): New function.
(_gcry_aes_cbc_dec) [USE_AESNI]: Add parallelized CBC loop.
(_gcry_aes_cbc_dec) [USE_AESNI]: Change IV storage register from xmm3
to xmm5.
--
This gives ~60% improvement in CBC decryption speed on sandy-bridge (x86-64).
Overall speed improvement with this and previous CBC patches is over 400%.
Before:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 670ms 770ms 2920ms 720ms 1900ms 660ms 2260ms 2250ms 480ms 500ms
AES192 860ms 930ms 3250ms 870ms 2210ms 830ms 2580ms 2580ms 570ms 570ms
AES256 1020ms 1080ms 3580ms 1030ms 2550ms 970ms 2880ms 2870ms 660ms 660ms
After:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 670ms 770ms 2130ms 450ms 1880ms 670ms 2250ms 2280ms 490ms 490ms
AES192 880ms 920ms 2460ms 540ms 2210ms 830ms 2580ms 2570ms 580ms 570ms
AES256 1020ms 1070ms 2800ms 620ms 2560ms 970ms 2880ms 2880ms 660ms 650ms
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
---
cipher/rijndael.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 152 insertions(+), 9 deletions(-)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 69e1df1..34a0f8c 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -822,6 +822,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
}
+/* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4. */
+static void
+do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
+#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
+#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
+#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
+#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
+#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
+#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
+#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ aesdeclast_xmm0_xmm1
+ aesdeclast_xmm0_xmm2
+ aesdeclast_xmm0_xmm3
+ aesdeclast_xmm0_xmm4
+ : /* no output */
+ : [key] "r" (ctx->keyschdec),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesdec_xmm0_xmm1
+#undef aesdec_xmm0_xmm2
+#undef aesdec_xmm0_xmm3
+#undef aesdec_xmm0_xmm4
+#undef aesdeclast_xmm0_xmm1
+#undef aesdeclast_xmm0_xmm2
+#undef aesdeclast_xmm0_xmm3
+#undef aesdeclast_xmm0_xmm4
+}
+
+
/* Perform a CFB encryption or decryption round using the
initialization vector IV and the input block A. Write the result
to the output block B and update IV. IV needs to be 16 byte
@@ -1623,17 +1732,51 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
ctx->decryption_prepared = 1;
}
- /* As we avoid memcpy to/from stack by using xmm2 and xmm3 for temporary
- storage, out-of-order CPUs see parallellism even over loop iterations
- and see 2.5x to 2.9x speed up on Intel Sandy-Bridge. Further
- improvements are possible with do_aesni_cbc_dec_4() when implemented.
- */
asm volatile
- ("movdqu %[iv], %%xmm3\n\t" /* use xmm3 as fast IV storage */
+ ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */
: /* No output */
: [iv] "m" (*iv)
: "memory");
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf)
+ : "memory");
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile
+ ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */
+ "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */
+ "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */
+ "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */
+ "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+
for ( ;nblocks; nblocks-- )
{
asm volatile
@@ -1647,9 +1790,9 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
asm volatile
("movdqu %[outbuf], %%xmm0\n\t"
- "pxor %%xmm3, %%xmm0\n\t" /* xor IV with output */
+ "pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */
"movdqu %%xmm0, %[outbuf]\n\t"
- "movdqu %%xmm2, %%xmm3\n\t" /* store savebuf as new IV */
+ "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */
: /* No output */
: [outbuf] "m" (*outbuf)
: "memory");
@@ -1659,7 +1802,7 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
}
asm volatile
- ("movdqu %%xmm3, %[iv]\n\t" /* store IV */
+ ("movdqu %%xmm5, %[iv]\n\t" /* store IV */
: /* No output */
: [iv] "m" (*iv)
: "memory");
More information about the Gcrypt-devel
mailing list