[PATCH] Optimize AES-NI CTR mode.
Jussi Kivilinna
jussi.kivilinna at mbnet.fi
Thu Nov 29 16:31:03 CET 2012
* cipher/rijndael.c [USE_AESNI] (do_aesni_ctr, do_aesni_ctr_4): Make
handling of 64-bit overflow and carry conditional. Avoid generic to
vector register passing of value '1'. Generate and use '-1' instead.
--
We only need to handle 64-bit carry in few special cases, that happen very
rarely. So move carry handling to slow-path and only detect need for carry
handling on fast-path. Also avoid moving '1' from generic register to vector
register, as that might be slow on some CPUs. Instead generate '-1' with
SSE2 instructions and use subtraction instead of addition to increase IV.
Overall this gives ~8% improvement in speed for AES CTR mode on Intel
Sandy-Bridge.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
---
cipher/rijndael.c | 90 +++++++++++++++++++++++------------------------------
1 file changed, 39 insertions(+), 51 deletions(-)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index cc7f8d6..6313ab2 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1015,24 +1015,20 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
"movaps %%xmm0, %%xmm2\n\t"
- "mov $1, %%esi\n\t" /* xmm2++ (big-endian) */
- "movd %%esi, %%xmm1\n\t"
-
- "movl 12(%[ctr]), %%esi\n\t" /* load lower parts of CTR */
- "bswapl %%esi\n\t"
- "movl 8(%[ctr]), %%edi\n\t"
- "bswapl %%edi\n\t"
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
"pshufb %[mask], %%xmm2\n\t"
- "paddq %%xmm1, %%xmm2\n\t"
+ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ (big endian) */
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jnc .Lno_carry%=\n\t"
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "cmpl $0xffffffff, 12(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
- /* swap upper and lower halfs */
- "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
- "paddq %%xmm1, %%xmm2\n\t" /* add carry to upper 64bits */
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "psubq %%xmm1, %%xmm2\n\t" /* add carry to upper 64bits */
".Lno_carry%=:\n\t"
@@ -1085,7 +1081,7 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
[key] "r" (ctx->keyschenc),
[rounds] "g" (ctx->rounds),
[mask] "m" (*be_mask)
- : "%esi", "%edi", "cc", "memory");
+ : "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenclast_xmm1_xmm0
}
@@ -1120,48 +1116,40 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
"movaps %%xmm0, %%xmm2\n\t"
- "mov $1, %%esi\n\t" /* xmm1 := 1 */
- "movd %%esi, %%xmm1\n\t"
-
- "movl 12(%[ctr]), %%esi\n\t" /* load lower parts of CTR */
- "bswapl %%esi\n\t"
- "movl 8(%[ctr]), %%edi\n\t"
- "bswapl %%edi\n\t"
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
"pshufb %[mask], %%xmm2\n\t" /* xmm2 := le(xmm2) */
- "paddq %%xmm1, %%xmm2\n\t" /* xmm2++ */
+ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */
"movaps %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
- "paddq %%xmm1, %%xmm3\n\t" /* xmm3++ */
+ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */
"movaps %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
- "paddq %%xmm1, %%xmm4\n\t" /* xmm4++ */
+ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */
"movaps %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
- "paddq %%xmm1, %%xmm5\n\t" /* xmm5++ */
-
- /* swap upper and lower halfs */
- "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
-
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm2%=\n\t"
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm3%=\n\t"
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm4%=\n\t"
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm5%=\n\t"
- "jmp .Lno_carry%=\n\t"
-
- ".Lcarry_xmm2%=:\n\t"
- "paddq %%xmm1, %%xmm2\n\t"
+ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */
+
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "movl 12(%[ctr]), %%esi\n\t"
+ "bswapl %%esi\n\t"
+ "cmpl $0xfffffffc, %%esi\n\t"
+ "jb .Lno_carry%=\n\t" /* no carry */
+
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */
+ "cmpl $0xfffffffe, %%esi\n\t"
+ "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */
+ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */
+ /* esi == 0xffffffff */
+
+ "psubq %%xmm1, %%xmm2\n\t"
".Lcarry_xmm3%=:\n\t"
- "paddq %%xmm1, %%xmm3\n\t"
+ "psubq %%xmm1, %%xmm3\n\t"
".Lcarry_xmm4%=:\n\t"
- "paddq %%xmm1, %%xmm4\n\t"
+ "psubq %%xmm1, %%xmm4\n\t"
".Lcarry_xmm5%=:\n\t"
- "paddq %%xmm1, %%xmm5\n\t"
+ "psubq %%xmm1, %%xmm5\n\t"
".Lno_carry%=:\n\t"
"pshufb %[mask], %%xmm2\n\t" /* xmm2 := be(xmm2) */
@@ -1170,7 +1158,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
"pshufb %[mask], %%xmm5\n\t" /* xmm5 := be(xmm5) */
"movdqa %%xmm5, (%[ctr])\n" /* Update CTR. */
- "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
"pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
"pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
"pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
@@ -1275,7 +1263,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
[key] "r" (ctx->keyschenc),
[rounds] "g" (ctx->rounds),
[mask] "m" (*be_mask)
- : "%esi", "%edi", "cc", "memory");
+ : "%esi", "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenc_xmm1_xmm2
#undef aesenc_xmm1_xmm3
More information about the Gcrypt-devel
mailing list