[PATCH] rijndael-aesni: tweak x86_64 AES-NI for better performance on AMD Zen2

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri Sep 18 17:14:17 CEST 2020


* cipher/rijndael-aesni.c (do_aesni_enc_vec8, do_aesni_dec_vec8): More
first round key xoring and last round out to caller.
(do_aesni_ctr_4): Change low 8-bit counter overflow check to 8-bit
addition to low-bits and detect overflow from carry flag; Adjust
slow path to restore counter.
(do_aesni_ctr_8): Same as above; Interleave first round key xoring and
first round with CTR generation on fast path; Interleave last round
with output xoring.
(_gcry_aes_aesni_cfb_dec, _gcry_aes_aesni_cbc_dec): Add first round
key xoring; Change order of last round xoring and output xoring
(shorten the dependency path).
(_gcry_aes_aesni_ocb_auth): Add first round key xoring and last round
handling.
--

Benchmark on Ryzen 7 3700X:

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC dec |     0.113 ns/B      8445 MiB/s     0.407 c/B
        CFB dec |     0.114 ns/B      8337 MiB/s     0.412 c/B
        CTR enc |     0.112 ns/B      8505 MiB/s     0.404 c/B
        CTR dec |     0.113 ns/B      8476 MiB/s     0.405 c/B

After (CBC-dec +21%, CFB-dec +24%, CTR +8% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC dec |     0.093 ns/B     10277 MiB/s     0.334 c/B
        CFB dec |     0.092 ns/B     10372 MiB/s     0.331 c/B
        CTR enc |     0.104 ns/B      9209 MiB/s     0.373 c/B
        CTR dec |     0.104 ns/B      9192 MiB/s     0.373 c/B

Performance remains the same on Intel Skylake.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-aesni.c | 313 +++++++++++++++++++++++-----------------
 1 file changed, 183 insertions(+), 130 deletions(-)

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index b26449a7..747ef662 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -788,17 +788,7 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 static ASM_FUNC_ATTR_INLINE void
 do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
 {
-  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
-                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
-                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
-                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
-                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
-                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
-                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
-                "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
-                "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm0\n\t"
-                "cmpl $12, %[rounds]\n\t"
+  asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -808,6 +798,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "aesenc %%xmm0, %%xmm10\n\t"
                 "aesenc %%xmm0, %%xmm11\n\t"
                 "movdqa 0x20(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
                 "aesenc %%xmm0, %%xmm1\n\t"
                 "aesenc %%xmm0, %%xmm2\n\t"
                 "aesenc %%xmm0, %%xmm3\n\t"
@@ -920,14 +911,6 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
                 "movdqa 0xe0(%[key]), %%xmm0\n"
 
                 ".Ldeclast%=:\n\t"
-                "aesenclast %%xmm0, %%xmm1\n\t"
-                "aesenclast %%xmm0, %%xmm2\n\t"
-                "aesenclast %%xmm0, %%xmm3\n\t"
-                "aesenclast %%xmm0, %%xmm4\n\t"
-                "aesenclast %%xmm0, %%xmm8\n\t"
-                "aesenclast %%xmm0, %%xmm9\n\t"
-                "aesenclast %%xmm0, %%xmm10\n\t"
-                "aesenclast %%xmm0, %%xmm11\n\t"
                 : /* no output */
                 : [key] "r" (ctx->keyschenc),
                   [rounds] "r" (ctx->rounds)
@@ -940,16 +923,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx)
 static ASM_FUNC_ATTR_INLINE void
 do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
 {
-  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
-                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
-                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
-                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
-                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
-                "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
-                "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
-                "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
-                "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm0\n\t"
+  asm volatile ("movdqa 0x10(%[key]), %%xmm0\n\t"
                 "cmpl $12, %[rounds]\n\t"
                 "aesdec %%xmm0, %%xmm1\n\t"
                 "aesdec %%xmm0, %%xmm2\n\t"
@@ -1072,14 +1046,6 @@ do_aesni_dec_vec8 (const RIJNDAEL_context *ctx)
                 "movdqa 0xe0(%[key]), %%xmm0\n"
 
                 ".Ldeclast%=:\n\t"
-                "aesdeclast %%xmm0, %%xmm1\n\t"
-                "aesdeclast %%xmm0, %%xmm2\n\t"
-                "aesdeclast %%xmm0, %%xmm3\n\t"
-                "aesdeclast %%xmm0, %%xmm4\n\t"
-                "aesdeclast %%xmm0, %%xmm8\n\t"
-                "aesdeclast %%xmm0, %%xmm9\n\t"
-                "aesdeclast %%xmm0, %%xmm10\n\t"
-                "aesdeclast %%xmm0, %%xmm11\n\t"
                 : /* no output */
                 : [key] "r" (ctx->keyschdec),
                   [rounds] "r" (ctx->rounds)
@@ -1204,8 +1170,8 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
    */
 
   asm volatile (/* detect if 8-bit carry handling is needed */
-                "cmpb   $0xfb, 15(%[ctr])\n\t"
-                "ja     .Ladd32bit%=\n\t"
+                "addb   $4, 15(%[ctr])\n\t"
+                "jc     .Ladd32bit%=\n\t"
 
                 "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
                 "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
@@ -1217,9 +1183,10 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
                 "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
                 "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(4) + CTR (xmm0) */
                 "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
-                "jmp    .Lstore_ctr%=\n\t"
+                "jmp    .Ldone_ctr%=\n\t"
 
                 ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Restore CTR.  */
                 "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
                 "movdqa %%xmm0, %%xmm2\n\t"
                 "pcmpeqd %%xmm1, %%xmm1\n\t"
@@ -1265,8 +1232,9 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
                 "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
                 "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
 
-                ".Lstore_ctr%=:\n\t"
                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+
+                ".Ldone_ctr%=:\n\t"
                 :
                 : [ctr] "r" (ctr),
                   [key] "r" (ctx->keyschenc),
@@ -1428,30 +1396,50 @@ do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
    */
 
   asm volatile (/* detect if 8-bit carry handling is needed */
-                "cmpb   $0xf7, 15(%[ctr])\n\t"
-                "ja     .Ladd32bit%=\n\t"
+                "addb   $8, 15(%[ctr])\n\t"
+                "jc     .Ladd32bit%=\n\t"
 
-                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
-                "movdqa 0*16(%[addb]), %%xmm2\n\t"  /* xmm2 := be(1) */
-                "movdqa 1*16(%[addb]), %%xmm3\n\t"  /* xmm3 := be(2) */
-                "movdqa 2*16(%[addb]), %%xmm4\n\t"  /* xmm4 := be(3) */
-                "movdqa 3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) */
-                "movdqa 4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) */
-                "movdqa 5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) */
-                "movdqa 6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) */
-                "movdqa 7*16(%[addb]), %%xmm5\n\t"  /* xmm5 := be(8) */
                 "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
-                "paddb  %%xmm0, %%xmm2\n\t"     /* xmm2 := be(1) + CTR (xmm0) */
-                "paddb  %%xmm0, %%xmm3\n\t"     /* xmm3 := be(2) + CTR (xmm0) */
-                "paddb  %%xmm0, %%xmm4\n\t"     /* xmm4 := be(3) + CTR (xmm0) */
-                "paddb  %%xmm0, %%xmm8\n\t"     /* xmm8 := be(4) + CTR (xmm0) */
-                "paddb  %%xmm0, %%xmm9\n\t"     /* xmm9 := be(5) + CTR (xmm0) */
-                "paddb  %%xmm0, %%xmm10\n\t"    /* xmm10 := be(6) + CTR (xmm0) */
-                "paddb  %%xmm0, %%xmm11\n\t"    /* xmm11 := be(7) + CTR (xmm0) */
-                "paddb  %%xmm0, %%xmm5\n\t"     /* xmm5 := be(8) + CTR (xmm0) */
-                "jmp    .Lstore_ctr%=\n\t"
+                "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
+
+                "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm2\n\t"     /* xmm2 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm3\n\t"     /* xmm3 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm4\n\t"     /* xmm4 := CTR (xmm5) */
+                "paddb  0*16(%[addb]), %%xmm2\n\t" /* xmm2 := be(1) + CTR */
+                "paddb  1*16(%[addb]), %%xmm3\n\t" /* xmm3 := be(2) + CTR */
+                "paddb  2*16(%[addb]), %%xmm4\n\t" /* xmm4 := be(3) + CTR */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "aesenc %%xmm7, %%xmm0\n\t"
+                "aesenc %%xmm7, %%xmm2\n\t"
+                "aesenc %%xmm7, %%xmm3\n\t"
+                "aesenc %%xmm7, %%xmm4\n\t"
+                "movdqa %%xmm5, %%xmm8\n\t"     /* xmm8 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm9\n\t"     /* xmm9 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm10\n\t"    /* xmm10 := CTR (xmm5) */
+                "movdqa %%xmm5, %%xmm11\n\t"    /* xmm11 := CTR (xmm5) */
+                "paddb  3*16(%[addb]), %%xmm8\n\t"  /* xmm8 := be(4) + CTR */
+                "paddb  4*16(%[addb]), %%xmm9\n\t"  /* xmm9 := be(5) + CTR */
+                "paddb  5*16(%[addb]), %%xmm10\n\t" /* xmm10 := be(6) + CTR */
+                "paddb  6*16(%[addb]), %%xmm11\n\t" /* xmm11 := be(7) + CTR */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "aesenc %%xmm7, %%xmm8\n\t"
+                "aesenc %%xmm7, %%xmm9\n\t"
+                "aesenc %%xmm7, %%xmm10\n\t"
+                "aesenc %%xmm7, %%xmm11\n\t"
+
+                "paddb  7*16(%[addb]), %%xmm5\n\t" /* xmm5 := be(8) + CTR */
+
+                "jmp    .Ldone_ctr%=\n\t"
 
                 ".Ladd32bit%=:\n\t"
+                "movdqa %%xmm5, (%[ctr])\n\t"   /* Restore CTR. */
                 "movdqa %%xmm5, %%xmm0\n\t"     /* xmm0, xmm2 := CTR (xmm5) */
                 "movdqa %%xmm0, %%xmm2\n\t"
                 "pcmpeqd %%xmm1, %%xmm1\n\t"
@@ -1512,44 +1500,50 @@ do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
                 "psubq   %%xmm1, %%xmm5\n\t"
 
                 ".Lno_carry%=:\n\t"
-                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0] */
+                "movdqa 16(%[key]), %%xmm7\n\t" /* xmm7 := key[1] */
 
                 "pshufb %%xmm6, %%xmm2\n\t"     /* xmm2 := be(xmm2) */
                 "pshufb %%xmm6, %%xmm3\n\t"     /* xmm3 := be(xmm3) */
                 "pshufb %%xmm6, %%xmm4\n\t"     /* xmm4 := be(xmm4) */
-                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
+                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
+                "aesenc %%xmm7, %%xmm0\n\t"
+                "aesenc %%xmm7, %%xmm2\n\t"
+                "aesenc %%xmm7, %%xmm3\n\t"
+                "aesenc %%xmm7, %%xmm4\n\t"
                 "pshufb %%xmm6, %%xmm8\n\t"     /* xmm8 := be(xmm8) */
                 "pshufb %%xmm6, %%xmm9\n\t"     /* xmm9 := be(xmm9) */
                 "pshufb %%xmm6, %%xmm10\n\t"    /* xmm10 := be(xmm10) */
                 "pshufb %%xmm6, %%xmm11\n\t"    /* xmm11 := be(xmm11) */
+                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
+                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
+                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
+                "aesenc %%xmm7, %%xmm8\n\t"
+                "aesenc %%xmm7, %%xmm9\n\t"
+                "aesenc %%xmm7, %%xmm10\n\t"
+                "aesenc %%xmm7, %%xmm11\n\t"
 
-                ".Lstore_ctr%=:\n\t"
+                "pshufb %%xmm6, %%xmm5\n\t"     /* xmm5 := be(xmm5) */
                 "movdqa %%xmm5, (%[ctr])\n\t"   /* Update CTR (mem).  */
+
+                ".align 16\n\t"
+                ".Ldone_ctr%=:\n\t"
                 :
                 : [ctr] "r" (ctr),
                   [key] "r" (ctx->keyschenc),
                   [addb] "r" (bige_addb)
                 : "%esi", "cc", "memory");
 
-  asm volatile ("pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm2\n\t"     /* xmm2 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm3\n\t"     /* xmm3 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm4\n\t"     /* xmm4 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm8\n\t"     /* xmm8 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm9\n\t"     /* xmm9 ^= key[0]    */
-                "pxor   %%xmm1, %%xmm10\n\t"    /* xmm10 ^= key[0]   */
-                "pxor   %%xmm1, %%xmm11\n\t"    /* xmm11 ^= key[0]   */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "aesenc %%xmm1, %%xmm0\n\t"
-                "aesenc %%xmm1, %%xmm2\n\t"
-                "aesenc %%xmm1, %%xmm3\n\t"
-                "aesenc %%xmm1, %%xmm4\n\t"
-                "aesenc %%xmm1, %%xmm8\n\t"
-                "aesenc %%xmm1, %%xmm9\n\t"
-                "aesenc %%xmm1, %%xmm10\n\t"
-                "aesenc %%xmm1, %%xmm11\n\t"
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
+  asm volatile ("movdqa 0x20(%[key]), %%xmm1\n\t"
+                "movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
+                "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
+                "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
+                "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
+                "movdqu 4*16(%[src]), %%xmm7\n\t"  /* Get block 5.      */
                 "aesenc %%xmm1, %%xmm0\n\t"
                 "aesenc %%xmm1, %%xmm2\n\t"
                 "aesenc %%xmm1, %%xmm3\n\t"
@@ -1558,6 +1552,7 @@ do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
                 "aesenc %%xmm1, %%xmm9\n\t"
                 "aesenc %%xmm1, %%xmm10\n\t"
                 "aesenc %%xmm1, %%xmm11\n\t"
+                "cmpl $12, %[rounds]\n\t"
                 "movdqa 0x30(%[key]), %%xmm1\n\t"
                 "aesenc %%xmm1, %%xmm0\n\t"
                 "aesenc %%xmm1, %%xmm2\n\t"
@@ -1664,38 +1659,33 @@ do_aesni_ctr_8 (const RIJNDAEL_context *ctx,
                 ".Lenclast%=:\n\t"
                 :
                 : [key] "r" (ctx->keyschenc),
-                  [rounds] "r" (ctx->rounds)
+                  [rounds] "r" (ctx->rounds),
+                  [src] "r" (a)
                 : "cc", "memory");
 
-  asm volatile ("movdqu 0*16(%[src]), %%xmm12\n\t" /* Get block 1.      */
-                "movdqu 1*16(%[src]), %%xmm13\n\t" /* Get block 2.      */
-                "movdqu 2*16(%[src]), %%xmm14\n\t" /* Get block 3.      */
-                "movdqu 3*16(%[src]), %%xmm15\n\t" /* Get block 4.      */
-                "movdqu 4*16(%[src]), %%xmm7\n\t"  /* Get block 5.      */
-                "pxor %%xmm1, %%xmm12\n\t"         /* block1 ^= lastkey */
-                "aesenclast %%xmm12, %%xmm0\n\t"
-                "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
+  asm volatile ("pxor %%xmm1, %%xmm12\n\t"         /* block1 ^= lastkey */
                 "pxor %%xmm1, %%xmm13\n\t"         /* block2 ^= lastkey */
-                "aesenclast %%xmm13, %%xmm2\n\t"
-                "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
                 "pxor %%xmm1, %%xmm14\n\t"         /* block3 ^= lastkey */
-                "aesenclast %%xmm14, %%xmm3\n\t"
-                "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
                 "pxor %%xmm1, %%xmm15\n\t"         /* block4 ^= lastkey */
+                "aesenclast %%xmm12, %%xmm0\n\t"
+                "aesenclast %%xmm13, %%xmm2\n\t"
+                "aesenclast %%xmm14, %%xmm3\n\t"
                 "aesenclast %%xmm15, %%xmm4\n\t"
-                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
-                "pxor %%xmm1,  %%xmm7\n\t"         /* block5 ^= lastkey */
-                "aesenclast %%xmm7, %%xmm8\n\t"
-                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1     */
-                "pxor %%xmm1, %%xmm12\n\t"         /* block6 ^= lastkey */
-                "aesenclast %%xmm12, %%xmm9\n\t"
+                "movdqu 5*16(%[src]), %%xmm12\n\t" /* Get block 6.      */
+                "movdqu 6*16(%[src]), %%xmm13\n\t" /* Get block 7.      */
+                "movdqu 7*16(%[src]), %%xmm14\n\t" /* Get block 8.      */
+                "movdqu %%xmm0, 0*16(%[dst])\n\t"  /* Store block 1.    */
                 "movdqu %%xmm2, 1*16(%[dst])\n\t"  /* Store block 2.    */
-                "pxor %%xmm1, %%xmm13\n\t"         /* block7 ^= lastkey */
-                "aesenclast %%xmm13, %%xmm10\n\t"
                 "movdqu %%xmm3, 2*16(%[dst])\n\t"  /* Store block 3.    */
+                "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
+                "pxor %%xmm1, %%xmm7\n\t"          /* block5 ^= lastkey */
+                "pxor %%xmm1, %%xmm12\n\t"         /* block6 ^= lastkey */
+                "pxor %%xmm1, %%xmm13\n\t"         /* block7 ^= lastkey */
                 "pxor %%xmm1, %%xmm14\n\t"         /* block8 ^= lastkey */
+                "aesenclast %%xmm7, %%xmm8\n\t"
+                "aesenclast %%xmm12, %%xmm9\n\t"
+                "aesenclast %%xmm13, %%xmm10\n\t"
                 "aesenclast %%xmm14, %%xmm11\n\t"
-                "movdqu %%xmm4, 3*16(%[dst])\n\t"  /* Store block 4.    */
                 "movdqu %%xmm8, 4*16(%[dst])\n\t"  /* Store block 8.    */
                 "movdqu %%xmm9, 5*16(%[dst])\n\t"  /* Store block 9.    */
                 "movdqu %%xmm10, 6*16(%[dst])\n\t" /* Store block 10.   */
@@ -1910,7 +1900,9 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
       for ( ;nblocks >= 8; nblocks -= 8)
 	{
 	  asm volatile
-	    ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
+	    ("movdqa (%[key]), %%xmm0\n\t"
+
+	     "movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
 	     "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
 	     "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
 	     "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
@@ -1925,30 +1917,50 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 	     "movdqa %%xmm3, %%xmm13\n\t"
 	     "movdqa %%xmm4, %%xmm14\n\t"
 	     "movdqa %%xmm8, %%xmm15\n\t"
+
+             "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+             "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+             "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+             "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+             "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+             "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+             "pxor   %%xmm0, %%xmm10\n\t"     /* xmm10 ^= key[0] */
+             "pxor   %%xmm0, %%xmm11\n\t"     /* xmm11 ^= key[0] */
 	     : /* No output */
-	     : [inbuf] "r" (inbuf)
+	     : [inbuf] "r" (inbuf),
+	       [key] "r" (ctx->keyschenc)
 	     : "memory");
 
 	  do_aesni_enc_vec8 (ctx);
 
 	  asm volatile
 	    (
-	     "pxor %%xmm12, %%xmm1\n\t"
+	     "pxor %%xmm0, %%xmm12\n\t"
+	     "pxor %%xmm0, %%xmm13\n\t"
+	     "pxor %%xmm0, %%xmm14\n\t"
+	     "pxor %%xmm0, %%xmm15\n\t"
+	     "aesenclast %%xmm12, %%xmm1\n\t"
+	     "aesenclast %%xmm13, %%xmm2\n\t"
+	     "aesenclast %%xmm14, %%xmm3\n\t"
+	     "aesenclast %%xmm15, %%xmm4\n\t"
+
 	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
-	     "pxor %%xmm13, %%xmm2\n\t"
 	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
-	     "pxor %%xmm14, %%xmm3\n\t"
 	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
-	     "pxor %%xmm15, %%xmm4\n\t"
 	     "movdqu 7*16(%[inbuf]), %%xmm15\n\t"
+	     "pxor %%xmm0, %%xmm12\n\t"
+	     "pxor %%xmm0, %%xmm13\n\t"
+	     "pxor %%xmm0, %%xmm14\n\t"
+	     "pxor %%xmm0, %%xmm15\n\t"
+
+	     "aesenclast %%xmm12, %%xmm8\n\t"
+	     "aesenclast %%xmm13, %%xmm9\n\t"
+	     "aesenclast %%xmm14, %%xmm10\n\t"
+	     "aesenclast %%xmm15, %%xmm11\n\t"
 
-	     "pxor %%xmm12, %%xmm8\n\t"
 	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
-	     "pxor %%xmm13, %%xmm9\n\t"
 	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
-	     "pxor %%xmm14, %%xmm10\n\t"
 	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
-	     "pxor %%xmm15, %%xmm11\n\t"
 	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
 
 	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
@@ -2070,7 +2082,9 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
       for ( ;nblocks >= 8 ; nblocks -= 8 )
 	{
 	  asm volatile
-	    ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+	    ("movdqa (%[key]), %%xmm0\n\t"
+
+	     "movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
 	     "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
 	     "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
 	     "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
@@ -2084,31 +2098,50 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 	     "movdqa %%xmm3, %%xmm14\n\t"
 	     "movdqa %%xmm4, %%xmm15\n\t"
 
+	     "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+	     "pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
+
 	     : /* No output */
-	     : [inbuf] "r" (inbuf)
+	     : [inbuf] "r" (inbuf),
+	       [key] "r" (ctx->keyschdec)
 	     : "memory");
 
 	  do_aesni_dec_vec8 (ctx);
 
 	  asm volatile
-	    ("pxor %%xmm5, %%xmm1\n\t"		/* xor IV with output */
+	    (
+	     "pxor %%xmm0, %%xmm5\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm12\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm13\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm14\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm15\n\t"			/* xor IV with key */
 
-	     "pxor %%xmm12, %%xmm2\n\t"		/* xor IV with output */
-	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
+	     "aesdeclast %%xmm5, %%xmm1\n\t"
+	     "aesdeclast %%xmm12, %%xmm2\n\t"
+	     "aesdeclast %%xmm13, %%xmm3\n\t"
+	     "aesdeclast %%xmm14, %%xmm4\n\t"
 
-	     "pxor %%xmm13, %%xmm3\n\t"		/* xor IV with output */
+	     "movdqu 4*16(%[inbuf]), %%xmm12\n\t"
 	     "movdqu 5*16(%[inbuf]), %%xmm13\n\t"
-
-	     "pxor %%xmm14, %%xmm4\n\t"		/* xor IV with output */
 	     "movdqu 6*16(%[inbuf]), %%xmm14\n\t"
-
-	     "pxor %%xmm15, %%xmm8\n\t"		/* xor IV with output */
 	     "movdqu 7*16(%[inbuf]), %%xmm5\n\t"
-	     "pxor %%xmm12, %%xmm9\n\t"		/* xor IV with output */
+	     "pxor %%xmm0, %%xmm12\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm13\n\t"			/* xor IV with key */
+	     "pxor %%xmm0, %%xmm14\n\t"			/* xor IV with key */
+
+	     "aesdeclast %%xmm15, %%xmm8\n\t"
+	     "aesdeclast %%xmm12, %%xmm9\n\t"
+	     "aesdeclast %%xmm13, %%xmm10\n\t"
+	     "aesdeclast %%xmm14, %%xmm11\n\t"
+
 	     "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
-	     "pxor %%xmm13, %%xmm10\n\t"		/* xor IV with output */
 	     "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
-	     "pxor %%xmm14, %%xmm11\n\t"		/* xor IV with output */
 	     "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
 	     "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
 	     "movdqu %%xmm8, 4*16(%[outbuf])\n\t"
@@ -3452,8 +3485,14 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			"pxor   %%xmm5,    %%xmm3\n\t"
 
 			"pxor   %%xmm0,    %%xmm5\n\t"
+			"movdqa (%[key]),  %%xmm0\n\t"
 			"pxor   %%xmm5,    %%xmm4\n\t"
 
+			"pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+			"pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+			"pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+			"pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+
 			"pxor   %%xmm7,    %%xmm8\n\t"
 			"pxor   %%xmm5,    %%xmm8\n\t"
 
@@ -3465,13 +3504,27 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
 			"pxor   %%xmm14,   %%xmm5\n\t"
 			"pxor   %%xmm5,    %%xmm11\n\t"
+
+			"pxor   %%xmm0, %%xmm8\n\t"     /* xmm8 ^= key[0] */
+			"pxor   %%xmm0, %%xmm9\n\t"     /* xmm9 ^= key[0] */
+			"pxor   %%xmm0, %%xmm10\n\t"    /* xmm10 ^= key[0] */
+			"pxor   %%xmm0, %%xmm11\n\t"    /* xmm11 ^= key[0] */
 			:
-			:
+			: [key] "r" (ctx->keyschenc)
 			: "memory" );
 
 	  do_aesni_enc_vec8 (ctx);
 
-	  asm volatile ("pxor   %%xmm2,   %%xmm1\n\t"
+	  asm volatile (
+			"aesenclast %%xmm0, %%xmm1\n\t"
+			"aesenclast %%xmm0, %%xmm2\n\t"
+			"aesenclast %%xmm0, %%xmm3\n\t"
+			"aesenclast %%xmm0, %%xmm4\n\t"
+			"aesenclast %%xmm0, %%xmm8\n\t"
+			"aesenclast %%xmm0, %%xmm9\n\t"
+			"aesenclast %%xmm0, %%xmm10\n\t"
+			"aesenclast %%xmm0, %%xmm11\n\t"
+			"pxor   %%xmm2,   %%xmm1\n\t"
 			"pxor   %%xmm3,   %%xmm1\n\t"
 			"pxor   %%xmm4,   %%xmm1\n\t"
 			"pxor   %%xmm8,   %%xmm1\n\t"
-- 
2.25.1




More information about the Gcrypt-devel mailing list