[PATCH] rijndael: further optimizations for AES-NI accelerated CBC and CFB bulk modes

Sat Dec 6 14:22:53 CET 2014

* cipher/rijndael-aesni.c (do_aesni_enc, do_aesni_dec): Pass
input/output through SSE register XMM0.
(do_aesni_cfb): Remove.
(_gcry_aes_aesni_encrypt, _gcry_aes_aesni_decrypt): Add loading/storing
input/output to/from XMM0.
(_gcry_aes_aesni_cfb_enc, _gcry_aes_aesni_cbc_enc)
(_gcry_aes_aesni_cfb_dec): Update to use renewed 'do_aesni_enc' and
move IV loading/storing outside loop.
(_gcry_aes_aesni_cbc_dec): Update to use renewed 'do_aesni_dec'.
--

CBC encryption speed is improved ~16% on Intel Haswell and CFB encryption ~8%.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-aesni.c |  244 ++++++++++++++++++++---------------------------
 1 file changed, 104 insertions(+), 140 deletions(-)

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index e6c1051..3c367ce 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -340,33 +340,14 @@ _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
 }
 
 
-/* Encrypt one block using the Intel AES-NI instructions.  A and B may
-   be the same.
-
-   Our problem here is that gcc does not allow the "x" constraint for
-   SSE registers in asm unless you compile with -msse.  The common
-   wisdom is to use a separate file for SSE instructions and build it
-   separately.  This would require a lot of extra build system stuff,
-   similar to what we do in mpi/ for the asm stuff.  What we do
-   instead is to use standard registers and a bit more of plain asm
-   which copies the data and key stuff to the SSE registers and later
-   back.  If we decide to implement some block modes with parallelized
-   AES instructions, it might indeed be better to use plain asm ala
-   mpi/.  */
+/* Encrypt one block using the Intel AES-NI instructions.  Block is input
+ * and output through SSE register xmm0. */
 static inline void
-do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
-              const unsigned char *a)
+do_aesni_enc (const RIJNDAEL_context *ctx)
 {
 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
 #define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-  /* Note: For now we relax the alignment requirement for A and B: It
-     does not make much difference because in many case we would need
-     to memcpy them to an extra buffer; using the movdqu is much faster
-     that memcpy and movdqa.  For CFB we know that the IV is properly
-     aligned but that is a special case.  We should better implement
-     CFB direct in asm.  */
-  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
-                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
+  asm volatile ("movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
                 "movdqa 0x10(%[key]), %%xmm1\n\t"
                 aesenc_xmm1_xmm0
@@ -402,10 +383,9 @@ do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
 
                 ".Lenclast%=:\n\t"
                 aesenclast_xmm1_xmm0
-                "movdqu %%xmm0, %[dst]\n"
-                : [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschenc),
+                "\n"
+                :
+                : [key] "r" (ctx->keyschenc),
                   [rounds] "r" (ctx->rounds)
                 : "cc", "memory");
 #undef aesenc_xmm1_xmm0
@@ -413,14 +393,14 @@ do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
 }
 
 
+/* Decrypt one block using the Intel AES-NI instructions.  Block is input
+ * and output through SSE register xmm0. */
 static inline void
-do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
-              const unsigned char *a)
+do_aesni_dec (const RIJNDAEL_context *ctx)
 {
 #define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
 #define aesdeclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
-  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
-                "movdqa (%[key]), %%xmm1\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm1\n\t"
                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
                 "movdqa 0x10(%[key]), %%xmm1\n\t"
                 aesdec_xmm1_xmm0
@@ -456,10 +436,9 @@ do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
 
                 ".Ldeclast%=:\n\t"
                 aesdeclast_xmm1_xmm0
-                "movdqu %%xmm0, %[dst]\n"
-                : [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschdec),
+                "\n"
+                :
+                : [key] "r" (ctx->keyschdec),
                   [rounds] "r" (ctx->rounds)
                 : "cc", "memory");
 #undef aesdec_xmm1_xmm0
@@ -685,74 +664,6 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 }
 
 
-/* Perform a CFB encryption or decryption round using the
-   initialization vector IV and the input block A.  Write the result
-   to the output block B and update IV.  IV needs to be 16 byte
-   aligned.  */
-static inline void
-do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag,
-              unsigned char *iv, unsigned char *b, const unsigned char *a)
-{
-#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
-#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-  asm volatile ("movdqa %[iv], %%xmm0\n\t"      /* xmm0 := IV     */
-                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
-                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x30(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x40(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x50(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x60(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x70(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x80(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x90(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xa0(%[key]), %%xmm1\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xb0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xc0(%[key]), %%xmm1\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xd0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xe0(%[key]), %%xmm1\n"
-
-                ".Lenclast%=:\n\t"
-                aesenclast_xmm1_xmm0
-                "movdqu %[src], %%xmm1\n\t"      /* Save input.  */
-                "pxor %%xmm1, %%xmm0\n\t"        /* xmm0 = input ^ IV  */
-
-                "cmpl $1, %[decrypt]\n\t"
-                "jz .Ldecrypt_%=\n\t"
-                "movdqa %%xmm0, %[iv]\n\t"       /* [encrypt] Store IV.  */
-                "jmp .Lleave_%=\n"
-                ".Ldecrypt_%=:\n\t"
-                "movdqa %%xmm1, %[iv]\n"         /* [decrypt] Store IV.  */
-                ".Lleave_%=:\n\t"
-                "movdqu %%xmm0, %[dst]\n"        /* Store output.   */
-                : [iv] "+m" (*iv), [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschenc),
-                  [rounds] "g" (ctx->rounds),
-                  [decrypt] "m" (decrypt_flag)
-                : "cc", "memory");
-#undef aesenc_xmm1_xmm0
-#undef aesenclast_xmm1_xmm0
-}
-
 /* Perform a CTR encryption round using the counter CTR and the input
    block A.  Write the result to the output block B and update CTR.
    CTR needs to be a 16 byte aligned little-endian value.  */
@@ -1026,7 +937,15 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
 {
   aesni_prepare ();
-  do_aesni_enc (ctx, dst, src);
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_aesni_enc (ctx);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
   aesni_cleanup ();
   return 0;
 }
@@ -1038,12 +957,32 @@ _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                          size_t nblocks)
 {
   aesni_prepare ();
+
+  asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
   for ( ;nblocks; nblocks-- )
     {
-      do_aesni_cfb (ctx, 0, iv, outbuf, inbuf);
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
+                    "pxor %%xmm1, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
       outbuf += BLOCKSIZE;
       inbuf  += BLOCKSIZE;
     }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
   aesni_cleanup ();
 }
 
@@ -1053,45 +992,41 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                          const unsigned char *inbuf, unsigned char *iv,
                          size_t nblocks, int cbc_mac)
 {
-  unsigned char *last_iv;
-
   aesni_prepare ();
 
-  last_iv = iv;
+  asm volatile ("movdqu %[iv], %%xmm5\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
 
   for ( ;nblocks; nblocks-- )
     {
-      /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
-          SSE registers.  */
-      asm volatile ("movdqu %[iv], %%xmm0\n\t"
-                    "movdqu %[inbuf], %%xmm1\n\t"
-                    "pxor %%xmm0, %%xmm1\n\t"
-                    "movdqu %%xmm1, %[outbuf]\n\t"
+      asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm5, %%xmm0\n\t"
                     : /* No output */
-                    : [iv] "m" (*last_iv),
-                      [inbuf] "m" (*inbuf),
-                      [outbuf] "m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
                     : "memory" );
 
-      do_aesni_enc (ctx, outbuf, outbuf);
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqa %%xmm0, %%xmm5\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
 
-      last_iv = outbuf;
       inbuf += BLOCKSIZE;
       if (!cbc_mac)
         outbuf += BLOCKSIZE;
     }
 
-  if (last_iv != iv)
-    {
-      asm volatile ("movdqu %[last], %%xmm0\n\t"
-                    "movdqu %%xmm0, %[iv]\n\t"
-                    : /* No output */
-                    : [last] "m" (*last_iv),
-                      [iv] "m" (*iv)
-                    : "memory" );
-    }
+  asm volatile ("movdqu %%xmm5, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
 
   aesni_cleanup ();
+  aesni_cleanup_2_6 ();
 }
 
 
@@ -1134,7 +1069,15 @@ _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
 {
   aesni_prepare ();
-  do_aesni_dec (ctx, dst, src);
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_aesni_dec (ctx);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
   aesni_cleanup ();
   return 0;
 }
@@ -1147,19 +1090,23 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
 {
   aesni_prepare ();
 
+  asm volatile ("movdqu %[iv], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
   /* CFB decryption can be parallelized */
   for ( ;nblocks >= 4; nblocks -= 4)
     {
       asm volatile
-        ("movdqu (%[iv]),        %%xmm1\n\t" /* load input blocks */
+        ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
          "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
          "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
          "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
 
-         "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
-         "movdqu %%xmm0,         (%[iv])\n\t"
+         "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */
          : /* No output */
-         : [inbuf] "r" (inbuf), [iv] "r" (iv)
+         : [inbuf] "r" (inbuf)
          : "memory");
 
       do_aesni_enc_vec4 (ctx);
@@ -1190,12 +1137,29 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
       inbuf  += 4*BLOCKSIZE;
     }
 
+  asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc");
+
   for ( ;nblocks; nblocks-- )
     {
-      do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm6, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
       outbuf += BLOCKSIZE;
       inbuf  += BLOCKSIZE;
     }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
   aesni_cleanup ();
   aesni_cleanup_2_6 ();
 }
@@ -1256,21 +1220,21 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
   for ( ;nblocks; nblocks-- )
     {
       asm volatile
-        ("movdqu %[inbuf], %%xmm2\n\t"	/* use xmm2 as savebuf */
+        ("movdqu %[inbuf], %%xmm0\n\t"
+         "movdqa %%xmm0, %%xmm2\n\t"    /* use xmm2 as savebuf */
          : /* No output */
          : [inbuf] "m" (*inbuf)
          : "memory");
 
       /* uses only xmm0 and xmm1 */
-      do_aesni_dec (ctx, outbuf, inbuf);
+      do_aesni_dec (ctx);
 
       asm volatile
-        ("movdqu %[outbuf], %%xmm0\n\t"
-         "pxor %%xmm5, %%xmm0\n\t"	/* xor IV with output */
+        ("pxor %%xmm5, %%xmm0\n\t"	/* xor IV with output */
          "movdqu %%xmm0, %[outbuf]\n\t"
          "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
-         : /* No output */
-         : [outbuf] "m" (*outbuf)
+         : [outbuf] "=m" (*outbuf)
+         :
          : "memory");
 
       outbuf += BLOCKSIZE;