[PATCH 3/4] rijndael-aesni: add 8-block parallel code path for XTS

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Feb 28 18:35:58 CET 2021


* cipher/rijndael-aesni.c (xts_gfmul_const): Fix array size from 16
to 2.
(_gcry_aes_aesni_xts_enc, _gcry_aes_aesni_xts_dec) [__x86_64__]: Add
8-block parallel code paths.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-aesni.c | 596 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 595 insertions(+), 1 deletion(-)

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 95ec4c2b..9dde0489 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -3661,7 +3661,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 }
 
 
-static const u64 xts_gfmul_const[16] __attribute__ ((aligned (16))) =
+static const u64 xts_gfmul_const[2] __attribute__ ((aligned (16))) =
   { 0x87, 0x01 };
 
 
@@ -3683,6 +3683,303 @@ _gcry_aes_aesni_xts_enc (RIJNDAEL_context *ctx, unsigned char *tweak,
 		  [gfmul] "m" (*xts_gfmul_const)
 		: "memory" );
 
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm11\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm7\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf0] "m" (*(inbuf + 0 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf3] "m" (*(inbuf + 3 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf4] "m" (*(inbuf + 4 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * 16))
+			: [inbuf5] "m" (*(inbuf + 5 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * 16))
+			: [inbuf6] "m" (*(inbuf + 6 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqa %%xmm11,   %%xmm0\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm5,    %[outbuf7]\n\t"
+
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			: [outbuf7] "=m" (*(outbuf + 7 * 16))
+			: [inbuf7] "m" (*(inbuf + 7 * 16))
+			: "memory" );
+
+	  asm volatile ("cmpl $12, %[rounds]\n\t"
+			"movdqa (%[key]), %%xmm0\n\t"
+			"pxor %%xmm0, %%xmm1\n\t"
+			"pxor %%xmm0, %%xmm2\n\t"
+			"pxor %%xmm0, %%xmm3\n\t"
+			"pxor %%xmm0, %%xmm4\n\t"
+			"pxor %%xmm0, %%xmm8\n\t"
+			"pxor %%xmm0, %%xmm9\n\t"
+			"pxor %%xmm0, %%xmm10\n\t"
+			"pxor %%xmm0, %%xmm11\n\t"
+			"movdqa 0x10(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x20(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x30(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x40(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x50(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x60(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x70(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x80(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0x90(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0xa0(%[key]), %%xmm0\n\t"
+			"jb .Lenclast%=\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0xb0(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0xc0(%[key]), %%xmm0\n\t"
+			"je .Lenclast%=\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0xd0(%[key]), %%xmm0\n\t"
+			"aesenc %%xmm0, %%xmm1\n\t"
+			"aesenc %%xmm0, %%xmm2\n\t"
+			"aesenc %%xmm0, %%xmm3\n\t"
+			"aesenc %%xmm0, %%xmm4\n\t"
+			"aesenc %%xmm0, %%xmm8\n\t"
+			"aesenc %%xmm0, %%xmm9\n\t"
+			"aesenc %%xmm0, %%xmm10\n\t"
+			"aesenc %%xmm0, %%xmm11\n\t"
+			"movdqa 0xe0(%[key]), %%xmm0\n\t"
+
+			".Lenclast%=:\n\t"
+			:
+			: [key] "r" (ctx->keyschenc),
+			  [rounds] "rm" (ctx->rounds)
+			: "cc", "memory");
+
+	  asm volatile ("pxor %%xmm0, %%xmm7\n\t"
+			"pxor %%xmm0, %%xmm12\n\t"
+			"pxor %%xmm0, %%xmm13\n\t"
+			"pxor %%xmm0, %%xmm14\n\t"
+			"aesenclast %%xmm7, %%xmm1\n\t"
+			"aesenclast %%xmm12, %%xmm2\n\t"
+			"aesenclast %%xmm13, %%xmm3\n\t"
+			"aesenclast %%xmm14, %%xmm4\n\t"
+			"movdqu 5*16(%[outbuf]), %%xmm12\n\t"
+			"movdqu 6*16(%[outbuf]), %%xmm13\n\t"
+			"movdqu 7*16(%[outbuf]), %%xmm14\n\t"
+			"pxor %%xmm0, %%xmm15\n\t"
+			"pxor %%xmm0, %%xmm12\n\t"
+			"pxor %%xmm0, %%xmm13\n\t"
+			"pxor %%xmm0, %%xmm14\n\t"
+			"aesenclast %%xmm15, %%xmm8\n\t"
+			"aesenclast %%xmm12, %%xmm9\n\t"
+			"aesenclast %%xmm13, %%xmm10\n\t"
+			"aesenclast %%xmm14, %%xmm11\n\t"
+			"movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+			"movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+			"movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+			"movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+			"movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+			"movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+			"movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+			"movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+			:
+			: [outbuf] "r" (outbuf)
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
   for ( ;nblocks >= 4; nblocks -= 4 )
     {
       asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
@@ -3827,6 +4124,303 @@ _gcry_aes_aesni_xts_dec (RIJNDAEL_context *ctx, unsigned char *tweak,
 		  [gfmul] "m" (*xts_gfmul_const)
 		: "memory" );
 
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      aesni_prepare_8_15_variable;
+
+      aesni_prepare_8_15();
+
+      for ( ;nblocks >= 8 ; nblocks -= 8 )
+	{
+	  asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm11\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
+			"pxor   %%xmm5,    %%xmm1\n\t"
+			"movdqa %%xmm5,    %%xmm7\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf0] "m" (*(inbuf + 0 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
+			"pxor   %%xmm5,    %%xmm2\n\t"
+			"movdqa %%xmm5,    %%xmm12\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf1] "m" (*(inbuf + 1 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf2], %%xmm3\n\t"
+			"pxor   %%xmm5,    %%xmm3\n\t"
+			"movdqa %%xmm5,    %%xmm13\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf2] "m" (*(inbuf + 2 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm5,    %%xmm4\n\t"
+			"movdqa %%xmm5,    %%xmm14\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf3] "m" (*(inbuf + 3 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm5,    %%xmm8\n\t"
+			"movdqa %%xmm5,    %%xmm15\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			:
+			: [inbuf4] "m" (*(inbuf + 4 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf5], %%xmm9\n\t"
+			"pxor   %%xmm5,    %%xmm9\n\t"
+			"movdqu %%xmm5,    %[outbuf5]\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			: [outbuf5] "=m" (*(outbuf + 5 * 16))
+			: [inbuf5] "m" (*(inbuf + 5 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
+			"pxor   %%xmm5,    %%xmm10\n\t"
+			"movdqu %%xmm5,    %[outbuf6]\n\t"
+
+			"movdqa %%xmm11,   %%xmm0\n\t"
+			"paddd  %%xmm11,   %%xmm11\n\t"
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			: [outbuf6] "=m" (*(outbuf + 6 * 16))
+			: [inbuf6] "m" (*(inbuf + 6 * 16))
+			: "memory" );
+
+	  asm volatile ("movdqa %%xmm11,   %%xmm0\n\t"
+			"movdqu %[inbuf7], %%xmm11\n\t"
+			"pxor   %%xmm5,    %%xmm11\n\t"
+			"movdqu %%xmm5,    %[outbuf7]\n\t"
+
+			"psrad  $31,       %%xmm0\n\t"
+			"paddq  %%xmm5,    %%xmm5\n\t"
+			"pand   %%xmm6,    %%xmm0\n\t"
+			"pxor   %%xmm0,    %%xmm5\n\t"
+			: [outbuf7] "=m" (*(outbuf + 7 * 16))
+			: [inbuf7] "m" (*(inbuf + 7 * 16))
+			: "memory" );
+
+	  asm volatile ("cmpl $12, %[rounds]\n\t"
+			"movdqa (%[key]), %%xmm0\n\t"
+			"pxor %%xmm0, %%xmm1\n\t"
+			"pxor %%xmm0, %%xmm2\n\t"
+			"pxor %%xmm0, %%xmm3\n\t"
+			"pxor %%xmm0, %%xmm4\n\t"
+			"pxor %%xmm0, %%xmm8\n\t"
+			"pxor %%xmm0, %%xmm9\n\t"
+			"pxor %%xmm0, %%xmm10\n\t"
+			"pxor %%xmm0, %%xmm11\n\t"
+			"movdqa 0x10(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x20(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x30(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x40(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x50(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x60(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x70(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x80(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0x90(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0xa0(%[key]), %%xmm0\n\t"
+			"jb .Ldeclast%=\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0xb0(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0xc0(%[key]), %%xmm0\n\t"
+			"je .Ldeclast%=\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0xd0(%[key]), %%xmm0\n\t"
+			"aesdec %%xmm0, %%xmm1\n\t"
+			"aesdec %%xmm0, %%xmm2\n\t"
+			"aesdec %%xmm0, %%xmm3\n\t"
+			"aesdec %%xmm0, %%xmm4\n\t"
+			"aesdec %%xmm0, %%xmm8\n\t"
+			"aesdec %%xmm0, %%xmm9\n\t"
+			"aesdec %%xmm0, %%xmm10\n\t"
+			"aesdec %%xmm0, %%xmm11\n\t"
+			"movdqa 0xe0(%[key]), %%xmm0\n\t"
+
+			".Ldeclast%=:\n\t"
+			:
+			: [key] "r" (ctx->keyschdec),
+			  [rounds] "rm" (ctx->rounds)
+			: "cc", "memory");
+
+	  asm volatile ("pxor %%xmm0, %%xmm7\n\t"
+			"pxor %%xmm0, %%xmm12\n\t"
+			"pxor %%xmm0, %%xmm13\n\t"
+			"pxor %%xmm0, %%xmm14\n\t"
+			"aesdeclast %%xmm7, %%xmm1\n\t"
+			"aesdeclast %%xmm12, %%xmm2\n\t"
+			"aesdeclast %%xmm13, %%xmm3\n\t"
+			"aesdeclast %%xmm14, %%xmm4\n\t"
+			"movdqu 5*16(%[outbuf]), %%xmm12\n\t"
+			"movdqu 6*16(%[outbuf]), %%xmm13\n\t"
+			"movdqu 7*16(%[outbuf]), %%xmm14\n\t"
+			"pxor %%xmm0, %%xmm15\n\t"
+			"pxor %%xmm0, %%xmm12\n\t"
+			"pxor %%xmm0, %%xmm13\n\t"
+			"pxor %%xmm0, %%xmm14\n\t"
+			"aesdeclast %%xmm15, %%xmm8\n\t"
+			"aesdeclast %%xmm12, %%xmm9\n\t"
+			"aesdeclast %%xmm13, %%xmm10\n\t"
+			"aesdeclast %%xmm14, %%xmm11\n\t"
+			"movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+			"movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+			"movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+			"movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+			"movdqu %%xmm8, 4*16(%[outbuf])\n\t"
+			"movdqu %%xmm9, 5*16(%[outbuf])\n\t"
+			"movdqu %%xmm10, 6*16(%[outbuf])\n\t"
+			"movdqu %%xmm11, 7*16(%[outbuf])\n\t"
+			:
+			: [outbuf] "r" (outbuf)
+			: "memory" );
+
+	  outbuf += 8*BLOCKSIZE;
+	  inbuf  += 8*BLOCKSIZE;
+	}
+
+      aesni_cleanup_8_15();
+    }
+#endif
+
   for ( ;nblocks >= 4; nblocks -= 4 )
     {
       asm volatile ("pshufd $0x13,     %%xmm5,  %%xmm4\n\t"
-- 
2.27.0




More information about the Gcrypt-devel mailing list