[PATCH] rijndael-aesni: use inline checksumming for OCB decryption

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun May 28 16:53:55 CEST 2023


* cipher/rijndael-aesni.c (aesni_ocb_checksum): Remove.
(aesni_ocb_dec): Add inline checksumming.
--

Inline checksumming is far faster on Ryzen processors on i386
builds than two-pass checksumming.

Benchmark on AMD Ryzen 9 7900X (i386):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        OCB dec |     0.180 ns/B      5292 MiB/s     0.847 c/B      4700

After (~2x faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        OCB dec |     0.091 ns/B     10491 MiB/s     0.427 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-aesni.c | 220 ++++++++--------------------------------
 1 file changed, 43 insertions(+), 177 deletions(-)

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 906737a6..b33ef7ed 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -2710,174 +2710,6 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
 }
 
 
-static ASM_FUNC_ATTR_INLINE void
-aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
-		    size_t nblocks)
-{
-  RIJNDAEL_context *ctx = (void *)&c->context.c;
-
-  /* Calculate checksum */
-  asm volatile ("movdqu %[checksum], %%xmm6\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                :
-                :[checksum] "m" (*c->u_ctr.ctr)
-                : "memory" );
-
-  if (0) {}
-#if defined(HAVE_GCC_INLINE_ASM_AVX2)
-  else if (nblocks >= 16 && ctx->use_avx2)
-    {
-      /* Use wider 256-bit registers for fast xoring of plaintext. */
-      asm volatile ("vzeroupper\n\t"
-		    "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
-		    "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
-		    "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
-		    "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
-                    :
-                    :
-                    : "memory");
-
-      for (;nblocks >= 16; nblocks -= 16)
-	{
-	  asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
-			"vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
-			"vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
-			"vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
-			:
-			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
-			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
-			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
-			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
-			: "memory" );
-	  asm volatile ("vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
-			"vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
-			"vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
-			"vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
-			:
-			: [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
-			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
-			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
-			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
-			: "memory" );
-	  plaintext += BLOCKSIZE * 16;
-	}
-
-      asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
-		    "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
-		    "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
-		    "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
-		    "vextracti128 $1, %%ymm6, %%xmm0\n\t"
-		    "vextracti128 $1, %%ymm1, %%xmm4\n\t"
-		    "vextracti128 $1, %%ymm2, %%xmm5\n\t"
-		    "vextracti128 $1, %%ymm3, %%xmm7\n\t"
-		    "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
-		    "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
-		    "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
-		    "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
-		    "vzeroupper\n\t"
-		    :
-		    :
-		    : "memory" );
-    }
-#endif
-#if defined(HAVE_GCC_INLINE_ASM_AVX)
-  else if (nblocks >= 16 && ctx->use_avx)
-    {
-      /* Same as AVX2, except using 256-bit floating point instructions. */
-      asm volatile ("vzeroupper\n\t"
-		    "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t"
-		    "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t"
-		    "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t"
-		    "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t"
-                    :
-                    :
-                    : "memory");
-
-      for (;nblocks >= 16; nblocks -= 16)
-	{
-	  asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t"
-			"vxorpd %[ptr1], %%ymm1, %%ymm1\n\t"
-			"vxorpd %[ptr2], %%ymm2, %%ymm2\n\t"
-			"vxorpd %[ptr3], %%ymm3, %%ymm3\n\t"
-			:
-			: [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
-			  [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
-			  [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
-			  [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
-			: "memory" );
-	  asm volatile ("vxorpd %[ptr4], %%ymm0, %%ymm0\n\t"
-			"vxorpd %[ptr5], %%ymm4, %%ymm4\n\t"
-			"vxorpd %[ptr6], %%ymm5, %%ymm5\n\t"
-			"vxorpd %[ptr7], %%ymm7, %%ymm7\n\t"
-			:
-			: [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
-			  [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
-			  [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
-			  [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
-			: "memory" );
-	  plaintext += BLOCKSIZE * 16;
-	}
-
-      asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t"
-		    "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t"
-		    "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t"
-		    "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t"
-		    "vextractf128 $1, %%ymm6, %%xmm0\n\t"
-		    "vextractf128 $1, %%ymm1, %%xmm4\n\t"
-		    "vextractf128 $1, %%ymm2, %%xmm5\n\t"
-		    "vextractf128 $1, %%ymm3, %%xmm7\n\t"
-		    "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t"
-		    "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t"
-		    "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t"
-		    "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t"
-		    "vzeroupper\n\t"
-		    :
-		    :
-		    : "memory" );
-    }
-#endif
-
-  for (;nblocks >= 4; nblocks -= 4)
-    {
-      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
-		    "movdqu %[ptr1], %%xmm4\n\t"
-		    "movdqu %[ptr2], %%xmm5\n\t"
-		    "movdqu %[ptr3], %%xmm7\n\t"
-		    "pxor %%xmm0, %%xmm6\n\t"
-		    "pxor %%xmm4, %%xmm1\n\t"
-		    "pxor %%xmm5, %%xmm2\n\t"
-		    "pxor %%xmm7, %%xmm3\n\t"
-		    :
-		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
-		      [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
-		      [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
-		      [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
-		    : "memory" );
-      plaintext += BLOCKSIZE * 4;
-    }
-
-  for (;nblocks >= 1; nblocks -= 1)
-    {
-      asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
-		    "pxor %%xmm0, %%xmm6\n\t"
-		    :
-		    : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
-		    : "memory" );
-      plaintext += BLOCKSIZE;
-    }
-
-  asm volatile ("pxor %%xmm1, %%xmm6\n\t"
-		"pxor %%xmm2, %%xmm6\n\t"
-		"pxor %%xmm3, %%xmm6\n\t"
-		"movdqu %%xmm6, %[checksum]\n\t"
-		: [checksum] "=m" (*c->u_ctr.ctr)
-		:
-		: "memory" );
-}
-
-
 static unsigned int ASM_FUNC_ATTR_NOINLINE
 aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                const void *inbuf_arg, size_t nblocks)
@@ -3401,9 +3233,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
   /* Preload Offset */
   asm volatile ("movdqu %[iv], %%xmm5\n\t"
-                : /* No output */
-                : [iv] "m" (*c->u_iv.iv)
-                : "memory" );
+		"movdqu %[ctr], %%xmm7\n\t"
+		: /* No output */
+		: [iv] "m" (*c->u_iv.iv),
+		  [ctr] "m" (*c->u_ctr.ctr)
+		: "memory" );
 
   for ( ;nblocks && n % 4; nblocks-- )
     {
@@ -3424,6 +3258,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
+		    "pxor   %%xmm0, %%xmm7\n\t"
                     : [outbuf] "=m" (*outbuf)
                     :
                     : "memory" );
@@ -3452,6 +3287,15 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		    "pxor %[first_key], %%xmm5\n\t"
 		    "pxor %[first_key], %%xmm0\n\t"
 		    "movdqa %%xmm0, %[lxfkey]\n\t"
+		    /* Clear plaintext blocks */
+		    "pxor   %%xmm1,    %%xmm1\n\t"
+		    "pxor   %%xmm2,    %%xmm2\n\t"
+		    "pxor   %%xmm3,    %%xmm3\n\t"
+		    "pxor   %%xmm4,    %%xmm4\n\t"
+		    "pxor   %%xmm8,    %%xmm8\n\t"
+		    "pxor   %%xmm9,    %%xmm9\n\t"
+		    "pxor   %%xmm10,   %%xmm10\n\t"
+		    "pxor   %%xmm11,   %%xmm11\n\t"
 		    : [lxfkey] "=m" (*lxf_key)
 		    : [l0] "m" (*c->u_mode.ocb.L[0]),
 		      [last_key] "m" (ctx->keyschdec[ctx->rounds][0][0]),
@@ -3463,7 +3307,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 	  n += 4;
 	  l = aes_ocb_get_l(c, n);
 
-	  asm volatile ("movdqu %[l0l1],   %%xmm10\n\t"
+	  asm volatile ("pxor   %%xmm10,   %%xmm1\n\t"
+			"pxor   %%xmm11,   %%xmm2\n\t"
+			"movdqu %[l0l1],   %%xmm10\n\t"
 			"movdqu %[l1],     %%xmm11\n\t"
 			"movdqu %[l3],     %%xmm15\n\t"
 			:
@@ -3477,7 +3323,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
-	  asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+	  asm volatile ("pxor   %%xmm1,    %%xmm4\n\t"
+			"pxor   %%xmm2,    %%xmm8\n\t"
+			"pxor   %%xmm3,    %%xmm9\n\t"
+			"movdqu %[inbuf0], %%xmm1\n\t"
 			"movdqu %[inbuf1], %%xmm2\n\t"
 			"movdqu %[inbuf2], %%xmm3\n\t"
 			:
@@ -3485,8 +3334,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 			  [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
 			  [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
 			: "memory" );
-	  asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+	  asm volatile ("pxor   %%xmm4,    %%xmm7\n\t"
+			"movdqu %[inbuf3], %%xmm4\n\t"
+			"pxor   %%xmm8,    %%xmm7\n\t"
 			"movdqu %[inbuf4], %%xmm8\n\t"
+			"pxor   %%xmm9,    %%xmm7\n\t"
 			"movdqu %[inbuf5], %%xmm9\n\t"
 			:
 			: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
@@ -3722,6 +3574,15 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
       asm volatile ("pxor %[first_key], %%xmm5\n\t"
 		    "pxor %%xmm0, %%xmm0\n\t"
 		    "movdqu %%xmm0, %[lxfkey]\n\t"
+		    /* Add plaintext blocks to checksum */
+		    "pxor   %%xmm1,    %%xmm2\n\t"
+		    "pxor   %%xmm3,    %%xmm4\n\t"
+		    "pxor   %%xmm9,    %%xmm8\n\t"
+		    "pxor   %%xmm11,   %%xmm10\n\t"
+		    "pxor   %%xmm2,    %%xmm4\n\t"
+		    "pxor   %%xmm8,    %%xmm10\n\t"
+		    "pxor   %%xmm4,    %%xmm7\n\t"
+		    "pxor   %%xmm10,   %%xmm7\n\t"
 		    : [lxfkey] "=m" (*lxf_key)
 		    : [first_key] "m" (ctx->keyschdec[0][0][0])
 		    : "memory" );
@@ -3782,8 +3643,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
       asm volatile ("pxor   %[tmpbuf0],%%xmm1\n\t"
 		    "movdqu %%xmm1,    %[outbuf0]\n\t"
+		    "pxor   %%xmm1,    %%xmm7\n\t"
 		    "pxor   %[tmpbuf1],%%xmm2\n\t"
 		    "movdqu %%xmm2,    %[outbuf1]\n\t"
+		    "pxor   %%xmm2,    %%xmm7\n\t"
 		    : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
 		      [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
 		    : [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
@@ -3791,8 +3654,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 		    : "memory" );
       asm volatile ("pxor   %[tmpbuf2],%%xmm3\n\t"
 		    "movdqu %%xmm3,    %[outbuf2]\n\t"
+		    "pxor   %%xmm3,    %%xmm7\n\t"
 		    "pxor   %%xmm5,    %%xmm4\n\t"
 		    "movdqu %%xmm4,    %[outbuf3]\n\t"
+		    "pxor   %%xmm4,    %%xmm7\n\t"
 		    : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
 		      [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
 		    : [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
@@ -3822,6 +3687,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
       asm volatile ("pxor   %%xmm5, %%xmm0\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
+		    "pxor   %%xmm0, %%xmm7\n\t"
                     : [outbuf] "=m" (*outbuf)
                     :
                     : "memory" );
@@ -3832,7 +3698,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = n;
   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
-                : [iv] "=m" (*c->u_iv.iv)
+                "movdqu %%xmm7, %[ctr]\n\t"
+		: [iv] "=m" (*c->u_iv.iv),
+		  [ctr] "=m" (*c->u_ctr.ctr)
                 :
                 : "memory" );
 
@@ -3846,8 +3714,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                 :
                 : "memory" );
 
-  aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
-
   aesni_cleanup ();
   aesni_cleanup_2_7 ();
 
-- 
2.39.2




More information about the Gcrypt-devel mailing list