[PATCH 1/2] rijndael-ppc: small speed-up for CBC and CFB encryption

Jussi Kivilinna jussi.kivilinna at iki.fi
Wed Jul 27 22:16:31 CEST 2022


* cipher/rijndael-ppc-common.h (AES_ENCRYPT_ALL): Remove
* cipher/rijndael-ppc-functions.h (CFB_ENC_FUNC)
(CBC_ENC_FUNC): Removed two block unrolled loop; Optimized single
block loop for shorter critical-path.
--

Patch gives small ~3% performance increase for CBC and CFB
encryption, tested with POWER8.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ppc-common.h    |  25 ------
 cipher/rijndael-ppc-functions.h | 131 ++++++++++++++++++--------------
 2 files changed, 74 insertions(+), 82 deletions(-)

diff --git a/cipher/rijndael-ppc-common.h b/cipher/rijndael-ppc-common.h
index bbbeaac0..3fa9a0b9 100644
--- a/cipher/rijndael-ppc-common.h
+++ b/cipher/rijndael-ppc-common.h
@@ -158,31 +158,6 @@ typedef union
     rkeylast = ALIGNED_LOAD (rk, nrounds); \
   } while (0)
 
-#define AES_ENCRYPT_ALL(blk, nrounds) \
-  do { \
-    blk ^= rkey0; \
-    blk = asm_cipher_be (blk, rkey1); \
-    blk = asm_cipher_be (blk, rkey2); \
-    blk = asm_cipher_be (blk, rkey3); \
-    blk = asm_cipher_be (blk, rkey4); \
-    blk = asm_cipher_be (blk, rkey5); \
-    blk = asm_cipher_be (blk, rkey6); \
-    blk = asm_cipher_be (blk, rkey7); \
-    blk = asm_cipher_be (blk, rkey8); \
-    blk = asm_cipher_be (blk, rkey9); \
-    if (nrounds >= 12) \
-      { \
-	blk = asm_cipher_be (blk, rkey10); \
-	blk = asm_cipher_be (blk, rkey11); \
-	if (rounds > 12) \
-	  { \
-	    blk = asm_cipher_be (blk, rkey12); \
-	    blk = asm_cipher_be (blk, rkey13); \
-	  } \
-      } \
-    blk = asm_cipherlast_be (blk, rkeylast); \
-  } while (0)
-
 
 static ASM_FUNC_ATTR_INLINE block
 asm_aligned_ld(unsigned long offset, const void *ptr)
diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 72f31852..f95489d9 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -76,43 +76,46 @@ void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
   u128_t *out = (u128_t *)outbuf_arg;
   int rounds = ctx->rounds;
   ROUND_KEY_VARIABLES_ALL;
-  block rkeylast_orig;
-  block iv;
+  block key0_xor_keylast;
+  block iv, outiv;
 
   iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+  outiv = iv;
 
   PRELOAD_ROUND_KEYS_ALL (rounds);
-  rkeylast_orig = rkeylast;
-
-  for (; nblocks >= 2; nblocks -= 2)
-    {
-      block in2, iv1;
-
-      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
-      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
-      in += 2;
-
-      AES_ENCRYPT_ALL (iv, rounds);
-
-      iv1 = iv;
-      rkeylast = rkeylast_orig ^ in2;
-
-      AES_ENCRYPT_ALL (iv, rounds);
-
-      VEC_STORE_BE (out++, 0, iv1, bige_const);
-      VEC_STORE_BE (out++, 0, iv, bige_const);
-    }
+  key0_xor_keylast = rkey0 ^ rkeylast;
+  iv ^= rkey0;
 
   for (; nblocks; nblocks--)
     {
-      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in++, 0, bige_const);
-
-      AES_ENCRYPT_ALL (iv, rounds);
+      rkeylast = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const);
+
+      iv = asm_cipher_be (iv, rkey1);
+      iv = asm_cipher_be (iv, rkey2);
+      iv = asm_cipher_be (iv, rkey3);
+      iv = asm_cipher_be (iv, rkey4);
+      iv = asm_cipher_be (iv, rkey5);
+      iv = asm_cipher_be (iv, rkey6);
+      iv = asm_cipher_be (iv, rkey7);
+      iv = asm_cipher_be (iv, rkey8);
+      iv = asm_cipher_be (iv, rkey9);
+      if (rounds >= 12)
+	{
+	  iv = asm_cipher_be (iv, rkey10);
+	  iv = asm_cipher_be (iv, rkey11);
+	  if (rounds > 12)
+	    {
+	      iv = asm_cipher_be (iv, rkey12);
+	      iv = asm_cipher_be (iv, rkey13);
+	    }
+	}
+      iv = asm_cipherlast_be (iv, rkeylast);
 
-      VEC_STORE_BE (out++, 0, iv, bige_const);
+      outiv = rkey0 ^ iv;
+      VEC_STORE_BE (out++, 0, outiv, bige_const);
     }
 
-  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+  VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
 }
 
 void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
@@ -324,47 +327,61 @@ void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
   byte *out = (byte *)outbuf_arg;
   int rounds = ctx->rounds;
   ROUND_KEY_VARIABLES_ALL;
-  block lastiv, b;
+  block iv, key0_xor_keylast, nextiv, outiv;
   unsigned int outadd = -(!cbc_mac) & 16;
 
-  lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS_ALL (rounds);
-
-  for (; nblocks >= 2; nblocks -= 2)
-    {
-      block in2, lastiv1;
-
-      b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const);
-      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
-      in += 2;
-
-      AES_ENCRYPT_ALL (b, rounds);
+  if (nblocks == 0) /* CMAC may call with nblocks 0. */
+    return;
 
-      lastiv1 = b;
-      b = lastiv1 ^ in2;
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
 
-      AES_ENCRYPT_ALL (b, rounds);
+  PRELOAD_ROUND_KEYS_ALL (rounds);
+  key0_xor_keylast = rkey0 ^ rkeylast;
 
-      lastiv = b;
-      VEC_STORE_BE ((u128_t *)out, 0, lastiv1, bige_const);
-      out += outadd;
-      VEC_STORE_BE ((u128_t *)out, 0, lastiv, bige_const);
-      out += outadd;
-    }
+  nextiv = VEC_LOAD_BE (in++, 0, bige_const);
+  iv ^= rkey0 ^ nextiv;
 
-  for (; nblocks; nblocks--)
+  do
     {
-      b = lastiv ^ VEC_LOAD_BE (in++, 0, bige_const);
-
-      AES_ENCRYPT_ALL (b, rounds);
+      if (--nblocks)
+	{
+	  nextiv = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const);
+	}
 
-      lastiv = b;
-      VEC_STORE_BE ((u128_t *)out, 0, b, bige_const);
+      iv = asm_cipher_be (iv, rkey1);
+      iv = asm_cipher_be (iv, rkey2);
+      iv = asm_cipher_be (iv, rkey3);
+      iv = asm_cipher_be (iv, rkey4);
+      iv = asm_cipher_be (iv, rkey5);
+      iv = asm_cipher_be (iv, rkey6);
+      iv = asm_cipher_be (iv, rkey7);
+      iv = asm_cipher_be (iv, rkey8);
+      iv = asm_cipher_be (iv, rkey9);
+      if (rounds >= 12)
+	{
+	  iv = asm_cipher_be (iv, rkey10);
+	  iv = asm_cipher_be (iv, rkey11);
+	  if (rounds > 12)
+	    {
+	      iv = asm_cipher_be (iv, rkey12);
+	      iv = asm_cipher_be (iv, rkey13);
+	    }
+	}
+      outiv = iv;
+      /* Proper order for following instructions is important for best
+       * performance on POWER8: the output path vcipherlast needs to be
+       * last one. */
+      __asm__ volatile ("vcipherlast %0, %0, %2\n\t"
+			"vcipherlast %1, %1, %3\n\t"
+			: "+v" (iv), "+outiv" (outiv)
+			: "v" (nextiv), "v" (rkeylast));
+
+      VEC_STORE_BE ((u128_t *)out, 0, outiv, bige_const);
       out += outadd;
     }
+  while (nblocks);
 
-  VEC_STORE_BE (iv_arg, 0, lastiv, bige_const);
+  VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
 }
 
 void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
-- 
2.34.1




More information about the Gcrypt-devel mailing list