[PATCH 3/5] aes-ppc: add CTR32LE bulk acceleration

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Feb 26 14:00:35 CET 2023


* cipher/rijndael-ppc-functions.h (CTR32LE_ENC_FUNC): New.
* cipher/rijndael-ppc.c (_gcry_aes_ppc8_ctr32le_enc): New.
* cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ctr32le_enc): New.
* cipher/rijndael.c (_gcry_aes_ppc8_ctr32le_enc)
(_gcry_aes_ppc9le_ctr32le_enc): New.
(do_setkey): Setup _gcry_aes_ppc8_ctr32le_enc for POWER8 and
_gcry_aes_ppc9le_ctr32le_enc for POWER9.
--

Benchmark on POWER9:

 Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
    GCM-SIV enc |      1.42 ns/B     672.2 MiB/s      3.26 c/B

 After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
    GCM-SIV enc |     0.725 ns/B      1316 MiB/s      1.67 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ppc-functions.h | 245 ++++++++++++++++++++++++++++++++
 cipher/rijndael-ppc.c           |   1 +
 cipher/rijndael-ppc9le.c        |   1 +
 cipher/rijndael.c               |  11 ++
 4 files changed, 258 insertions(+)

diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 8a05d3c9..79eca7a2 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -2292,3 +2292,248 @@ void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
 
 #undef GEN_TWEAK
 }
+
+
+void CTR32LE_ENC_FUNC(void *context, unsigned char *ctr_arg, void *outbuf_arg,
+		      const void *inbuf_arg, size_t nblocks)
+{
+#ifndef WORDS_BIGENDIAN
+  static const vec_u32 vec_u32_one = { 1, 0, 0, 0 };
+#else
+  static const vec_u32 vec_u32_one = { 0, 0, 0, 1 };
+#endif
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block b;
+  vec_u32 ctr, one;
+
+  ctr = (vec_u32)vec_reve (VEC_LOAD_BE (ctr_arg, 0, bige_const));
+  one = vec_u32_one;
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+#define VEC_ADD_CTRLE32(ctrv_u32, addv_u32) \
+      vec_reve((block)((ctrv_u32) + (addv_u32)))
+
+  if (nblocks >= 4)
+    {
+      block in0, in1, in2, in3, in4, in5, in6, in7;
+      block b0, b1, b2, b3, b4, b5, b6, b7;
+      vec_u32 two, three, four, five, six, seven, eight;
+      block rkey;
+
+      two   = one + one;
+      three = two + one;
+      four  = two + two;
+      five  = three + two;
+      six   = three + three;
+      seven = four + three;
+      eight = four + four;
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b1 = VEC_ADD_CTRLE32 (ctr, one);
+	  b2 = VEC_ADD_CTRLE32 (ctr, two);
+	  b3 = VEC_ADD_CTRLE32 (ctr, three);
+	  b4 = VEC_ADD_CTRLE32 (ctr, four);
+	  b5 = VEC_ADD_CTRLE32 (ctr, five);
+	  b6 = VEC_ADD_CTRLE32 (ctr, six);
+	  b7 = VEC_ADD_CTRLE32 (ctr, seven);
+	  b0 = asm_xor (rkey0, vec_reve((block)ctr));
+	  rkey = ALIGNED_LOAD (rk, 1);
+	  ctr = ctr + eight;
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+	  b0 = asm_cipher_be (b0, rkey);
+	  b1 = asm_cipher_be (b1, rkey);
+	  b2 = asm_cipher_be (b2, rkey);
+	  b3 = asm_cipher_be (b3, rkey);
+	  b4 = asm_xor (rkey0, b4);
+	  b5 = asm_xor (rkey0, b5);
+	  b6 = asm_xor (rkey0, b6);
+	  b7 = asm_xor (rkey0, b7);
+	  b4 = asm_cipher_be (b4, rkey);
+	  b5 = asm_cipher_be (b5, rkey);
+	  b6 = asm_cipher_be (b6, rkey);
+	  b7 = asm_cipher_be (b7, rkey);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  DO_ROUND(2);
+	  in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  DO_ROUND(3);
+	  in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  DO_ROUND(4);
+	  in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  DO_ROUND(5);
+	  in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  DO_ROUND(6);
+	  in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  DO_ROUND(7);
+	  in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  DO_ROUND(8);
+	  in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  DO_ROUND(9);
+
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = VEC_BE_SWAP (in0, bige_const);
+	  in1 = VEC_BE_SWAP (in1, bige_const);
+	  in2 = VEC_BE_SWAP (in2, bige_const);
+	  in3 = VEC_BE_SWAP (in3, bige_const);
+	  in4 = VEC_BE_SWAP (in4, bige_const);
+	  in5 = VEC_BE_SWAP (in5, bige_const);
+	  in6 = VEC_BE_SWAP (in6, bige_const);
+	  in7 = VEC_BE_SWAP (in7, bige_const);
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  in4 = asm_xor (rkeylast, in4);
+	  in5 = asm_xor (rkeylast, in5);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+	  in6 = asm_xor (rkeylast, in6);
+	  in7 = asm_xor (rkeylast, in7);
+	  b4 = asm_cipherlast_be (b4, in4);
+	  b5 = asm_cipherlast_be (b5, in5);
+	  b6 = asm_cipherlast_be (b6, in6);
+	  b7 = asm_cipherlast_be (b7, in7);
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  b1 = VEC_ADD_CTRLE32 (ctr, one);
+	  b2 = VEC_ADD_CTRLE32 (ctr, two);
+	  b3 = VEC_ADD_CTRLE32 (ctr, three);
+	  b0 = asm_xor (rkey0, vec_reve((block)ctr));
+	  ctr = ctr + four;
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+
+	  in0 = VEC_LOAD_BE (in, 0, bige_const);
+	  in1 = VEC_LOAD_BE (in, 1, bige_const);
+	  in2 = VEC_LOAD_BE (in, 2, bige_const);
+	  in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b = vec_reve((block)ctr);
+      ctr = ctr + one;
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+
+      AES_ENCRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      out++;
+      in++;
+    }
+
+#undef VEC_ADD_CTRLE32
+
+  VEC_STORE_BE (ctr_arg, 0, vec_reve((block)ctr), bige_const);
+}
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index 53c4f126..d16fbb40 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -198,6 +198,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 #define OCB_CRYPT_FUNC		_gcry_aes_ppc8_ocb_crypt
 #define OCB_AUTH_FUNC		_gcry_aes_ppc8_ocb_auth
 #define XTS_CRYPT_FUNC		_gcry_aes_ppc8_xts_crypt
+#define CTR32LE_ENC_FUNC	_gcry_aes_ppc8_ctr32le_enc
 
 #include <rijndael-ppc-functions.h>
 
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
index 9ce9c224..f7055290 100644
--- a/cipher/rijndael-ppc9le.c
+++ b/cipher/rijndael-ppc9le.c
@@ -97,6 +97,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
 #define OCB_CRYPT_FUNC		_gcry_aes_ppc9le_ocb_crypt
 #define OCB_AUTH_FUNC		_gcry_aes_ppc9le_ocb_auth
 #define XTS_CRYPT_FUNC		_gcry_aes_ppc9le_xts_crypt
+#define CTR32LE_ENC_FUNC	_gcry_aes_ppc9le_ctr32le_enc
 
 #include <rijndael-ppc-functions.h>
 
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 071d4a16..b49a0642 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -280,6 +280,10 @@ extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak,
 				      void *outbuf_arg,
 				      const void *inbuf_arg,
 				      size_t nblocks, int encrypt);
+
+extern void _gcry_aes_ppc8_ctr32le_enc (void *context, unsigned char *ctr,
+					void *outbuf_arg, const void *inbuf_arg,
+					size_t nblocks);
 #endif /*USE_PPC_CRYPTO*/
 
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
@@ -322,6 +326,11 @@ extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak,
 					const void *inbuf_arg,
 					size_t nblocks, int encrypt);
 
+extern void _gcry_aes_ppc9le_ctr32le_enc (void *context, unsigned char *ctr,
+					  void *outbuf_arg,
+					  const void *inbuf_arg,
+					  size_t nblocks);
+
 extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 					 const void *inbuf_arg,
 					 size_t nblocks, int encrypt);
@@ -633,6 +642,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt;
+      bulk_ops->ctr32le_enc = _gcry_aes_ppc9le_ctr32le_enc;
       if (hwfeatures & HWF_PPC_ARCH_3_10)  /* for P10 */
         bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt;
 # ifdef ENABLE_FORCE_SOFT_HWFEATURES
@@ -663,6 +673,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_ppc8_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_ppc8_xts_crypt;
+      bulk_ops->ctr32le_enc = _gcry_aes_ppc8_ctr32le_enc;
     }
 #endif
 #ifdef USE_S390X_CRYPTO
-- 
2.37.2




More information about the Gcrypt-devel mailing list