[PATCH 5/5] chacha20-ppc: use target and optimize attributes for P8 and P9

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Feb 26 14:00:37 CET 2023


* cipher/chacha20-ppc.c (_gcry_chacha20_ppc8_blocks1): Rename to...
(chacha20_ppc_blocks1): ...this; Add 'always inline' attribute.
(_gcry_chacha20_ppc8_blocks4): Rename to...
(chacha20_ppc_blocks4): ...this; Add 'always inline' attribute.
(_gcry_chacha20_poly1305_ppc8_blocks4): Rename to...
(chacha20_poly1305_ppc_blocks4): ...this; Add 'always inline'
attribute.
(FUNC_ATTR_OPT_O2, FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
(_gcry_chacha20_ppc8_blocks1, _gcry_chacha20_ppc8_blocks4)
(_gcry_chacha20_poly1305_ppc8_blocks4): New.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
* cipher/chacha20.c (CHACHA20_context_t): Add 'use_p9'.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
(chacha20_do_setkey): Set 'use_p9' if HW has HWF_PPC_ARCH_3_00.
(chacha20_blocks, do_chacha20_encrypt_stream_tail)
(_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt) [USE_PPC_VEC]: Add 'use_p9' paths.
--

This change makes sure that chacha20-ppc gets compiled
with proper optimization level and right target setting.

Benchmark on POWER9:

 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      1.11 ns/B     856.0 MiB/s      2.56 c/B
     STREAM dec |      1.11 ns/B     856.0 MiB/s      2.56 c/B
   POLY1305 enc |      1.57 ns/B     606.2 MiB/s      3.62 c/B
   POLY1305 dec |      1.56 ns/B     610.4 MiB/s      3.59 c/B
  POLY1305 auth |     0.876 ns/B      1089 MiB/s      2.02 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-ppc.c | 118 ++++++++++++++++++++++++++++++++++++++----
 cipher/chacha20.c     |  55 ++++++++++++++++----
 2 files changed, 154 insertions(+), 19 deletions(-)

diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 4a21b837..3fe7bc8c 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -136,9 +136,8 @@ vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a)
 #define ADD_U64(v,a) \
 	(v = vec_add_ctr_u64(v, a))
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
-			    size_t nblks)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_ppc_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks)
 {
   vector4x_u32 counter_1 = { 1, 0, 0, 0 };
   vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
@@ -283,9 +282,8 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
-			    size_t nblks)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks)
 {
   vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
   vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -470,10 +468,10 @@ _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
     MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \
   } while (0)
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
-				     size_t nblks, POLY1305_STATE *st,
-				     const byte *poly1305_src)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+			      size_t nblks, POLY1305_STATE *st,
+			      const byte *poly1305_src)
 {
   vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
   vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -641,6 +639,106 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
   return 0;
 }
 
+#else
+
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+			      size_t nblks, POLY1305_STATE *st,
+			      const byte *poly1305_src)
+{
+}
+
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+#endif
+
+
+/* Functions targetting POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+				     size_t nblks, POLY1305_STATE *st,
+				     const byte *poly1305_src)
+{
+  return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+				       poly1305_src);
+}
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+/* Functions targetting POWER9. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+				     size_t nblks, POLY1305_STATE *st,
+				     const byte *poly1305_src)
+{
+  return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+				       poly1305_src);
+}
+#else
+/* Compiler does not support target attribute, use same functions for POWER9
+ * as for POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return _gcry_chacha20_ppc8_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return _gcry_chacha20_ppc8_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+				     size_t nblks, POLY1305_STATE *st,
+				     const byte *poly1305_src)
+{
+  return _gcry_chacha20_poly1305_ppc8_blocks4(state, dst, src, nblks, st,
+					      poly1305_src);
+}
+#endif /* HAVE_GCC_ATTRIBUTE_PPC_TARGET */
+
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index a7e0dd63..d979d263 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -134,6 +134,7 @@ typedef struct CHACHA20_context_s
   unsigned int use_avx512:1;
   unsigned int use_neon:1;
   unsigned int use_ppc:1;
+  unsigned int use_p9:1;
   unsigned int use_p10:1;
   unsigned int use_s390x:1;
 } CHACHA20_context_t;
@@ -195,12 +196,24 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
 					 const byte *src,
 					 size_t nblks);
 
+unsigned int _gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst,
+					 const byte *src,
+					 size_t nblks);
+
+unsigned int _gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst,
+					 const byte *src,
+					 size_t nblks);
+
 #undef USE_PPC_VEC_POLY1305
 #if SIZEOF_UNSIGNED_LONG == 8
 #define USE_PPC_VEC_POLY1305 1
 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		POLY1305_STATE *st, const byte *poly1305_src);
+
+unsigned int _gcry_chacha20_poly1305_ppc9_blocks4(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		POLY1305_STATE *st, const byte *poly1305_src);
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
 #endif /* USE_PPC_VEC */
@@ -369,7 +382,10 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
 #ifdef USE_PPC_VEC
   if (ctx->use_ppc)
     {
-      return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
+      if (ctx->use_p9)
+	return _gcry_chacha20_ppc9_blocks1(ctx->input, dst, src, nblks);
+      else
+	return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
     }
 #endif
 
@@ -509,6 +525,7 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #endif
 #ifdef USE_PPC_VEC
   ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
+  ctx->use_p9  = (features & HWF_PPC_ARCH_3_00) != 0;
 # ifndef WORDS_BIGENDIAN
   ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
 #  ifdef ENABLE_FORCE_SOFT_HWFEATURES
@@ -626,18 +643,25 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
+      if (0)
+        {}
 #ifndef WORDS_BIGENDIAN
       /*
        * A workaround to skip counter overflow. This is rare.
        */
-      if (ctx->use_p10 && nblocks >= 8
-          && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
+      else if (ctx->use_p10 && nblocks >= 8
+               && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
         {
           size_t len = nblocks * CHACHA20_BLOCK_SIZE;
           nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len);
         }
-      else
 #endif
+      else if (ctx->use_p9)
+        {
+          nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf,
+                                              nblocks);
+        }
+      else
         {
           nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf,
                                               nblocks);
@@ -844,7 +868,10 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
     }
   else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
     {
-      nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
+      if (ctx->use_p9)
+        nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf, 4);
+      else
+	nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
@@ -986,7 +1013,12 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	  nblocks -= nblocks % 4;
 
-	  nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+	  if (ctx->use_p9)
+	    nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+		      ctx->input, outbuf, inbuf, nblocks,
+		      &c->u_mode.poly1305.ctx.state, authptr);
+	  else
+	    nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
 		      ctx->input, outbuf, inbuf, nblocks,
 		      &c->u_mode.poly1305.ctx.state, authptr);
 	  burn = nburn > burn ? nburn : burn;
@@ -1212,9 +1244,14 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
 
-      nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
-			ctx->input, outbuf, inbuf, nblocks,
-			&c->u_mode.poly1305.ctx.state, inbuf);
+      if (ctx->use_p9)
+	nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, inbuf);
+      else
+	nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, inbuf);
       burn = nburn > burn ? nburn : burn;
 
       length -= nblocks * CHACHA20_BLOCK_SIZE;
-- 
2.37.2




More information about the Gcrypt-devel mailing list