[PATCH] Small tweak for PowerPC Chacha20-Poly1305 round loop

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu Sep 19 21:36:29 CEST 2019


* cipher/chacha20-ppc.c (_gcry_chacha20_poly1305_ppc8_block4): Use
inner/outer round loop structure instead of two separate loops for
stitched and non-stitched parts.
--

Benchmark on POWER8 ~3.8Ghz:

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.619 ns/B      1541 MiB/s      2.35 c/B
     STREAM dec |     0.619 ns/B      1541 MiB/s      2.35 c/B
   POLY1305 enc |     0.784 ns/B      1216 MiB/s      2.98 c/B
   POLY1305 dec |     0.770 ns/B      1239 MiB/s      2.93 c/B
  POLY1305 auth |     0.502 ns/B      1898 MiB/s      1.91 c/B

After (~2% faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
   POLY1305 enc |     0.765 ns/B      1247 MiB/s      2.91 c/B
   POLY1305 dec |     0.749 ns/B      1273 MiB/s      2.85 c/B

Benchmark on POWER9 ~3.8Ghz:

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.687 ns/B      1389 MiB/s      2.61 c/B
     STREAM dec |     0.692 ns/B      1379 MiB/s      2.63 c/B
   POLY1305 enc |      1.08 ns/B     880.9 MiB/s      4.11 c/B
   POLY1305 dec |      1.07 ns/B     888.0 MiB/s      4.08 c/B
  POLY1305 auth |     0.459 ns/B      2078 MiB/s      1.74 c/B

After (~5% faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
   POLY1305 enc |      1.03 ns/B     929.2 MiB/s      3.90 c/B
   POLY1305 dec |      1.02 ns/B     936.6 MiB/s      3.87 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 17e2f0902..985f2fcd6 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -469,7 +469,7 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
   u64 m0, m1, m2;
   u64 x0_lo, x0_hi, x1_lo, x1_hi;
   u64 t0_lo, t0_hi, t1_lo, t1_hi;
-  int i;
+  unsigned int i, o;
 
   /* load poly1305 state */
   m2 = 1;
@@ -515,19 +515,21 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
       v12 += counters_0123;
       v13 -= vec_cmplt(v12, counters_0123);
 
-      for (i = 0; i < 16; i += 2)
-	{
-	  POLY1305_BLOCK_PART1((i + 0) * 16);
-	  QUARTERROUND2(v0, v4,  v8, v12,   v1, v5,  v9, v13)
-	  POLY1305_BLOCK_PART2();
-	  QUARTERROUND2(v2, v6, v10, v14,   v3, v7, v11, v15)
-	  POLY1305_BLOCK_PART1((i + 1) * 16);
-	  QUARTERROUND2(v0, v5, v10, v15,   v1, v6, v11, v12)
-	  POLY1305_BLOCK_PART2();
-	  QUARTERROUND2(v2, v7,  v8, v13,   v3, v4,  v9, v14)
-	}
-      for (; i < 20; i += 2)
+      for (o = 20; o; o -= 10)
 	{
+	  for (i = 8; i; i -= 2)
+	    {
+	      POLY1305_BLOCK_PART1(0 * 16);
+	      QUARTERROUND2(v0, v4,  v8, v12,   v1, v5,  v9, v13)
+	      POLY1305_BLOCK_PART2();
+	      QUARTERROUND2(v2, v6, v10, v14,   v3, v7, v11, v15)
+	      POLY1305_BLOCK_PART1(1 * 16);
+	      poly1305_src += 2 * 16;
+	      QUARTERROUND2(v0, v5, v10, v15,   v1, v6, v11, v12)
+	      POLY1305_BLOCK_PART2();
+	      QUARTERROUND2(v2, v7,  v8, v13,   v3, v4,  v9, v14)
+	    }
+
 	  QUARTERROUND2(v0, v4,  v8, v12,   v1, v5,  v9, v13)
 	  QUARTERROUND2(v2, v6, v10, v14,   v3, v7, v11, v15)
 	  QUARTERROUND2(v0, v5, v10, v15,   v1, v6, v11, v12)
@@ -601,7 +603,6 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
 
       src += 4*64;
       dst += 4*64;
-      poly1305_src += 16*16;
 
       nblks -= 4;
     }




More information about the Gcrypt-devel mailing list