[PATCH] Small tweak for PowerPC Chacha20-Poly1305 round loop
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu Sep 19 21:36:29 CEST 2019
* cipher/chacha20-ppc.c (_gcry_chacha20_poly1305_ppc8_block4): Use
inner/outer round loop structure instead of two separate loops for
stitched and non-stitched parts.
--
Benchmark on POWER8 ~3.8Ghz:
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.619 ns/B 1541 MiB/s 2.35 c/B
STREAM dec | 0.619 ns/B 1541 MiB/s 2.35 c/B
POLY1305 enc | 0.784 ns/B 1216 MiB/s 2.98 c/B
POLY1305 dec | 0.770 ns/B 1239 MiB/s 2.93 c/B
POLY1305 auth | 0.502 ns/B 1898 MiB/s 1.91 c/B
After (~2% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 0.765 ns/B 1247 MiB/s 2.91 c/B
POLY1305 dec | 0.749 ns/B 1273 MiB/s 2.85 c/B
Benchmark on POWER9 ~3.8Ghz:
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.687 ns/B 1389 MiB/s 2.61 c/B
STREAM dec | 0.692 ns/B 1379 MiB/s 2.63 c/B
POLY1305 enc | 1.08 ns/B 880.9 MiB/s 4.11 c/B
POLY1305 dec | 1.07 ns/B 888.0 MiB/s 4.08 c/B
POLY1305 auth | 0.459 ns/B 2078 MiB/s 1.74 c/B
After (~5% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 1.03 ns/B 929.2 MiB/s 3.90 c/B
POLY1305 dec | 1.02 ns/B 936.6 MiB/s 3.87 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 17e2f0902..985f2fcd6 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -469,7 +469,7 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
u64 m0, m1, m2;
u64 x0_lo, x0_hi, x1_lo, x1_hi;
u64 t0_lo, t0_hi, t1_lo, t1_hi;
- int i;
+ unsigned int i, o;
/* load poly1305 state */
m2 = 1;
@@ -515,19 +515,21 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
v12 += counters_0123;
v13 -= vec_cmplt(v12, counters_0123);
- for (i = 0; i < 16; i += 2)
- {
- POLY1305_BLOCK_PART1((i + 0) * 16);
- QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13)
- POLY1305_BLOCK_PART2();
- QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15)
- POLY1305_BLOCK_PART1((i + 1) * 16);
- QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12)
- POLY1305_BLOCK_PART2();
- QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14)
- }
- for (; i < 20; i += 2)
+ for (o = 20; o; o -= 10)
{
+ for (i = 8; i; i -= 2)
+ {
+ POLY1305_BLOCK_PART1(0 * 16);
+ QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13)
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15)
+ POLY1305_BLOCK_PART1(1 * 16);
+ poly1305_src += 2 * 16;
+ QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12)
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14)
+ }
+
QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13)
QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15)
QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12)
@@ -601,7 +603,6 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
src += 4*64;
dst += 4*64;
- poly1305_src += 16*16;
nblks -= 4;
}
More information about the Gcrypt-devel
mailing list