[PATCH 2/6] chacha20: avoid AVX512/AVX2/SSSE3 for single block processing with Zen5

Mon Dec 29 19:01:35 CET 2025

* cipher/chacha20.c (CHACHA20_context_s): Add
'skip_one_block_hw_impl'.
(chacha20_blocks, do_chacha20_encrypt_stream_tail): Avoid single
block / non-parallel processing with AVX512/AVX2/SSSE3.
--

AMD Zen5 has slower integer vector performance than general purpose
register implementation for Chacha20. Generic C is approx 50% faster
for single block computation. Commit adjust calls to AVX512/AVX2/SSSE3
code so that tailing single block computation are handled with generic
C for AMD Zen5.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20.c | 69 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 17b9b9f0..0bf31be7 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -36,6 +36,7 @@
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
+#include "hwf-common.h"
 #include "cipher-internal.h"
 #include "bufhelp.h"
 
@@ -135,6 +136,9 @@ typedef struct CHACHA20_context_s
   u32 input[16];
   unsigned char pad[CHACHA20_BLOCK_SIZE];
   unsigned int unused; /* bytes in the pad.  */
+#if defined(USE_SSSE3) || defined(USE_AVX512) || defined(USE_AVX2)
+  unsigned int skip_one_block_hw_impl:1;
+#endif
   unsigned int use_ssse3:1;
   unsigned int use_avx2:1;
   unsigned int use_avx512:1;
@@ -382,20 +386,49 @@ static unsigned int
 chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
 		 size_t nblks)
 {
+  unsigned int nburn, burn = 0;
+#if defined(USE_SSSE3) || defined(USE_AVX512)
+  size_t gen_nblks = 0;
+
+  if (ctx->skip_one_block_hw_impl)
+    {
+      gen_nblks = nblks % 2;
+      nblks = nblks - gen_nblks;
+    }
+#endif
+
 #ifdef USE_AVX512
-  if (ctx->use_avx512)
+  if (nblks && ctx->use_avx512)
     {
-      return _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks);
+      if (gen_nblks == 0)
+	return _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks);
+
+      burn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks);
+      dst += CHACHA20_BLOCK_SIZE * nblks;
+      src += CHACHA20_BLOCK_SIZE * nblks;
+      nblks = 0;
     }
 #endif
 
 #ifdef USE_SSSE3
-  if (ctx->use_ssse3)
+  if (nblks && ctx->use_ssse3)
     {
-      return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
+      if (gen_nblks == 0)
+	return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
+
+      burn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
+      dst += CHACHA20_BLOCK_SIZE * nblks;
+      src += CHACHA20_BLOCK_SIZE * nblks;
+      nblks = 0;
     }
 #endif
 
+#if defined(USE_SSSE3) || defined(USE_AVX512)
+  nblks += gen_nblks;
+  if (nblks == 0)
+    return burn;
+#endif
+
 #ifdef USE_PPC_VEC
   if (ctx->use_ppc)
     {
@@ -420,7 +453,8 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
     }
 #endif
 
-  return do_chacha20_blocks (ctx->input, dst, src, nblks);
+  nburn = do_chacha20_blocks (ctx->input, dst, src, nblks);
+  return nburn > burn ? nburn : burn;
 }
 
 
@@ -541,6 +575,12 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
+#if defined(USE_SSSE3) || defined(USE_AVX512) || defined(USE_AVX2)
+  /* If CPU prefers GPR over scalar integer vector implementation, use
+   * generic C chacha20 for single block non-parallel operations. */
+  ctx->skip_one_block_hw_impl =
+    !!_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector;
+#endif
 #ifdef USE_ARMV7_NEON
   ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
@@ -603,12 +643,19 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
   if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
-      nburn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, outbuf, inbuf,
-                                                 nblocks);
-      burn = nburn > burn ? nburn : burn;
-      length %= CHACHA20_BLOCK_SIZE;
-      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
-      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+
+      if (ctx->skip_one_block_hw_impl)
+	nblocks -= nblocks % 2;
+
+      if (nblocks)
+	{
+	  nburn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, outbuf, inbuf,
+						     nblocks);
+	  burn = nburn > burn ? nburn : burn;
+	  length -= nblocks * CHACHA20_BLOCK_SIZE;
+	  outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+	  inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+	}
     }
 #endif
 
-- 
2.51.0