[PATCH 1/6][v2] blake2: avoid AVX/AVX2/AVX512 when CPU has high vector inst latency

Mon Dec 29 19:01:34 CET 2025

* cipher/blake2.c (blake2b_init_ctx, blake2s_init_ctx): Disable
AVX/AVX2/AVX512 implementation if x86 CPU prefers GPR implementation
over scalar integer vector.
* src/hwf-common.h (hwf_x86_cpu_details)
(_gcry_hwf_x86_cpu_details): New.
* src/hwf-x86.c (x86_cpu_details, x86_hw_features)
(x86_detect_done, _gcry_hwf_x86_cpu_details): New.
(detect_x86_gnuc): Detect Zen5 and add 'cpu_details'.
(_gcry_hwf_detect_x86): Add 'x86_cpu_details' setup.
--

Blake2s/Blake2b AVX/AVX2/AVX512 implementations are slower than
generic C implementation if CPU has integer vector latency higher
than 1 (for example, AMD Zen5 has int-vector latency of 2) and powerful
GPR execution. Therefore use generic C implementation for Blake2
on Zen5.

Generic C with AMD Zen5:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 BLAKE2B_512    |     0.473 ns/B      2016 MiB/s      2.72 c/B      5750
 BLAKE2S_256    |     0.798 ns/B      1195 MiB/s      4.59 c/B      5750

AVX512 with AMD Zen5:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 BLAKE2B_512    |     0.923 ns/B      1033 MiB/s      5.31 c/B      5750
 BLAKE2S_256    |      1.42 ns/B     672.4 MiB/s      8.15 c/B      5749

[v2]: Define x86 specific _gcry_hwf_x86_cpu_details() instead of
generic _gcry_hwf_get_int_vec_latency().
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/blake2.c  | 13 ++++++----
 src/hwf-common.h |  8 +++++++
 src/hwf-x86.c    | 62 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 74 insertions(+), 9 deletions(-)

diff --git a/cipher/blake2.c b/cipher/blake2.c
index 1a04fbd8..fd28879b 100644
--- a/cipher/blake2.c
+++ b/cipher/blake2.c
@@ -28,6 +28,7 @@
 #include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher.h"
+#include "hwf-common.h"
 #include "hash-common.h"
 
 /* USE_AVX indicates whether to compile with Intel AVX code. */
@@ -491,10 +492,12 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
   memset (c, 0, sizeof (*c));
 
 #ifdef USE_AVX2
-  c->use_avx2 = !!(features & HWF_INTEL_AVX2);
+  c->use_avx2 = !!(features & HWF_INTEL_AVX2)
+    && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
 #endif
 #ifdef USE_AVX512
-  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512)
+    && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
 #endif
 
   c->outlen = dbits / 8;
@@ -828,10 +831,12 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
   memset (c, 0, sizeof (*c));
 
 #ifdef USE_AVX
-  c->use_avx = !!(features & HWF_INTEL_AVX);
+  c->use_avx = !!(features & HWF_INTEL_AVX)
+    && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
 #endif
 #ifdef USE_AVX512
-  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512)
+    && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
 #endif
 
   c->outlen = dbits / 8;
diff --git a/src/hwf-common.h b/src/hwf-common.h
index 749ff040..039c0e06 100644
--- a/src/hwf-common.h
+++ b/src/hwf-common.h
@@ -20,6 +20,14 @@
 #ifndef HWF_COMMON_H
 #define HWF_COMMON_H
 
+struct hwf_x86_cpu_details
+{
+  unsigned int int_vector_latency;
+  unsigned int prefer_gpr_over_scalar_int_vector;
+};
+
+const struct hwf_x86_cpu_details *_gcry_hwf_x86_cpu_details (void);
+
 unsigned int _gcry_hwf_detect_x86 (void);
 unsigned int _gcry_hwf_detect_arm (void);
 unsigned int _gcry_hwf_detect_ppc (void);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 54af1c83..e2c9af0c 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -36,6 +36,12 @@
    features.  */
 #undef HAS_X86_CPUID
 
+
+static struct hwf_x86_cpu_details x86_cpu_details;
+static unsigned int x86_hw_features;
+static int x86_detect_done;
+
+
 #if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__)
 # define HAS_X86_CPUID 1
 
@@ -49,6 +55,7 @@
 # define CFI_POP4
 #endif
 
+
 static int
 is_cpuid_available(void)
 {
@@ -184,7 +191,9 @@ get_xgetbv(void)
 
 #ifdef HAS_X86_CPUID
 static unsigned int
-detect_x86_gnuc (void)
+detect_x86_gnuc (
+  struct hwf_x86_cpu_details *cpu_details
+)
 {
   union
   {
@@ -198,10 +207,15 @@ detect_x86_gnuc (void)
   unsigned int fms, family, model;
   unsigned int result = 0;
   unsigned int is_amd_cpu = 0;
+  unsigned int has_avx512bmm = 0;
+  unsigned int has_sse3 = 0;
 
   (void)os_supports_avx_avx2_registers;
   (void)os_supports_avx512_registers;
 
+  /* Assume integer vector latency of 1 by default. */
+  cpu_details->int_vector_latency = 1;
+
   if (!is_cpuid_available())
     return 0;
 
@@ -320,7 +334,8 @@ detect_x86_gnuc (void)
    * too high max_cpuid_level, so don't check level 7 if processor does not
    * support SSE3 (as cpuid:7 contains only features for newer processors).
    * Source: http://www.sandpile.org/x86/cpuid.htm  */
-  if (max_cpuid_level >= 7 && (features & 0x00000001))
+  has_sse3 = !!(features & 0x00000001);
+  if (max_cpuid_level >= 7 && has_sse3)
     {
       /* Get CPUID:7 contains further Intel feature flags. */
       get_cpuid(7, NULL, &features, &features2, NULL);
@@ -385,6 +400,16 @@ detect_x86_gnuc (void)
         result |= HWF_INTEL_GFNI;
     }
 
+  /* Check additional feature flags. */
+  if (max_cpuid_level >= 0x21 && has_sse3)
+    {
+      get_cpuid(0x21, &features, NULL, NULL, NULL);
+      if (features & (1 << 23))
+	{
+	  has_avx512bmm = 1;
+	}
+    }
+
   if ((result & HWF_INTEL_CPU) && family == 6)
     {
       /* These Intel Core processor models have SHLD/SHRD instruction that
@@ -413,6 +438,14 @@ detect_x86_gnuc (void)
 	}
     }
 
+  if (is_amd_cpu && (family == 0x1a) && !has_avx512bmm)
+    {
+      /* Zen5 has integer vector instruction latency of 2 and powerful
+       * GPR integer performance. */
+      cpu_details->int_vector_latency = 2;
+      cpu_details->prefer_gpr_over_scalar_int_vector = 1;
+    }
+
 #ifdef ENABLE_FORCE_SOFT_HWFEATURES
   /* Soft HW features mark functionality that is available on all systems
    * but not feasible to use because of slow HW implementation. */
@@ -428,6 +461,11 @@ detect_x86_gnuc (void)
    * only for those Intel processors that benefit from the SHLD
    * instruction. Enabled here unconditionally as requested. */
   result |= HWF_INTEL_FAST_SHLD;
+
+  /* Assume that integer vector instructions have minimum latency and
+   * higher scalar performance than GPR. */
+  cpu_details->int_vector_latency = 0;
+  cpu_details->prefer_gpr_over_scalar_int_vector = 0;
 #endif
 
   return result;
@@ -438,9 +476,23 @@ detect_x86_gnuc (void)
 unsigned int
 _gcry_hwf_detect_x86 (void)
 {
+  if (x86_detect_done)
+    return x86_hw_features;
+
+  memset(&x86_cpu_details, 0, sizeof(x86_cpu_details));
+  x86_hw_features = 0;
+
 #if defined (HAS_X86_CPUID)
-  return detect_x86_gnuc ();
-#else
-  return 0;
+  x86_hw_features = detect_x86_gnuc (&x86_cpu_details);
 #endif
+
+  x86_detect_done = 1;
+  return x86_hw_features;
+}
+
+
+const struct hwf_x86_cpu_details *
+_gcry_hwf_x86_cpu_details (void)
+{
+  return &x86_cpu_details;
 }
-- 
2.51.0