[PATCH 1/6][v2] blake2: avoid AVX/AVX2/AVX512 when CPU has high vector inst latency
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Dec 29 19:01:34 CET 2025
* cipher/blake2.c (blake2b_init_ctx, blake2s_init_ctx): Disable
AVX/AVX2/AVX512 implementation if x86 CPU prefers GPR implementation
over scalar integer vector.
* src/hwf-common.h (hwf_x86_cpu_details)
(_gcry_hwf_x86_cpu_details): New.
* src/hwf-x86.c (x86_cpu_details, x86_hw_features)
(x86_detect_done, _gcry_hwf_x86_cpu_details): New.
(detect_x86_gnuc): Detect Zen5 and add 'cpu_details'.
(_gcry_hwf_detect_x86): Add 'x86_cpu_details' setup.
--
Blake2s/Blake2b AVX/AVX2/AVX512 implementations are slower than
generic C implementation if CPU has integer vector latency higher
than 1 (for example, AMD Zen5 has int-vector latency of 2) and powerful
GPR execution. Therefore use generic C implementation for Blake2
on Zen5.
Generic C with AMD Zen5:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
BLAKE2B_512 | 0.473 ns/B 2016 MiB/s 2.72 c/B 5750
BLAKE2S_256 | 0.798 ns/B 1195 MiB/s 4.59 c/B 5750
AVX512 with AMD Zen5:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
BLAKE2B_512 | 0.923 ns/B 1033 MiB/s 5.31 c/B 5750
BLAKE2S_256 | 1.42 ns/B 672.4 MiB/s 8.15 c/B 5749
[v2]: Define x86 specific _gcry_hwf_x86_cpu_details() instead of
generic _gcry_hwf_get_int_vec_latency().
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/blake2.c | 13 ++++++----
src/hwf-common.h | 8 +++++++
src/hwf-x86.c | 62 ++++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 74 insertions(+), 9 deletions(-)
diff --git a/cipher/blake2.c b/cipher/blake2.c
index 1a04fbd8..fd28879b 100644
--- a/cipher/blake2.c
+++ b/cipher/blake2.c
@@ -28,6 +28,7 @@
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher.h"
+#include "hwf-common.h"
#include "hash-common.h"
/* USE_AVX indicates whether to compile with Intel AVX code. */
@@ -491,10 +492,12 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
memset (c, 0, sizeof (*c));
#ifdef USE_AVX2
- c->use_avx2 = !!(features & HWF_INTEL_AVX2);
+ c->use_avx2 = !!(features & HWF_INTEL_AVX2)
+ && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
#endif
#ifdef USE_AVX512
- c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+ c->use_avx512 = !!(features & HWF_INTEL_AVX512)
+ && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
#endif
c->outlen = dbits / 8;
@@ -828,10 +831,12 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
memset (c, 0, sizeof (*c));
#ifdef USE_AVX
- c->use_avx = !!(features & HWF_INTEL_AVX);
+ c->use_avx = !!(features & HWF_INTEL_AVX)
+ && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
#endif
#ifdef USE_AVX512
- c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+ c->use_avx512 = !!(features & HWF_INTEL_AVX512)
+ && !(_gcry_hwf_x86_cpu_details()->prefer_gpr_over_scalar_int_vector);
#endif
c->outlen = dbits / 8;
diff --git a/src/hwf-common.h b/src/hwf-common.h
index 749ff040..039c0e06 100644
--- a/src/hwf-common.h
+++ b/src/hwf-common.h
@@ -20,6 +20,14 @@
#ifndef HWF_COMMON_H
#define HWF_COMMON_H
+struct hwf_x86_cpu_details
+{
+ unsigned int int_vector_latency;
+ unsigned int prefer_gpr_over_scalar_int_vector;
+};
+
+const struct hwf_x86_cpu_details *_gcry_hwf_x86_cpu_details (void);
+
unsigned int _gcry_hwf_detect_x86 (void);
unsigned int _gcry_hwf_detect_arm (void);
unsigned int _gcry_hwf_detect_ppc (void);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 54af1c83..e2c9af0c 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -36,6 +36,12 @@
features. */
#undef HAS_X86_CPUID
+
+static struct hwf_x86_cpu_details x86_cpu_details;
+static unsigned int x86_hw_features;
+static int x86_detect_done;
+
+
#if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__)
# define HAS_X86_CPUID 1
@@ -49,6 +55,7 @@
# define CFI_POP4
#endif
+
static int
is_cpuid_available(void)
{
@@ -184,7 +191,9 @@ get_xgetbv(void)
#ifdef HAS_X86_CPUID
static unsigned int
-detect_x86_gnuc (void)
+detect_x86_gnuc (
+ struct hwf_x86_cpu_details *cpu_details
+)
{
union
{
@@ -198,10 +207,15 @@ detect_x86_gnuc (void)
unsigned int fms, family, model;
unsigned int result = 0;
unsigned int is_amd_cpu = 0;
+ unsigned int has_avx512bmm = 0;
+ unsigned int has_sse3 = 0;
(void)os_supports_avx_avx2_registers;
(void)os_supports_avx512_registers;
+ /* Assume integer vector latency of 1 by default. */
+ cpu_details->int_vector_latency = 1;
+
if (!is_cpuid_available())
return 0;
@@ -320,7 +334,8 @@ detect_x86_gnuc (void)
* too high max_cpuid_level, so don't check level 7 if processor does not
* support SSE3 (as cpuid:7 contains only features for newer processors).
* Source: http://www.sandpile.org/x86/cpuid.htm */
- if (max_cpuid_level >= 7 && (features & 0x00000001))
+ has_sse3 = !!(features & 0x00000001);
+ if (max_cpuid_level >= 7 && has_sse3)
{
/* Get CPUID:7 contains further Intel feature flags. */
get_cpuid(7, NULL, &features, &features2, NULL);
@@ -385,6 +400,16 @@ detect_x86_gnuc (void)
result |= HWF_INTEL_GFNI;
}
+ /* Check additional feature flags. */
+ if (max_cpuid_level >= 0x21 && has_sse3)
+ {
+ get_cpuid(0x21, &features, NULL, NULL, NULL);
+ if (features & (1 << 23))
+ {
+ has_avx512bmm = 1;
+ }
+ }
+
if ((result & HWF_INTEL_CPU) && family == 6)
{
/* These Intel Core processor models have SHLD/SHRD instruction that
@@ -413,6 +438,14 @@ detect_x86_gnuc (void)
}
}
+ if (is_amd_cpu && (family == 0x1a) && !has_avx512bmm)
+ {
+ /* Zen5 has integer vector instruction latency of 2 and powerful
+ * GPR integer performance. */
+ cpu_details->int_vector_latency = 2;
+ cpu_details->prefer_gpr_over_scalar_int_vector = 1;
+ }
+
#ifdef ENABLE_FORCE_SOFT_HWFEATURES
/* Soft HW features mark functionality that is available on all systems
* but not feasible to use because of slow HW implementation. */
@@ -428,6 +461,11 @@ detect_x86_gnuc (void)
* only for those Intel processors that benefit from the SHLD
* instruction. Enabled here unconditionally as requested. */
result |= HWF_INTEL_FAST_SHLD;
+
+ /* Assume that integer vector instructions have minimum latency and
+ * higher scalar performance than GPR. */
+ cpu_details->int_vector_latency = 0;
+ cpu_details->prefer_gpr_over_scalar_int_vector = 0;
#endif
return result;
@@ -438,9 +476,23 @@ detect_x86_gnuc (void)
unsigned int
_gcry_hwf_detect_x86 (void)
{
+ if (x86_detect_done)
+ return x86_hw_features;
+
+ memset(&x86_cpu_details, 0, sizeof(x86_cpu_details));
+ x86_hw_features = 0;
+
#if defined (HAS_X86_CPUID)
- return detect_x86_gnuc ();
-#else
- return 0;
+ x86_hw_features = detect_x86_gnuc (&x86_cpu_details);
#endif
+
+ x86_detect_done = 1;
+ return x86_hw_features;
+}
+
+
+const struct hwf_x86_cpu_details *
+_gcry_hwf_x86_cpu_details (void)
+{
+ return &x86_cpu_details;
}
--
2.51.0
More information about the Gcrypt-devel
mailing list