[PATCH] avx512: tweak AVX512 spec stop, use common macro in assembly

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Dec 11 14:26:41 CET 2022


* cipher/cipher-gcm-intel-pclmul.c: Use xmm registers for AVX512
spec stop.
* cipher/asm-common-amd64.h (spec_stop_avx512): New.
* cipher/blake2b-amd64-avx512.S: Use spec_stop_avx512.
* cipher/blake2s-amd64-avx512.S: Likewise.
* cipher/camellia-gfni-avx512-amd64.S: Likewise.
* cipher/chacha20-avx512-amd64.S: Likewise.
* cipher/keccak-amd64-avx512.S: Likewise.
* cipher/poly1305-amd64-avx512.S: Likewise.
* cipher/sha512-avx512-amd64.S: Likewise.
* cipher/sm4-gfni-avx512-amd64.S: Likewise.
---

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/asm-common-amd64.h           | 10 +++++++++-
 cipher/blake2b-amd64-avx512.S       |  2 ++
 cipher/blake2s-amd64-avx512.S       |  2 ++
 cipher/camellia-gfni-avx512-amd64.S | 14 +++++++-------
 cipher/chacha20-amd64-avx512.S      |  3 +--
 cipher/cipher-gcm-intel-pclmul.c    |  4 ++--
 cipher/keccak-amd64-avx512.S        |  4 ++++
 cipher/poly1305-amd64-avx512.S      |  3 +--
 cipher/sha512-avx512-amd64.S        |  2 ++
 cipher/sm4-gfni-avx512-amd64.S      | 20 ++++++++++++++------
 10 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index 97912b1b..dc2c4d2f 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -186,8 +186,16 @@
 # define EXIT_SYSV_FUNC
 #endif
 
-/* 'ret' instruction replacement for straight-line speculation mitigation */
+/* 'ret' instruction replacement for straight-line speculation mitigation. */
 #define ret_spec_stop \
 	ret; int3;
 
+/* This prevents speculative execution on old AVX512 CPUs, to prevent
+ * speculative execution to AVX512 code. The vpopcntb instruction is
+ * available on newer CPUs that do not suffer from significant frequency
+ * drop when 512-bit vectors are utilized. */
+#define spec_stop_avx512 \
+	vpxord %xmm16, %xmm16, %xmm16; \
+	vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */
+
 #endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S
index db53474d..18b0c3ad 100644
--- a/cipher/blake2b-amd64-avx512.S
+++ b/cipher/blake2b-amd64-avx512.S
@@ -221,6 +221,8 @@ _gcry_blake2b_transform_amd64_avx512:
          */
         CFI_STARTPROC();
 
+        spec_stop_avx512;
+
         movl $0xf, %eax;
         kmovw %eax, %k0;
         xorl %eax, %eax;
diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S
index 4457ca99..ddcdfd67 100644
--- a/cipher/blake2s-amd64-avx512.S
+++ b/cipher/blake2s-amd64-avx512.S
@@ -183,6 +183,8 @@ _gcry_blake2s_transform_amd64_avx512:
          */
         CFI_STARTPROC();
 
+        spec_stop_avx512;
+
         addq $64, (STATE_T + 0)(RSTATE);
 
         vmovdqa .Liv+(0 * 4) rRIP, ROW3;
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index 15b2dc90..bddad804 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -832,7 +832,7 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
 	vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
@@ -985,7 +985,7 @@ _gcry_camellia_gfni_avx512_cbc_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	movq %rcx, %r9;
 
@@ -1047,7 +1047,7 @@ _gcry_camellia_gfni_avx512_cfb_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
@@ -1122,7 +1122,7 @@ _gcry_camellia_gfni_avx512_ocb_enc:
 	 *	%r9 : L pointers (void *L[64])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	pushq %r12;
 	CFI_PUSH(%r12);
@@ -1285,7 +1285,7 @@ _gcry_camellia_gfni_avx512_ocb_dec:
 	 *	%r9 : L pointers (void *L[64])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	pushq %r12;
 	CFI_PUSH(%r12);
@@ -1451,7 +1451,7 @@ _gcry_camellia_gfni_avx512_enc_blk64:
 	 *	%rdx: src (64 blocks)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
@@ -1515,7 +1515,7 @@ _gcry_camellia_gfni_avx512_dec_blk64:
 	 *	%rdx: src (64 blocks)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index 682798fe..544e7cdc 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -298,8 +298,7 @@ _gcry_chacha20_amd64_avx512_blocks:
 	 */
 	CFI_STARTPROC();
 
-	vpxord %xmm16, %xmm16, %xmm16;
-	vpopcntb %xmm16, %xmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpq $4, NBLKS;
 	jb .Lskip_vertical_handling;
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 78a9e338..ec00df09 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1513,7 +1513,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
       if (nblocks >= 32
 	  && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
 	{
-	  asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */
+	  asm volatile ("vpopcntb %%xmm7, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
 			"vshufi64x2 $0, %%zmm7, %%zmm7, %%zmm15\n\t"
 			"vmovdqa %%xmm1, %%xmm8\n\t"
 			"vmovdqu64 %[swapperm], %%zmm14\n\t"
@@ -1792,7 +1792,7 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
       if (nblocks >= 32
 	  && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
 	{
-	  asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */
+	  asm volatile ("vpopcntb %%xmm1, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
 			"vmovdqa %%xmm1, %%xmm8\n\t"
 			"vmovdqu64 %[swapperm], %%zmm14\n\t"
 			:
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S
index f44e0285..58b4150f 100644
--- a/cipher/keccak-amd64-avx512.S
+++ b/cipher/keccak-amd64-avx512.S
@@ -282,6 +282,8 @@ _gcry_keccak_f1600_state_permute64_avx512:
 	 */
 	CFI_STARTPROC()
 
+	spec_stop_avx512;
+
 	leaq		12*8(%rdi), %rax
 	leaq		(24-1)*8(%rsi), %r11
 
@@ -362,6 +364,8 @@ _gcry_keccak_absorb_blocks_avx512:
 	 */
 	CFI_STARTPROC()
 
+	spec_stop_avx512;
+
 	leaq		12*8(%rdi), %rax
 	leaq		(24-1)*8(%rsi), %r11
 
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 72303e1e..5c8f838f 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -1580,8 +1580,7 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
 ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;)
 _gcry_poly1305_amd64_avx512_blocks:
 	CFI_STARTPROC()
-	vpxord xmm16, xmm16, xmm16;
-	vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 	FUNC_ENTRY()
 
 #define _a0 gp3
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index 0e3f44ab..145c8667 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -264,6 +264,8 @@ _gcry_sha512_transform_amd64_avx512:
 	cmp	rdx, 0
 	je	.Lnowork
 
+	spec_stop_avx512;
+
 	/* Setup mask register for DC:BA merging. */
 	mov	eax, 0b1100
 	kmovd	MASK_DC_00, eax
diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
index 1d5e9a48..0f9899d4 100644
--- a/cipher/sm4-gfni-avx512-amd64.S
+++ b/cipher/sm4-gfni-avx512-amd64.S
@@ -158,6 +158,7 @@ _gcry_sm4_gfni_avx512_expand_key:
 	 *	%r8: ck array
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	vmovd 0*4(%rdi), RA0x;
 	vmovd 1*4(%rdi), RA1x;
@@ -553,6 +554,7 @@ _gcry_sm4_gfni_avx512_crypt_blk1_16:
 	 *	%rcx: num blocks (1..16)
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 #define LOAD_INPUT(offset, yreg) \
 	cmpq $(1 + 2 * (offset)), %rcx; \
@@ -621,6 +623,7 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
 	vmovdqa .Lcounter0123_lo rRIP, RTMP1;
@@ -728,6 +731,7 @@ _gcry_sm4_gfni_avx512_cbc_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	vmovdqu (0 * 32)(%rdx), RA0;
 	vmovdqu (1 * 32)(%rdx), RA1;
@@ -779,6 +783,7 @@ _gcry_sm4_gfni_avx512_cfb_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	/* Load input */
 	vmovdqu (%rcx), RNOTx;
@@ -835,6 +840,7 @@ _gcry_sm4_gfni_avx512_ocb_enc:
 	 *	%r9 : L pointers (void *L[16])
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -950,6 +956,7 @@ _gcry_sm4_gfni_avx512_ocb_dec:
 	 *	%r9 : L pointers (void *L[16])
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -1066,6 +1073,7 @@ _gcry_sm4_gfni_avx512_ocb_auth:
 	 *	%r8 : L pointers (void *L[16])
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -1251,7 +1259,7 @@ _gcry_sm4_gfni_avx512_crypt_blk32:
 	 *	%rdx: src (32 blocks)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	/* Load input */
 	vmovdqu32 (0 * 64)(%rdx), RA0z;
@@ -1292,7 +1300,7 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z;
 	vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z;
@@ -1400,7 +1408,7 @@ _gcry_sm4_gfni_avx512_cbc_dec_blk32:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	vmovdqu32 (0 * 64)(%rdx), RA0z;
 	vmovdqu32 (1 * 64)(%rdx), RA1z;
@@ -1453,7 +1461,7 @@ _gcry_sm4_gfni_avx512_cfb_dec_blk32:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	/* Load input */
 	vmovdqu (%rcx), RA0x;
@@ -1510,7 +1518,7 @@ _gcry_sm4_gfni_avx512_ocb_enc_blk32:
 	 *	%r9 : L pointers (void *L[32])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	subq $(5 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(5 * 8);
@@ -1634,7 +1642,7 @@ _gcry_sm4_gfni_avx512_ocb_dec_blk32:
 	 *	%r9 : L pointers (void *L[32])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	subq $(5 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(5 * 8);
-- 
2.37.2




More information about the Gcrypt-devel mailing list