[PATCH] avx512: tweak AVX512 spec stop, use common macro in assembly
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Dec 11 14:26:41 CET 2022
* cipher/cipher-gcm-intel-pclmul.c: Use xmm registers for AVX512
spec stop.
* cipher/asm-common-amd64.h (spec_stop_avx512): New.
* cipher/blake2b-amd64-avx512.S: Use spec_stop_avx512.
* cipher/blake2s-amd64-avx512.S: Likewise.
* cipher/camellia-gfni-avx512-amd64.S: Likewise.
* cipher/chacha20-avx512-amd64.S: Likewise.
* cipher/keccak-amd64-avx512.S: Likewise.
* cipher/poly1305-amd64-avx512.S: Likewise.
* cipher/sha512-avx512-amd64.S: Likewise.
* cipher/sm4-gfni-avx512-amd64.S: Likewise.
---
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/asm-common-amd64.h | 10 +++++++++-
cipher/blake2b-amd64-avx512.S | 2 ++
cipher/blake2s-amd64-avx512.S | 2 ++
cipher/camellia-gfni-avx512-amd64.S | 14 +++++++-------
cipher/chacha20-amd64-avx512.S | 3 +--
cipher/cipher-gcm-intel-pclmul.c | 4 ++--
cipher/keccak-amd64-avx512.S | 4 ++++
cipher/poly1305-amd64-avx512.S | 3 +--
cipher/sha512-avx512-amd64.S | 2 ++
cipher/sm4-gfni-avx512-amd64.S | 20 ++++++++++++++------
10 files changed, 44 insertions(+), 20 deletions(-)
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index 97912b1b..dc2c4d2f 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -186,8 +186,16 @@
# define EXIT_SYSV_FUNC
#endif
-/* 'ret' instruction replacement for straight-line speculation mitigation */
+/* 'ret' instruction replacement for straight-line speculation mitigation. */
#define ret_spec_stop \
ret; int3;
+/* This prevents speculative execution on old AVX512 CPUs, to prevent
+ * speculative execution to AVX512 code. The vpopcntb instruction is
+ * available on newer CPUs that do not suffer from significant frequency
+ * drop when 512-bit vectors are utilized. */
+#define spec_stop_avx512 \
+ vpxord %xmm16, %xmm16, %xmm16; \
+ vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */
+
#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S
index db53474d..18b0c3ad 100644
--- a/cipher/blake2b-amd64-avx512.S
+++ b/cipher/blake2b-amd64-avx512.S
@@ -221,6 +221,8 @@ _gcry_blake2b_transform_amd64_avx512:
*/
CFI_STARTPROC();
+ spec_stop_avx512;
+
movl $0xf, %eax;
kmovw %eax, %k0;
xorl %eax, %eax;
diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S
index 4457ca99..ddcdfd67 100644
--- a/cipher/blake2s-amd64-avx512.S
+++ b/cipher/blake2s-amd64-avx512.S
@@ -183,6 +183,8 @@ _gcry_blake2s_transform_amd64_avx512:
*/
CFI_STARTPROC();
+ spec_stop_avx512;
+
addq $64, (STATE_T + 0)(RSTATE);
vmovdqa .Liv+(0 * 4) rRIP, ROW3;
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index 15b2dc90..bddad804 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -832,7 +832,7 @@ _gcry_camellia_gfni_avx512_ctr_enc:
* %rcx: iv (big endian, 128bit)
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
@@ -985,7 +985,7 @@ _gcry_camellia_gfni_avx512_cbc_dec:
* %rcx: iv
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
movq %rcx, %r9;
@@ -1047,7 +1047,7 @@ _gcry_camellia_gfni_avx512_cfb_dec:
* %rcx: iv
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
cmpl $128, key_bitlength(CTX);
movl $32, %r8d;
@@ -1122,7 +1122,7 @@ _gcry_camellia_gfni_avx512_ocb_enc:
* %r9 : L pointers (void *L[64])
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
pushq %r12;
CFI_PUSH(%r12);
@@ -1285,7 +1285,7 @@ _gcry_camellia_gfni_avx512_ocb_dec:
* %r9 : L pointers (void *L[64])
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
pushq %r12;
CFI_PUSH(%r12);
@@ -1451,7 +1451,7 @@ _gcry_camellia_gfni_avx512_enc_blk64:
* %rdx: src (64 blocks)
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
cmpl $128, key_bitlength(CTX);
movl $32, %r8d;
@@ -1515,7 +1515,7 @@ _gcry_camellia_gfni_avx512_dec_blk64:
* %rdx: src (64 blocks)
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
cmpl $128, key_bitlength(CTX);
movl $32, %r8d;
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index 682798fe..544e7cdc 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -298,8 +298,7 @@ _gcry_chacha20_amd64_avx512_blocks:
*/
CFI_STARTPROC();
- vpxord %xmm16, %xmm16, %xmm16;
- vpopcntb %xmm16, %xmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
cmpq $4, NBLKS;
jb .Lskip_vertical_handling;
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 78a9e338..ec00df09 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1513,7 +1513,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
if (nblocks >= 32
&& (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
{
- asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */
+ asm volatile ("vpopcntb %%xmm7, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
"vshufi64x2 $0, %%zmm7, %%zmm7, %%zmm15\n\t"
"vmovdqa %%xmm1, %%xmm8\n\t"
"vmovdqu64 %[swapperm], %%zmm14\n\t"
@@ -1792,7 +1792,7 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
if (nblocks >= 32
&& (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
{
- asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */
+ asm volatile ("vpopcntb %%xmm1, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
"vmovdqa %%xmm1, %%xmm8\n\t"
"vmovdqu64 %[swapperm], %%zmm14\n\t"
:
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S
index f44e0285..58b4150f 100644
--- a/cipher/keccak-amd64-avx512.S
+++ b/cipher/keccak-amd64-avx512.S
@@ -282,6 +282,8 @@ _gcry_keccak_f1600_state_permute64_avx512:
*/
CFI_STARTPROC()
+ spec_stop_avx512;
+
leaq 12*8(%rdi), %rax
leaq (24-1)*8(%rsi), %r11
@@ -362,6 +364,8 @@ _gcry_keccak_absorb_blocks_avx512:
*/
CFI_STARTPROC()
+ spec_stop_avx512;
+
leaq 12*8(%rdi), %rax
leaq (24-1)*8(%rsi), %r11
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 72303e1e..5c8f838f 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -1580,8 +1580,7 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;)
_gcry_poly1305_amd64_avx512_blocks:
CFI_STARTPROC()
- vpxord xmm16, xmm16, xmm16;
- vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
FUNC_ENTRY()
#define _a0 gp3
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index 0e3f44ab..145c8667 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -264,6 +264,8 @@ _gcry_sha512_transform_amd64_avx512:
cmp rdx, 0
je .Lnowork
+ spec_stop_avx512;
+
/* Setup mask register for DC:BA merging. */
mov eax, 0b1100
kmovd MASK_DC_00, eax
diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
index 1d5e9a48..0f9899d4 100644
--- a/cipher/sm4-gfni-avx512-amd64.S
+++ b/cipher/sm4-gfni-avx512-amd64.S
@@ -158,6 +158,7 @@ _gcry_sm4_gfni_avx512_expand_key:
* %r8: ck array
*/
CFI_STARTPROC();
+ spec_stop_avx512;
vmovd 0*4(%rdi), RA0x;
vmovd 1*4(%rdi), RA1x;
@@ -553,6 +554,7 @@ _gcry_sm4_gfni_avx512_crypt_blk1_16:
* %rcx: num blocks (1..16)
*/
CFI_STARTPROC();
+ spec_stop_avx512;
#define LOAD_INPUT(offset, yreg) \
cmpq $(1 + 2 * (offset)), %rcx; \
@@ -621,6 +623,7 @@ _gcry_sm4_gfni_avx512_ctr_enc:
* %rcx: iv (big endian, 128bit)
*/
CFI_STARTPROC();
+ spec_stop_avx512;
vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
vmovdqa .Lcounter0123_lo rRIP, RTMP1;
@@ -728,6 +731,7 @@ _gcry_sm4_gfni_avx512_cbc_dec:
* %rcx: iv
*/
CFI_STARTPROC();
+ spec_stop_avx512;
vmovdqu (0 * 32)(%rdx), RA0;
vmovdqu (1 * 32)(%rdx), RA1;
@@ -779,6 +783,7 @@ _gcry_sm4_gfni_avx512_cfb_dec:
* %rcx: iv
*/
CFI_STARTPROC();
+ spec_stop_avx512;
/* Load input */
vmovdqu (%rcx), RNOTx;
@@ -835,6 +840,7 @@ _gcry_sm4_gfni_avx512_ocb_enc:
* %r9 : L pointers (void *L[16])
*/
CFI_STARTPROC();
+ spec_stop_avx512;
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -950,6 +956,7 @@ _gcry_sm4_gfni_avx512_ocb_dec:
* %r9 : L pointers (void *L[16])
*/
CFI_STARTPROC();
+ spec_stop_avx512;
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -1066,6 +1073,7 @@ _gcry_sm4_gfni_avx512_ocb_auth:
* %r8 : L pointers (void *L[16])
*/
CFI_STARTPROC();
+ spec_stop_avx512;
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -1251,7 +1259,7 @@ _gcry_sm4_gfni_avx512_crypt_blk32:
* %rdx: src (32 blocks)
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
/* Load input */
vmovdqu32 (0 * 64)(%rdx), RA0z;
@@ -1292,7 +1300,7 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
* %rcx: iv (big endian, 128bit)
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z;
vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z;
@@ -1400,7 +1408,7 @@ _gcry_sm4_gfni_avx512_cbc_dec_blk32:
* %rcx: iv
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
vmovdqu32 (0 * 64)(%rdx), RA0z;
vmovdqu32 (1 * 64)(%rdx), RA1z;
@@ -1453,7 +1461,7 @@ _gcry_sm4_gfni_avx512_cfb_dec_blk32:
* %rcx: iv
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
/* Load input */
vmovdqu (%rcx), RA0x;
@@ -1510,7 +1518,7 @@ _gcry_sm4_gfni_avx512_ocb_enc_blk32:
* %r9 : L pointers (void *L[32])
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
subq $(5 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(5 * 8);
@@ -1634,7 +1642,7 @@ _gcry_sm4_gfni_avx512_ocb_dec_blk32:
* %r9 : L pointers (void *L[32])
*/
CFI_STARTPROC();
- vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+ spec_stop_avx512;
subq $(5 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(5 * 8);
--
2.37.2
More information about the Gcrypt-devel
mailing list