[PATCH] avx512: tweak zmm16-zmm31 register clearing

Jussi Kivilinna jussi.kivilinna at iki.fi
Mon Jan 16 18:41:20 CET 2023


* cipher/asm-common-amd64.h (spec_stop_avx512): Clear ymm16
before and after vpopcntb.
* cipher/camellia-gfni-avx512-amd64.S (clear_zmm16_zmm31): Clear
YMM16-YMM31 registers instead of XMM16-XMM31.
* cipher/chacha20-amd64-avx512.S (clear_zmm16_zmm31): Likewise.
* cipher/keccak-amd64-avx512.S (clear_regs): Likewise.
(clear_avx512_4regs): Clear all 4 registers with XOR.
* cipher/cipher-gcm-intel-pclmul.c (_gcry_ghash_intel_pclmul)
(_gcry_polyval_intel_pclmul): Clear YMM16-YMM19 registers instead of
ZMM16-ZMM19.
* cipher/poly1305-amd64-avx512.S (POLY1305_BLOCKS): Clear YMM16-YMM31
registers after vector processing instead of XMM16-XMM31.
* cipher/sha512-avx512-amd64.S
(_gcry_sha512_transform_amd64_avx512): Likewise.
--

Clear zmm16-zmm31 registers with 256bit XOR instead of 128bit
as this is better for AMD Zen4. Also clear xmm16 register
after vpopcnt in avx512 spec-stop so we do not leave any zmm
register state which might end up unnecessarily using CPU
resources.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/asm-common-amd64.h           | 10 ++++++----
 cipher/camellia-gfni-avx512-amd64.S |  8 ++++----
 cipher/chacha20-amd64-avx512.S      |  8 ++++----
 cipher/cipher-gcm-intel-pclmul.c    | 18 +++++++++---------
 cipher/keccak-amd64-avx512.S        | 10 +++++-----
 cipher/poly1305-amd64-avx512.S      |  8 ++++----
 cipher/sha512-avx512-amd64.S        | 14 +++++++-------
 7 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index cd93abc3..d9bbc01b 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -195,11 +195,13 @@
  * available on newer CPUs that do not suffer from significant frequency
  * drop when 512-bit vectors are utilized. */
 #define spec_stop_avx512 \
-	vpxord %xmm16, %xmm16, %xmm16; \
-	vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */
+	vpxord %ymm16, %ymm16, %ymm16; \
+	vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */ \
+	vpxord %ymm16, %ymm16, %ymm16;
 
 #define spec_stop_avx512_intel_syntax \
-	vpxord xmm16, xmm16, xmm16; \
-	vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */
+	vpxord ymm16, ymm16, ymm16; \
+	vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */ \
+	vpxord ymm16, ymm16, ymm16;
 
 #endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index bddad804..14725b4a 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -99,10 +99,10 @@
 	vpxord v3, v3, v3
 
 #define clear_zmm16_zmm31() \
-	clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \
-	clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \
-	clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \
-	clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31)
+	clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
+	clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
+	clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
+	clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31)
 
 #define clear_regs() \
 	kxorq %k1, %k1, %k1; \
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index 544e7cdc..4b183528 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -205,10 +205,10 @@
 	vpxord v3, v3, v3;
 
 #define clear_zmm16_zmm31() \
-	clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \
-	clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \
-	clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \
-	clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31);
+	clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
+	clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
+	clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
+	clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31);
 
 /**********************************************************************
   16-way (zmm), 8-way (ymm), 4-way (xmm) chacha20
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index ec00df09..391cbe6f 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1560,10 +1560,10 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
 	    }
 
 	  asm volatile ("vmovdqa %%xmm15, %%xmm7\n\t"
-			"vpxorq %%zmm16, %%zmm16, %%zmm16\n\t"
-			"vpxorq %%zmm17, %%zmm17, %%zmm17\n\t"
-			"vpxorq %%zmm18, %%zmm18, %%zmm18\n\t"
-			"vpxorq %%zmm19, %%zmm19, %%zmm19\n\t"
+			"vpxorq %%ymm16, %%ymm16, %%ymm16\n\t"
+			"vpxorq %%ymm17, %%ymm17, %%ymm17\n\t"
+			"vpxorq %%ymm18, %%ymm18, %%ymm18\n\t"
+			"vpxorq %%ymm19, %%ymm19, %%ymm19\n\t"
 			:
 			:
 			: "memory" );
@@ -1838,15 +1838,15 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
 	    }
 
 	  asm volatile ("vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
-			"vpxorq %%zmm16, %%zmm16, %%zmm16\n\t"
-			"vpxorq %%zmm17, %%zmm17, %%zmm17\n\t"
-			"vpxorq %%zmm18, %%zmm18, %%zmm18\n\t"
-			"vpxorq %%zmm19, %%zmm19, %%zmm19\n\t"
+			"vpxorq %%ymm16, %%ymm16, %%ymm16\n\t"
+			"vpxorq %%ymm17, %%ymm17, %%ymm17\n\t"
+			"vpxorq %%ymm18, %%ymm18, %%ymm18\n\t"
+			"vpxorq %%ymm19, %%ymm19, %%ymm19\n\t"
 			:
 			:
 			: "memory" );
 	}
-#endif
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */
 
       if (nblocks >= 16)
 	{
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S
index 58b4150f..b1fc7b64 100644
--- a/cipher/keccak-amd64-avx512.S
+++ b/cipher/keccak-amd64-avx512.S
@@ -160,14 +160,14 @@
 
 /* Misc helper macros. */
 #define clear_avx512_4regs(a, b, c, d) \
-	eor(a, a, a); vmovdqa64 a, b; vmovdqa64 a, c; vmovdqa64 a, d;
+	eor(a, a, a); eor(b, b, b); eor(c, c, c); eor(d, d, d);
 
 #define clear_regs() \
 	vzeroall; /* xmm0-xmm15 */ \
-	clear_avx512_4regs(%xmm16, %xmm17, %xmm18, %xmm19); \
-	clear_avx512_4regs(%xmm20, %xmm21, %xmm22, %xmm23); \
-	clear_avx512_4regs(%xmm24, %xmm25, %xmm26, %xmm27); \
-	clear_avx512_4regs(%xmm28, %xmm29, %xmm30, %xmm31);
+	clear_avx512_4regs(%ymm16, %ymm17, %ymm18, %ymm19); \
+	clear_avx512_4regs(%ymm20, %ymm21, %ymm22, %ymm23); \
+	clear_avx512_4regs(%ymm24, %ymm25, %ymm26, %ymm27); \
+	clear_avx512_4regs(%ymm28, %ymm29, %ymm30, %ymm31);
 
 ELF(.type	KeccakF1600_ce, at function)
 .align	64, 0xcc
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 6622861f..9beed8ad 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -1425,10 +1425,10 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
 	vmovdqa64 [rsp + STACK_r_save + 64*5], zmm0; \
 	\
 	vzeroall; \
-	clear_zmm(xmm16); clear_zmm(xmm20); clear_zmm(xmm24); clear_zmm(xmm28); \
-	clear_zmm(xmm17); clear_zmm(xmm21); clear_zmm(xmm25); clear_zmm(xmm29); \
-	clear_zmm(xmm18); clear_zmm(xmm22); clear_zmm(xmm26); clear_zmm(xmm30); \
-	clear_zmm(xmm19); clear_zmm(xmm23); clear_zmm(xmm27); clear_zmm(xmm31); \
+	clear_zmm(ymm16); clear_zmm(ymm20); clear_zmm(ymm24); clear_zmm(ymm28); \
+	clear_zmm(ymm17); clear_zmm(ymm21); clear_zmm(ymm25); clear_zmm(ymm29); \
+	clear_zmm(ymm18); clear_zmm(ymm22); clear_zmm(ymm26); clear_zmm(ymm30); \
+	clear_zmm(ymm19); clear_zmm(ymm23); clear_zmm(ymm27); clear_zmm(ymm31); \
 	\
 .L_final_loop: \
 	cmp     LEN, POLY1305_BLOCK_SIZE; \
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index 65475422..431fb3e9 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -384,13 +384,13 @@ _gcry_sha512_transform_amd64_avx512:
 	vmovdqa	[rsp + frame_XFER + 1*32], ymm0 /* burn stack */
 	vmovdqa	[rsp + frame_XFER + 2*32], ymm0 /* burn stack */
 	vmovdqa	[rsp + frame_XFER + 3*32], ymm0 /* burn stack */
-	clear_reg(xmm16);
-	clear_reg(xmm17);
-	clear_reg(xmm18);
-	clear_reg(xmm19);
-	clear_reg(xmm20);
-	clear_reg(xmm21);
-	clear_reg(xmm22);
+	clear_reg(ymm16);
+	clear_reg(ymm17);
+	clear_reg(ymm18);
+	clear_reg(ymm19);
+	clear_reg(ymm20);
+	clear_reg(ymm21);
+	clear_reg(ymm22);
 
 	/* Restore Stack Pointer */
 	mov	rsp, RSP_SAVE
-- 
2.37.2




More information about the Gcrypt-devel mailing list