[PATCH 5/5] aria-avx2: add VAES accelerated implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Feb 19 09:49:10 CET 2023


* cipher/aria-aesni-avx2-amd64.S (CONFIG_AS_VAES): New.
[CONFIG_AS_VAES]: Add VAES accelerated assembly macros and functions.
* cipher/aria.c (USE_VAES_AVX2): New.
(ARIA_context): Add 'use_vaes_avx2'.
(_gcry_aria_vaes_avx2_ecb_crypt_blk32)
(_gcry_aria_vaes_avx2_ctr_crypt_blk32)
(aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): Add VAES/AVX2
code paths.
(aria_setkey): Enable VAES/AVX2 implementation based on HW features.
--

This patch adds VAES/AVX2 accelerated ARIA block cipher implementation.

VAES instruction set extends AESNI instructions to work on all 128-bit
lanes of 256-bit YMM and 512-bit ZMM vector registers, thus AES
operations can be executed directly on YMM registers without needing
to manually split YMM to two XMM halfs for AESNI instructions.
This improves performance on CPUs that support VAES but not GFNI, like
AMD Zen3.

Benchmark on Ryzen 7 5800X (zen3, turbo-freq off):

 Before (AESNI/AVX2):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.559 ns/B      1707 MiB/s      2.12 c/B      3800
        ECB dec |     0.560 ns/B      1703 MiB/s      2.13 c/B      3800
        CTR enc |     0.570 ns/B      1672 MiB/s      2.17 c/B      3800
        CTR dec |     0.568 ns/B      1679 MiB/s      2.16 c/B      3800

 After (VAES/AVX2, ~33% faster):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.435 ns/B      2193 MiB/s      1.65 c/B      3800
        ECB dec |     0.434 ns/B      2197 MiB/s      1.65 c/B      3800
        CTR enc |     0.413 ns/B      2306 MiB/s      1.57 c/B      3800
        CTR dec |     0.411 ns/B      2318 MiB/s      1.56 c/B      3800

Cc: Taehee Yoo <ap420073 at gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/aria-aesni-avx2-amd64.S | 368 ++++++++++++++++++++++++++++++++-
 cipher/aria.c                  |  50 ++++-
 2 files changed, 409 insertions(+), 9 deletions(-)

diff --git a/cipher/aria-aesni-avx2-amd64.S b/cipher/aria-aesni-avx2-amd64.S
index f09a9042..f1dcdb70 100644
--- a/cipher/aria-aesni-avx2-amd64.S
+++ b/cipher/aria-aesni-avx2-amd64.S
@@ -31,6 +31,9 @@
 #ifdef ENABLE_GFNI_SUPPORT
 #  define CONFIG_AS_GFNI 1
 #endif
+#ifdef HAVE_GCC_INLINE_ASM_VAES_VPCLMUL
+#  define CONFIG_AS_VAES 1
+#endif
 
 /* struct ARIA_context: */
 #define ARIA_BLOCK_SIZE  16
@@ -358,6 +361,53 @@
 	vgf2p8affineinvqb $0, t2, x7, x7
 #endif /* CONFIG_AS_GFNI */
 
+#ifdef CONFIG_AS_VAES
+#define aria_sbox_8way_vaes(x0, x1, x2, x3,		\
+			    x4, x5, x6, x7,		\
+			    t0, t1, t2, t3,		\
+			    t4, t5, t6, t7)		\
+	vpxor t7, t7, t7;				\
+	vpxor t6, t6, t6;				\
+	vbroadcasti128 .Linv_shift_row rRIP, t0;	\
+	vbroadcasti128 .Lshift_row rRIP, t1;		\
+	vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
+	vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
+	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
+	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
+							\
+	vaesenclast t7, x0, x0;				\
+	vaesenclast t7, x4, x4;				\
+	vaesenclast t7, x1, x1;				\
+	vaesenclast t7, x5, x5;				\
+	vaesdeclast t7, x2, x2;				\
+	vaesdeclast t7, x6, x6;				\
+							\
+	vpbroadcastd .L0f0f0f0f rRIP, t6;		\
+							\
+	/* AES inverse shift rows */			\
+	vpshufb t0, x0, x0;				\
+	vpshufb t0, x4, x4;				\
+	vpshufb t0, x1, x1;				\
+	vpshufb t0, x5, x5;				\
+	vpshufb t1, x3, x3;				\
+	vpshufb t1, x7, x7;				\
+	vpshufb t1, x2, x2;				\
+	vpshufb t1, x6, x6;				\
+							\
+	/* affine transformation for S2 */		\
+	filter_8bit(x1, t2, t3, t6, t0);		\
+	/* affine transformation for S2 */		\
+	filter_8bit(x5, t2, t3, t6, t0);		\
+							\
+	/* affine transformation for X2 */		\
+	filter_8bit(x3, t4, t5, t6, t0);		\
+	/* affine transformation for X2 */		\
+	filter_8bit(x7, t4, t5, t6, t0);		\
+							\
+	vaesdeclast t7, x3, x3;				\
+	vaesdeclast t7, x7, x7;
+#endif /* CONFIG_AS_VAES */
+
 #define aria_sbox_8way(x0, x1, x2, x3,			\
 		       x4, x5, x6, x7,			\
 		       t0, t1, t2, t3,			\
@@ -432,7 +482,7 @@
 	vextracti128 $1, x7, t6##_x;			\
 	vaesdeclast t7##_x, x7##_x, x7##_x;		\
 	vaesdeclast t7##_x, t6##_x, t6##_x;		\
-	vinserti128 $1, t6##_x, x7, x7;			\
+	vinserti128 $1, t6##_x, x7, x7;
 
 #define aria_diff_m(x0, x1, x2, x3,			\
 		    t0, t1, t2, t3)			\
@@ -630,6 +680,7 @@
 	aria_load_state_8way(y0, y1, y2, y3,		\
 			     y4, y5, y6, y7,		\
 			     mem_tmp, 8);
+
 #ifdef CONFIG_AS_GFNI
 #define aria_fe_gfni(x0, x1, x2, x3,			\
 		     x4, x5, x6, x7,			\
@@ -786,6 +837,155 @@
 			     mem_tmp, 8);
 #endif /* CONFIG_AS_GFNI */
 
+#ifdef CONFIG_AS_VAES
+#define aria_fe_vaes(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T3 = ABCD -> BADC				\
+	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
+	 * T0 = ABCD -> CDAB				\
+	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
+	 * T1 = ABCD -> DCBA				\
+	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
+	 */						\
+	aria_diff_word(x2, x3, x0, x1,			\
+		       x7, x6, x5, x4,			\
+		       y0, y1, y2, y3,			\
+		       y5, y4, y7, y6);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_fo_vaes(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6,	\
+			    x7, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6,	\
+			    x7, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T1 = ABCD -> BADC				\
+	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
+	 * T2 = ABCD -> CDAB				\
+	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
+	 * T3 = ABCD -> DCBA				\
+	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
+	 */						\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x5, x4, x7, x6,			\
+		       y2, y3, y0, y1,			\
+		       y7, y6, y5, y4);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_ff_vaes(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round, last_round)	\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, last_round);		\
+							\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);	\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, last_round);		\
+							\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);
+#endif /* CONFIG_AS_VAES */
 
 SECTION_RODATA
 .align 32
@@ -1279,6 +1479,172 @@ _gcry_aria_aesni_avx2_ctr_crypt_blk32:
 ELF(.size _gcry_aria_aesni_avx2_ctr_crypt_blk32,
 	  .-_gcry_aria_aesni_avx2_ctr_crypt_blk32;)
 
+#ifdef CONFIG_AS_VAES
+.align 16
+ELF(.type __aria_vaes_avx2_crypt_32way, at function;)
+__aria_vaes_avx2_crypt_32way:
+	/* input:
+	 *      %r9: rk
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %ymm0..%ymm15: byte-sliced blocks
+	 */
+	CFI_STARTPROC();
+
+	movq %rsi, %rax;
+	leaq 8 * 32(%rax), %r8;
+
+	movl ARIA_CTX_rounds(CTX), %r10d;
+	subl $2, %r10d;
+
+	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %r8);
+	aria_fo_vaes(%ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %ymm0, %ymm1, %ymm2, %ymm3,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %rax, %r9, 0);
+	leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_vaes:
+	aria_fe_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %rax, %r9, 0);
+	aria_fo_vaes(%ymm9, %ymm8, %ymm11, %ymm10,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %ymm0, %ymm1, %ymm2, %ymm3,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %rax, %r9, 1);
+	leaq 2*16(%r9), %r9;
+	subl $2, %r10d;
+	jnz .Loop_vaes;
+
+	aria_ff_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %rax, %r9, 0, 1);
+
+	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+			   %ymm9, %ymm13, %ymm0, %ymm5,
+			   %ymm10, %ymm14, %ymm3, %ymm6,
+			   %ymm11, %ymm15, %ymm2, %ymm7,
+			   (%rax), (%r8));
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_vaes_avx2_crypt_32way,.-__aria_vaes_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ecb_crypt_blk32, at function;)
+_gcry_aria_vaes_avx2_ecb_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx, CTX
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: round keys
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r9;
+	movq %rsi, %r11;
+	movq %rsp, %rsi; /* use stack for temporary store */
+
+	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx);
+
+	call __aria_vaes_avx2_crypt_32way;
+
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %r11);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ecb_crypt_blk32,
+	  .-_gcry_aria_vaes_avx2_ecb_crypt_blk32;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ctr_crypt_blk32, at function;)
+_gcry_aria_vaes_avx2_ctr_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r8;  /* %r8: iv */
+	movq %rsp, %rcx; /* %rcx: keystream */
+	call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+	pushq %rsi;
+	movq %rdx, %r11;
+	movq %rcx, %rsi; /* use stack for temporary store */
+	movq %rcx, %rdx;
+	leaq ARIA_CTX_enc_key(CTX), %r9;
+
+	call __aria_vaes_avx2_crypt_32way;
+
+	popq %rsi;
+	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rsi);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ctr_crypt_blk32,
+	  .-_gcry_aria_vaes_avx2_ctr_crypt_blk32;)
+#endif /* CONFIG_AS_VAES */
+
 #ifdef CONFIG_AS_GFNI
 .align 16
 ELF(.type __aria_gfni_avx2_crypt_32way, at function;)
diff --git a/cipher/aria.c b/cipher/aria.c
index 9eb42a2d..bc2d4384 100644
--- a/cipher/aria.c
+++ b/cipher/aria.c
@@ -74,6 +74,12 @@
 # endif
 #endif
 
+/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */
+#undef USE_VAES_AVX2
+#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define USE_VAES_AVX2 1
+#endif
+
 /* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
 #undef USE_GFNI_AVX2
 #if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
@@ -142,6 +148,7 @@ typedef struct
 #endif
 #ifdef USE_AESNI_AVX2
   unsigned int use_aesni_avx2:1;
+  unsigned int use_vaes_avx2:1;
   unsigned int use_gfni_avx2:1;
 #endif
 #ifdef USE_GFNI_AVX512
@@ -464,12 +471,13 @@ static inline unsigned int
 aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, const byte *in,
 			   const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX
-  if (ctx->use_gfni_avx)
+  else if (ctx->use_gfni_avx)
     return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX */
+  else
     return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
 		+ ASM_EXTRA_STACK;
 }
@@ -478,12 +486,13 @@ static inline unsigned int
 aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const byte *in,
 			 byte *iv)
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX
-  if (ctx->use_gfni_avx)
+  else if (ctx->use_gfni_avx)
     return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX */
+  else
     return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
 }
@@ -498,6 +507,16 @@ extern unsigned int
 _gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
 				      const byte *in, byte *iv) ASM_FUNC_ABI;
 
+#ifdef USE_VAES_AVX2
+extern unsigned int
+_gcry_aria_vaes_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+				     const byte *in,
+				     const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_vaes_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+				     const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_VAES_AVX2 */
+
 #ifdef USE_GFNI_AVX2
 extern unsigned int
 _gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
@@ -512,12 +531,18 @@ static inline unsigned int
 aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
 			  const u32 key[][ARIA_RD_KEY_WORDS])
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX2
-  if (ctx->use_gfni_avx2)
+  else if (ctx->use_gfni_avx2)
     return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ecb_crypt_blk32(ctx, out, in, key)
+		+ ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
     return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
 		+ ASM_EXTRA_STACK;
 }
@@ -526,12 +551,18 @@ static inline unsigned int
 aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
 			  byte *iv)
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX2
-  if (ctx->use_gfni_avx2)
+  else if (ctx->use_gfni_avx2)
     return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+		+ ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
     return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
 }
@@ -1614,6 +1645,9 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
 #ifdef USE_GFNI_AVX2
   ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
 #endif
+#ifdef USE_VAES_AVX2
+  ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
+#endif
 #ifdef USE_AESNI_AVX
   ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
 #endif
-- 
2.37.2




More information about the Gcrypt-devel mailing list