[PATCH 5/5] aria-avx2: add VAES accelerated implementation
Taehee Yoo
ap420073 at gmail.com
Mon Feb 20 12:04:44 CET 2023
On 2/19/23 17:49, Jussi Kivilinna wrote:
Hi Jussi,
Thank you so much for this implementation!
I tested this in the kernel and it works really well.
In my machine(i3-12100), as you mentioned, it improves 30% performance
for the in-kernel aria-avx2 driver.
It will be really helpful to the in-kernel aria-avx2 driver.
> * cipher/aria-aesni-avx2-amd64.S (CONFIG_AS_VAES): New.
> [CONFIG_AS_VAES]: Add VAES accelerated assembly macros and functions.
> * cipher/aria.c (USE_VAES_AVX2): New.
> (ARIA_context): Add 'use_vaes_avx2'.
> (_gcry_aria_vaes_avx2_ecb_crypt_blk32)
> (_gcry_aria_vaes_avx2_ctr_crypt_blk32)
> (aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): Add VAES/AVX2
> code paths.
> (aria_setkey): Enable VAES/AVX2 implementation based on HW features.
> --
>
> This patch adds VAES/AVX2 accelerated ARIA block cipher implementation.
>
> VAES instruction set extends AESNI instructions to work on all 128-bit
> lanes of 256-bit YMM and 512-bit ZMM vector registers, thus AES
> operations can be executed directly on YMM registers without needing
> to manually split YMM to two XMM halfs for AESNI instructions.
> This improves performance on CPUs that support VAES but not GFNI, like
> AMD Zen3.
>
> Benchmark on Ryzen 7 5800X (zen3, turbo-freq off):
>
> Before (AESNI/AVX2):
> ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> ECB enc | 0.559 ns/B 1707 MiB/s 2.12 c/B 3800
> ECB dec | 0.560 ns/B 1703 MiB/s 2.13 c/B 3800
> CTR enc | 0.570 ns/B 1672 MiB/s 2.17 c/B 3800
> CTR dec | 0.568 ns/B 1679 MiB/s 2.16 c/B 3800
>
> After (VAES/AVX2, ~33% faster):
> ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> ECB enc | 0.435 ns/B 2193 MiB/s 1.65 c/B 3800
> ECB dec | 0.434 ns/B 2197 MiB/s 1.65 c/B 3800
> CTR enc | 0.413 ns/B 2306 MiB/s 1.57 c/B 3800
> CTR dec | 0.411 ns/B 2318 MiB/s 1.56 c/B 3800
>
> Cc: Taehee Yoo <ap420073 at gmail.com>
> Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
> ---
> cipher/aria-aesni-avx2-amd64.S | 368 ++++++++++++++++++++++++++++++++-
> cipher/aria.c | 50 ++++-
> 2 files changed, 409 insertions(+), 9 deletions(-)
>
> diff --git a/cipher/aria-aesni-avx2-amd64.S
b/cipher/aria-aesni-avx2-amd64.S
> index f09a9042..f1dcdb70 100644
> --- a/cipher/aria-aesni-avx2-amd64.S
> +++ b/cipher/aria-aesni-avx2-amd64.S
> @@ -31,6 +31,9 @@
> #ifdef ENABLE_GFNI_SUPPORT
> # define CONFIG_AS_GFNI 1
> #endif
> +#ifdef HAVE_GCC_INLINE_ASM_VAES_VPCLMUL
> +# define CONFIG_AS_VAES 1
> +#endif
>
> /* struct ARIA_context: */
> #define ARIA_BLOCK_SIZE 16
> @@ -358,6 +361,53 @@
> vgf2p8affineinvqb $0, t2, x7, x7
> #endif /* CONFIG_AS_GFNI */
>
> +#ifdef CONFIG_AS_VAES
> +#define aria_sbox_8way_vaes(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + t0, t1, t2, t3, \
> + t4, t5, t6, t7) \
> + vpxor t7, t7, t7; \
> + vpxor t6, t6, t6; \
> + vbroadcasti128 .Linv_shift_row rRIP, t0; \
> + vbroadcasti128 .Lshift_row rRIP, t1; \
> + vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
> + vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
> + vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
> + vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
> + \
> + vaesenclast t7, x0, x0; \
> + vaesenclast t7, x4, x4; \
> + vaesenclast t7, x1, x1; \
> + vaesenclast t7, x5, x5; \
> + vaesdeclast t7, x2, x2; \
> + vaesdeclast t7, x6, x6; \
> + \
> + vpbroadcastd .L0f0f0f0f rRIP, t6; \
> + \
> + /* AES inverse shift rows */ \
> + vpshufb t0, x0, x0; \
> + vpshufb t0, x4, x4; \
> + vpshufb t0, x1, x1; \
> + vpshufb t0, x5, x5; \
> + vpshufb t1, x3, x3; \
> + vpshufb t1, x7, x7; \
> + vpshufb t1, x2, x2; \
> + vpshufb t1, x6, x6; \
> + \
> + /* affine transformation for S2 */ \
> + filter_8bit(x1, t2, t3, t6, t0); \
> + /* affine transformation for S2 */ \
> + filter_8bit(x5, t2, t3, t6, t0); \
> + \
> + /* affine transformation for X2 */ \
> + filter_8bit(x3, t4, t5, t6, t0); \
> + /* affine transformation for X2 */ \
> + filter_8bit(x7, t4, t5, t6, t0); \
> + \
> + vaesdeclast t7, x3, x3; \
> + vaesdeclast t7, x7, x7;
> +#endif /* CONFIG_AS_VAES */
> +
> #define aria_sbox_8way(x0, x1, x2, x3, \
> x4, x5, x6, x7, \
> t0, t1, t2, t3, \
> @@ -432,7 +482,7 @@
> vextracti128 $1, x7, t6##_x; \
> vaesdeclast t7##_x, x7##_x, x7##_x; \
> vaesdeclast t7##_x, t6##_x, t6##_x; \
> - vinserti128 $1, t6##_x, x7, x7; \
> + vinserti128 $1, t6##_x, x7, x7;
>
> #define aria_diff_m(x0, x1, x2, x3, \
> t0, t1, t2, t3) \
> @@ -630,6 +680,7 @@
> aria_load_state_8way(y0, y1, y2, y3, \
> y4, y5, y6, y7, \
> mem_tmp, 8);
> +
> #ifdef CONFIG_AS_GFNI
> #define aria_fe_gfni(x0, x1, x2, x3, \
> x4, x5, x6, x7, \
> @@ -786,6 +837,155 @@
> mem_tmp, 8);
> #endif /* CONFIG_AS_GFNI */
>
> +#ifdef CONFIG_AS_VAES
> +#define aria_fe_vaes(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + y0, y1, y2, y3, \
> + y4, y5, y6, y7, \
> + mem_tmp, rk, round) \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 8, round); \
> + \
> + aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
> + x5, y0, y1, y2, y3, y4, y5, \
> + y6, y7); \
> + \
> + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
> + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
> + aria_store_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 8); \
> + \
> + aria_load_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 0); \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 0, round); \
> + \
> + aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
> + x5, y0, y1, y2, y3, y4, y5, \
> + y6, y7); \
> + \
> + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
> + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
> + aria_store_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 0); \
> + aria_load_state_8way(y0, y1, y2, y3, \
> + y4, y5, y6, y7, \
> + mem_tmp, 8); \
> + aria_diff_word(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + y0, y1, y2, y3, \
> + y4, y5, y6, y7); \
> + /* aria_diff_byte() \
> + * T3 = ABCD -> BADC \
> + * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
> + * T0 = ABCD -> CDAB \
> + * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
> + * T1 = ABCD -> DCBA \
> + * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
> + */ \
> + aria_diff_word(x2, x3, x0, x1, \
> + x7, x6, x5, x4, \
> + y0, y1, y2, y3, \
> + y5, y4, y7, y6); \
> + aria_store_state_8way(x3, x2, x1, x0, \
> + x6, x7, x4, x5, \
> + mem_tmp, 0);
> +
> +#define aria_fo_vaes(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + y0, y1, y2, y3, \
> + y4, y5, y6, y7, \
> + mem_tmp, rk, round) \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 8, round); \
> + \
> + aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6, \
> + x7, y0, y1, y2, y3, y4, y5, \
> + y6, y7); \
> + \
> + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
> + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
> + aria_store_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 8); \
> + \
> + aria_load_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 0); \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 0, round); \
> + \
> + aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6, \
> + x7, y0, y1, y2, y3, y4, y5, \
> + y6, y7); \
> + \
> + aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
> + aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
> + aria_store_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 0); \
> + aria_load_state_8way(y0, y1, y2, y3, \
> + y4, y5, y6, y7, \
> + mem_tmp, 8); \
> + aria_diff_word(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + y0, y1, y2, y3, \
> + y4, y5, y6, y7); \
> + /* aria_diff_byte() \
> + * T1 = ABCD -> BADC \
> + * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
> + * T2 = ABCD -> CDAB \
> + * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
> + * T3 = ABCD -> DCBA \
> + * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
> + */ \
> + aria_diff_word(x0, x1, x2, x3, \
> + x5, x4, x7, x6, \
> + y2, y3, y0, y1, \
> + y7, y6, y5, y4); \
> + aria_store_state_8way(x3, x2, x1, x0, \
> + x6, x7, x4, x5, \
> + mem_tmp, 0);
> +
> +#define aria_ff_vaes(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + y0, y1, y2, y3, \
> + y4, y5, y6, y7, \
> + mem_tmp, rk, round, last_round) \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 8, round); \
> + \
> + aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
> + x5, y0, y1, y2, y3, y4, y5, \
> + y6, y7); \
> + \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 8, last_round); \
> + \
> + aria_store_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 8); \
> + \
> + aria_load_state_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + mem_tmp, 0); \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 0, round); \
> + \
> + aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
> + x5, y0, y1, y2, y3, y4, y5, \
> + y6, y7); \
> + \
> + aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
> + y0, rk, 0, last_round); \
> + \
> + aria_load_state_8way(y0, y1, y2, y3, \
> + y4, y5, y6, y7, \
> + mem_tmp, 8);
> +#endif /* CONFIG_AS_VAES */
>
> SECTION_RODATA
> .align 32
> @@ -1279,6 +1479,172 @@ _gcry_aria_aesni_avx2_ctr_crypt_blk32:
> ELF(.size _gcry_aria_aesni_avx2_ctr_crypt_blk32,
> .-_gcry_aria_aesni_avx2_ctr_crypt_blk32;)
>
> +#ifdef CONFIG_AS_VAES
> +.align 16
> +ELF(.type __aria_vaes_avx2_crypt_32way, at function;)
> +__aria_vaes_avx2_crypt_32way:
> + /* input:
> + * %r9: rk
> + * %rsi: dst
> + * %rdx: src
> + * %ymm0..%ymm15: byte-sliced blocks
> + */
> + CFI_STARTPROC();
> +
> + movq %rsi, %rax;
> + leaq 8 * 32(%rax), %r8;
> +
> + movl ARIA_CTX_rounds(CTX), %r10d;
> + subl $2, %r10d;
> +
> + inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
> + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
> + %ymm15, %rax, %r8);
> + aria_fo_vaes(%ymm8, %ymm9, %ymm10, %ymm11,
> + %ymm12, %ymm13, %ymm14, %ymm15,
> + %ymm0, %ymm1, %ymm2, %ymm3,
> + %ymm4, %ymm5, %ymm6, %ymm7,
> + %rax, %r9, 0);
> + leaq 1*16(%r9), %r9;
> +
> +.align 16
> +.Loop_vaes:
> + aria_fe_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
> + %ymm4, %ymm5, %ymm6, %ymm7,
> + %ymm8, %ymm9, %ymm10, %ymm11,
> + %ymm12, %ymm13, %ymm14, %ymm15,
> + %rax, %r9, 0);
> + aria_fo_vaes(%ymm9, %ymm8, %ymm11, %ymm10,
> + %ymm12, %ymm13, %ymm14, %ymm15,
> + %ymm0, %ymm1, %ymm2, %ymm3,
> + %ymm4, %ymm5, %ymm6, %ymm7,
> + %rax, %r9, 1);
> + leaq 2*16(%r9), %r9;
> + subl $2, %r10d;
> + jnz .Loop_vaes;
> +
> + aria_ff_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
> + %ymm4, %ymm5, %ymm6, %ymm7,
> + %ymm8, %ymm9, %ymm10, %ymm11,
> + %ymm12, %ymm13, %ymm14, %ymm15,
> + %rax, %r9, 0, 1);
> +
> + debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
> + %ymm9, %ymm13, %ymm0, %ymm5,
> + %ymm10, %ymm14, %ymm3, %ymm6,
> + %ymm11, %ymm15, %ymm2, %ymm7,
> + (%rax), (%r8));
> +
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size __aria_vaes_avx2_crypt_32way,.-__aria_vaes_avx2_crypt_32way;)
> +
> +.align 16
> +.globl _gcry_aria_vaes_avx2_ecb_crypt_blk32
> +ELF(.type _gcry_aria_vaes_avx2_ecb_crypt_blk32, at function;)
> +_gcry_aria_vaes_avx2_ecb_crypt_blk32:
> + /* input:
> + * %rdi: ctx, CTX
> + * %rsi: dst
> + * %rdx: src
> + * %rcx: round keys
> + */
> + CFI_STARTPROC();
> +
> + pushq %rbp;
> + CFI_PUSH(%rbp);
> + movq %rsp, %rbp;
> + CFI_DEF_CFA_REGISTER(%rbp);
> +
> + subq $(16 * 32), %rsp;
> + andq $~31, %rsp;
> +
> + movq %rcx, %r9;
> + movq %rsi, %r11;
> + movq %rsp, %rsi; /* use stack for temporary store */
> +
> + inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
> + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
> + %ymm15, %rdx);
> +
> + call __aria_vaes_avx2_crypt_32way;
> +
> + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
> + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
> + %ymm15, %r11);
> +
> + movl $STACK_DEPTH, %eax;
> + leave;
> + CFI_LEAVE();
> + vzeroall;
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size _gcry_aria_vaes_avx2_ecb_crypt_blk32,
> + .-_gcry_aria_vaes_avx2_ecb_crypt_blk32;)
> +
> +.align 16
> +.globl _gcry_aria_vaes_avx2_ctr_crypt_blk32
> +ELF(.type _gcry_aria_vaes_avx2_ctr_crypt_blk32, at function;)
> +_gcry_aria_vaes_avx2_ctr_crypt_blk32:
> + /* input:
> + * %rdi: ctx
> + * %rsi: dst
> + * %rdx: src
> + * %rcx: iv (big endian, 128bit)
> + */
> + CFI_STARTPROC();
> +
> + pushq %rbp;
> + CFI_PUSH(%rbp);
> + movq %rsp, %rbp;
> + CFI_DEF_CFA_REGISTER(%rbp);
> +
> + subq $(16 * 32), %rsp;
> + andq $~31, %rsp;
> +
> + movq %rcx, %r8; /* %r8: iv */
> + movq %rsp, %rcx; /* %rcx: keystream */
> + call __aria_aesni_avx2_ctr_gen_keystream_32way;
> +
> + pushq %rsi;
> + movq %rdx, %r11;
> + movq %rcx, %rsi; /* use stack for temporary store */
> + movq %rcx, %rdx;
> + leaq ARIA_CTX_enc_key(CTX), %r9;
> +
> + call __aria_vaes_avx2_crypt_32way;
> +
> + popq %rsi;
> + vpxor (0 * 32)(%r11), %ymm1, %ymm1;
> + vpxor (1 * 32)(%r11), %ymm0, %ymm0;
> + vpxor (2 * 32)(%r11), %ymm3, %ymm3;
> + vpxor (3 * 32)(%r11), %ymm2, %ymm2;
> + vpxor (4 * 32)(%r11), %ymm4, %ymm4;
> + vpxor (5 * 32)(%r11), %ymm5, %ymm5;
> + vpxor (6 * 32)(%r11), %ymm6, %ymm6;
> + vpxor (7 * 32)(%r11), %ymm7, %ymm7;
> + vpxor (8 * 32)(%r11), %ymm8, %ymm8;
> + vpxor (9 * 32)(%r11), %ymm9, %ymm9;
> + vpxor (10 * 32)(%r11), %ymm10, %ymm10;
> + vpxor (11 * 32)(%r11), %ymm11, %ymm11;
> + vpxor (12 * 32)(%r11), %ymm12, %ymm12;
> + vpxor (13 * 32)(%r11), %ymm13, %ymm13;
> + vpxor (14 * 32)(%r11), %ymm14, %ymm14;
> + vpxor (15 * 32)(%r11), %ymm15, %ymm15;
> + write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
> + %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
> + %ymm15, %rsi);
> +
> + movl $STACK_DEPTH, %eax;
> + leave;
> + CFI_LEAVE();
> + vzeroall;
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size _gcry_aria_vaes_avx2_ctr_crypt_blk32,
> + .-_gcry_aria_vaes_avx2_ctr_crypt_blk32;)
> +#endif /* CONFIG_AS_VAES */
> +
> #ifdef CONFIG_AS_GFNI
> .align 16
> ELF(.type __aria_gfni_avx2_crypt_32way, at function;)
> diff --git a/cipher/aria.c b/cipher/aria.c
> index 9eb42a2d..bc2d4384 100644
> --- a/cipher/aria.c
> +++ b/cipher/aria.c
> @@ -74,6 +74,12 @@
> # endif
> #endif
>
> +/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2
code. */
> +#undef USE_VAES_AVX2
> +#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
> +# define USE_VAES_AVX2 1
> +#endif
> +
> /* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2
code. */
> #undef USE_GFNI_AVX2
> #if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
> @@ -142,6 +148,7 @@ typedef struct
> #endif
> #ifdef USE_AESNI_AVX2
> unsigned int use_aesni_avx2:1;
> + unsigned int use_vaes_avx2:1;
> unsigned int use_gfni_avx2:1;
> #endif
> #ifdef USE_GFNI_AVX512
> @@ -464,12 +471,13 @@ static inline unsigned int
> aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out,
const byte *in,
> const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
> {
> + if (0) { }
> #ifdef USE_GFNI_AVX
> - if (ctx->use_gfni_avx)
> + else if (ctx->use_gfni_avx)
> return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key,
nblks)
> + ASM_EXTRA_STACK;
> - else
> #endif /* USE_GFNI_AVX */
> + else
> return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in,
key, nblks)
> + ASM_EXTRA_STACK;
> }
> @@ -478,12 +486,13 @@ static inline unsigned int
> aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const
byte *in,
> byte *iv)
> {
> + if (0) { }
> #ifdef USE_GFNI_AVX
> - if (ctx->use_gfni_avx)
> + else if (ctx->use_gfni_avx)
> return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
> + ASM_EXTRA_STACK;
> - else
> #endif /* USE_GFNI_AVX */
> + else
> return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
> + ASM_EXTRA_STACK;
> }
> @@ -498,6 +507,16 @@ extern unsigned int
> _gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
> const byte *in, byte *iv) ASM_FUNC_ABI;
>
> +#ifdef USE_VAES_AVX2
> +extern unsigned int
> +_gcry_aria_vaes_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
> + const byte *in,
> + const void *key) ASM_FUNC_ABI;
> +extern unsigned int
> +_gcry_aria_vaes_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
> + const byte *in, byte *iv) ASM_FUNC_ABI;
> +#endif /* USE_VAES_AVX2 */
> +
> #ifdef USE_GFNI_AVX2
> extern unsigned int
> _gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
> @@ -512,12 +531,18 @@ static inline unsigned int
> aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const
byte *in,
> const u32 key[][ARIA_RD_KEY_WORDS])
> {
> + if (0) { }
> #ifdef USE_GFNI_AVX2
> - if (ctx->use_gfni_avx2)
> + else if (ctx->use_gfni_avx2)
> return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
> + ASM_EXTRA_STACK;
> - else
> #endif /* USE_GFNI_AVX2 */
> +#ifdef USE_VAES_AVX2
> + else if (ctx->use_vaes_avx2)
> + return _gcry_aria_vaes_avx2_ecb_crypt_blk32(ctx, out, in, key)
> + + ASM_EXTRA_STACK;
> +#endif /* USE_VAES_AVX2 */
> + else
> return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
> + ASM_EXTRA_STACK;
> }
> @@ -526,12 +551,18 @@ static inline unsigned int
> aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const
byte *in,
> byte *iv)
> {
> + if (0) { }
> #ifdef USE_GFNI_AVX2
> - if (ctx->use_gfni_avx2)
> + else if (ctx->use_gfni_avx2)
> return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
> + ASM_EXTRA_STACK;
> - else
> #endif /* USE_GFNI_AVX2 */
> +#ifdef USE_VAES_AVX2
> + else if (ctx->use_vaes_avx2)
> + return _gcry_aria_vaes_avx2_ctr_crypt_blk32(ctx, out, in, iv)
> + + ASM_EXTRA_STACK;
> +#endif /* USE_VAES_AVX2 */
> + else
> return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
> + ASM_EXTRA_STACK;
> }
> @@ -1614,6 +1645,9 @@ aria_setkey(void *c, const byte *key, unsigned
keylen,
> #ifdef USE_GFNI_AVX2
> ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf &
HWF_INTEL_AVX2);
> #endif
> +#ifdef USE_VAES_AVX2
> + ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf &
HWF_INTEL_AVX2);
> +#endif
> #ifdef USE_AESNI_AVX
> ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf &
HWF_INTEL_AVX);
> #endif
Thank you so much for this implementation!
Taehee Yoo
More information about the Gcrypt-devel
mailing list