[PATCH 2/2] Add VAES/AVX2 AMD64 accelerated implementation of AES
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Jan 25 19:47:08 CET 2021
* cipher/Makefile.am: Add 'rijndael-vaes.c' and
'rijndael-vaes-avx2-amd64.S'.
* cipher/rijndael-internal.h (USE_VAES): New.
* cipher/rijndael-vaes-avx2-amd64.S: New.
* cipher/rijndael-vaes.c: New.
* cipher/rijndael.c (_gcry_aes_vaes_cfb_dec, _gcry_aes_vaes_cbc_dec)
(_gcry_aes_vaes_ctr_enc, _gcry_aes_vaes_ocb_crypt): New.
(do_setkey) [USE_AESNI] [USE_VAES]: Add detection for VAES.
* configure.ac: Add 'rijndael-vaes.lo' and
'rijndael-vaes-avx2-amd64.lo'.
--
Patch adds VAES/AVX2 accelerated implementation for CBC-decryption,
CFB-decryption, CTR-encryption, OCB-encryption and OCB-decryption.
Benchmarks on AMD Ryzen 5800X:
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.066 ns/B 14344 MiB/s 0.322 c/B 4850
CFB dec | 0.067 ns/B 14321 MiB/s 0.323 c/B 4850
CTR enc | 0.066 ns/B 14458 MiB/s 0.320 c/B 4850
OCB enc | 0.070 ns/B 13597 MiB/s 0.340 c/B 4850
OCB dec | 0.068 ns/B 14081 MiB/s 0.328 c/B 4850
After (~1.9x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.034 ns/B 28080 MiB/s 0.165 c/B 4850
CFB dec | 0.034 ns/B 27957 MiB/s 0.165 c/B 4850
CTR enc | 0.034 ns/B 28411 MiB/s 0.163 c/B 4850
OCB enc | 0.037 ns/B 25642 MiB/s 0.180 c/B 4850
OCB dec | 0.036 ns/B 26827 MiB/s 0.172 c/B 4850
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 1 +
cipher/rijndael-internal.h | 10 +
cipher/rijndael-vaes-avx2-amd64.S | 2180 +++++++++++++++++++++++++++++
cipher/rijndael-vaes.c | 146 ++
cipher/rijndael.c | 40 +
configure.ac | 4 +
6 files changed, 2381 insertions(+)
create mode 100644 cipher/rijndael-vaes-avx2-amd64.S
create mode 100644 cipher/rijndael-vaes.c
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 62f7ec3e..8fb0a8b5 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -100,6 +100,7 @@ EXTRA_libcipher_la_SOURCES = \
rijndael-aesni.c rijndael-padlock.c \
rijndael-amd64.S rijndael-arm.S \
rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \
+ rijndael-vaes.c rijndael-vaes-avx2-amd64.S \
rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S \
rijndael-armv8-aarch64-ce.S rijndael-aarch64.S \
rijndael-ppc.c rijndael-ppc9le.c \
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 7e01f6b0..75ecb74c 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -89,6 +89,16 @@
# endif
#endif /* ENABLE_AESNI_SUPPORT */
+/* USE_VAES inidicates whether to compile with Intel VAES code. */
+#undef USE_VAES
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(__x86_64__) && defined(ENABLE_AVX2_SUPPORT) && \
+ defined(HAVE_GCC_INLINE_ASM_VAES) && \
+ defined(USE_AESNI)
+# define USE_VAES 1
+#endif
+
/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
* code. */
#undef USE_ARM_CE
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
new file mode 100644
index 00000000..80db87df
--- /dev/null
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -0,0 +1,2180 @@
+/* VAES/AVX2 AMD64 accelerated AES for Libgcrypt
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined(__x86_64__)
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) && \
+ defined(HAVE_GCC_INLINE_ASM_VAES)
+
+#include "asm-common-amd64.h"
+
+.text
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+ vpcmpeqq minus_one, x, tmp1; \
+ vpcmpeqq minus_two, x, tmp2; \
+ vpor tmp1, tmp2, tmp2; \
+ vpsubq minus_two, x, x; \
+ vpslldq $8, tmp2, tmp2; \
+ vpsubq tmp2, x, x;
+
+#define AES_OP8(op, key, b0, b1, b2, b3, b4, b5, b6, b7) \
+ op key, b0, b0; \
+ op key, b1, b1; \
+ op key, b2, b2; \
+ op key, b3, b3; \
+ op key, b4, b4; \
+ op key, b5, b5; \
+ op key, b6, b6; \
+ op key, b7, b7;
+
+#define VAESENC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \
+ AES_OP8(vaesenc, key, b0, b1, b2, b3, b4, b5, b6, b7)
+
+#define VAESDEC8(key, b0, b1, b2, b3, b4, b5, b6, b7) \
+ AES_OP8(vaesdec, key, b0, b1, b2, b3, b4, b5, b6, b7)
+
+#define XOR8(key, b0, b1, b2, b3, b4, b5, b6, b7) \
+ AES_OP8(vpxor, key, b0, b1, b2, b3, b4, b5, b6, b7)
+
+#define AES_OP4(op, key, b0, b1, b2, b3) \
+ op key, b0, b0; \
+ op key, b1, b1; \
+ op key, b2, b2; \
+ op key, b3, b3;
+
+#define VAESENC4(key, b0, b1, b2, b3) \
+ AES_OP4(vaesenc, key, b0, b1, b2, b3)
+
+#define VAESDEC4(key, b0, b1, b2, b3) \
+ AES_OP4(vaesdec, key, b0, b1, b2, b3)
+
+#define XOR4(key, b0, b1, b2, b3) \
+ AES_OP4(vpxor, key, b0, b1, b2, b3)
+
+/**********************************************************************
+ CBC-mode decryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_cbc_dec_amd64, at function)
+.globl _gcry_vaes_avx2_cbc_dec_amd64
+_gcry_vaes_avx2_cbc_dec_amd64:
+ /* input:
+ * %rdi: round keys
+ * %rsi: iv
+ * %rdx: dst
+ * %rcx: src
+ * %r8: nblocks
+ * %r9: nrounds
+ */
+ CFI_STARTPROC();
+
+ /* Load IV. */
+ vmovdqu (%rsi), %xmm15;
+
+ /* Process 16 blocks per loop. */
+.align 8
+.Lcbc_dec_blk16:
+ cmpq $16, %r8;
+ jb .Lcbc_dec_blk8;
+
+ leaq -16(%r8), %r8;
+
+ /* Load input and xor first key. Update IV. */
+ vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vmovdqu (4 * 16)(%rcx), %ymm2;
+ vmovdqu (6 * 16)(%rcx), %ymm3;
+ vmovdqu (8 * 16)(%rcx), %ymm4;
+ vmovdqu (10 * 16)(%rcx), %ymm5;
+ vmovdqu (12 * 16)(%rcx), %ymm6;
+ vmovdqu (14 * 16)(%rcx), %ymm7;
+ vpxor %ymm8, %ymm0, %ymm0;
+ vpxor %ymm8, %ymm1, %ymm1;
+ vpxor %ymm8, %ymm2, %ymm2;
+ vpxor %ymm8, %ymm3, %ymm3;
+ vpxor %ymm8, %ymm4, %ymm4;
+ vpxor %ymm8, %ymm5, %ymm5;
+ vpxor %ymm8, %ymm6, %ymm6;
+ vpxor %ymm8, %ymm7, %ymm7;
+ vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+ vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm9;
+ vmovdqu (1 * 16)(%rcx), %ymm10;
+ vmovdqu (3 * 16)(%rcx), %ymm11;
+ vmovdqu (5 * 16)(%rcx), %ymm12;
+ vmovdqu (7 * 16)(%rcx), %ymm13;
+ vmovdqu (9 * 16)(%rcx), %ymm14;
+ vmovdqu (15 * 16)(%rcx), %xmm15;
+ leaq (16 * 16)(%rcx), %rcx;
+
+ /* AES rounds */
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+ cmpl $12, %r9d;
+ jb .Lcbc_dec_blk16_last;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+ jz .Lcbc_dec_blk16_last;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+ VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+
+ /* Last round and output handling. */
+ .Lcbc_dec_blk16_last:
+ vpxor %ymm8, %ymm9, %ymm9;
+ vpxor %ymm8, %ymm10, %ymm10;
+ vpxor %ymm8, %ymm11, %ymm11;
+ vpxor %ymm8, %ymm12, %ymm12;
+ vpxor %ymm8, %ymm13, %ymm13;
+ vpxor %ymm8, %ymm14, %ymm14;
+ vaesdeclast %ymm9, %ymm0, %ymm0;
+ vaesdeclast %ymm10, %ymm1, %ymm1;
+ vpxor (-5 * 16)(%rcx), %ymm8, %ymm9;
+ vpxor (-3 * 16)(%rcx), %ymm8, %ymm10;
+ vaesdeclast %ymm11, %ymm2, %ymm2;
+ vaesdeclast %ymm12, %ymm3, %ymm3;
+ vaesdeclast %ymm13, %ymm4, %ymm4;
+ vaesdeclast %ymm14, %ymm5, %ymm5;
+ vaesdeclast %ymm9, %ymm6, %ymm6;
+ vaesdeclast %ymm10, %ymm7, %ymm7;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ vmovdqu %ymm4, (8 * 16)(%rdx);
+ vmovdqu %ymm5, (10 * 16)(%rdx);
+ vmovdqu %ymm6, (12 * 16)(%rdx);
+ vmovdqu %ymm7, (14 * 16)(%rdx);
+ leaq (16 * 16)(%rdx), %rdx;
+
+ jmp .Lcbc_dec_blk16;
+
+ /* Handle trailing eight blocks. */
+.align 8
+.Lcbc_dec_blk8:
+ cmpq $8, %r8;
+ jb .Lcbc_dec_blk4;
+
+ leaq -8(%r8), %r8;
+
+ /* Load input and xor first key. Update IV. */
+ vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vmovdqu (4 * 16)(%rcx), %ymm2;
+ vmovdqu (6 * 16)(%rcx), %ymm3;
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor %ymm4, %ymm1, %ymm1;
+ vpxor %ymm4, %ymm2, %ymm2;
+ vpxor %ymm4, %ymm3, %ymm3;
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10;
+ vmovdqu (1 * 16)(%rcx), %ymm11;
+ vmovdqu (3 * 16)(%rcx), %ymm12;
+ vmovdqu (5 * 16)(%rcx), %ymm13;
+ vmovdqu (7 * 16)(%rcx), %xmm15;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ /* AES rounds */
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lcbc_dec_blk8_last;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lcbc_dec_blk8_last;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Lcbc_dec_blk8_last:
+ vpxor %ymm4, %ymm10, %ymm10;
+ vpxor %ymm4, %ymm11, %ymm11;
+ vpxor %ymm4, %ymm12, %ymm12;
+ vpxor %ymm4, %ymm13, %ymm13;
+ vaesdeclast %ymm10, %ymm0, %ymm0;
+ vaesdeclast %ymm11, %ymm1, %ymm1;
+ vaesdeclast %ymm12, %ymm2, %ymm2;
+ vaesdeclast %ymm13, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ /* Handle trailing four blocks. */
+.align 8
+.Lcbc_dec_blk4:
+ cmpq $4, %r8;
+ jb .Lcbc_dec_blk1;
+
+ leaq -4(%r8), %r8;
+
+ /* Load input and xor first key. */
+ vmovdqa (0 * 16)(%rdi), %xmm4;
+ vmovdqu (0 * 16)(%rcx), %xmm10;
+ vmovdqu (1 * 16)(%rcx), %xmm11;
+ vmovdqu (2 * 16)(%rcx), %xmm12;
+ vmovdqu (3 * 16)(%rcx), %xmm13;
+ leaq (4 * 16)(%rcx), %rcx;
+ vpxor %xmm10, %xmm4, %xmm0;
+ vpxor %xmm11, %xmm4, %xmm1;
+ vpxor %xmm12, %xmm4, %xmm2;
+ vpxor %xmm13, %xmm4, %xmm3;
+
+ /* AES rounds */
+ vmovdqa (1 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (2 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (3 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (4 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (5 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (6 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (7 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (8 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (9 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (10 * 16)(%rdi), %xmm4;
+ cmpl $12, %r9d;
+ jb .Lcbc_dec_blk4_last;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (11 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (12 * 16)(%rdi), %xmm4;
+ jz .Lcbc_dec_blk4_last;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (13 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (14 * 16)(%rdi), %xmm4;
+
+ /* Last round and output handling. */
+ .Lcbc_dec_blk4_last:
+ vpxor %xmm4, %xmm15, %xmm15;
+ vpxor %xmm4, %xmm10, %xmm10;
+ vpxor %xmm4, %xmm11, %xmm11;
+ vpxor %xmm4, %xmm12, %xmm12;
+ vaesdeclast %xmm15, %xmm0, %xmm0;
+ vmovdqa %xmm13, %xmm15;
+ vaesdeclast %xmm10, %xmm1, %xmm1;
+ vaesdeclast %xmm11, %xmm2, %xmm2;
+ vaesdeclast %xmm12, %xmm3, %xmm3;
+ vmovdqu %xmm0, (0 * 16)(%rdx);
+ vmovdqu %xmm1, (1 * 16)(%rdx);
+ vmovdqu %xmm2, (2 * 16)(%rdx);
+ vmovdqu %xmm3, (3 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+
+ /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lcbc_dec_blk1:
+ cmpq $1, %r8;
+ jb .Ldone_cbc_dec;
+
+ leaq -1(%r8), %r8;
+
+ /* Load input. */
+ vmovdqu (%rcx), %xmm2;
+ leaq 16(%rcx), %rcx;
+
+ /* Xor first key. */
+ vpxor (0 * 16)(%rdi), %xmm2, %xmm0;
+
+ /* AES rounds. */
+ vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (10 * 16)(%rdi), %xmm1;
+ cmpl $12, %r9d;
+ jb .Lcbc_dec_blk1_last;
+ vaesdec %xmm1, %xmm0, %xmm0;
+ vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (12 * 16)(%rdi), %xmm1;
+ jz .Lcbc_dec_blk1_last;
+ vaesdec %xmm1, %xmm0, %xmm0;
+ vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (14 * 16)(%rdi), %xmm1;
+
+ /* Last round and output handling. */
+ .Lcbc_dec_blk1_last:
+ vpxor %xmm1, %xmm15, %xmm15;
+ vaesdeclast %xmm15, %xmm0, %xmm0;
+ vmovdqa %xmm2, %xmm15;
+ vmovdqu %xmm0, (%rdx);
+ leaq 16(%rdx), %rdx;
+
+ jmp .Lcbc_dec_blk1;
+
+.align 8
+.Ldone_cbc_dec:
+ /* Store IV. */
+ vmovdqu %xmm15, (%rsi);
+
+ vzeroall;
+ ret
+ CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64)
+
+/**********************************************************************
+ CFB-mode decryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_cfb_dec_amd64, at function)
+.globl _gcry_vaes_avx2_cfb_dec_amd64
+_gcry_vaes_avx2_cfb_dec_amd64:
+ /* input:
+ * %rdi: round keys
+ * %rsi: iv
+ * %rdx: dst
+ * %rcx: src
+ * %r8: nblocks
+ * %r9: nrounds
+ */
+ CFI_STARTPROC();
+
+ /* Load IV. */
+ vmovdqu (%rsi), %xmm15;
+
+ /* Process 16 blocks per loop. */
+.align 8
+.Lcfb_dec_blk16:
+ cmpq $16, %r8;
+ jb .Lcfb_dec_blk8;
+
+ leaq -16(%r8), %r8;
+
+ /* Load input and xor first key. Update IV. */
+ vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+ vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+ vmovdqu (1 * 16)(%rcx), %ymm1;
+ vmovdqu (3 * 16)(%rcx), %ymm2;
+ vmovdqu (5 * 16)(%rcx), %ymm3;
+ vmovdqu (7 * 16)(%rcx), %ymm4;
+ vmovdqu (9 * 16)(%rcx), %ymm5;
+ vmovdqu (11 * 16)(%rcx), %ymm6;
+ vmovdqu (13 * 16)(%rcx), %ymm7;
+ vmovdqu (15 * 16)(%rcx), %xmm15;
+ vpxor %ymm8, %ymm0, %ymm0;
+ vpxor %ymm8, %ymm1, %ymm1;
+ vpxor %ymm8, %ymm2, %ymm2;
+ vpxor %ymm8, %ymm3, %ymm3;
+ vpxor %ymm8, %ymm4, %ymm4;
+ vpxor %ymm8, %ymm5, %ymm5;
+ vpxor %ymm8, %ymm6, %ymm6;
+ vpxor %ymm8, %ymm7, %ymm7;
+ vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+ vmovdqu (0 * 16)(%rcx), %ymm9;
+ vmovdqu (2 * 16)(%rcx), %ymm10;
+ vmovdqu (4 * 16)(%rcx), %ymm11;
+ vmovdqu (6 * 16)(%rcx), %ymm12;
+ vmovdqu (8 * 16)(%rcx), %ymm13;
+ vmovdqu (10 * 16)(%rcx), %ymm14;
+
+ leaq (16 * 16)(%rcx), %rcx;
+
+ /* AES rounds */
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+ cmpl $12, %r9d;
+ jb .Lcfb_dec_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+ jz .Lcfb_dec_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+
+ /* Last round and output handling. */
+ .Lcfb_dec_blk16_last:
+ vpxor %ymm8, %ymm9, %ymm9;
+ vpxor %ymm8, %ymm10, %ymm10;
+ vpxor %ymm8, %ymm11, %ymm11;
+ vpxor %ymm8, %ymm12, %ymm12;
+ vpxor %ymm8, %ymm13, %ymm13;
+ vpxor %ymm8, %ymm14, %ymm14;
+ vaesenclast %ymm9, %ymm0, %ymm0;
+ vaesenclast %ymm10, %ymm1, %ymm1;
+ vpxor (-4 * 16)(%rcx), %ymm8, %ymm9;
+ vpxor (-2 * 16)(%rcx), %ymm8, %ymm10;
+ vaesenclast %ymm11, %ymm2, %ymm2;
+ vaesenclast %ymm12, %ymm3, %ymm3;
+ vaesenclast %ymm13, %ymm4, %ymm4;
+ vaesenclast %ymm14, %ymm5, %ymm5;
+ vaesenclast %ymm9, %ymm6, %ymm6;
+ vaesenclast %ymm10, %ymm7, %ymm7;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ vmovdqu %ymm4, (8 * 16)(%rdx);
+ vmovdqu %ymm5, (10 * 16)(%rdx);
+ vmovdqu %ymm6, (12 * 16)(%rdx);
+ vmovdqu %ymm7, (14 * 16)(%rdx);
+ leaq (16 * 16)(%rdx), %rdx;
+
+ jmp .Lcfb_dec_blk16;
+
+ /* Handle trailing eight blocks. */
+.align 8
+.Lcfb_dec_blk8:
+ cmpq $8, %r8;
+ jb .Lcfb_dec_blk4;
+
+ leaq -8(%r8), %r8;
+
+ /* Load input and xor first key. Update IV. */
+ vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+ vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+ vmovdqu (1 * 16)(%rcx), %ymm1;
+ vmovdqu (3 * 16)(%rcx), %ymm2;
+ vmovdqu (5 * 16)(%rcx), %ymm3;
+ vmovdqu (7 * 16)(%rcx), %xmm15;
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor %ymm4, %ymm1, %ymm1;
+ vpxor %ymm4, %ymm2, %ymm2;
+ vpxor %ymm4, %ymm3, %ymm3;
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ vmovdqu (0 * 16)(%rcx), %ymm10;
+ vmovdqu (2 * 16)(%rcx), %ymm11;
+ vmovdqu (4 * 16)(%rcx), %ymm12;
+ vmovdqu (6 * 16)(%rcx), %ymm13;
+
+ leaq (8 * 16)(%rcx), %rcx;
+
+ /* AES rounds */
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lcfb_dec_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lcfb_dec_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Lcfb_dec_blk8_last:
+ vpxor %ymm4, %ymm10, %ymm10;
+ vpxor %ymm4, %ymm11, %ymm11;
+ vpxor %ymm4, %ymm12, %ymm12;
+ vpxor %ymm4, %ymm13, %ymm13;
+ vaesenclast %ymm10, %ymm0, %ymm0;
+ vaesenclast %ymm11, %ymm1, %ymm1;
+ vaesenclast %ymm12, %ymm2, %ymm2;
+ vaesenclast %ymm13, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ /* Handle trailing four blocks. */
+.align 8
+.Lcfb_dec_blk4:
+ cmpq $4, %r8;
+ jb .Lcfb_dec_blk1;
+
+ leaq -4(%r8), %r8;
+
+ /* Load input and xor first key. */
+ vmovdqa (0 * 16)(%rdi), %xmm4;
+ vpxor %xmm15, %xmm4, %xmm0;
+ vmovdqu (0 * 16)(%rcx), %xmm12;
+ vmovdqu (1 * 16)(%rcx), %xmm13;
+ vmovdqu (2 * 16)(%rcx), %xmm14;
+ vpxor %xmm12, %xmm4, %xmm1;
+ vpxor %xmm13, %xmm4, %xmm2;
+ vpxor %xmm14, %xmm4, %xmm3;
+ vmovdqa (1 * 16)(%rdi), %xmm4;
+
+ /* Load input as next IV. */
+ vmovdqu (3 * 16)(%rcx), %xmm15;
+ leaq (4 * 16)(%rcx), %rcx;
+
+ /* AES rounds */
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (2 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (3 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (4 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (5 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (6 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (7 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (8 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (9 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (10 * 16)(%rdi), %xmm4;
+ cmpl $12, %r9d;
+ jb .Lcfb_dec_blk4_last;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (11 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (12 * 16)(%rdi), %xmm4;
+ jz .Lcfb_dec_blk4_last;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (13 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (14 * 16)(%rdi), %xmm4;
+
+ /* Last round and output handling. */
+ .Lcfb_dec_blk4_last:
+ vpxor %xmm4, %xmm12, %xmm12;
+ vpxor %xmm4, %xmm13, %xmm13;
+ vpxor %xmm4, %xmm14, %xmm14;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vaesenclast %xmm12, %xmm0, %xmm0;
+ vaesenclast %xmm13, %xmm1, %xmm1;
+ vaesenclast %xmm14, %xmm2, %xmm2;
+ vaesenclast %xmm4, %xmm3, %xmm3;
+ vmovdqu %xmm0, (0 * 16)(%rdx);
+ vmovdqu %xmm1, (1 * 16)(%rdx);
+ vmovdqu %xmm2, (2 * 16)(%rdx);
+ vmovdqu %xmm3, (3 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+
+ /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lcfb_dec_blk1:
+ cmpq $1, %r8;
+ jb .Ldone_cfb_dec;
+
+ leaq -1(%r8), %r8;
+
+ /* Xor first key. */
+ vpxor (0 * 16)(%rdi), %xmm15, %xmm0;
+
+ /* Load input as next IV. */
+ vmovdqu (%rcx), %xmm15;
+ leaq 16(%rcx), %rcx;
+
+ /* AES rounds. */
+ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (10 * 16)(%rdi), %xmm1;
+ cmpl $12, %r9d;
+ jb .Lcfb_dec_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (12 * 16)(%rdi), %xmm1;
+ jz .Lcfb_dec_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (14 * 16)(%rdi), %xmm1;
+
+ /* Last round and output handling. */
+ .Lcfb_dec_blk1_last:
+ vpxor %xmm15, %xmm1, %xmm1;
+ vaesenclast %xmm1, %xmm0, %xmm0;
+ vmovdqu %xmm0, (%rdx);
+ leaq 16(%rdx), %rdx;
+
+ jmp .Lcfb_dec_blk1;
+
+.align 8
+.Ldone_cfb_dec:
+ /* Store IV. */
+ vmovdqu %xmm15, (%rsi);
+
+ vzeroall;
+ ret
+ CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64)
+
+/**********************************************************************
+ CTR-mode encryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_ctr_enc_amd64, at function)
+.globl _gcry_vaes_avx2_ctr_enc_amd64
+_gcry_vaes_avx2_ctr_enc_amd64:
+ /* input:
+ * %rdi: round keys
+ * %rsi: counter
+ * %rdx: dst
+ * %rcx: src
+ * %r8: nblocks
+ * %r9: nrounds
+ */
+ CFI_STARTPROC();
+
+ movq 8(%rsi), %r10;
+ movq 0(%rsi), %r11;
+ bswapq %r10;
+ bswapq %r11;
+
+ vpcmpeqd %ymm15, %ymm15, %ymm15;
+ vpsrldq $8, %ymm15, %ymm15; // 0:-1
+ vpaddq %ymm15, %ymm15, %ymm14; // 0:-2
+ vbroadcasti128 .Lbswap128_mask rRIP, %ymm13;
+
+ /* Process 16 blocks per loop. */
+.align 8
+.Lctr_enc_blk16:
+ cmpq $16, %r8;
+ jb .Lctr_enc_blk8;
+
+ leaq -16(%r8), %r8;
+
+ vbroadcasti128 (%rsi), %ymm7;
+ vbroadcasti128 (0 * 16)(%rdi), %ymm8;
+
+ /* detect if carry handling is needed */
+ addb $16, 15(%rsi);
+ jc .Lctr_enc_blk16_handle_carry;
+
+ /* Increment counters. */
+ vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0;
+ vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1;
+ vpaddb .Lbige_addb_4 rRIP, %ymm7, %ymm2;
+ vpaddb .Lbige_addb_6 rRIP, %ymm7, %ymm3;
+ vpaddb .Lbige_addb_8 rRIP, %ymm7, %ymm4;
+ vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5;
+ vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6;
+ vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7;
+ leaq 16(%r10), %r10;
+
+ .Lctr_enc_blk16_rounds:
+ /* AES rounds */
+ XOR8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (1 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm8;
+ cmpl $12, %r9d;
+ jb .Lctr_enc_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm8;
+ jz .Lctr_enc_blk16_last;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm8;
+ VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm8;
+
+ /* Last round and output handling. */
+ .Lctr_enc_blk16_last:
+ vpxor (0 * 16)(%rcx), %ymm8, %ymm9; /* Xor src to last round key. */
+ vpxor (2 * 16)(%rcx), %ymm8, %ymm10;
+ vpxor (4 * 16)(%rcx), %ymm8, %ymm11;
+ vpxor (6 * 16)(%rcx), %ymm8, %ymm12;
+ vaesenclast %ymm9, %ymm0, %ymm0;
+ vaesenclast %ymm10, %ymm1, %ymm1;
+ vaesenclast %ymm11, %ymm2, %ymm2;
+ vaesenclast %ymm12, %ymm3, %ymm3;
+ vpxor (8 * 16)(%rcx), %ymm8, %ymm9;
+ vpxor (10 * 16)(%rcx), %ymm8, %ymm10;
+ vpxor (12 * 16)(%rcx), %ymm8, %ymm11;
+ vpxor (14 * 16)(%rcx), %ymm8, %ymm8;
+ leaq (16 * 16)(%rcx), %rcx;
+ vaesenclast %ymm9, %ymm4, %ymm4;
+ vaesenclast %ymm10, %ymm5, %ymm5;
+ vaesenclast %ymm11, %ymm6, %ymm6;
+ vaesenclast %ymm8, %ymm7, %ymm7;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ vmovdqu %ymm4, (8 * 16)(%rdx);
+ vmovdqu %ymm5, (10 * 16)(%rdx);
+ vmovdqu %ymm6, (12 * 16)(%rdx);
+ vmovdqu %ymm7, (14 * 16)(%rdx);
+ leaq (16 * 16)(%rdx), %rdx;
+
+ jmp .Lctr_enc_blk16;
+
+ .align 8
+ .Lctr_enc_blk16_handle_carry:
+ /* Increment counters (handle carry). */
+ vpshufb %xmm13, %xmm7, %xmm1; /* be => le */
+ vmovdqa %xmm1, %xmm0;
+ inc_le128(%xmm1, %xmm15, %xmm5);
+ vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */
+ vpshufb %ymm13, %ymm7, %ymm0;
+ addq $16, %r10;
+ adcq $0, %r11;
+ bswapq %r10;
+ bswapq %r11;
+ movq %r10, 8(%rsi);
+ movq %r11, 0(%rsi);
+ bswapq %r10;
+ bswapq %r11;
+ add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */
+ vpshufb %ymm13, %ymm7, %ymm1;
+ add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */
+ vpshufb %ymm13, %ymm7, %ymm2;
+ add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +7:+6 */
+ vpshufb %ymm13, %ymm7, %ymm3;
+ add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +9:+8 */
+ vpshufb %ymm13, %ymm7, %ymm4;
+ add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +11:+10 */
+ vpshufb %ymm13, %ymm7, %ymm5;
+ add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +13:+12 */
+ vpshufb %ymm13, %ymm7, %ymm6;
+ add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +15:+14 */
+ vpshufb %ymm13, %ymm7, %ymm7;
+
+ jmp .Lctr_enc_blk16_rounds;
+
+ /* Handle trailing eight blocks. */
+.align 8
+.Lctr_enc_blk8:
+ cmpq $8, %r8;
+ jb .Lctr_enc_blk4;
+
+ leaq -8(%r8), %r8;
+
+ vbroadcasti128 (%rsi), %ymm3;
+ vbroadcasti128 (0 * 16)(%rdi), %ymm4;
+
+ /* detect if carry handling is needed */
+ addb $8, 15(%rsi);
+ jc .Lctr_enc_blk8_handle_carry;
+
+ /* Increment counters. */
+ vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
+ vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
+ vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2;
+ vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3;
+ leaq 8(%r10), %r10;
+
+ .Lctr_enc_blk8_rounds:
+ /* AES rounds */
+ XOR4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ cmpl $12, %r9d;
+ jb .Lctr_enc_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ jz .Lctr_enc_blk8_last;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Lctr_enc_blk8_last:
+ vpxor (0 * 16)(%rcx), %ymm4, %ymm5; /* Xor src to last round key. */
+ vpxor (2 * 16)(%rcx), %ymm4, %ymm6;
+ vpxor (4 * 16)(%rcx), %ymm4, %ymm7;
+ vpxor (6 * 16)(%rcx), %ymm4, %ymm4;
+ leaq (8 * 16)(%rcx), %rcx;
+ vaesenclast %ymm5, %ymm0, %ymm0;
+ vaesenclast %ymm6, %ymm1, %ymm1;
+ vaesenclast %ymm7, %ymm2, %ymm2;
+ vaesenclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ jmp .Lctr_enc_blk4;
+
+ .align 8
+ .Lctr_enc_blk8_handle_carry:
+ /* Increment counters (handle carry). */
+ vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
+ vmovdqa %xmm1, %xmm0;
+ inc_le128(%xmm1, %xmm15, %xmm5);
+ vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
+ vpshufb %ymm13, %ymm3, %ymm0;
+ addq $8, %r10;
+ adcq $0, %r11;
+ bswapq %r10;
+ bswapq %r11;
+ movq %r10, 8(%rsi);
+ movq %r11, 0(%rsi);
+ bswapq %r10;
+ bswapq %r11;
+ add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
+ vpshufb %ymm13, %ymm3, %ymm1;
+ add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */
+ vpshufb %ymm13, %ymm3, %ymm2;
+ add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +7:+6 */
+ vpshufb %ymm13, %ymm3, %ymm3;
+
+ jmp .Lctr_enc_blk8_rounds;
+
+ /* Handle trailing four blocks. */
+.align 8
+.Lctr_enc_blk4:
+ cmpq $4, %r8;
+ jb .Lctr_enc_blk1;
+
+ leaq -4(%r8), %r8;
+
+ vmovdqu (%rsi), %xmm0;
+ vmovdqa (0 * 16)(%rdi), %xmm4;
+
+ /* detect if carry handling is needed */
+ addb $4, 15(%rsi);
+ jc .Lctr_enc_blk4_handle_carry;
+
+ /* Increment counters. */
+ vpaddb .Lbige_addb_1 rRIP, %xmm0, %xmm1;
+ vpaddb .Lbige_addb_2 rRIP, %xmm0, %xmm2;
+ vpaddb .Lbige_addb_3 rRIP, %xmm0, %xmm3;
+ leaq 4(%r10), %r10;
+
+ .Lctr_enc_blk4_rounds:
+ /* AES rounds */
+ XOR4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (1 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (2 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (3 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (4 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (5 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (6 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (7 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (8 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (9 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (10 * 16)(%rdi), %xmm4;
+ cmpl $12, %r9d;
+ jb .Lctr_enc_blk4_last;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (11 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (12 * 16)(%rdi), %xmm4;
+ jz .Lctr_enc_blk4_last;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (13 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (14 * 16)(%rdi), %xmm4;
+
+ /* Last round and output handling. */
+ .Lctr_enc_blk4_last:
+ vpxor (0 * 16)(%rcx), %xmm4, %xmm5; /* Xor src to last round key. */
+ vpxor (1 * 16)(%rcx), %xmm4, %xmm6;
+ vpxor (2 * 16)(%rcx), %xmm4, %xmm7;
+ vpxor (3 * 16)(%rcx), %xmm4, %xmm4;
+ leaq (4 * 16)(%rcx), %rcx;
+ vaesenclast %xmm5, %xmm0, %xmm0;
+ vaesenclast %xmm6, %xmm1, %xmm1;
+ vaesenclast %xmm7, %xmm2, %xmm2;
+ vaesenclast %xmm4, %xmm3, %xmm3;
+ vmovdqu %xmm0, (0 * 16)(%rdx);
+ vmovdqu %xmm1, (1 * 16)(%rdx);
+ vmovdqu %xmm2, (2 * 16)(%rdx);
+ vmovdqu %xmm3, (3 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+
+ jmp .Lctr_enc_blk1;
+
+ .align 8
+ .Lctr_enc_blk4_handle_carry:
+ /* Increment counters (handle carry). */
+ vpshufb %xmm13, %xmm0, %xmm3; /* be => le */
+ addq $4, %r10;
+ adcq $0, %r11;
+ bswapq %r10;
+ bswapq %r11;
+ movq %r10, 8(%rsi);
+ movq %r11, 0(%rsi);
+ bswapq %r10;
+ bswapq %r11;
+ inc_le128(%xmm3, %xmm15, %xmm5);
+ vpshufb %xmm13, %xmm3, %xmm1;
+ inc_le128(%xmm3, %xmm15, %xmm5);
+ vpshufb %xmm13, %xmm3, %xmm2;
+ inc_le128(%xmm3, %xmm15, %xmm5);
+ vpshufb %xmm13, %xmm3, %xmm3;
+
+ jmp .Lctr_enc_blk4_rounds;
+
+ /* Process trailing one to three blocks, one per loop. */
+.align 8
+.Lctr_enc_blk1:
+ cmpq $1, %r8;
+ jb .Ldone_ctr_enc;
+
+ leaq -1(%r8), %r8;
+
+ /* Load and increament counter. */
+ vmovdqu (%rsi), %xmm0;
+ addq $1, %r10;
+ adcq $0, %r11;
+ bswapq %r10;
+ bswapq %r11;
+ movq %r10, 8(%rsi);
+ movq %r11, 0(%rsi);
+ bswapq %r10;
+ bswapq %r11;
+
+ /* AES rounds. */
+ vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (10 * 16)(%rdi), %xmm1;
+ cmpl $12, %r9d;
+ jb .Lctr_enc_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (12 * 16)(%rdi), %xmm1;
+ jz .Lctr_enc_blk1_last;
+ vaesenc %xmm1, %xmm0, %xmm0;
+ vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+ vmovdqa (14 * 16)(%rdi), %xmm1;
+
+ /* Last round and output handling. */
+ .Lctr_enc_blk1_last:
+ vpxor (%rcx), %xmm1, %xmm1; /* Xor src to last round key. */
+ leaq 16(%rcx), %rcx;
+ vaesenclast %xmm1, %xmm0, %xmm0; /* Last round and xor with xmm1. */
+ vmovdqu %xmm0, (%rdx);
+ leaq 16(%rdx), %rdx;
+
+ jmp .Lctr_enc_blk1;
+
+.align 8
+.Ldone_ctr_enc:
+ vzeroall;
+ xorl %r10d, %r10d;
+ xorl %r11d, %r11d;
+ ret
+ CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64)
+
+/**********************************************************************
+ OCB-mode encryption/decryption
+ **********************************************************************/
+ELF(.type _gcry_vaes_avx2_ocb_checksum, at function)
+_gcry_vaes_avx2_ocb_checksum:
+ /* input:
+ * %rax: offset pointer
+ * %r10: plaintext pointer
+ * %r11: nblocks
+ */
+ CFI_STARTPROC();
+
+ vpxor %xmm0, %xmm0, %xmm0;
+ cmpq $4, %r11;
+ jb .Locb_checksum_blk1;
+ vpxor %xmm1, %xmm1, %xmm1;
+ vpxor %xmm2, %xmm2, %xmm2;
+ vpxor %xmm3, %xmm3, %xmm3;
+ cmpq $16, %r11;
+ jb .Locb_checksum_blk4;
+ vpxor %xmm4, %xmm4, %xmm4;
+ vpxor %xmm5, %xmm5, %xmm5;
+ vpxor %xmm6, %xmm6, %xmm6;
+ vpxor %xmm7, %xmm7, %xmm7;
+ cmpq $32, %r11;
+ jb .Locb_checksum_blk16;
+ vpxor %xmm8, %xmm8, %xmm8;
+ vpxor %xmm9, %xmm9, %xmm9;
+ vpxor %xmm10, %xmm10, %xmm10;
+ vpxor %xmm11, %xmm11, %xmm11;
+ vpxor %xmm12, %xmm12, %xmm12;
+ vpxor %xmm13, %xmm13, %xmm13;
+ vpxor %xmm14, %xmm14, %xmm14;
+ vpxor %xmm15, %xmm15, %xmm15;
+
+.align 8
+.Locb_checksum_blk32:
+ cmpq $32, %r11;
+ jb .Locb_checksum_blk32_done;
+
+ leaq -32(%r11), %r11;
+
+ vpxor (0 * 16)(%r10), %ymm0, %ymm0;
+ vpxor (2 * 16)(%r10), %ymm1, %ymm1;
+ vpxor (4 * 16)(%r10), %ymm2, %ymm2;
+ vpxor (6 * 16)(%r10), %ymm3, %ymm3;
+ vpxor (8 * 16)(%r10), %ymm4, %ymm4;
+ vpxor (10 * 16)(%r10), %ymm5, %ymm5;
+ vpxor (12 * 16)(%r10), %ymm6, %ymm6;
+ vpxor (14 * 16)(%r10), %ymm7, %ymm7;
+ vpxor (16 * 16)(%r10), %ymm8, %ymm8;
+ vpxor (18 * 16)(%r10), %ymm9, %ymm9;
+ vpxor (20 * 16)(%r10), %ymm10, %ymm10;
+ vpxor (22 * 16)(%r10), %ymm11, %ymm11;
+ vpxor (24 * 16)(%r10), %ymm12, %ymm12;
+ vpxor (26 * 16)(%r10), %ymm13, %ymm13;
+ vpxor (28 * 16)(%r10), %ymm14, %ymm14;
+ vpxor (30 * 16)(%r10), %ymm15, %ymm15;
+ leaq (32 * 16)(%r10), %r10;
+
+ jmp .Locb_checksum_blk32;
+
+.align 8
+.Locb_checksum_blk32_done:
+ vpxor %ymm8, %ymm0, %ymm0;
+ vpxor %ymm9, %ymm1, %ymm1;
+ vpxor %ymm10, %ymm2, %ymm2;
+ vpxor %ymm11, %ymm3, %ymm3;
+ vpxor %ymm12, %ymm4, %ymm4;
+ vpxor %ymm13, %ymm5, %ymm5;
+ vpxor %ymm14, %ymm6, %ymm6;
+ vpxor %ymm15, %ymm7, %ymm7;
+
+.align 8
+.Locb_checksum_blk16:
+ cmpq $16, %r11;
+ jb .Locb_checksum_blk16_done;
+
+ leaq -16(%r11), %r11;
+
+ vpxor (0 * 16)(%r10), %ymm0, %ymm0;
+ vpxor (2 * 16)(%r10), %ymm1, %ymm1;
+ vpxor (4 * 16)(%r10), %ymm2, %ymm2;
+ vpxor (6 * 16)(%r10), %ymm3, %ymm3;
+ vpxor (8 * 16)(%r10), %ymm4, %ymm4;
+ vpxor (10 * 16)(%r10), %ymm5, %ymm5;
+ vpxor (12 * 16)(%r10), %ymm6, %ymm6;
+ vpxor (14 * 16)(%r10), %ymm7, %ymm7;
+ leaq (16 * 16)(%r10), %r10;
+
+ jmp .Locb_checksum_blk16;
+
+.align 8
+.Locb_checksum_blk16_done:
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor %ymm5, %ymm1, %ymm1;
+ vpxor %ymm6, %ymm2, %ymm2;
+ vpxor %ymm7, %ymm3, %ymm3;
+ vextracti128 $1, %ymm0, %xmm4;
+ vextracti128 $1, %ymm1, %xmm5;
+ vextracti128 $1, %ymm2, %xmm6;
+ vextracti128 $1, %ymm3, %xmm7;
+ vpxor %xmm4, %xmm0, %xmm0;
+ vpxor %xmm5, %xmm1, %xmm1;
+ vpxor %xmm6, %xmm2, %xmm2;
+ vpxor %xmm7, %xmm3, %xmm3;
+
+.align 8
+.Locb_checksum_blk4:
+ cmpq $4, %r11;
+ jb .Locb_checksum_blk4_done;
+
+ leaq -4(%r11), %r11;
+
+ vpxor (0 * 16)(%r10), %xmm0, %xmm0;
+ vpxor (1 * 16)(%r10), %xmm1, %xmm1;
+ vpxor (2 * 16)(%r10), %xmm2, %xmm2;
+ vpxor (3 * 16)(%r10), %xmm3, %xmm3;
+ leaq (4 * 16)(%r10), %r10;
+
+ jmp .Locb_checksum_blk4;
+
+.align 8
+.Locb_checksum_blk4_done:
+ vpxor %xmm1, %xmm0, %xmm0;
+ vpxor %xmm3, %xmm2, %xmm2;
+ vpxor %xmm2, %xmm0, %xmm0;
+
+.align 8
+.Locb_checksum_blk1:
+ cmpq $1, %r11;
+ jb .Locb_checksum_done;
+
+ leaq -1(%r11), %r11;
+
+ vpxor (%r10), %xmm0, %xmm0;
+ leaq 16(%r10), %r10;
+
+ jmp .Locb_checksum_blk1;
+
+.align 8
+.Locb_checksum_done:
+ vpxor (%rax), %xmm0, %xmm0;
+ vmovdqu %xmm0, (%rax);
+ ret;
+ CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum)
+
+#define STACK_REGS_POS (16 * 16 + 4 * 16)
+#define STACK_ALLOC (STACK_REGS_POS + 6 * 8)
+
+ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64, at function)
+.globl _gcry_vaes_avx2_ocb_crypt_amd64
+_gcry_vaes_avx2_ocb_crypt_amd64:
+ /* input:
+ * %rdi: round keys
+ * %esi: nblk
+ * %rdx: dst
+ * %rcx: src
+ * %r8: nblocks
+ * %r9: nrounds
+ * 16(%rbp): offset
+ * 24(%rbp): checksum
+ * 32(%rbp): L-array
+ * 40(%rbp): encrypt (%r15d)
+ */
+ CFI_STARTPROC();
+
+ pushq %rbp;
+ CFI_PUSH(%rbp);
+ movq %rsp, %rbp;
+ CFI_DEF_CFA_REGISTER(%rbp);
+
+ subq $STACK_ALLOC, %rsp;
+ andq $~63, %rsp;
+
+ movq %r12, (STACK_REGS_POS + 0 * 8)(%rsp);
+ CFI_REG_ON_STACK(r12, STACK_REGS_POS + 0 * 8);
+ movq %r13, (STACK_REGS_POS + 1 * 8)(%rsp);
+ CFI_REG_ON_STACK(r13, STACK_REGS_POS + 1 * 8);
+ movq %r14, (STACK_REGS_POS + 2 * 8)(%rsp);
+ CFI_REG_ON_STACK(r14, STACK_REGS_POS + 2 * 8);
+ movq %r15, (STACK_REGS_POS + 3 * 8)(%rsp);
+ CFI_REG_ON_STACK(r15, STACK_REGS_POS + 3 * 8);
+
+ movl 40(%rbp), %r15d; /* encrypt-flag. */
+ movq 16(%rbp), %r14; /* offset ptr. */
+
+ /* Handle encryption checksumming. */
+ testl %r15d, %r15d;
+ jz .Locb_dec_checksum_prepare;
+ movq 24(%rbp), %rax; /* checksum ptr. */
+ movq %rcx, %r10;
+ movq %r8, %r11;
+ call _gcry_vaes_avx2_ocb_checksum;
+ jmp .Locb_enc_checksum_done;
+.Locb_dec_checksum_prepare:
+ /* Store plaintext address and number of blocks for decryption
+ * checksumming. */
+ movq %rdx, (STACK_REGS_POS + 4 * 8)(%rsp);
+ movq %r8, (STACK_REGS_POS + 5 * 8)(%rsp);
+.Locb_enc_checksum_done:
+
+ vmovdqu (%r14), %xmm15; /* Load offset. */
+ movq 32(%rbp), %r14; /* L-array ptr. */
+ vmovdqa (0 * 16)(%rdi), %xmm0; /* first key */
+ movl $(10 * 16), %eax;
+ cmpl $12, %r9d;
+ jb .Llast_key_ptr;
+ movl $(12 * 16), %eax;
+ je .Llast_key_ptr;
+ movl $(14 * 16), %eax;
+ .align 8
+ .Llast_key_ptr:
+ vpxor (%rdi, %rax), %xmm0, %xmm0; /* first key ^ last key */
+ vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key */
+ vmovdqa %xmm0, (14 * 16)(%rsp);
+ vmovdqa %xmm0, (15 * 16)(%rsp);
+
+.align 8
+.Lhandle_unaligned_ocb:
+ /* Get number of blocks to align nblk to 16 (and L-array optimization). */
+ movl %esi, %r10d;
+ negl %r10d;
+ andl $15, %r10d;
+ cmpq %r8, %r10;
+ cmovaq %r8, %r10;
+ cmpq $1, %r10;
+ jb .Lunaligned_ocb_done;
+
+ /* Unaligned: Process eight blocks per loop. */
+.align 8
+.Locb_unaligned_blk8:
+ cmpq $8, %r10;
+ jb .Locb_unaligned_blk4;
+
+ leaq -8(%r8), %r8;
+ leaq -8(%r10), %r10;
+
+ leal 1(%esi), %r11d;
+ leal 2(%esi), %r12d;
+ leal 3(%esi), %r13d;
+ leal 4(%esi), %eax;
+ tzcntl %r11d, %r11d;
+ tzcntl %r12d, %r12d;
+ tzcntl %r13d, %r13d;
+ tzcntl %eax, %eax;
+ shll $4, %r11d;
+ shll $4, %r12d;
+ shll $4, %r13d;
+ shll $4, %eax;
+ vpxor (%r14, %r11), %xmm15, %xmm5;
+ vpxor (%r14, %r12), %xmm5, %xmm6;
+ vpxor (%r14, %r13), %xmm6, %xmm7;
+ vpxor (%r14, %rax), %xmm7, %xmm8;
+
+ leal 5(%esi), %r11d;
+ leal 6(%esi), %r12d;
+ leal 7(%esi), %r13d;
+ leal 8(%esi), %esi;
+ tzcntl %r11d, %r11d;
+ tzcntl %r12d, %r12d;
+ tzcntl %r13d, %r13d;
+ tzcntl %esi, %eax;
+ shll $4, %r11d;
+ shll $4, %r12d;
+ shll $4, %r13d;
+ shll $4, %eax;
+ vpxor (%r14, %r11), %xmm8, %xmm9;
+ vpxor (%r14, %r12), %xmm9, %xmm10;
+ vpxor (%r14, %r13), %xmm10, %xmm11;
+ vpxor (%r14, %rax), %xmm11, %xmm15;
+
+ vinserti128 $1, %xmm6, %ymm5, %ymm5;
+ vinserti128 $1, %xmm8, %ymm7, %ymm6;
+ vinserti128 $1, %xmm10, %ymm9, %ymm7;
+ vinserti128 $1, %xmm15, %ymm11, %ymm8;
+
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ vmovdqa (14 * 16)(%rsp), %ymm9;
+
+ testl %r15d, %r15d;
+ jz .Locb_unaligned_blk8_dec;
+
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk8_enc_last;
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ jz .Locb_unaligned_blk8_enc_last;
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk8_enc_last:
+ vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */
+ vpxor %ymm6, %ymm9, %ymm6;
+ vpxor %ymm7, %ymm9, %ymm7;
+ vpxor %ymm8, %ymm9, %ymm4;
+ vaesenclast %ymm5, %ymm0, %ymm0;
+ vaesenclast %ymm6, %ymm1, %ymm1;
+ vaesenclast %ymm7, %ymm2, %ymm2;
+ vaesenclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ jmp .Locb_unaligned_blk8;
+
+ .align 8
+ .Locb_unaligned_blk8_dec:
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk8_dec_last;
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ jz .Locb_unaligned_blk8_dec_last;
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk8_dec_last:
+ vpxor %ymm5, %ymm9, %ymm5; /* Xor src to last round key. */
+ vpxor %ymm6, %ymm9, %ymm6;
+ vpxor %ymm7, %ymm9, %ymm7;
+ vpxor %ymm8, %ymm9, %ymm4;
+ vaesdeclast %ymm5, %ymm0, %ymm0;
+ vaesdeclast %ymm6, %ymm1, %ymm1;
+ vaesdeclast %ymm7, %ymm2, %ymm2;
+ vaesdeclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ jmp .Locb_unaligned_blk8;
+
+ /* Unaligned: Process four blocks per loop. */
+.align 8
+.Locb_unaligned_blk4:
+ cmpq $4, %r10;
+ jb .Locb_unaligned_blk1;
+
+ leaq -4(%r8), %r8;
+ leaq -4(%r10), %r10;
+
+ leal 1(%esi), %r11d;
+ leal 2(%esi), %r12d;
+ leal 3(%esi), %r13d;
+ leal 4(%esi), %esi;
+ tzcntl %r11d, %r11d;
+ tzcntl %r12d, %r12d;
+ tzcntl %r13d, %r13d;
+ tzcntl %esi, %eax;
+ shll $4, %r11d;
+ shll $4, %r12d;
+ shll $4, %r13d;
+ shll $4, %eax;
+
+ vpxor (%r14, %r11), %xmm15, %xmm5;
+ vpxor (%r14, %r12), %xmm5, %xmm6;
+ vpxor (%r14, %r13), %xmm6, %xmm7;
+ vpxor (%r14, %rax), %xmm7, %xmm15;
+
+ vpxor (0 * 16)(%rcx), %xmm5, %xmm0;
+ vpxor (1 * 16)(%rcx), %xmm6, %xmm1;
+ vpxor (2 * 16)(%rcx), %xmm7, %xmm2;
+ vpxor (3 * 16)(%rcx), %xmm15, %xmm3;
+ leaq (4 * 16)(%rcx), %rcx;
+
+ vmovdqa (14 * 16)(%rsp), %xmm8;
+
+ testl %r15d, %r15d;
+ jz .Locb_unaligned_blk4_dec;
+
+ /* AES rounds */
+ vmovdqa (1 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (2 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (3 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (4 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (5 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (6 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (7 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (8 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (9 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk4_enc_last;
+ vmovdqa (10 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (11 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ jz .Locb_unaligned_blk4_enc_last;
+ vmovdqa (12 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (13 * 16)(%rdi), %xmm4;
+ VAESENC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk4_enc_last:
+ vpxor %xmm5, %xmm8, %xmm5; /* Xor src to last round key. */
+ vpxor %xmm6, %xmm8, %xmm6;
+ vpxor %xmm7, %xmm8, %xmm7;
+ vpxor %xmm15, %xmm8, %xmm4;
+ vaesenclast %xmm5, %xmm0, %xmm0;
+ vaesenclast %xmm6, %xmm1, %xmm1;
+ vaesenclast %xmm7, %xmm2, %xmm2;
+ vaesenclast %xmm4, %xmm3, %xmm3;
+ vmovdqu %xmm0, (0 * 16)(%rdx);
+ vmovdqu %xmm1, (1 * 16)(%rdx);
+ vmovdqu %xmm2, (2 * 16)(%rdx);
+ vmovdqu %xmm3, (3 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+
+ jmp .Locb_unaligned_blk4;
+
+ .align 8
+ .Locb_unaligned_blk4_dec:
+ /* AES rounds */
+ vmovdqa (1 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (2 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (3 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (4 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (5 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (6 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (7 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (8 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (9 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk4_dec_last;
+ vmovdqa (10 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (11 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ jz .Locb_unaligned_blk4_dec_last;
+ vmovdqa (12 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+ vmovdqa (13 * 16)(%rdi), %xmm4;
+ VAESDEC4(%xmm4, %xmm0, %xmm1, %xmm2, %xmm3);
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk4_dec_last:
+ vpxor %xmm5, %xmm8, %xmm5; /* Xor src to last round key. */
+ vpxor %xmm6, %xmm8, %xmm6;
+ vpxor %xmm7, %xmm8, %xmm7;
+ vpxor %xmm15, %xmm8, %xmm4;
+ vaesdeclast %xmm5, %xmm0, %xmm0;
+ vaesdeclast %xmm6, %xmm1, %xmm1;
+ vaesdeclast %xmm7, %xmm2, %xmm2;
+ vaesdeclast %xmm4, %xmm3, %xmm3;
+ vmovdqu %xmm0, (0 * 16)(%rdx);
+ vmovdqu %xmm1, (1 * 16)(%rdx);
+ vmovdqu %xmm2, (2 * 16)(%rdx);
+ vmovdqu %xmm3, (3 * 16)(%rdx);
+ leaq (4 * 16)(%rdx), %rdx;
+
+ jmp .Locb_unaligned_blk4;
+
+ /* Unaligned: Process one block per loop. */
+.align 8
+.Locb_unaligned_blk1:
+ cmpq $1, %r10;
+ jb .Lunaligned_ocb_done;
+
+ leaq -1(%r8), %r8;
+ leaq -1(%r10), %r10;
+
+ leal 1(%esi), %esi;
+ tzcntl %esi, %r11d;
+ shll $4, %r11d;
+ vpxor (%r14, %r11), %xmm15, %xmm15;
+ vpxor (%rcx), %xmm15, %xmm0;
+ leaq 16(%rcx), %rcx;
+
+ testl %r15d, %r15d;
+ jz .Locb_unaligned_blk1_dec;
+ /* AES rounds. */
+ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk1_enc_last;
+ vaesenc (10 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+ jz .Locb_unaligned_blk1_enc_last;
+ vaesenc (12 * 16)(%rdi), %xmm0, %xmm0;
+ vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk1_enc_last:
+ vpxor (14 * 16)(%rsp), %xmm15, %xmm1;
+ vaesenclast %xmm1, %xmm0, %xmm0;
+ vmovdqu %xmm0, (%rdx);
+ leaq 16(%rdx), %rdx;
+
+ jmp .Locb_unaligned_blk1;
+
+ .align 8
+ .Locb_unaligned_blk1_dec:
+ /* AES rounds. */
+ vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (3 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (4 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (5 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (6 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (7 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (8 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (9 * 16)(%rdi), %xmm0, %xmm0;
+ cmpl $12, %r9d;
+ jb .Locb_unaligned_blk1_dec_last;
+ vaesdec (10 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (11 * 16)(%rdi), %xmm0, %xmm0;
+ jz .Locb_unaligned_blk1_dec_last;
+ vaesdec (12 * 16)(%rdi), %xmm0, %xmm0;
+ vaesdec (13 * 16)(%rdi), %xmm0, %xmm0;
+
+ /* Last round and output handling. */
+ .Locb_unaligned_blk1_dec_last:
+ vpxor (14 * 16)(%rsp), %xmm15, %xmm1;
+ vaesdeclast %xmm1, %xmm0, %xmm0;
+ vmovdqu %xmm0, (%rdx);
+ leaq 16(%rdx), %rdx;
+
+ jmp .Locb_unaligned_blk1;
+
+.align 8
+.Lunaligned_ocb_done:
+ cmpq $1, %r8;
+ jb .Ldone_ocb;
+
+ /* Short buffers do not benefit from L-array optimization. */
+ movq %r8, %r10;
+ cmpq $16, %r8;
+ jb .Locb_unaligned_blk8;
+
+ vinserti128 $1, %xmm15, %ymm15, %ymm15;
+
+ /* Prepare L-array optimization.
+ * Since nblk is aligned to 16, offsets will have following
+ * construction:
+ * - block1 = ntz{0} = offset ^ L[0]
+ * - block2 = ntz{1} = offset ^ L[0] ^ L[1]
+ * - block3 = ntz{0} = offset ^ L[1]
+ * - block4 = ntz{2} = offset ^ L[1] ^ L[2]
+ * - block5 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[2]
+ * - block6 = ntz{1} = offset ^ L[0] ^ L[2]
+ * - block7 = ntz{0} = offset ^ L[2]
+ * - block8 = ntz{3} = offset ^ L[2] ^ L[3]
+ * - block9 = ntz{0} = offset ^ L[0] ^ L[2] ^ L[3]
+ * - block10 = ntz{1} = offset ^ L[0] ^ L[1] ^ L[2] ^ L[3]
+ * - block11 = ntz{0} = offset ^ L[1] ^ L[2] ^ L[3]
+ * - block12 = ntz{2} = offset ^ L[1] ^ L[3]
+ * - block13 = ntz{0} = offset ^ L[0] ^ L[1] ^ L[3]
+ * - block14 = ntz{1} = offset ^ L[0] ^ L[3]
+ * - block15 = ntz{0} = offset ^ L[3]
+ * - block16 = ntz{x} = offset ^ L[3] ^ L[ntz{x}]
+ */
+ vmovdqu (0 * 16)(%r14), %xmm0;
+ vmovdqu (1 * 16)(%r14), %xmm1;
+ vmovdqu (2 * 16)(%r14), %xmm2;
+ vmovdqu (3 * 16)(%r14), %xmm3;
+ vpxor %xmm0, %xmm1, %xmm4; /* L[0] ^ L[1] */
+ vpxor %xmm0, %xmm2, %xmm5; /* L[0] ^ L[2] */
+ vpxor %xmm0, %xmm3, %xmm6; /* L[0] ^ L[3] */
+ vpxor %xmm1, %xmm2, %xmm7; /* L[1] ^ L[2] */
+ vpxor %xmm1, %xmm3, %xmm8; /* L[1] ^ L[3] */
+ vpxor %xmm2, %xmm3, %xmm9; /* L[2] ^ L[3] */
+ vpxor %xmm4, %xmm2, %xmm10; /* L[0] ^ L[1] ^ L[2] */
+ vpxor %xmm5, %xmm3, %xmm11; /* L[0] ^ L[2] ^ L[3] */
+ vpxor %xmm7, %xmm3, %xmm12; /* L[1] ^ L[2] ^ L[3] */
+ vpxor %xmm0, %xmm8, %xmm13; /* L[0] ^ L[1] ^ L[3] */
+ vpxor %xmm4, %xmm9, %xmm14; /* L[0] ^ L[1] ^ L[2] ^ L[3] */
+ vinserti128 $1, %xmm4, %ymm0, %ymm0;
+ vinserti128 $1, %xmm7, %ymm1, %ymm1;
+ vinserti128 $1, %xmm5, %ymm10, %ymm10;
+ vinserti128 $1, %xmm9, %ymm2, %ymm2;
+ vinserti128 $1, %xmm14, %ymm11, %ymm11;
+ vinserti128 $1, %xmm8, %ymm12, %ymm12;
+ vinserti128 $1, %xmm6, %ymm13, %ymm13;
+ vmovdqa %ymm0, (0 * 16)(%rsp);
+ vmovdqa %ymm1, (2 * 16)(%rsp);
+ vmovdqa %ymm10, (4 * 16)(%rsp);
+ vmovdqa %ymm2, (6 * 16)(%rsp);
+ vmovdqa %ymm11, (8 * 16)(%rsp);
+ vmovdqa %ymm12, (10 * 16)(%rsp);
+ vmovdqa %ymm13, (12 * 16)(%rsp);
+
+ /* Aligned: Process 16 blocks per loop. */
+.align 8
+.Locb_aligned_blk16:
+ cmpq $16, %r8;
+ jb .Locb_aligned_blk8;
+
+ leaq -16(%r8), %r8;
+
+ leal 16(%esi), %esi;
+ tzcntl %esi, %eax;
+ shll $4, %eax;
+
+ vpxor (0 * 16)(%rsp), %ymm15, %ymm8;
+ vpxor (2 * 16)(%rsp), %ymm15, %ymm9;
+ vpxor (4 * 16)(%rsp), %ymm15, %ymm10;
+ vpxor (6 * 16)(%rsp), %ymm15, %ymm11;
+ vpxor (8 * 16)(%rsp), %ymm15, %ymm12;
+
+ vpxor (3 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[3] */
+ vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */
+ vinserti128 $1, %xmm14, %ymm13, %ymm14;
+
+ vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+ vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
+ vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+ vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+ vmovdqa %ymm13, (16 * 16)(%rsp);
+ vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+ vmovdqa %ymm13, (18 * 16)(%rsp);
+
+ leaq (16 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+ testl %r15d, %r15d;
+ jz .Locb_aligned_blk16_dec;
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ cmpl $12, %r9d;
+ jb .Locb_aligned_blk16_enc_last;
+ vbroadcasti128 (10 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ jz .Locb_aligned_blk16_enc_last;
+ vbroadcasti128 (12 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm13;
+ VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+
+ /* Last round and output handling. */
+ .Locb_aligned_blk16_enc_last:
+ vmovdqa (14 * 16)(%rsp), %ymm13;
+ vpxor %ymm8, %ymm13, %ymm8;
+ vpxor %ymm9, %ymm13, %ymm9;
+ vpxor %ymm10, %ymm13, %ymm10;
+ vpxor %ymm11, %ymm13, %ymm11;
+ vaesenclast %ymm8, %ymm0, %ymm0;
+ vaesenclast %ymm9, %ymm1, %ymm1;
+ vaesenclast %ymm10, %ymm2, %ymm2;
+ vaesenclast %ymm11, %ymm3, %ymm3;
+ vpxor %ymm12, %ymm13, %ymm12;
+ vpxor (16 * 16)(%rsp), %ymm13, %ymm8;
+ vpxor (18 * 16)(%rsp), %ymm13, %ymm9;
+ vpxor %ymm14, %ymm13, %ymm13;
+ vaesenclast %ymm12, %ymm4, %ymm4;
+ vaesenclast %ymm8, %ymm5, %ymm5;
+ vaesenclast %ymm9, %ymm6, %ymm6;
+ vaesenclast %ymm13, %ymm7, %ymm7;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ vmovdqu %ymm4, (8 * 16)(%rdx);
+ vmovdqu %ymm5, (10 * 16)(%rdx);
+ vmovdqu %ymm6, (12 * 16)(%rdx);
+ vmovdqu %ymm7, (14 * 16)(%rdx);
+ leaq (16 * 16)(%rdx), %rdx;
+
+ jmp .Locb_aligned_blk16;
+
+ .align 8
+ .Locb_aligned_blk16_dec:
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ cmpl $12, %r9d;
+ jb .Locb_aligned_blk16_dec_last;
+ vbroadcasti128 (10 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ jz .Locb_aligned_blk16_dec_last;
+ vbroadcasti128 (12 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm13;
+ VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+
+ /* Last round and output handling. */
+ .Locb_aligned_blk16_dec_last:
+ vmovdqa (14 * 16)(%rsp), %ymm13;
+ vpxor %ymm8, %ymm13, %ymm8;
+ vpxor %ymm9, %ymm13, %ymm9;
+ vpxor %ymm10, %ymm13, %ymm10;
+ vpxor %ymm11, %ymm13, %ymm11;
+ vaesdeclast %ymm8, %ymm0, %ymm0;
+ vaesdeclast %ymm9, %ymm1, %ymm1;
+ vaesdeclast %ymm10, %ymm2, %ymm2;
+ vaesdeclast %ymm11, %ymm3, %ymm3;
+ vpxor %ymm12, %ymm13, %ymm12;
+ vpxor (16 * 16)(%rsp), %ymm13, %ymm8;
+ vpxor (18 * 16)(%rsp), %ymm13, %ymm9;
+ vpxor %ymm14, %ymm13, %ymm13;
+ vaesdeclast %ymm12, %ymm4, %ymm4;
+ vaesdeclast %ymm8, %ymm5, %ymm5;
+ vaesdeclast %ymm9, %ymm6, %ymm6;
+ vaesdeclast %ymm13, %ymm7, %ymm7;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ vmovdqu %ymm4, (8 * 16)(%rdx);
+ vmovdqu %ymm5, (10 * 16)(%rdx);
+ vmovdqu %ymm6, (12 * 16)(%rdx);
+ vmovdqu %ymm7, (14 * 16)(%rdx);
+ leaq (16 * 16)(%rdx), %rdx;
+
+ jmp .Locb_aligned_blk16;
+
+ /* Aligned: Process trailing eight blocks. */
+.align 8
+.Locb_aligned_blk8:
+ cmpq $8, %r8;
+ jb .Locb_aligned_done;
+
+ leaq -8(%r8), %r8;
+
+ leal 8(%esi), %esi;
+ tzcntl %esi, %eax;
+ shll $4, %eax;
+
+ vpxor (0 * 16)(%rsp), %ymm15, %ymm5;
+ vpxor (2 * 16)(%rsp), %ymm15, %ymm6;
+ vpxor (4 * 16)(%rsp), %ymm15, %ymm7;
+
+ vpxor (2 * 16)(%r14), %xmm15, %xmm13; /* offset ^ first key ^ L[2] */
+ vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */
+ vinserti128 $1, %xmm14, %ymm13, %ymm14;
+
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+ vmovdqa (14 * 16)(%rsp), %ymm8;
+
+ testl %r15d, %r15d;
+ jz .Locb_aligned_blk8_dec;
+
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ cmpl $12, %r9d;
+ jb .Locb_aligned_blk8_enc_last;
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ jz .Locb_aligned_blk8_enc_last;
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+
+ /* Last round and output handling. */
+ .Locb_aligned_blk8_enc_last:
+ vpxor %ymm5, %ymm8, %ymm5;
+ vpxor %ymm6, %ymm8, %ymm6;
+ vpxor %ymm7, %ymm8, %ymm7;
+ vpxor %ymm14, %ymm8, %ymm4;
+ vaesenclast %ymm5, %ymm0, %ymm0;
+ vaesenclast %ymm6, %ymm1, %ymm1;
+ vaesenclast %ymm7, %ymm2, %ymm2;
+ vaesenclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ jmp .Locb_aligned_done;
+
+ .align 8
+ .Locb_aligned_blk8_dec:
+ /* AES rounds */
+ vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ cmpl $12, %r9d;
+ jb .Locb_aligned_blk8_dec_last;
+ vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ jz .Locb_aligned_blk8_dec_last;
+ vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+ VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+ vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+ /* Last round and output handling. */
+ .Locb_aligned_blk8_dec_last:
+ vpxor %ymm5, %ymm8, %ymm5;
+ vpxor %ymm6, %ymm8, %ymm6;
+ vpxor %ymm7, %ymm8, %ymm7;
+ vpxor %ymm14, %ymm8, %ymm4;
+ vaesdeclast %ymm5, %ymm0, %ymm0;
+ vaesdeclast %ymm6, %ymm1, %ymm1;
+ vaesdeclast %ymm7, %ymm2, %ymm2;
+ vaesdeclast %ymm4, %ymm3, %ymm3;
+ vmovdqu %ymm0, (0 * 16)(%rdx);
+ vmovdqu %ymm1, (2 * 16)(%rdx);
+ vmovdqu %ymm2, (4 * 16)(%rdx);
+ vmovdqu %ymm3, (6 * 16)(%rdx);
+ leaq (8 * 16)(%rdx), %rdx;
+
+ /*jmp .Locb_aligned_done;*/
+
+.align 8
+.Locb_aligned_done:
+ /* Burn stack. */
+ vpxor %ymm0, %ymm0, %ymm0;
+ vmovdqa %ymm0, (0 * 16)(%rsp);
+ vmovdqa %ymm0, (2 * 16)(%rsp);
+ vmovdqa %ymm0, (4 * 16)(%rsp);
+ vmovdqa %ymm0, (6 * 16)(%rsp);
+ vmovdqa %ymm0, (8 * 16)(%rsp);
+ vmovdqa %ymm0, (10 * 16)(%rsp);
+ vmovdqa %ymm0, (12 * 16)(%rsp);
+ vmovdqa %ymm0, (16 * 16)(%rsp);
+ vmovdqa %ymm0, (18 * 16)(%rsp);
+
+ /* Handle tailing 1…7 blocks in nblk-unaligned loop. */
+ movq %r8, %r10;
+ cmpq $1, %r8;
+ jnb .Locb_unaligned_blk8;
+
+.align 8
+.Ldone_ocb:
+ movq 16(%rbp), %r14; /* offset ptr. */
+ vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key ^ first key */
+ vmovdqu %xmm15, (%r14); /* Store offset. */
+
+ /* Handle decryption checksumming. */
+
+ testl %r15d, %r15d;
+ jnz .Locb_dec_checksum_done;
+ movq 24(%rbp), %rax; /* checksum ptr. */
+ movq (STACK_REGS_POS + 4 * 8)(%rsp), %r10;
+ movq (STACK_REGS_POS + 5 * 8)(%rsp), %r11;
+ call _gcry_vaes_avx2_ocb_checksum;
+.Locb_dec_checksum_done:
+
+ /* Burn stack. */
+ vpxor %ymm0, %ymm0, %ymm0;
+ vmovdqa %ymm0, (0 * 16)(%rsp);
+
+ vzeroall;
+
+ movq (STACK_REGS_POS + 0 * 8)(%rsp), %r12;
+ CFI_RESTORE(%r12);
+ movq (STACK_REGS_POS + 1 * 8)(%rsp), %r13;
+ CFI_RESTORE(%r13);
+ movq (STACK_REGS_POS + 2 * 8)(%rsp), %r14;
+ CFI_RESTORE(%r14);
+ movq (STACK_REGS_POS + 3 * 8)(%rsp), %r15;
+ CFI_RESTORE(%r15);
+
+ leave;
+ CFI_LEAVE();
+ ret
+ CFI_ENDPROC();
+ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64)
+
+/**********************************************************************
+ constants
+ **********************************************************************/
+ELF(.type _gcry_vaes_consts, at object)
+_gcry_vaes_consts:
+.align 32
+.Lbige_addb_0:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+.Lbige_addb_1:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#endif /* HAVE_GCC_INLINE_ASM_VAES */
+#endif /* __x86_64__ */
diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c
new file mode 100644
index 00000000..636e3e95
--- /dev/null
+++ b/cipher/rijndael-vaes.c
@@ -0,0 +1,146 @@
+/* VAES/AVX2 accelerated AES for Libgcrypt
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "types.h" /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_VAES
+
+
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+# define ASM_FUNC_ABI
+# endif
+
+
+extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx);
+
+
+extern void _gcry_vaes_avx2_cbc_dec_amd64 (const void *keysched,
+ unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ unsigned int nrounds) ASM_FUNC_ABI;
+
+extern void _gcry_vaes_avx2_cfb_dec_amd64 (const void *keysched,
+ unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ unsigned int nrounds) ASM_FUNC_ABI;
+
+extern void _gcry_vaes_avx2_ctr_enc_amd64 (const void *keysched,
+ unsigned char *ctr,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ unsigned int nrounds) ASM_FUNC_ABI;
+
+extern void _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched,
+ unsigned int blkn,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ unsigned int nrounds,
+ unsigned char *offset,
+ unsigned char *checksum,
+ unsigned char *L_table,
+ int encrypt) ASM_FUNC_ABI;
+
+
+void
+_gcry_aes_vaes_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschdec32;
+ unsigned int nrounds = ctx->rounds;
+
+ if (!ctx->decryption_prepared)
+ {
+ _gcry_aes_aesni_prepare_decryption (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ _gcry_vaes_avx2_cbc_dec_amd64 (keysched, iv, outbuf, inbuf, nblocks, nrounds);
+}
+
+void
+_gcry_aes_vaes_cfb_dec (RIJNDAEL_context *ctx, unsigned char *iv,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_vaes_avx2_cfb_dec_amd64 (keysched, iv, outbuf, inbuf, nblocks, nrounds);
+}
+
+void
+_gcry_aes_vaes_ctr_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+ unsigned char *outbuf, const unsigned char *inbuf,
+ size_t nblocks)
+{
+ const void *keysched = ctx->keyschenc32;
+ unsigned int nrounds = ctx->rounds;
+
+ _gcry_vaes_avx2_ctr_enc_amd64 (keysched, iv, outbuf, inbuf, nblocks, nrounds);
+}
+
+size_t
+_gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt)
+{
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned int nrounds = ctx->rounds;
+ u64 blkn = c->u_mode.ocb.data_nblocks;
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ _gcry_aes_aesni_prepare_decryption (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ c->u_mode.ocb.data_nblocks = blkn + nblocks;
+
+ _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf, inbuf,
+ nblocks, nrounds, c->u_iv.iv, c->u_ctr.ctr,
+ c->u_mode.ocb.L[0], encrypt);
+
+ return 0;
+}
+
+#endif /* USE_VAES */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index fe137327..dd03bd6a 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -102,6 +102,23 @@ extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak,
size_t nblocks, int encrypt);
#endif
+#ifdef USE_VAES
+/* VAES (AMD64) accelerated implementation of AES */
+
+extern void _gcry_aes_vaes_cfb_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks);
+extern void _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks);
+extern void _gcry_aes_vaes_ctr_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks);
+extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
+#endif
+
#ifdef USE_SSSE3
/* SSSE3 (AMD64) vector permutation implementation of AES */
extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key);
@@ -480,6 +497,17 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt;
bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth;
bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt;
+
+#ifdef USE_VAES
+ if ((hwfeatures & HWF_INTEL_VAES) && (hwfeatures & HWF_INTEL_AVX2))
+ {
+ /* Setup VAES bulk encryption routines. */
+ bulk_ops->cfb_dec = _gcry_aes_vaes_cfb_dec;
+ bulk_ops->cbc_dec = _gcry_aes_vaes_cbc_dec;
+ bulk_ops->ctr_enc = _gcry_aes_vaes_ctr_enc;
+ bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
+ }
+#endif
}
#endif
#ifdef USE_PADLOCK
@@ -1644,7 +1672,11 @@ selftest_basic_256 (void)
static const char*
selftest_ctr_128 (void)
{
+#ifdef USE_VAES
+ const int nblocks = 16+1;
+#else
const int nblocks = 8+1;
+#endif
const int blocksize = BLOCKSIZE;
const int context_size = sizeof(RIJNDAEL_context);
@@ -1658,7 +1690,11 @@ selftest_ctr_128 (void)
static const char*
selftest_cbc_128 (void)
{
+#ifdef USE_VAES
+ const int nblocks = 16+2;
+#else
const int nblocks = 8+2;
+#endif
const int blocksize = BLOCKSIZE;
const int context_size = sizeof(RIJNDAEL_context);
@@ -1672,7 +1708,11 @@ selftest_cbc_128 (void)
static const char*
selftest_cfb_128 (void)
{
+#ifdef USE_VAES
+ const int nblocks = 16+2;
+#else
const int nblocks = 8+2;
+#endif
const int blocksize = BLOCKSIZE;
const int context_size = sizeof(RIJNDAEL_context);
diff --git a/configure.ac b/configure.ac
index f74056f2..6e38c27c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2563,6 +2563,10 @@ if test "$found" = "1" ; then
# Build with the SSSE3 implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo"
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo"
+
+ # Build with the VAES/AVX2 implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-vaes.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-vaes-avx2-amd64.lo"
;;
arm*-*-*)
# Build with the assembly implementation
--
2.27.0
More information about the Gcrypt-devel
mailing list