[PATCH 1/4] rijndael-vaes-avx2-amd64: acceleration for OCB auth

Jussi Kivilinna jussi.kivilinna at iki.fi
Mon Jul 10 20:07:00 CEST 2023


* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ocb_crypt_amd64): Add authentication mode support.
* cipher/rijndael-vaes.c (_gcry_vaes_avx2_ocb_crypt_amd64): Change
to return 'size_t' value.
(_gcry_aes_vaes_ocb_auth): New.
* cipher/rijndael.c (_gcry_aes_vaes_ocb_auth): New.
(do_setkey) [USE_VAES]: Add setup for 'bulk_ops->ocb_auth'.
--

Benchmark on AMD Ryzen 9 7900X (zen4):

Before:

 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
       OCB auth |     0.071 ns/B     13470 MiB/s     0.333 c/B      4700

After (~2.0x faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
       OCB auth |     0.034 ns/B     27946 MiB/s     0.160 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-vaes-avx2-amd64.S | 300 ++++++++++++++++++++++++++++--
 cipher/rijndael-vaes.c            |  48 +++--
 cipher/rijndael.c                 |   4 +
 3 files changed, 323 insertions(+), 29 deletions(-)

diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index 843ad9cf..fd012982 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -1402,7 +1402,7 @@ _gcry_vaes_avx2_ctr32le_enc_amd64:
 ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)
 
 /**********************************************************************
-  OCB-mode encryption/decryption
+  OCB-mode encryption/decryption/authentication
  **********************************************************************/
 ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64, at function)
 .globl _gcry_vaes_avx2_ocb_crypt_amd64
@@ -1418,7 +1418,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	 *	16(%rbp): offset
 	 *	24(%rbp): checksum
 	 *	32(%rbp): L-array
-	 *	40(%rbp): encrypt (%r15d)
+	 *	40(%rbp): decrypt/encrypt/auth (%r15d)
 	 */
 	CFI_STARTPROC();
 
@@ -1427,7 +1427,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 #define OFFSET_PTR_Q   16(%rbp)
 #define CHECKSUM_PTR_Q 24(%rbp)
 #define L_ARRAY_PTR_L  32(%rbp)
-#define ENCRYPT_FLAG_L 40(%rbp)
+#define OPER_MODE_L    40(%rbp)
 
 	pushq %rbp;
 	CFI_PUSH(%rbp);
@@ -1448,7 +1448,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	movq %rbx, (STACK_REGS_POS + 4 * 8)(%rsp);
 	CFI_REG_ON_STACK(rbx, STACK_REGS_POS + 4 * 8);
 
-	movl ENCRYPT_FLAG_L, %r15d; /* encrypt-flag. */
+	movl OPER_MODE_L, %r15d; /* decrypt/encrypt/auth-mode. */
 	movq OFFSET_PTR_Q, %r14; /* offset ptr. */
 	movq CHECKSUM_PTR_Q, %rbx; /* checksum ptr. */
 
@@ -1531,8 +1531,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	vinserti128 $1, %xmm10, %ymm9, %ymm7;
 	vinserti128 $1, %xmm15, %ymm11, %ymm8;
 
-	testl %r15d, %r15d;
-	jz .Locb_unaligned_blk8_dec;
+	cmpl $1, %r15d;
+	jb .Locb_unaligned_blk8_dec;
+	ja .Locb_unaligned_blk8_auth;
 		vmovdqu (0 * 16)(%rcx), %ymm0;
 		vmovdqu (2 * 16)(%rcx), %ymm1;
 		vmovdqu (4 * 16)(%rcx), %ymm2;
@@ -1598,6 +1599,59 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
 		jmp .Locb_unaligned_blk8;
 
+	.align 8
+	.Locb_unaligned_blk8_auth:
+		vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+		vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+		vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+		vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
+		leaq (8 * 16)(%rcx), %rcx;
+
+		/* AES rounds */
+		vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Locb_unaligned_blk8_auth_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Locb_unaligned_blk8_auth_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+		/* Last round and output handling. */
+	.Locb_unaligned_blk8_auth_last:
+		vaesenclast %ymm4, %ymm0, %ymm0;
+		vaesenclast %ymm4, %ymm1, %ymm1;
+		vaesenclast %ymm4, %ymm2, %ymm2;
+		vaesenclast %ymm4, %ymm3, %ymm3;
+		vpxor %ymm0, %ymm14, %ymm14;
+		vpxor %ymm1, %ymm13, %ymm13;
+		vpxor %ymm2, %ymm14, %ymm14;
+		vpxor %ymm3, %ymm13, %ymm13;
+
+		jmp .Locb_unaligned_blk8;
+
 	.align 8
 	.Locb_unaligned_blk8_dec:
 		vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
@@ -1690,8 +1744,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	vpxor (%r14, %rax), %xmm7, %xmm15;
 	vinserti128 $1, %xmm15, %ymm7, %ymm6;
 
-	testl %r15d, %r15d;
-	jz .Locb_unaligned_blk4_dec;
+	cmpl $1, %r15d;
+	jb .Locb_unaligned_blk4_dec;
+	ja .Locb_unaligned_blk4_auth;
 		vmovdqu (0 * 16)(%rcx), %ymm0;
 		vmovdqu (2 * 16)(%rcx), %ymm1;
 		leaq (4 * 16)(%rcx), %rcx;
@@ -1744,6 +1799,53 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
 		jmp .Locb_unaligned_blk1;
 
+	.align 8
+	.Locb_unaligned_blk4_auth:
+		vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+		vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+		leaq (4 * 16)(%rcx), %rcx;
+
+		/* AES rounds */
+		vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Locb_unaligned_blk4_auth_last;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Locb_unaligned_blk4_auth_last;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESENC2(%ymm4, %ymm0, %ymm1);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+	      /* Last round and output handling. */
+	.Locb_unaligned_blk4_auth_last:
+		vaesenclast %ymm4, %ymm0, %ymm0;
+		vaesenclast %ymm4, %ymm1, %ymm1;
+		vpxor %ymm0, %ymm14, %ymm14;
+		vpxor %ymm1, %ymm13, %ymm13;
+
+		jmp .Locb_unaligned_blk1;
+
 	.align 8
 	.Locb_unaligned_blk4_dec:
 		vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
@@ -1808,8 +1910,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	shll $4, %r11d;
 	vpxor (%r14, %r11), %xmm15, %xmm15;
 
-	testl %r15d, %r15d;
-	jz .Locb_unaligned_blk1_dec;
+	cmpl $1, %r15d;
+	jb .Locb_unaligned_blk1_dec;
+	ja .Locb_unaligned_blk1_auth;
 		vmovdqu (%rcx), %xmm0;
 		vpxor %ymm0, %ymm14, %ymm14;
 		vpxor %xmm15, %xmm0, %xmm0;
@@ -1842,6 +1945,39 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
 		jmp .Locb_unaligned_blk1;
 
+	.align 8
+	.Locb_unaligned_blk1_auth:
+		vpxor (%rcx), %xmm15, %xmm0;
+		leaq 16(%rcx), %rcx;
+
+		/* AES rounds. */
+		vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (3 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (4 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (5 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (6 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (7 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (8 * 16)(%rdi), %xmm0, %xmm0;
+		vaesenc (9 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (10 * 16)(%rdi), %xmm1;
+		cmpl $12, %r9d;
+		jb .Locb_unaligned_blk1_auth_last;
+		vaesenc %xmm1, %xmm0, %xmm0;
+		vaesenc (11 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (12 * 16)(%rdi), %xmm1;
+		jz .Locb_unaligned_blk1_auth_last;
+		vaesenc %xmm1, %xmm0, %xmm0;
+		vaesenc (13 * 16)(%rdi), %xmm0, %xmm0;
+		vmovdqa (14 * 16)(%rdi), %xmm1;
+
+		/* Last round and output handling. */
+	.Locb_unaligned_blk1_auth_last:
+		vaesenclast %xmm1, %xmm0, %xmm0;
+		vpxor %ymm0, %ymm14, %ymm14;
+
+		jmp .Locb_unaligned_blk1;
+
 	.align 8
 	.Locb_unaligned_blk1_dec:
 		vpxor (%rcx), %xmm15, %xmm0;
@@ -1961,8 +2097,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */
 	vinserti128 $1, %xmm14, %ymm13, %ymm14;
 
-	testl %r15d, %r15d;
-	jz .Locb_aligned_blk16_dec;
+	cmpl $1, %r15d;
+	jb .Locb_aligned_blk16_dec;
+	ja .Locb_aligned_blk16_auth;
 		vmovdqu (0 * 16)(%rcx), %ymm0;
 		vmovdqu (2 * 16)(%rcx), %ymm1;
 		vmovdqu (4 * 16)(%rcx), %ymm2;
@@ -2057,6 +2194,81 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
 		jmp .Locb_aligned_blk16;
 
+	.align 8
+	.Locb_aligned_blk16_auth:
+		vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+		vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+		vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
+		vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
+		vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
+		vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
+		vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+		vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+		vmovdqa %ymm13, (16 * 16)(%rsp);
+		vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+		vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+		vmovdqa %ymm13, (18 * 16)(%rsp);
+
+		leaq (16 * 16)(%rcx), %rcx;
+
+		vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+		/* AES rounds */
+		vbroadcasti128 (1 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm13;
+		cmpl $12, %r9d;
+		jb .Locb_aligned_blk16_auth_last;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm13;
+		jz .Locb_aligned_blk16_auth_last;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm13;
+		VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm13;
+
+		/* Last round and output handling. */
+	.Locb_aligned_blk16_auth_last:
+		vaesenclast %ymm13, %ymm0, %ymm0;
+		vaesenclast %ymm13, %ymm1, %ymm1;
+		vaesenclast %ymm13, %ymm2, %ymm2;
+		vaesenclast %ymm13, %ymm3, %ymm3;
+		vaesenclast %ymm13, %ymm4, %ymm4;
+		vaesenclast %ymm13, %ymm5, %ymm5;
+		vaesenclast %ymm13, %ymm6, %ymm6;
+		vaesenclast %ymm13, %ymm7, %ymm7;
+
+		vpxor %ymm1, %ymm0, %ymm0;
+		vpxor %ymm3, %ymm2, %ymm2;
+		vpxor %ymm5, %ymm4, %ymm4;
+		vpxor %ymm7, %ymm6, %ymm6;
+		vpxor %ymm2, %ymm0, %ymm0;
+		vpxor %ymm6, %ymm4, %ymm4;
+		vpxor %ymm4, %ymm0, %ymm0;
+		vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+		vmovdqa %ymm0, (20 * 16)(%rsp);
+
+		jmp .Locb_aligned_blk16;
+
 	.align 8
 	.Locb_aligned_blk16_dec:
 		vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
@@ -2169,8 +2381,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */
 	vinserti128 $1, %xmm14, %ymm13, %ymm14;
 
-	testl %r15d, %r15d;
-	jz .Locb_aligned_blk8_dec;
+	cmpl $1, %r15d;
+	jb .Locb_aligned_blk8_dec;
+	ja .Locb_aligned_blk8_auth;
 		vmovdqu (0 * 16)(%rcx), %ymm0;
 		vmovdqu (2 * 16)(%rcx), %ymm1;
 		vmovdqu (4 * 16)(%rcx), %ymm2;
@@ -2240,6 +2453,63 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
 		jmp .Locb_aligned_done;
 
+	.align 8
+	.Locb_aligned_blk8_auth:
+		vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+		vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+		vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+		vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
+		leaq (8 * 16)(%rcx), %rcx;
+
+		vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+		/* AES rounds */
+		vbroadcasti128 (1 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (2 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (3 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (4 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (5 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (6 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (7 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (8 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (9 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (10 * 16)(%rdi), %ymm4;
+		cmpl $12, %r9d;
+		jb .Locb_aligned_blk8_auth_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (11 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (12 * 16)(%rdi), %ymm4;
+		jz .Locb_aligned_blk8_auth_last;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (13 * 16)(%rdi), %ymm4;
+		VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
+		vbroadcasti128 (14 * 16)(%rdi), %ymm4;
+
+		/* Last round and output handling. */
+	.Locb_aligned_blk8_auth_last:
+		vaesenclast %ymm4, %ymm0, %ymm0;
+		vaesenclast %ymm4, %ymm1, %ymm1;
+		vaesenclast %ymm4, %ymm2, %ymm2;
+		vaesenclast %ymm4, %ymm3, %ymm3;
+
+		vpxor %ymm1, %ymm0, %ymm0;
+		vpxor %ymm3, %ymm2, %ymm2;
+		vpxor %ymm2, %ymm0, %ymm0;
+		vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+		vmovdqa %ymm0, (20 * 16)(%rsp);
+
+		jmp .Locb_aligned_done;
+
 	.align 8
 	.Locb_aligned_blk8_dec:
 		vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
@@ -2357,6 +2627,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 	movq (STACK_REGS_POS + 4 * 8)(%rsp), %rbx;
 	CFI_RESTORE(%rbx);
 
+	xorl %eax, %eax;
+
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop
diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c
index 978c86da..ce9e18e7 100644
--- a/cipher/rijndael-vaes.c
+++ b/cipher/rijndael-vaes.c
@@ -40,7 +40,7 @@
 # endif
 
 
-extern void _gcry_aes_aesni_prepare_decryption(RIJNDAEL_context *ctx);
+extern void _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx);
 
 
 extern void _gcry_vaes_avx2_cbc_dec_amd64 (const void *keysched,
@@ -72,16 +72,16 @@ extern void _gcry_vaes_avx2_ctr32le_enc_amd64 (const void *keysched,
 					       unsigned int nrounds)
 						ASM_FUNC_ABI;
 
-extern void _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched,
-					     unsigned int blkn,
-					     void *outbuf_arg,
-					     const void *inbuf_arg,
-					     size_t nblocks,
-					     unsigned int nrounds,
-					     unsigned char *offset,
-					     unsigned char *checksum,
-					     unsigned char *L_table,
-					     int encrypt) ASM_FUNC_ABI;
+extern size_t _gcry_vaes_avx2_ocb_crypt_amd64 (const void *keysched,
+					       unsigned int blkn,
+					       void *outbuf_arg,
+					       const void *inbuf_arg,
+					       size_t nblocks,
+					       unsigned int nrounds,
+					       unsigned char *offset,
+					       unsigned char *checksum,
+					       unsigned char *L_table,
+					       int encrypt) ASM_FUNC_ABI;
 
 extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched,
 					     unsigned char *tweak,
@@ -193,11 +193,29 @@ _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
   c->u_mode.ocb.data_nblocks = blkn + nblocks;
 
-  _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf, inbuf,
-				   nblocks, nrounds, c->u_iv.iv, c->u_ctr.ctr,
-				   c->u_mode.ocb.L[0], encrypt);
+  return _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, outbuf,
+					  inbuf, nblocks, nrounds, c->u_iv.iv,
+					  c->u_ctr.ctr, c->u_mode.ocb.L[0],
+					  encrypt);
+}
+
+size_t
+_gcry_aes_vaes_ocb_auth (gcry_cipher_hd_t c, const void *inbuf_arg,
+			 size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const void *keysched = ctx->keyschenc32;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int nrounds = ctx->rounds;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+  c->u_mode.ocb.aad_nblocks = blkn + nblocks;
 
-  return 0;
+  return _gcry_vaes_avx2_ocb_crypt_amd64 (keysched, (unsigned int)blkn, NULL,
+					  inbuf, nblocks, nrounds,
+					  c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum,
+					  c->u_mode.ocb.L[0], 2);
 }
 
 void
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index b49a0642..56acb199 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -125,6 +125,9 @@ extern void _gcry_aes_vaes_ctr32le_enc (void *context, unsigned char *ctr,
 extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 					const void *inbuf_arg, size_t nblocks,
 					int encrypt);
+extern size_t _gcry_aes_vaes_ocb_auth (gcry_cipher_hd_t c,
+				       const void *inbuf_arg,
+				       size_t nblocks);
 extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak,
 				      void *outbuf_arg, const void *inbuf_arg,
 				      size_t nblocks, int encrypt);
@@ -562,6 +565,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 	  bulk_ops->ctr_enc = _gcry_aes_vaes_ctr_enc;
 	  bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc;
 	  bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt;
+	  bulk_ops->ocb_auth = _gcry_aes_vaes_ocb_auth;
 	  bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt;
 	  bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt;
 	}
-- 
2.39.2




More information about the Gcrypt-devel mailing list