From jussi.kivilinna at iki.fi  Sun Feb 19 09:49:08 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 19 Feb 2023 10:49:08 +0200
Subject: [PATCH 3/5] aria-avx: small optimization for aria_ark_8way
In-Reply-To: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
Message-ID: <20230219084910.1302701-3-jussi.kivilinna@iki.fi>

* cipher/aria-aesni-avx-amd64.S (aria_ark_8way): Use 'vmovd' for
loading key material and 'vpshufb' for broadcasting from byte
locations 3, 2, 1 and 0.
--

Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):

 Before (GFNI/AVX):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.516 ns/B      1847 MiB/s      2.43 c/B      4700
        ECB dec |     0.519 ns/B      1839 MiB/s      2.44 c/B      4700
        CTR enc |     0.517 ns/B      1846 MiB/s      2.43 c/B      4700
        CTR dec |     0.518 ns/B      1843 MiB/s      2.43 c/B      4700

 After (GFNI/AVX, ~5% faster):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.490 ns/B      1947 MiB/s      2.30 c/B      4700
        ECB dec |     0.490 ns/B      1946 MiB/s      2.30 c/B      4700
        CTR enc |     0.493 ns/B      1935 MiB/s      2.32 c/B      4700
        CTR dec |     0.493 ns/B      1934 MiB/s      2.32 c/B      4700

===

Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off):

 Before (GFNI/AVX):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.967 ns/B     986.6 MiB/s      2.89 c/B      2992
        ECB dec |     0.966 ns/B     987.1 MiB/s      2.89 c/B      2992
        CTR enc |     0.972 ns/B     980.8 MiB/s      2.91 c/B      2993
        CTR dec |     0.971 ns/B     982.5 MiB/s      2.90 c/B      2993

 After (GFNI/AVX, ~6% faster):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.908 ns/B      1050 MiB/s      2.72 c/B      2992
        ECB dec |     0.903 ns/B      1056 MiB/s      2.70 c/B      2992
        CTR enc |     0.913 ns/B      1045 MiB/s      2.73 c/B      2992
        CTR dec |     0.910 ns/B      1048 MiB/s      2.72 c/B      2992

===

Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off):

 Before (AESNI/AVX):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.921 ns/B      1035 MiB/s      3.50 c/B      3800
        ECB dec |     0.922 ns/B      1034 MiB/s      3.50 c/B      3800
        CTR enc |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
        CTR dec |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800

 After (AESNI/AVX, ~6% faster)
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.862 ns/B      1106 MiB/s      3.28 c/B      3800
        ECB dec |     0.862 ns/B      1106 MiB/s      3.28 c/B      3800
        CTR enc |     0.865 ns/B      1102 MiB/s      3.29 c/B      3800
        CTR dec |     0.865 ns/B      1103 MiB/s      3.29 c/B      3800

===

Benchmark on AMD EPYC 7642 (zen2):

 Before (AESNI/AVX):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.22 ns/B     784.5 MiB/s      4.01 c/B      3298
        ECB dec |      1.22 ns/B     784.8 MiB/s      4.00 c/B      3292
        CTR enc |      1.22 ns/B     780.1 MiB/s      4.03 c/B      3299
        CTR dec |      1.22 ns/B     779.1 MiB/s      4.04 c/B      3299

 After (AESNI/AVX, ~13% faster):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.07 ns/B     888.3 MiB/s      3.54 c/B      3299
        ECB dec |      1.08 ns/B     885.3 MiB/s      3.55 c/B      3299
        CTR enc |      1.07 ns/B     888.7 MiB/s      3.54 c/B      3298
        CTR dec |      1.07 ns/B     887.4 MiB/s      3.55 c/B      3299

===

Benchmark on Intel Core i5-6500 (skylake):

 Before (AESNI/AVX):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.24 ns/B     766.6 MiB/s      4.48 c/B      3598
        ECB dec |      1.25 ns/B     764.9 MiB/s      4.49 c/B      3598
        CTR enc |      1.25 ns/B     761.7 MiB/s      4.50 c/B      3598
        CTR dec |      1.25 ns/B     761.6 MiB/s      4.51 c/B      3598

 After (AESNI/AVX, ~1% faster):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.22 ns/B     780.0 MiB/s      4.40 c/B      3598
        ECB dec |      1.22 ns/B     779.6 MiB/s      4.40 c/B      3598
        CTR enc |      1.23 ns/B     776.6 MiB/s      4.42 c/B      3598
        CTR dec |      1.23 ns/B     776.6 MiB/s      4.42 c/B      3598

===

Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off):

 Before (AESNI/AVX):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      2.11 ns/B     452.7 MiB/s      5.25 c/B      2494
        ECB dec |      2.10 ns/B     454.5 MiB/s      5.23 c/B      2494
        CTR enc |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
        CTR dec |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494

 After (AESNI/AVX, ~4% faster)
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      2.00 ns/B     475.8 MiB/s      5.00 c/B      2494
        ECB dec |      2.00 ns/B     476.4 MiB/s      4.99 c/B      2494
        CTR enc |      2.01 ns/B     474.7 MiB/s      5.01 c/B      2494
        CTR dec |      2.01 ns/B     473.9 MiB/s      5.02 c/B      2494

Cc: Taehee Yoo <ap420073 at gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/aria-aesni-avx-amd64.S | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/cipher/aria-aesni-avx-amd64.S b/cipher/aria-aesni-avx-amd64.S
index 7274b80e..f0c72225 100644
--- a/cipher/aria-aesni-avx-amd64.S
+++ b/cipher/aria-aesni-avx-amd64.S
@@ -357,27 +357,21 @@
 		      t0, t1, t2, rk,			\
 		      idx, round)			\
 	/* AddRoundKey */                               \
-	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
-	vpsrld $24, t0, t2;				\
-	vpshufb t1, t2, t2;				\
+	vmovd ((round * 16) + idx + 0)(rk), t0;		\
+	vpshufb .Lthree_x16 rRIP, t0, t2;		\
 	vpxor t2, x0, x0;				\
-	vpsrld $16, t0, t2;				\
-	vpshufb t1, t2, t2;				\
+	vpshufb .Ltwo_x16 rRIP, t0, t2;			\
 	vpxor t2, x1, x1;				\
-	vpsrld $8, t0, t2;				\
-	vpshufb t1, t2, t2;				\
+	vpshufb .Lone_x16 rRIP, t0, t2;			\
 	vpxor t2, x2, x2;				\
 	vpshufb t1, t0, t2;				\
 	vpxor t2, x3, x3;				\
-	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
-	vpsrld $24, t0, t2;				\
-	vpshufb t1, t2, t2;				\
+	vmovd ((round * 16) + idx + 4)(rk), t0;		\
+	vpshufb .Lthree_x16 rRIP, t0, t2;		\
 	vpxor t2, x4, x4;				\
-	vpsrld $16, t0, t2;				\
-	vpshufb t1, t2, t2;				\
+	vpshufb .Ltwo_x16 rRIP, t0, t2;			\
 	vpxor t2, x5, x5;				\
-	vpsrld $8, t0, t2;				\
-	vpshufb t1, t2, t2;				\
+	vpshufb .Lone_x16 rRIP, t0, t2;			\
 	vpxor t2, x6, x6;				\
 	vpshufb t1, t0, t2;				\
 	vpxor t2, x7, x7;
@@ -858,6 +852,13 @@ SECTION_RODATA
 .Ltf_hi__x2__and__fwd_aff:
 	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
 
+.Lthree_x16:
+	.byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+.Ltwo_x16:
+	.byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+.Lone_x16:
+	.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
 .Lbige_addb_1:
 	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
 .Lbige_addb_2:
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 19 09:49:07 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 19 Feb 2023 10:49:07 +0200
Subject: [PATCH 2/5] aria: add x86_64 GFNI/AVX512 accelerated implementation
In-Reply-To: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
Message-ID: <20230219084910.1302701-2-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'aria-gfni-avx512-amd64.S'.
* cipher/aria-gfni-avx512-amd64.S: New.
* cipher/aria.c (USE_GFNI_AVX512): New.
[USE_GFNI_AVX512] (MAX_PARALLEL_BLKS): New.
(ARIA_context): Add 'use_gfni_avx512'.
(_gcry_aria_gfni_avx512_ecb_crypt_blk64)
(_gcry_aria_gfni_avx512_ctr_crypt_blk64)
(aria_gfni_avx512_ecb_crypt_blk64)
(aria_gfni_avx512_ctr_crypt_blk64): New.
(aria_crypt_blocks) [USE_GFNI_AVX512]: Add 64 parallel block
AVX512/GFNI processing.
(_gcry_aria_ctr_enc) [USE_GFNI_AVX512]: Add 64 parallel block
AVX512/GFNI processing.
(aria_setkey): Enable GFNI/AVX512 based on HW features.
* configure.ac: Add 'aria-gfni-avx512-amd64.lo'.
--

This patch adds AVX512/GFNI accelerated ARIA block cipher
implementation for libgcrypt. This implementation is based on
work by Taehee Yoo, with following notable changes:
 - Integration to libgcrypt, use of 'aes-common-amd64.h'.
 - Use round loop instead of unrolling for smaller code size and
   increased performance.
 - Use stack for temporary storage instead of external buffers.

===

Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):

 GFNI/AVX512:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.204 ns/B      4682 MiB/s     0.957 c/B      4700
        ECB dec |     0.204 ns/B      4668 MiB/s     0.960 c/B      4700
        CTR enc |     0.212 ns/B      4509 MiB/s     0.994 c/B      4700
        CTR dec |     0.212 ns/B      4490 MiB/s     0.998 c/B      4700

===

Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off):

 GFNI/AVX512:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.361 ns/B      2639 MiB/s      1.08 c/B      2992
        ECB dec |     0.362 ns/B      2632 MiB/s      1.08 c/B      2992
        CTR enc |     0.368 ns/B      2592 MiB/s      1.10 c/B      2992
        CTR dec |     0.368 ns/B      2591 MiB/s      1.10 c/B      2992

Cc: Taehee Yoo <ap420073 at gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am              |   1 +
 cipher/aria-gfni-avx512-amd64.S | 936 ++++++++++++++++++++++++++++++++
 cipher/aria.c                   |  86 ++-
 configure.ac                    |   1 +
 4 files changed, 1022 insertions(+), 2 deletions(-)
 create mode 100644 cipher/aria-gfni-avx512-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index a13e52e9..163c1f0f 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -78,6 +78,7 @@ EXTRA_libcipher_la_SOURCES = \
 	asm-poly1305-amd64.h \
 	asm-poly1305-s390x.h \
 	aria.c aria-aesni-avx-amd64.S aria-aesni-avx2-amd64.S \
+	aria-gfni-avx512-amd64.S \
 	arcfour.c arcfour-amd64.S \
 	blowfish.c blowfish-amd64.S blowfish-arm.S \
 	cast5.c cast5-amd64.S cast5-arm.S \
diff --git a/cipher/aria-gfni-avx512-amd64.S b/cipher/aria-gfni-avx512-amd64.S
new file mode 100644
index 00000000..849c744b
--- /dev/null
+++ b/cipher/aria-gfni-avx512-amd64.S
@@ -0,0 +1,936 @@
+/* aria-gfni-avx512-amd64.S  -  GFNI/AVX512 implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073 at gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX512_SUPPORT) && defined(ENABLE_GFNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 64 + 63)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
+	( (((a0) & 1) << 0) |				\
+	  (((a1) & 1) << 1) |				\
+	  (((a2) & 1) << 2) |				\
+	  (((a3) & 1) << 3) |				\
+	  (((a4) & 1) << 4) |				\
+	  (((a5) & 1) << 5) |				\
+	  (((a6) & 1) << 6) |				\
+	  (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
+	( ((l7) << (0 * 8)) |				\
+	  ((l6) << (1 * 8)) |				\
+	  ((l5) << (2 * 8)) |				\
+	  ((l4) << (3 * 8)) |				\
+	  ((l3) << (4 * 8)) |				\
+	  ((l2) << (5 * 8)) |				\
+	  ((l1) << (6 * 8)) |				\
+	  ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define clear_vec4(v0,v1,v2,v3) \
+	vpxord v0, v0, v0; \
+	vpxord v1, v1, v1; \
+	vpxord v2, v2, v2; \
+	vpxord v3, v3, v3
+
+#define clear_zmm16_zmm31() \
+	clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
+	clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
+	clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
+	clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31)
+
+#define clear_regs() \
+	kxorq %k1, %k1, %k1; \
+	vzeroall; \
+	clear_zmm16_zmm31()
+
+#define add_le128(out, in, lo_counter, hi_counter1)	\
+	vpaddq lo_counter, in, out;			\
+	vpcmpuq $1, lo_counter, out, %k1;		\
+	kaddb %k1, %k1, %k1;				\
+	vpaddq hi_counter1, out, out{%k1};
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
+	vpandq x, mask4bit, tmp0;			\
+	vpandqn x, mask4bit, x;				\
+	vpsrld $4, x, x;				\
+							\
+	vpshufb tmp0, lo_t, tmp0;			\
+	vpshufb x, hi_t, x;				\
+	vpxorq tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
+	vpunpckhdq x1, x0, t2;				\
+	vpunpckldq x1, x0, x0;				\
+							\
+	vpunpckldq x3, x2, t1;				\
+	vpunpckhdq x3, x2, x2;				\
+							\
+	vpunpckhqdq t1, x0, x1;				\
+	vpunpcklqdq t1, x0, x0;				\
+							\
+	vpunpckhqdq x2, t2, x3;				\
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,		\
+			 a1, b1, c1, d1,		\
+			 a2, b2, c2, d2,		\
+			 a3, b3, c3, d3,		\
+			 st0, st1)			\
+	vmovdqu64 d2, st0;				\
+	vmovdqu64 d3, st1;				\
+	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
+	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
+	vmovdqu64 st0, d2;				\
+	vmovdqu64 st1, d3;				\
+							\
+	vmovdqu64 a0, st0;				\
+	vmovdqu64 a1, st1;				\
+	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
+	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
+							\
+	vbroadcasti64x2 .Lshufb_16x16b rRIP, a0;	\
+	vmovdqu64 st1, a1;				\
+	vpshufb a0, a2, a2;				\
+	vpshufb a0, a3, a3;				\
+	vpshufb a0, b0, b0;				\
+	vpshufb a0, b1, b1;				\
+	vpshufb a0, b2, b2;				\
+	vpshufb a0, b3, b3;				\
+	vpshufb a0, a1, a1;				\
+	vpshufb a0, c0, c0;				\
+	vpshufb a0, c1, c1;				\
+	vpshufb a0, c2, c2;				\
+	vpshufb a0, c3, c3;				\
+	vpshufb a0, d0, d0;				\
+	vpshufb a0, d1, d1;				\
+	vpshufb a0, d2, d2;				\
+	vpshufb a0, d3, d3;				\
+	vmovdqu64 d3, st1;				\
+	vmovdqu64 st0, d3;				\
+	vpshufb a0, d3, a0;				\
+	vmovdqu64 d2, st0;				\
+							\
+	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
+	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
+	vmovdqu64 st0, d2;				\
+	vmovdqu64 st1, d3;				\
+							\
+	vmovdqu64 b0, st0;				\
+	vmovdqu64 b1, st1;				\
+	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
+	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
+	vmovdqu64 st0, b0;				\
+	vmovdqu64 st1, b1;				\
+	/* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,		\
+			   a1, b1, c1, d1,		\
+			   a2, b2, c2, d2,		\
+			   a3, b3, c3, d3,		\
+			   st0, st1)			\
+	vmovdqu64 d2, st0;				\
+	vmovdqu64 d3, st1;				\
+	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
+	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
+	vmovdqu64 st0, d2;				\
+	vmovdqu64 st1, d3;				\
+							\
+	vmovdqu64 a0, st0;				\
+	vmovdqu64 a1, st1;				\
+	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
+	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
+							\
+	vbroadcasti64x2 .Lshufb_16x16b rRIP, a0;	\
+	vmovdqu64 st1, a1;				\
+	vpshufb a0, a2, a2;				\
+	vpshufb a0, a3, a3;				\
+	vpshufb a0, b0, b0;				\
+	vpshufb a0, b1, b1;				\
+	vpshufb a0, b2, b2;				\
+	vpshufb a0, b3, b3;				\
+	vpshufb a0, a1, a1;				\
+	vpshufb a0, c0, c0;				\
+	vpshufb a0, c1, c1;				\
+	vpshufb a0, c2, c2;				\
+	vpshufb a0, c3, c3;				\
+	vpshufb a0, d0, d0;				\
+	vpshufb a0, d1, d1;				\
+	vpshufb a0, d2, d2;				\
+	vpshufb a0, d3, d3;				\
+	vmovdqu64 d3, st1;				\
+	vmovdqu64 st0, d3;				\
+	vpshufb a0, d3, a0;				\
+	vmovdqu64 d2, st0;				\
+							\
+	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
+	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
+	vmovdqu64 st0, d2;				\
+	vmovdqu64 st1, d3;				\
+							\
+	vmovdqu64 b0, st0;				\
+	vmovdqu64 b1, st1;				\
+	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
+	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
+	vmovdqu64 st0, b0;				\
+	vmovdqu64 st1, b1;				\
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     rio)				\
+	vmovdqu64 (0 * 64)(rio), x0;			\
+	vmovdqu64 (1 * 64)(rio), x1;			\
+	vmovdqu64 (2 * 64)(rio), x2;			\
+	vmovdqu64 (3 * 64)(rio), x3;			\
+	vmovdqu64 (4 * 64)(rio), x4;			\
+	vmovdqu64 (5 * 64)(rio), x5;			\
+	vmovdqu64 (6 * 64)(rio), x6;			\
+	vmovdqu64 (7 * 64)(rio), x7;			\
+	vmovdqu64 (8 * 64)(rio), y0;			\
+	vmovdqu64 (9 * 64)(rio), y1;			\
+	vmovdqu64 (10 * 64)(rio), y2;			\
+	vmovdqu64 (11 * 64)(rio), y3;			\
+	vmovdqu64 (12 * 64)(rio), y4;			\
+	vmovdqu64 (13 * 64)(rio), y5;			\
+	vmovdqu64 (14 * 64)(rio), y6;			\
+	vmovdqu64 (15 * 64)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,			\
+		      x4, x5, x6, x7,			\
+		      y0, y1, y2, y3,			\
+		      y4, y5, y6, y7,			\
+		      mem_ab, mem_cd)			\
+	byteslice_16x16b(x0, x1, x2, x3,		\
+			 x4, x5, x6, x7,		\
+			 y0, y1, y2, y3,		\
+			 y4, y5, y6, y7,		\
+			 (mem_ab), (mem_cd));		\
+							\
+	vmovdqu64 x0, 0 * 64(mem_ab);			\
+	vmovdqu64 x1, 1 * 64(mem_ab);			\
+	vmovdqu64 x2, 2 * 64(mem_ab);			\
+	vmovdqu64 x3, 3 * 64(mem_ab);			\
+	vmovdqu64 x4, 4 * 64(mem_ab);			\
+	vmovdqu64 x5, 5 * 64(mem_ab);			\
+	vmovdqu64 x6, 6 * 64(mem_ab);			\
+	vmovdqu64 x7, 7 * 64(mem_ab);			\
+	vmovdqu64 y0, 0 * 64(mem_cd);			\
+	vmovdqu64 y1, 1 * 64(mem_cd);			\
+	vmovdqu64 y2, 2 * 64(mem_cd);			\
+	vmovdqu64 y3, 3 * 64(mem_cd);			\
+	vmovdqu64 y4, 4 * 64(mem_cd);			\
+	vmovdqu64 y5, 5 * 64(mem_cd);			\
+	vmovdqu64 y6, 6 * 64(mem_cd);			\
+	vmovdqu64 y7, 7 * 64(mem_cd);
+
+#define write_output(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem)				\
+	vmovdqu64 x0, 0 * 64(mem);			\
+	vmovdqu64 x1, 1 * 64(mem);			\
+	vmovdqu64 x2, 2 * 64(mem);			\
+	vmovdqu64 x3, 3 * 64(mem);			\
+	vmovdqu64 x4, 4 * 64(mem);			\
+	vmovdqu64 x5, 5 * 64(mem);			\
+	vmovdqu64 x6, 6 * 64(mem);			\
+	vmovdqu64 x7, 7 * 64(mem);			\
+	vmovdqu64 y0, 8 * 64(mem);			\
+	vmovdqu64 y1, 9 * 64(mem);			\
+	vmovdqu64 y2, 10 * 64(mem);			\
+	vmovdqu64 y3, 11 * 64(mem);			\
+	vmovdqu64 y4, 12 * 64(mem);			\
+	vmovdqu64 y5, 13 * 64(mem);			\
+	vmovdqu64 y6, 14 * 64(mem);			\
+	vmovdqu64 y7, 15 * 64(mem);			\
+
+#define aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, idx)		\
+	vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp);	\
+	vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp);	\
+	vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp);	\
+	vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp);	\
+	vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp);	\
+	vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp);	\
+	vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp);	\
+	vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, idx)		\
+	vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0;	\
+	vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1;	\
+	vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2;	\
+	vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3;	\
+	vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4;	\
+	vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5;	\
+	vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6;	\
+	vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
+
+#define aria_ark_16way(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7,			\
+		       t0, rk, round)			\
+	/* AddRoundKey */                               \
+	vpbroadcastb ((round * 16) + 3)(rk), t0;	\
+	vpxorq t0, x0, x0;				\
+	vpbroadcastb ((round * 16) + 2)(rk), t0;	\
+	vpxorq t0, x1, x1;				\
+	vpbroadcastb ((round * 16) + 1)(rk), t0;	\
+	vpxorq t0, x2, x2;				\
+	vpbroadcastb ((round * 16) + 0)(rk), t0;	\
+	vpxorq t0, x3, x3;				\
+	vpbroadcastb ((round * 16) + 7)(rk), t0;	\
+	vpxorq t0, x4, x4;				\
+	vpbroadcastb ((round * 16) + 6)(rk), t0;	\
+	vpxorq t0, x5, x5;				\
+	vpbroadcastb ((round * 16) + 5)(rk), t0;	\
+	vpxorq t0, x6, x6;				\
+	vpbroadcastb ((round * 16) + 4)(rk), t0;	\
+	vpxorq t0, x7, x7;				\
+	vpbroadcastb ((round * 16) + 11)(rk), t0;	\
+	vpxorq t0, y0, y0;				\
+	vpbroadcastb ((round * 16) + 10)(rk), t0;	\
+	vpxorq t0, y1, y1;				\
+	vpbroadcastb ((round * 16) + 9)(rk), t0;	\
+	vpxorq t0, y2, y2;				\
+	vpbroadcastb ((round * 16) + 8)(rk), t0;	\
+	vpxorq t0, y3, y3;				\
+	vpbroadcastb ((round * 16) + 15)(rk), t0;	\
+	vpxorq t0, y4, y4;				\
+	vpbroadcastb ((round * 16) + 14)(rk), t0;	\
+	vpxorq t0, y5, y5;				\
+	vpbroadcastb ((round * 16) + 13)(rk), t0;	\
+	vpxorq t0, y6, y6;				\
+	vpbroadcastb ((round * 16) + 12)(rk), t0;	\
+	vpxorq t0, y7, y7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
+			    x4, x5, x6, x7,		\
+			    t0, t1, t2, t3,		\
+			    t4, t5, t6, t7)		\
+	vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;	\
+	vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;	\
+	vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;	\
+	vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;	\
+	vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
+	vgf2p8affineinvqb $0, t2, x2, x2;		\
+	vgf2p8affineinvqb $0, t2, x6, x6;		\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
+	vgf2p8affineinvqb $0, t2, x3, x3;		\
+	vgf2p8affineinvqb $0, t2, x7, x7;
+
+#define aria_sbox_16way_gfni(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     t0, t1, t2, t3,		\
+			     t4, t5, t6, t7)		\
+	vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;	\
+	vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;	\
+	vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;	\
+	vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;	\
+	vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
+	vgf2p8affineinvqb $0, t2, x2, x2;		\
+	vgf2p8affineinvqb $0, t2, x6, x6;		\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
+	vgf2p8affineinvqb $0, t2, x3, x3;		\
+	vgf2p8affineinvqb $0, t2, x7, x7;		\
+	vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5;	\
+	vgf2p8affineqb $(tf_inv_const), t1, y2, y2;	\
+	vgf2p8affineqb $(tf_inv_const), t1, y6, y6;	\
+	vgf2p8affineinvqb $0, t2, y2, y2;		\
+	vgf2p8affineinvqb $0, t2, y6, y6;		\
+	vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0;	\
+	vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4;	\
+	vgf2p8affineqb $(tf_x2_const), t4, y3, y3;	\
+	vgf2p8affineqb $(tf_x2_const), t4, y7, y7;	\
+	vgf2p8affineinvqb $0, t2, y3, y3;		\
+	vgf2p8affineinvqb $0, t2, y7, y7;
+
+
+#define aria_diff_m(x0, x1, x2, x3,			\
+		    t0, t1, t2, t3)			\
+	/* T = rotr32(X, 8); */				\
+	/* X ^= T */					\
+	vpxorq x0, x3, t0;				\
+	vpxorq x1, x0, t1;				\
+	vpxorq x2, x1, t2;				\
+	vpxorq x3, x2, t3;				\
+	/* X = T ^ rotr(X, 16); */			\
+	vpxorq t2, x0, x0;				\
+	vpxorq x1, t3, t3;				\
+	vpxorq t0, x2, x2;				\
+	vpxorq t1, x3, x1;				\
+	vmovdqu64 t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7)			\
+	/* t1 ^= t2; */					\
+	vpxorq y0, x4, x4;				\
+	vpxorq y1, x5, x5;				\
+	vpxorq y2, x6, x6;				\
+	vpxorq y3, x7, x7;				\
+							\
+	/* t2 ^= t3; */					\
+	vpxorq y4, y0, y0;				\
+	vpxorq y5, y1, y1;				\
+	vpxorq y6, y2, y2;				\
+	vpxorq y7, y3, y3;				\
+							\
+	/* t0 ^= t1; */					\
+	vpxorq x4, x0, x0;				\
+	vpxorq x5, x1, x1;				\
+	vpxorq x6, x2, x2;				\
+	vpxorq x7, x3, x3;				\
+							\
+	/* t3 ^= t1; */					\
+	vpxorq x4, y4, y4;				\
+	vpxorq x5, y5, y5;				\
+	vpxorq x6, y6, y6;				\
+	vpxorq x7, y7, y7;				\
+							\
+	/* t2 ^= t0; */					\
+	vpxorq x0, y0, y0;				\
+	vpxorq x1, y1, y1;				\
+	vpxorq x2, y2, y2;				\
+	vpxorq x3, y3, y3;				\
+							\
+	/* t1 ^= t2; */					\
+	vpxorq y0, x4, x4;				\
+	vpxorq y1, x5, x5;				\
+	vpxorq y2, x6, x6;				\
+	vpxorq y3, x7, x7;
+
+#define aria_fe_gfni(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     z0, z1, z2, z3,			\
+		     z4, z5, z6, z7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7,	\
+		       z0, rk, round);			\
+							\
+	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
+			     x6, x7, x4, x5,		\
+			     y2, y3, y0, y1,		\
+			     y6, y7, y4, y5,		\
+			     z0, z1, z2, z3,		\
+			     z4, z5, z6, z7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
+	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
+	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
+	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T3 = ABCD -> BADC				\
+	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
+	 * T0 = ABCD -> CDAB				\
+	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
+	 * T1 = ABCD -> DCBA				\
+	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
+	 */						\
+	aria_diff_word(x2, x3, x0, x1,			\
+		       x7, x6, x5, x4,			\
+		       y0, y1, y2, y3,			\
+		       y5, y4, y7, y6);			\
+
+
+#define aria_fo_gfni(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     z0, z1, z2, z3,			\
+		     z4, z5, z6, z7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7,	\
+		       z0, rk, round);			\
+							\
+	aria_sbox_16way_gfni(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     z0, z1, z2, z3,		\
+			     z4, z5, z6, z7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);	\
+	aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);	\
+	aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);	\
+	aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);	\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T1 = ABCD -> BADC				\
+	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
+	 * T2 = ABCD -> CDAB				\
+	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
+	 * T3 = ABCD -> DCBA				\
+	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
+	 */						\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x5, x4, x7, x6,			\
+		       y2, y3, y0, y1,			\
+		       y7, y6, y5, y4);
+
+#define aria_ff_gfni(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     z0, z1, z2, z3,			\
+		     z4, z5, z6, z7,			\
+		     mem_tmp, rk, round, last_round)	\
+	aria_ark_16way(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7,			\
+		       z0, rk, round);			\
+	aria_sbox_16way_gfni(x2, x3, x0, x1,		\
+			     x6, x7, x4, x5,		\
+			     y2, y3, y0, y1,		\
+			     y6, y7, y4, y5,		\
+			     z0, z1, z2, z3,		\
+			     z4, z5, z6, z7);		\
+	aria_ark_16way(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7,			\
+		       z0, rk, last_round);
+
+SECTION_RODATA
+.align 64
+.Lcounter0123_lo:
+	.quad 0, 0
+	.quad 1, 0
+	.quad 2, 0
+	.quad 3, 0
+
+.align 32
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.align 16
+.Lcounter4444_lo:
+	.quad 4, 0
+.Lcounter8888_lo:
+	.quad 8, 0
+.Lcounter16161616_lo:
+	.quad 16, 0
+.Lcounter1111_hi:
+	.quad 0, 1
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
+		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
+		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
+		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
+		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+.text
+
+.align 16
+ELF(.type __aria_gfni_avx512_crypt_64way, at function;)
+__aria_gfni_avx512_crypt_64way:
+	/* input:
+	 *      %r9: rk
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %zmm0..%zmm15: byte-sliced blocks
+	 */
+	CFI_STARTPROC();
+
+	movq %rsi, %rax;
+	leaq 8 * 64(%rax), %r8;
+
+	movl ARIA_CTX_rounds(CTX), %r10d;
+	subl $2, %r10d;
+
+	inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
+		      %zmm4, %zmm5, %zmm6, %zmm7,
+		      %zmm8, %zmm9, %zmm10, %zmm11,
+		      %zmm12, %zmm13, %zmm14,
+		      %zmm15, %rax, %r8);
+	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+		     %zmm4, %zmm5, %zmm6, %zmm7,
+		     %zmm8, %zmm9, %zmm10, %zmm11,
+		     %zmm12, %zmm13, %zmm14, %zmm15,
+		     %zmm24, %zmm25, %zmm26, %zmm27,
+		     %zmm28, %zmm29, %zmm30, %zmm31,
+		     %rax, %r9, 0);
+	leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+	aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+		     %zmm6, %zmm7, %zmm4, %zmm5,
+		     %zmm9, %zmm8, %zmm11, %zmm10,
+		     %zmm12, %zmm13, %zmm14, %zmm15,
+		     %zmm24, %zmm25, %zmm26, %zmm27,
+		     %zmm28, %zmm29, %zmm30, %zmm31,
+		     %rax, %r9, 0);
+	aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+		     %zmm4, %zmm5, %zmm6, %zmm7,
+		     %zmm8, %zmm9, %zmm10, %zmm11,
+		     %zmm12, %zmm13, %zmm14, %zmm15,
+		     %zmm24, %zmm25, %zmm26, %zmm27,
+		     %zmm28, %zmm29, %zmm30, %zmm31,
+		     %rax, %r9, 1);
+	leaq 2*16(%r9), %r9;
+	subl $2, %r10d;
+	jnz .Loop_gfni;
+
+	aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+		     %zmm6, %zmm7, %zmm4, %zmm5,
+		     %zmm9, %zmm8, %zmm11, %zmm10,
+		     %zmm12, %zmm13, %zmm14, %zmm15,
+		     %zmm24, %zmm25, %zmm26, %zmm27,
+		     %zmm28, %zmm29, %zmm30, %zmm31,
+		     %rax, %r9, 0, 1);
+
+	debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
+			   %zmm8, %zmm13, %zmm2, %zmm7,
+			   %zmm11, %zmm14, %zmm1, %zmm4,
+			   %zmm10, %zmm15, %zmm0, %zmm5,
+			   (%rax), (%r8));
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_gfni_avx512_crypt_64way,.-__aria_gfni_avx512_crypt_64way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx512_ecb_crypt_blk64
+ELF(.type _gcry_aria_gfni_avx512_ecb_crypt_blk64, at function;)
+_gcry_aria_gfni_avx512_ecb_crypt_blk64:
+	/* input:
+	 *      %rdi: ctx, CTX
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: round keys
+	 */
+	CFI_STARTPROC();
+	spec_stop_avx512;
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 64), %rsp;
+	andq $~63, %rsp;
+
+	movq %rcx, %r9;
+	movq %rsi, %r11;
+	movq %rsp, %rsi; /* use stack for temporary store */
+
+	inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+		     %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+		     %zmm15, %rdx);
+
+	call __aria_gfni_avx512_crypt_64way;
+
+	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+		     %zmm15, %r11);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	clear_regs();
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx512_ecb_crypt_blk64,
+	  .-_gcry_aria_gfni_avx512_ecb_crypt_blk64;)
+
+.align 16
+ELF(.type __aria_gfni_avx512_ctr_gen_keystream_64way, at function;)
+__aria_gfni_avx512_ctr_gen_keystream_64way:
+	/* input:
+	 *      %rdi: ctx
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: keystream
+	 *      %r8: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
+	vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
+	vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+	vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+	vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+	vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+	/* load IV and byteswap */
+	movq 8(%r8), %r11;
+	movq (%r8), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	vbroadcasti64x2 (%r8), %zmm20;
+	vpshufb %zmm19, %zmm20, %zmm20;
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 64), %r11;
+	ja .Lload_ctr_carry;
+
+	/* construct IVs */
+	vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
+	vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
+	vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
+	vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
+	vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
+	vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
+	vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
+	vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
+	vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
+	vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
+	vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
+	vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
+	vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
+	vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
+	vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
+	vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
+	jmp .Lload_ctr_done;
+
+.Lload_ctr_carry:
+	/* construct IVs */
+	add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
+	add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+	add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+	add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+	add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
+	add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
+	add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
+	add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
+	add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
+	add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
+	add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
+	add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
+	add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
+	add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
+	add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
+	add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
+
+.Lload_ctr_done:
+	/* Byte-swap IVs and update counter. */
+	addq $64, %r11;
+	adcq $0, %r10;
+	vpshufb %zmm19, %zmm15, %zmm15;
+	vpshufb %zmm19, %zmm14, %zmm14;
+	vpshufb %zmm19, %zmm13, %zmm13;
+	vpshufb %zmm19, %zmm12, %zmm12;
+	vpshufb %zmm19, %zmm11, %zmm11;
+	vpshufb %zmm19, %zmm10, %zmm10;
+	vpshufb %zmm19, %zmm9, %zmm9;
+	vpshufb %zmm19, %zmm8, %zmm8;
+	bswapq %r11;
+	bswapq %r10;
+	vpshufb %zmm19, %zmm7, %zmm7;
+	vpshufb %zmm19, %zmm6, %zmm6;
+	vpshufb %zmm19, %zmm5, %zmm5;
+	vpshufb %zmm19, %zmm4, %zmm4;
+	vpshufb %zmm19, %zmm3, %zmm3;
+	vpshufb %zmm19, %zmm2, %zmm2;
+	vpshufb %zmm19, %zmm1, %zmm1;
+	vpshufb %zmm19, %zmm0, %zmm0;
+	movq %r11, 8(%r8);
+	movq %r10, (%r8);
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_gfni_avx512_ctr_gen_keystream_64way,
+	  .-__aria_gfni_avx512_ctr_gen_keystream_64way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx512_ctr_crypt_blk64
+ELF(.type _gcry_aria_gfni_avx512_ctr_crypt_blk64, at function;)
+_gcry_aria_gfni_avx512_ctr_crypt_blk64:
+	/* input:
+	 *      %rdi: ctx
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+	spec_stop_avx512;
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 64), %rsp;
+	andq $~63, %rsp;
+
+	movq %rcx, %r8;  /* %r8: iv */
+	movq %rsp, %rcx; /* %rcx: keystream */
+	call __aria_gfni_avx512_ctr_gen_keystream_64way
+
+	pushq %rsi;
+	movq %rdx, %r11;
+	movq %rcx, %rsi;
+	movq %rcx, %rdx;
+	leaq ARIA_CTX_enc_key(CTX), %r9;
+
+	call __aria_gfni_avx512_crypt_64way;
+
+	popq %rsi;
+	vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
+	vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
+	vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
+	vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
+	vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
+	vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
+	vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
+	vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
+	vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
+	vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
+	vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
+	vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
+	vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
+	vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
+	vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
+	vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
+	write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+		     %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+		     %zmm15, %rsi);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	clear_regs();
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx512_ctr_crypt_blk64,
+	  .-_gcry_aria_gfni_avx512_ctr_crypt_blk64;)
+
+#endif /* ENABLE_AVX512_SUPPORT && ENABLE_GFNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/cipher/aria.c b/cipher/aria.c
index 18952d04..9eb42a2d 100644
--- a/cipher/aria.c
+++ b/cipher/aria.c
@@ -80,8 +80,19 @@
 # define USE_GFNI_AVX2 1
 #endif
 
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */
+#undef USE_GFNI_AVX512
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_GFNI_AVX512 1
+# endif
+#endif
+
 /* How many parallel blocks to handle in bulk processing functions. */
-#if defined(USE_AESNI_AVX2)
+#if defined(USE_GFNI_AVX512)
+# define MAX_PARALLEL_BLKS 64
+#elif defined(USE_AESNI_AVX2)
 # define MAX_PARALLEL_BLKS 32
 #elif defined(USE_AESNI_AVX)
 # define MAX_PARALLEL_BLKS 16
@@ -93,7 +104,8 @@
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \
+    defined(USE_GFNI_AVX512)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 #  define ASM_EXTRA_STACK (10 * 16)
@@ -132,6 +144,9 @@ typedef struct
   unsigned int use_aesni_avx2:1;
   unsigned int use_gfni_avx2:1;
 #endif
+#ifdef USE_GFNI_AVX512
+  unsigned int use_gfni_avx512:1;
+#endif
 } ARIA_context;
 
 
@@ -522,6 +537,33 @@ aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
 }
 #endif /* USE_AESNI_AVX2 */
 
+#ifdef USE_GFNI_AVX512
+extern unsigned int
+_gcry_aria_gfni_avx512_ecb_crypt_blk64(const void *ctx, byte *out,
+				       const byte *in,
+				       const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx512_ctr_crypt_blk64(const void *ctx, byte *out,
+				       const byte *in, byte *iv) ASM_FUNC_ABI;
+
+static inline unsigned int
+aria_gfni_avx512_ecb_crypt_blk64(const ARIA_context *ctx, byte *out,
+				 const byte *in,
+				 const u32 key[][ARIA_RD_KEY_WORDS])
+{
+  return _gcry_aria_gfni_avx512_ecb_crypt_blk64(ctx, out, in, key)
+		+ ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_gfni_avx512_ctr_crypt_blk64(const ARIA_context *ctx, byte *out,
+				 const byte *in, byte *iv)
+{
+  return _gcry_aria_gfni_avx512_ctr_crypt_blk64(ctx, out, in, iv)
+		+ ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX2 */
+
 /* Prefetching for sbox tables. */
 static inline void
 prefetch_table(const volatile byte *tab, size_t len)
@@ -1024,6 +1066,26 @@ aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
 {
   unsigned int burn_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 64)
+	{
+	  nburn = aria_gfni_avx512_ecb_crypt_blk64 (ctx, out, in, key);
+	  in += 64 * ARIA_BLOCK_SIZE;
+	  out += 64 * ARIA_BLOCK_SIZE;
+	  num_blks -= 64;
+	}
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+	return burn_depth;
+    }
+#endif /* USE_AESNI_AVX2 */
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
     {
@@ -1124,6 +1186,23 @@ _gcry_aria_ctr_enc(void *context, unsigned char *ctr,
   const byte *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 64)
+	{
+	  nburn = aria_gfni_avx512_ctr_crypt_blk64 (ctx, outbuf, inbuf, ctr);
+	  inbuf += 64 * ARIA_BLOCK_SIZE;
+	  outbuf += 64 * ARIA_BLOCK_SIZE;
+	  nblocks -= 64;
+	}
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
     {
@@ -1526,6 +1605,9 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
+#ifdef USE_GFNI_AVX512
+  ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
 #ifdef USE_AESNI_AVX2
   ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
 #endif
diff --git a/configure.ac b/configure.ac
index 4f983a58..75622e50 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3040,6 +3040,7 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx-amd64.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx2-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-gfni-avx512-amd64.lo"
       ;;
    esac
 fi
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 19 09:49:09 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 19 Feb 2023 10:49:09 +0200
Subject: [PATCH 4/5] aria-avx512: small optimization for aria_diff_m
In-Reply-To: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
Message-ID: <20230219084910.1302701-4-jussi.kivilinna@iki.fi>

* cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for
3-way XOR operation.
---

Using vpternlogq gives small performance improvement on AMD Zen4. With
Intel tiger-lake speed is the same as before.

Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):

Before:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.204 ns/B      4682 MiB/s     0.957 c/B      4700
        ECB dec |     0.204 ns/B      4668 MiB/s     0.960 c/B      4700
        CTR enc |     0.212 ns/B      4509 MiB/s     0.994 c/B      4700
        CTR dec |     0.212 ns/B      4490 MiB/s     0.998 c/B      4700

After (~3% faster):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.198 ns/B      4812 MiB/s     0.932 c/B      4700
        ECB dec |     0.198 ns/B      4824 MiB/s     0.929 c/B      4700
        CTR enc |     0.204 ns/B      4665 MiB/s     0.961 c/B      4700
        CTR dec |     0.206 ns/B      4631 MiB/s     0.968 c/B      4700

Cc: Taehee Yoo <ap420073 at gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/aria-gfni-avx512-amd64.S | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/cipher/aria-gfni-avx512-amd64.S b/cipher/aria-gfni-avx512-amd64.S
index 849c744b..24a49a89 100644
--- a/cipher/aria-gfni-avx512-amd64.S
+++ b/cipher/aria-gfni-avx512-amd64.S
@@ -406,21 +406,17 @@
 	vgf2p8affineinvqb $0, t2, y3, y3;		\
 	vgf2p8affineinvqb $0, t2, y7, y7;
 
-
 #define aria_diff_m(x0, x1, x2, x3,			\
 		    t0, t1, t2, t3)			\
 	/* T = rotr32(X, 8); */				\
 	/* X ^= T */					\
-	vpxorq x0, x3, t0;				\
-	vpxorq x1, x0, t1;				\
-	vpxorq x2, x1, t2;				\
-	vpxorq x3, x2, t3;				\
 	/* X = T ^ rotr(X, 16); */			\
-	vpxorq t2, x0, x0;				\
-	vpxorq x1, t3, t3;				\
-	vpxorq t0, x2, x2;				\
-	vpxorq t1, x3, x1;				\
-	vmovdqu64 t3, x3;
+	vmovdqa64 x0, t0;				\
+	vmovdqa64 x3, t3;				\
+	vpternlogq $0x96, x2, x1, x0;			\
+	vpternlogq $0x96, x2, x1, x3;			\
+	vpternlogq $0x96, t0, t3, x2;			\
+	vpternlogq $0x96, t0, t3, x1;
 
 #define aria_diff_word(x0, x1, x2, x3,			\
 		       x4, x5, x6, x7,			\
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 19 09:49:10 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 19 Feb 2023 10:49:10 +0200
Subject: [PATCH 5/5] aria-avx2: add VAES accelerated implementation
In-Reply-To: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
Message-ID: <20230219084910.1302701-5-jussi.kivilinna@iki.fi>

* cipher/aria-aesni-avx2-amd64.S (CONFIG_AS_VAES): New.
[CONFIG_AS_VAES]: Add VAES accelerated assembly macros and functions.
* cipher/aria.c (USE_VAES_AVX2): New.
(ARIA_context): Add 'use_vaes_avx2'.
(_gcry_aria_vaes_avx2_ecb_crypt_blk32)
(_gcry_aria_vaes_avx2_ctr_crypt_blk32)
(aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): Add VAES/AVX2
code paths.
(aria_setkey): Enable VAES/AVX2 implementation based on HW features.
--

This patch adds VAES/AVX2 accelerated ARIA block cipher implementation.

VAES instruction set extends AESNI instructions to work on all 128-bit
lanes of 256-bit YMM and 512-bit ZMM vector registers, thus AES
operations can be executed directly on YMM registers without needing
to manually split YMM to two XMM halfs for AESNI instructions.
This improves performance on CPUs that support VAES but not GFNI, like
AMD Zen3.

Benchmark on Ryzen 7 5800X (zen3, turbo-freq off):

 Before (AESNI/AVX2):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.559 ns/B      1707 MiB/s      2.12 c/B      3800
        ECB dec |     0.560 ns/B      1703 MiB/s      2.13 c/B      3800
        CTR enc |     0.570 ns/B      1672 MiB/s      2.17 c/B      3800
        CTR dec |     0.568 ns/B      1679 MiB/s      2.16 c/B      3800

 After (VAES/AVX2, ~33% faster):
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.435 ns/B      2193 MiB/s      1.65 c/B      3800
        ECB dec |     0.434 ns/B      2197 MiB/s      1.65 c/B      3800
        CTR enc |     0.413 ns/B      2306 MiB/s      1.57 c/B      3800
        CTR dec |     0.411 ns/B      2318 MiB/s      1.56 c/B      3800

Cc: Taehee Yoo <ap420073 at gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/aria-aesni-avx2-amd64.S | 368 ++++++++++++++++++++++++++++++++-
 cipher/aria.c                  |  50 ++++-
 2 files changed, 409 insertions(+), 9 deletions(-)

diff --git a/cipher/aria-aesni-avx2-amd64.S b/cipher/aria-aesni-avx2-amd64.S
index f09a9042..f1dcdb70 100644
--- a/cipher/aria-aesni-avx2-amd64.S
+++ b/cipher/aria-aesni-avx2-amd64.S
@@ -31,6 +31,9 @@
 #ifdef ENABLE_GFNI_SUPPORT
 #  define CONFIG_AS_GFNI 1
 #endif
+#ifdef HAVE_GCC_INLINE_ASM_VAES_VPCLMUL
+#  define CONFIG_AS_VAES 1
+#endif
 
 /* struct ARIA_context: */
 #define ARIA_BLOCK_SIZE  16
@@ -358,6 +361,53 @@
 	vgf2p8affineinvqb $0, t2, x7, x7
 #endif /* CONFIG_AS_GFNI */
 
+#ifdef CONFIG_AS_VAES
+#define aria_sbox_8way_vaes(x0, x1, x2, x3,		\
+			    x4, x5, x6, x7,		\
+			    t0, t1, t2, t3,		\
+			    t4, t5, t6, t7)		\
+	vpxor t7, t7, t7;				\
+	vpxor t6, t6, t6;				\
+	vbroadcasti128 .Linv_shift_row rRIP, t0;	\
+	vbroadcasti128 .Lshift_row rRIP, t1;		\
+	vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
+	vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
+	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
+	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
+							\
+	vaesenclast t7, x0, x0;				\
+	vaesenclast t7, x4, x4;				\
+	vaesenclast t7, x1, x1;				\
+	vaesenclast t7, x5, x5;				\
+	vaesdeclast t7, x2, x2;				\
+	vaesdeclast t7, x6, x6;				\
+							\
+	vpbroadcastd .L0f0f0f0f rRIP, t6;		\
+							\
+	/* AES inverse shift rows */			\
+	vpshufb t0, x0, x0;				\
+	vpshufb t0, x4, x4;				\
+	vpshufb t0, x1, x1;				\
+	vpshufb t0, x5, x5;				\
+	vpshufb t1, x3, x3;				\
+	vpshufb t1, x7, x7;				\
+	vpshufb t1, x2, x2;				\
+	vpshufb t1, x6, x6;				\
+							\
+	/* affine transformation for S2 */		\
+	filter_8bit(x1, t2, t3, t6, t0);		\
+	/* affine transformation for S2 */		\
+	filter_8bit(x5, t2, t3, t6, t0);		\
+							\
+	/* affine transformation for X2 */		\
+	filter_8bit(x3, t4, t5, t6, t0);		\
+	/* affine transformation for X2 */		\
+	filter_8bit(x7, t4, t5, t6, t0);		\
+							\
+	vaesdeclast t7, x3, x3;				\
+	vaesdeclast t7, x7, x7;
+#endif /* CONFIG_AS_VAES */
+
 #define aria_sbox_8way(x0, x1, x2, x3,			\
 		       x4, x5, x6, x7,			\
 		       t0, t1, t2, t3,			\
@@ -432,7 +482,7 @@
 	vextracti128 $1, x7, t6##_x;			\
 	vaesdeclast t7##_x, x7##_x, x7##_x;		\
 	vaesdeclast t7##_x, t6##_x, t6##_x;		\
-	vinserti128 $1, t6##_x, x7, x7;			\
+	vinserti128 $1, t6##_x, x7, x7;
 
 #define aria_diff_m(x0, x1, x2, x3,			\
 		    t0, t1, t2, t3)			\
@@ -630,6 +680,7 @@
 	aria_load_state_8way(y0, y1, y2, y3,		\
 			     y4, y5, y6, y7,		\
 			     mem_tmp, 8);
+
 #ifdef CONFIG_AS_GFNI
 #define aria_fe_gfni(x0, x1, x2, x3,			\
 		     x4, x5, x6, x7,			\
@@ -786,6 +837,155 @@
 			     mem_tmp, 8);
 #endif /* CONFIG_AS_GFNI */
 
+#ifdef CONFIG_AS_VAES
+#define aria_fe_vaes(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T3 = ABCD -> BADC				\
+	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
+	 * T0 = ABCD -> CDAB				\
+	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
+	 * T1 = ABCD -> DCBA				\
+	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
+	 */						\
+	aria_diff_word(x2, x3, x0, x1,			\
+		       x7, x6, x5, x4,			\
+		       y0, y1, y2, y3,			\
+		       y5, y4, y7, y6);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_fo_vaes(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6,	\
+			    x7, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6,	\
+			    x7, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T1 = ABCD -> BADC				\
+	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
+	 * T2 = ABCD -> CDAB				\
+	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
+	 * T3 = ABCD -> DCBA				\
+	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
+	 */						\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x5, x4, x7, x6,			\
+		       y2, y3, y0, y1,			\
+		       y7, y6, y5, y4);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_ff_vaes(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round, last_round)	\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);			\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, last_round);		\
+							\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
+			    x5, y0, y1, y2, y3, y4, y5,	\
+			    y6, y7);	\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, last_round);		\
+							\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);
+#endif /* CONFIG_AS_VAES */
 
 SECTION_RODATA
 .align 32
@@ -1279,6 +1479,172 @@ _gcry_aria_aesni_avx2_ctr_crypt_blk32:
 ELF(.size _gcry_aria_aesni_avx2_ctr_crypt_blk32,
 	  .-_gcry_aria_aesni_avx2_ctr_crypt_blk32;)
 
+#ifdef CONFIG_AS_VAES
+.align 16
+ELF(.type __aria_vaes_avx2_crypt_32way, at function;)
+__aria_vaes_avx2_crypt_32way:
+	/* input:
+	 *      %r9: rk
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %ymm0..%ymm15: byte-sliced blocks
+	 */
+	CFI_STARTPROC();
+
+	movq %rsi, %rax;
+	leaq 8 * 32(%rax), %r8;
+
+	movl ARIA_CTX_rounds(CTX), %r10d;
+	subl $2, %r10d;
+
+	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %r8);
+	aria_fo_vaes(%ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %ymm0, %ymm1, %ymm2, %ymm3,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %rax, %r9, 0);
+	leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_vaes:
+	aria_fe_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %rax, %r9, 0);
+	aria_fo_vaes(%ymm9, %ymm8, %ymm11, %ymm10,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %ymm0, %ymm1, %ymm2, %ymm3,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %rax, %r9, 1);
+	leaq 2*16(%r9), %r9;
+	subl $2, %r10d;
+	jnz .Loop_vaes;
+
+	aria_ff_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %rax, %r9, 0, 1);
+
+	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+			   %ymm9, %ymm13, %ymm0, %ymm5,
+			   %ymm10, %ymm14, %ymm3, %ymm6,
+			   %ymm11, %ymm15, %ymm2, %ymm7,
+			   (%rax), (%r8));
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_vaes_avx2_crypt_32way,.-__aria_vaes_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ecb_crypt_blk32, at function;)
+_gcry_aria_vaes_avx2_ecb_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx, CTX
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: round keys
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r9;
+	movq %rsi, %r11;
+	movq %rsp, %rsi; /* use stack for temporary store */
+
+	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx);
+
+	call __aria_vaes_avx2_crypt_32way;
+
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %r11);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ecb_crypt_blk32,
+	  .-_gcry_aria_vaes_avx2_ecb_crypt_blk32;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ctr_crypt_blk32, at function;)
+_gcry_aria_vaes_avx2_ctr_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r8;  /* %r8: iv */
+	movq %rsp, %rcx; /* %rcx: keystream */
+	call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+	pushq %rsi;
+	movq %rdx, %r11;
+	movq %rcx, %rsi; /* use stack for temporary store */
+	movq %rcx, %rdx;
+	leaq ARIA_CTX_enc_key(CTX), %r9;
+
+	call __aria_vaes_avx2_crypt_32way;
+
+	popq %rsi;
+	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rsi);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ctr_crypt_blk32,
+	  .-_gcry_aria_vaes_avx2_ctr_crypt_blk32;)
+#endif /* CONFIG_AS_VAES */
+
 #ifdef CONFIG_AS_GFNI
 .align 16
 ELF(.type __aria_gfni_avx2_crypt_32way, at function;)
diff --git a/cipher/aria.c b/cipher/aria.c
index 9eb42a2d..bc2d4384 100644
--- a/cipher/aria.c
+++ b/cipher/aria.c
@@ -74,6 +74,12 @@
 # endif
 #endif
 
+/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */
+#undef USE_VAES_AVX2
+#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define USE_VAES_AVX2 1
+#endif
+
 /* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
 #undef USE_GFNI_AVX2
 #if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
@@ -142,6 +148,7 @@ typedef struct
 #endif
 #ifdef USE_AESNI_AVX2
   unsigned int use_aesni_avx2:1;
+  unsigned int use_vaes_avx2:1;
   unsigned int use_gfni_avx2:1;
 #endif
 #ifdef USE_GFNI_AVX512
@@ -464,12 +471,13 @@ static inline unsigned int
 aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, const byte *in,
 			   const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX
-  if (ctx->use_gfni_avx)
+  else if (ctx->use_gfni_avx)
     return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX */
+  else
     return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
 		+ ASM_EXTRA_STACK;
 }
@@ -478,12 +486,13 @@ static inline unsigned int
 aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const byte *in,
 			 byte *iv)
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX
-  if (ctx->use_gfni_avx)
+  else if (ctx->use_gfni_avx)
     return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX */
+  else
     return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
 }
@@ -498,6 +507,16 @@ extern unsigned int
 _gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
 				      const byte *in, byte *iv) ASM_FUNC_ABI;
 
+#ifdef USE_VAES_AVX2
+extern unsigned int
+_gcry_aria_vaes_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+				     const byte *in,
+				     const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_vaes_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+				     const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_VAES_AVX2 */
+
 #ifdef USE_GFNI_AVX2
 extern unsigned int
 _gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
@@ -512,12 +531,18 @@ static inline unsigned int
 aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
 			  const u32 key[][ARIA_RD_KEY_WORDS])
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX2
-  if (ctx->use_gfni_avx2)
+  else if (ctx->use_gfni_avx2)
     return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ecb_crypt_blk32(ctx, out, in, key)
+		+ ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
     return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
 		+ ASM_EXTRA_STACK;
 }
@@ -526,12 +551,18 @@ static inline unsigned int
 aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
 			  byte *iv)
 {
+  if (0) { }
 #ifdef USE_GFNI_AVX2
-  if (ctx->use_gfni_avx2)
+  else if (ctx->use_gfni_avx2)
     return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
-  else
 #endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+		+ ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
     return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
 		+ ASM_EXTRA_STACK;
 }
@@ -1614,6 +1645,9 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
 #ifdef USE_GFNI_AVX2
   ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
 #endif
+#ifdef USE_VAES_AVX2
+  ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
+#endif
 #ifdef USE_AESNI_AVX
   ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
 #endif
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 19 09:49:06 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 19 Feb 2023 10:49:06 +0200
Subject: [PATCH 1/5] aria: add x86_64 AESNI/GFNI/AVX/AVX2 accelerated
 implementations
Message-ID: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'aria-aesni-avx-amd64.S' and
'aria-aesni-avx2-amd64.S'.
* cipher/aria-aesni-avx-amd64.S: New.
* cipher/aria-aesni-avx2-amd64.S: New.
* cipher/aria.c (USE_AESNI_AVX, USE_GFNI_AVX, USE_AESNI_AVX2)
(USE_GFNI_AVX2, MAX_PARALLEL_BLKS, ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
(ARIA_context): Add 'use_aesni_avx', 'use_gfni_avx',
'use_aesni_avx2' and 'use_gfni_avx2'.
(_gcry_aria_aesni_avx_ecb_crypt_blk1_16)
(_gcry_aria_aesni_avx_ctr_crypt_blk16)
(_gcry_aria_gfni_avx_ecb_crypt_blk1_16)
(_gcry_aria_gfni_avx_ctr_crypt_blk16)
(aria_avx_ecb_crypt_blk1_16, aria_avx_ctr_crypt_blk16)
(_gcry_aria_aesni_avx2_ecb_crypt_blk32)
(_gcry_aria_aesni_avx2_ctr_crypt_blk32)
(_gcry_aria_gfni_avx2_ecb_crypt_blk32)
(_gcry_aria_gfni_avx2_ctr_crypt_blk32)
(aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): New.
(aria_crypt_blocks) [USE_AESNI_AVX2]: Add 32 parallel block
AVX2/AESNI/GFNI processing.
(aria_crypt_blocks) [USE_AESNI_AVX]: Add 3 to 16 parallel block
AVX/AESNI/GFNI processing.
(_gcry_aria_ctr_enc) [USE_AESNI_AVX2]: Add 32 parallel block
AVX2/AESNI/GFNI processing.
(_gcry_aria_ctr_enc) [USE_AESNI_AVX]: Add 16 parallel block
AVX/AESNI/GFNI processing.
(_gcry_aria_ctr_enc, _gcry_aria_cbc_dec, _gcry_aria_cfb_enc)
(_gcry_aria_ecb_crypt, _gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc)
(_gcry_aria_ocb_crypt, _gcry_aria_ocb_auth): Use MAX_PARALLEL_BLKS
for parallel processing width.
(aria_setkey): Enable AESNI/AVX, GFNI/AVX, AESNI/AVX2, GFNI/AVX2 based
on HW features.
* configure.ac: Add 'aria-aesni-avx-amd64.lo' and
'aria-aesni-avx2-amd64.lo'.
---

This patch adds AVX/AVX2/AESNI/GFNI accelerated ARIA block cipher
implementations for libgcrypt. This implementation is based on work
by Taehee Yoo, with following notable changes:
 - Integration to libgcrypt, use of 'aes-common-amd64.h'.
 - Use 'vmovddup' for loading GFNI constants.
 - Use round loop instead of unrolling for smaller code size and
   increased performance.
 - Use stack for temporary storage instead of external buffers.
 - Use merge ECB encryption/decryption to single function.
 - Add 1 to 15 blocks support for AVX ECB functions.
 - Add byte-addition fast path for CTR.

===

Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):

 AESNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.715 ns/B      1333 MiB/s      3.36 c/B      4700
        ECB dec |     0.712 ns/B      1339 MiB/s      3.35 c/B      4700
        CTR enc |     0.714 ns/B      1336 MiB/s      3.36 c/B      4700
        CTR dec |     0.714 ns/B      1335 MiB/s      3.36 c/B      4700

 GFNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.516 ns/B      1847 MiB/s      2.43 c/B      4700
        ECB dec |     0.519 ns/B      1839 MiB/s      2.44 c/B      4700
        CTR enc |     0.517 ns/B      1846 MiB/s      2.43 c/B      4700
        CTR dec |     0.518 ns/B      1843 MiB/s      2.43 c/B      4700

 AESNI/AVX2:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.416 ns/B      2292 MiB/s      1.96 c/B      4700
        ECB dec |     0.421 ns/B      2266 MiB/s      1.98 c/B      4700
        CTR enc |     0.415 ns/B      2298 MiB/s      1.95 c/B      4700
        CTR dec |     0.415 ns/B      2300 MiB/s      1.95 c/B      4700

 GFNI/AVX2:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.235 ns/B      4056 MiB/s      1.11 c/B      4700
        ECB dec |     0.234 ns/B      4079 MiB/s      1.10 c/B      4700
        CTR enc |     0.232 ns/B      4104 MiB/s      1.09 c/B      4700
        CTR dec |     0.233 ns/B      4094 MiB/s      1.10 c/B      4700

===

Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off):

 AESNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.26 ns/B     757.6 MiB/s      3.77 c/B      2993
        ECB dec |      1.27 ns/B     753.1 MiB/s      3.79 c/B      2992
        CTR enc |      1.25 ns/B     760.3 MiB/s      3.75 c/B      2992
        CTR dec |      1.26 ns/B     759.1 MiB/s      3.76 c/B      2992

 GFNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.967 ns/B     986.6 MiB/s      2.89 c/B      2992
        ECB dec |     0.966 ns/B     987.1 MiB/s      2.89 c/B      2992
        CTR enc |     0.972 ns/B     980.8 MiB/s      2.91 c/B      2993
        CTR dec |     0.971 ns/B     982.5 MiB/s      2.90 c/B      2993

 AESNI/AVX2:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.817 ns/B      1167 MiB/s      2.44 c/B      2992
        ECB dec |     0.819 ns/B      1164 MiB/s      2.45 c/B      2992
        CTR enc |     0.819 ns/B      1164 MiB/s      2.45 c/B      2992
        CTR dec |     0.819 ns/B      1164 MiB/s      2.45 c/B      2992

 GFNI/AVX2:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.506 ns/B      1886 MiB/s      1.51 c/B      2992
        ECB dec |     0.505 ns/B      1887 MiB/s      1.51 c/B      2992
        CTR enc |     0.564 ns/B      1691 MiB/s      1.69 c/B      2992
        CTR dec |     0.565 ns/B      1689 MiB/s      1.69 c/B      2992

===

Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off):

 AESNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.921 ns/B      1035 MiB/s      3.50 c/B      3800
        ECB dec |     0.922 ns/B      1034 MiB/s      3.50 c/B      3800
        CTR enc |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
        CTR dec |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800

 AESNI/AVX2:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.559 ns/B      1707 MiB/s      2.12 c/B      3800
        ECB dec |     0.560 ns/B      1703 MiB/s      2.13 c/B      3800
        CTR enc |     0.570 ns/B      1672 MiB/s      2.17 c/B      3800
        CTR dec |     0.568 ns/B      1679 MiB/s      2.16 c/B      3800

===

Benchmark on AMD EPYC 7642 (zen2):

 AESNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.22 ns/B     784.5 MiB/s      4.01 c/B      3298
        ECB dec |      1.22 ns/B     784.8 MiB/s      4.00 c/B      3292
        CTR enc |      1.22 ns/B     780.1 MiB/s      4.03 c/B      3299
        CTR dec |      1.22 ns/B     779.1 MiB/s      4.04 c/B      3299

 AESNI/AVX2:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.735 ns/B      1298 MiB/s      2.42 c/B      3299
        ECB dec |     0.738 ns/B      1292 MiB/s      2.44 c/B      3299
        CTR enc |     0.732 ns/B      1303 MiB/s      2.41 c/B      3299
        CTR dec |     0.732 ns/B      1303 MiB/s      2.41 c/B      3299

===

Benchmark on Intel Core i5-6500 (skylake):

 AESNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      1.24 ns/B     766.6 MiB/s      4.48 c/B      3598
        ECB dec |      1.25 ns/B     764.9 MiB/s      4.49 c/B      3598
        CTR enc |      1.25 ns/B     761.7 MiB/s      4.50 c/B      3598
        CTR dec |      1.25 ns/B     761.6 MiB/s      4.51 c/B      3598

 AESNI/AVX2:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.829 ns/B      1150 MiB/s      2.98 c/B      3599
        ECB dec |     0.831 ns/B      1147 MiB/s      2.99 c/B      3598
        CTR enc |     0.829 ns/B      1150 MiB/s      2.98 c/B      3598
        CTR dec |     0.828 ns/B      1152 MiB/s      2.98 c/B      3598

===

Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off):

 AESNI/AVX:
 ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      2.11 ns/B     452.7 MiB/s      5.25 c/B      2494
        ECB dec |      2.10 ns/B     454.5 MiB/s      5.23 c/B      2494
        CTR enc |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
        CTR dec |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494

Cc: Taehee Yoo <ap420073 at gmail.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am             |    2 +-
 cipher/aria-aesni-avx-amd64.S  | 1424 +++++++++++++++++++++++++++++++
 cipher/aria-aesni-avx2-amd64.S | 1449 ++++++++++++++++++++++++++++++++
 cipher/aria.c                  |  299 ++++++-
 configure.ac                   |    8 +
 5 files changed, 3156 insertions(+), 26 deletions(-)
 create mode 100644 cipher/aria-aesni-avx-amd64.S
 create mode 100644 cipher/aria-aesni-avx2-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 7ebcd179..a13e52e9 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -77,7 +77,7 @@ EXTRA_libcipher_la_SOURCES = \
 	asm-poly1305-aarch64.h \
 	asm-poly1305-amd64.h \
 	asm-poly1305-s390x.h \
-	aria.c \
+	aria.c aria-aesni-avx-amd64.S aria-aesni-avx2-amd64.S \
 	arcfour.c arcfour-amd64.S \
 	blowfish.c blowfish-amd64.S blowfish-arm.S \
 	cast5.c cast5-amd64.S cast5-arm.S \
diff --git a/cipher/aria-aesni-avx-amd64.S b/cipher/aria-aesni-avx-amd64.S
new file mode 100644
index 00000000..7274b80e
--- /dev/null
+++ b/cipher/aria-aesni-avx-amd64.S
@@ -0,0 +1,1424 @@
+/* aria-aesni-avx-amd64.S  -  AESNI/GFNI/AVX implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073 at gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX_SUPPORT) && defined(ENABLE_AESNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#ifdef ENABLE_GFNI_SUPPORT
+#  define CONFIG_AS_GFNI 1
+#endif
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 16 + 15)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
+	( (((a0) & 1) << 0) |				\
+	  (((a1) & 1) << 1) |				\
+	  (((a2) & 1) << 2) |				\
+	  (((a3) & 1) << 3) |				\
+	  (((a4) & 1) << 4) |				\
+	  (((a5) & 1) << 5) |				\
+	  (((a6) & 1) << 6) |				\
+	  (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
+	( ((l7) << (0 * 8)) |				\
+	  ((l6) << (1 * 8)) |				\
+	  ((l5) << (2 * 8)) |				\
+	  ((l4) << (3 * 8)) |				\
+	  ((l3) << (4 * 8)) |				\
+	  ((l2) << (5 * 8)) |				\
+	  ((l1) << (6 * 8)) |				\
+	  ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define inc_le128(x, minus_one, tmp)			\
+	vpcmpeqq minus_one, x, tmp;			\
+	vpsubq minus_one, x, x;				\
+	vpslldq $8, tmp, tmp;				\
+	vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
+	vpand x, mask4bit, tmp0;			\
+	vpandn x, mask4bit, x;				\
+	vpsrld $4, x, x;				\
+							\
+	vpshufb tmp0, lo_t, tmp0;			\
+	vpshufb x, hi_t, x;				\
+	vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
+	vpunpckhdq x1, x0, t2;				\
+	vpunpckldq x1, x0, x0;				\
+							\
+	vpunpckldq x3, x2, t1;				\
+	vpunpckhdq x3, x2, x2;				\
+							\
+	vpunpckhqdq t1, x0, x1;				\
+	vpunpcklqdq t1, x0, x0;				\
+							\
+	vpunpckhqdq x2, t2, x3;				\
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,		\
+			 a1, b1, c1, d1,		\
+			 a2, b2, c2, d2,		\
+			 a3, b3, c3, d3,		\
+			 st0, st1)			\
+	vmovdqu d2, st0;				\
+	vmovdqu d3, st1;				\
+	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
+	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu a0, st0;				\
+	vmovdqu a1, st1;				\
+	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
+	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
+							\
+	vmovdqu .Lshufb_16x16b rRIP, a0;		\
+	vmovdqu st1, a1;				\
+	vpshufb a0, a2, a2;				\
+	vpshufb a0, a3, a3;				\
+	vpshufb a0, b0, b0;				\
+	vpshufb a0, b1, b1;				\
+	vpshufb a0, b2, b2;				\
+	vpshufb a0, b3, b3;				\
+	vpshufb a0, a1, a1;				\
+	vpshufb a0, c0, c0;				\
+	vpshufb a0, c1, c1;				\
+	vpshufb a0, c2, c2;				\
+	vpshufb a0, c3, c3;				\
+	vpshufb a0, d0, d0;				\
+	vpshufb a0, d1, d1;				\
+	vpshufb a0, d2, d2;				\
+	vpshufb a0, d3, d3;				\
+	vmovdqu d3, st1;				\
+	vmovdqu st0, d3;				\
+	vpshufb a0, d3, a0;				\
+	vmovdqu d2, st0;				\
+							\
+	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
+	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu b0, st0;				\
+	vmovdqu b1, st1;				\
+	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
+	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
+	vmovdqu st0, b0;				\
+	vmovdqu st1, b1;				\
+	/* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,		\
+			   a1, b1, c1, d1,		\
+			   a2, b2, c2, d2,		\
+			   a3, b3, c3, d3,		\
+			   st0, st1)			\
+	vmovdqu d2, st0;				\
+	vmovdqu d3, st1;				\
+	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
+	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu a0, st0;				\
+	vmovdqu a1, st1;				\
+	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
+	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
+							\
+	vmovdqu .Lshufb_16x16b rRIP, a0;		\
+	vmovdqu st1, a1;				\
+	vpshufb a0, a2, a2;				\
+	vpshufb a0, a3, a3;				\
+	vpshufb a0, b0, b0;				\
+	vpshufb a0, b1, b1;				\
+	vpshufb a0, b2, b2;				\
+	vpshufb a0, b3, b3;				\
+	vpshufb a0, a1, a1;				\
+	vpshufb a0, c0, c0;				\
+	vpshufb a0, c1, c1;				\
+	vpshufb a0, c2, c2;				\
+	vpshufb a0, c3, c3;				\
+	vpshufb a0, d0, d0;				\
+	vpshufb a0, d1, d1;				\
+	vpshufb a0, d2, d2;				\
+	vpshufb a0, d3, d3;				\
+	vmovdqu d3, st1;				\
+	vmovdqu st0, d3;				\
+	vpshufb a0, d3, a0;				\
+	vmovdqu d2, st0;				\
+							\
+	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
+	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu b0, st0;				\
+	vmovdqu b1, st1;				\
+	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
+	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
+	vmovdqu st0, b0;				\
+	vmovdqu st1, b1;				\
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers */
+#define inpack16_pre(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     rio)				\
+	vmovdqu (0 * 16)(rio), x0;			\
+	vmovdqu (1 * 16)(rio), x1;			\
+	vmovdqu (2 * 16)(rio), x2;			\
+	vmovdqu (3 * 16)(rio), x3;			\
+	vmovdqu (4 * 16)(rio), x4;			\
+	vmovdqu (5 * 16)(rio), x5;			\
+	vmovdqu (6 * 16)(rio), x6;			\
+	vmovdqu (7 * 16)(rio), x7;			\
+	vmovdqu (8 * 16)(rio), y0;			\
+	vmovdqu (9 * 16)(rio), y1;			\
+	vmovdqu (10 * 16)(rio), y2;			\
+	vmovdqu (11 * 16)(rio), y3;			\
+	vmovdqu (12 * 16)(rio), y4;			\
+	vmovdqu (13 * 16)(rio), y5;			\
+	vmovdqu (14 * 16)(rio), y6;			\
+	vmovdqu (15 * 16)(rio), y7;
+
+/* byteslice blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,			\
+		      x4, x5, x6, x7,			\
+		      y0, y1, y2, y3,			\
+		      y4, y5, y6, y7,			\
+		      mem_ab, mem_cd)			\
+	byteslice_16x16b(x0, x1, x2, x3,		\
+			 x4, x5, x6, x7,		\
+			 y0, y1, y2, y3,		\
+			 y4, y5, y6, y7,		\
+			 (mem_ab), (mem_cd));		\
+							\
+	vmovdqu x0, 0 * 16(mem_ab);			\
+	vmovdqu x1, 1 * 16(mem_ab);			\
+	vmovdqu x2, 2 * 16(mem_ab);			\
+	vmovdqu x3, 3 * 16(mem_ab);			\
+	vmovdqu x4, 4 * 16(mem_ab);			\
+	vmovdqu x5, 5 * 16(mem_ab);			\
+	vmovdqu x6, 6 * 16(mem_ab);			\
+	vmovdqu x7, 7 * 16(mem_ab);			\
+	vmovdqu y0, 0 * 16(mem_cd);			\
+	vmovdqu y1, 1 * 16(mem_cd);			\
+	vmovdqu y2, 2 * 16(mem_cd);			\
+	vmovdqu y3, 3 * 16(mem_cd);			\
+	vmovdqu y4, 4 * 16(mem_cd);			\
+	vmovdqu y5, 5 * 16(mem_cd);			\
+	vmovdqu y6, 6 * 16(mem_cd);			\
+	vmovdqu y7, 7 * 16(mem_cd);
+
+#define write_output(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem)				\
+	vmovdqu x0, 0 * 16(mem);			\
+	vmovdqu x1, 1 * 16(mem);			\
+	vmovdqu x2, 2 * 16(mem);			\
+	vmovdqu x3, 3 * 16(mem);			\
+	vmovdqu x4, 4 * 16(mem);			\
+	vmovdqu x5, 5 * 16(mem);			\
+	vmovdqu x6, 6 * 16(mem);			\
+	vmovdqu x7, 7 * 16(mem);			\
+	vmovdqu y0, 8 * 16(mem);			\
+	vmovdqu y1, 9 * 16(mem);			\
+	vmovdqu y2, 10 * 16(mem);			\
+	vmovdqu y3, 11 * 16(mem);			\
+	vmovdqu y4, 12 * 16(mem);			\
+	vmovdqu y5, 13 * 16(mem);			\
+	vmovdqu y6, 14 * 16(mem);			\
+	vmovdqu y7, 15 * 16(mem);
+
+#define vload_if_enough_nblks(blk_offs, rnblks, rio, v)	\
+	vpxor v, v, v;					\
+	cmp $(blk_offs), rnblks;			\
+	jbe 1f;						\
+	vmovdqu (blk_offs * 16)(rio), v;		\
+	1:;
+
+#define vstore_if_enough_nblks(blk_offs, rnblks, mem, v)\
+	cmp $(blk_offs), rnblks;			\
+	jbe 1f;						\
+	vmovdqu v, (blk_offs * 16)(mem);		\
+	1:;
+
+#define inpack_1_15_pre(x0, x1, x2, x3,			\
+			x4, x5, x6, x7,			\
+			y0, y1, y2, y3,			\
+			y4, y5, y6, y7,			\
+			rio, rnblks)			\
+	vmovdqu (0 * 16)(rio), x0;			\
+	vload_if_enough_nblks(1, rnblks, rio, x1);	\
+	vload_if_enough_nblks(2, rnblks, rio, x2);	\
+	vload_if_enough_nblks(3, rnblks, rio, x3);	\
+	vload_if_enough_nblks(4, rnblks, rio, x4);	\
+	vload_if_enough_nblks(5, rnblks, rio, x5);	\
+	vload_if_enough_nblks(6, rnblks, rio, x6);	\
+	vload_if_enough_nblks(7, rnblks, rio, x7);	\
+	vload_if_enough_nblks(8, rnblks, rio, y0);	\
+	vload_if_enough_nblks(9, rnblks, rio, y1);	\
+	vload_if_enough_nblks(10, rnblks, rio, y2);	\
+	vload_if_enough_nblks(11, rnblks, rio, y3);	\
+	vload_if_enough_nblks(12, rnblks, rio, y4);	\
+	vload_if_enough_nblks(13, rnblks, rio, y5);	\
+	vload_if_enough_nblks(14, rnblks, rio, y6);	\
+	vpxor y7, y7, y7;
+
+#define write_output_1_15(x0, x1, x2, x3,		\
+			  x4, x5, x6, x7,		\
+			  y0, y1, y2, y3,		\
+			  y4, y5, y6, y7,		\
+			  mem, rnblks)			\
+	vmovdqu x0, (0 * 16)(mem);			\
+	vstore_if_enough_nblks(1, rnblks, mem, x1);	\
+	vstore_if_enough_nblks(2, rnblks, mem, x2);	\
+	vstore_if_enough_nblks(3, rnblks, mem, x3);	\
+	vstore_if_enough_nblks(4, rnblks, mem, x4);	\
+	vstore_if_enough_nblks(5, rnblks, mem, x5);	\
+	vstore_if_enough_nblks(6, rnblks, mem, x6);	\
+	vstore_if_enough_nblks(7, rnblks, mem, x7);	\
+	vstore_if_enough_nblks(8, rnblks, mem, y0);	\
+	vstore_if_enough_nblks(9, rnblks, mem, y1);	\
+	vstore_if_enough_nblks(10, rnblks, mem, y2);	\
+	vstore_if_enough_nblks(11, rnblks, mem, y3);	\
+	vstore_if_enough_nblks(12, rnblks, mem, y4);	\
+	vstore_if_enough_nblks(13, rnblks, mem, y5);	\
+	vstore_if_enough_nblks(14, rnblks, mem, y6);
+
+#define aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, idx)		\
+	vmovdqu x0, ((idx + 0) * 16)(mem_tmp);		\
+	vmovdqu x1, ((idx + 1) * 16)(mem_tmp);		\
+	vmovdqu x2, ((idx + 2) * 16)(mem_tmp);		\
+	vmovdqu x3, ((idx + 3) * 16)(mem_tmp);		\
+	vmovdqu x4, ((idx + 4) * 16)(mem_tmp);		\
+	vmovdqu x5, ((idx + 5) * 16)(mem_tmp);		\
+	vmovdqu x6, ((idx + 6) * 16)(mem_tmp);		\
+	vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, idx)		\
+	vmovdqu ((idx + 0) * 16)(mem_tmp), x0;		\
+	vmovdqu ((idx + 1) * 16)(mem_tmp), x1;		\
+	vmovdqu ((idx + 2) * 16)(mem_tmp), x2;		\
+	vmovdqu ((idx + 3) * 16)(mem_tmp), x3;		\
+	vmovdqu ((idx + 4) * 16)(mem_tmp), x4;		\
+	vmovdqu ((idx + 5) * 16)(mem_tmp), x5;		\
+	vmovdqu ((idx + 6) * 16)(mem_tmp), x6;		\
+	vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3,			\
+		      x4, x5, x6, x7,			\
+		      t0, t1, t2, rk,			\
+		      idx, round)			\
+	/* AddRoundKey */                               \
+	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
+	vpsrld $24, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x0, x0;				\
+	vpsrld $16, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x1, x1;				\
+	vpsrld $8, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x2, x2;				\
+	vpshufb t1, t0, t2;				\
+	vpxor t2, x3, x3;				\
+	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
+	vpsrld $24, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x4, x4;				\
+	vpsrld $16, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x5, x5;				\
+	vpsrld $8, t0, t2;				\
+	vpshufb t1, t2, t2;				\
+	vpxor t2, x6, x6;				\
+	vpshufb t1, t0, t2;				\
+	vpxor t2, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
+			    x4, x5, x6, x7,		\
+			    t0, t1, t2, t3,		\
+			    t4, t5, t6, t7)		\
+	vmovddup .Ltf_s2_bitmatrix rRIP, t0;		\
+	vmovddup .Ltf_inv_bitmatrix rRIP, t1;		\
+	vmovddup .Ltf_id_bitmatrix rRIP, t2;		\
+	vmovddup .Ltf_aff_bitmatrix rRIP, t3;		\
+	vmovddup .Ltf_x2_bitmatrix rRIP, t4;		\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
+	vgf2p8affineinvqb $0, t2, x2, x2;		\
+	vgf2p8affineinvqb $0, t2, x6, x6;		\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
+	vgf2p8affineinvqb $0, t2, x3, x3;		\
+	vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
+#define aria_sbox_8way(x0, x1, x2, x3,            	\
+		       x4, x5, x6, x7,			\
+		       t0, t1, t2, t3,			\
+		       t4, t5, t6, t7)			\
+	vmovdqa .Linv_shift_row rRIP, t0;		\
+	vmovdqa .Lshift_row rRIP, t1;			\
+	vbroadcastss .L0f0f0f0f rRIP, t6;		\
+	vmovdqa .Ltf_lo__inv_aff__and__s2 rRIP, t2;	\
+	vmovdqa .Ltf_hi__inv_aff__and__s2 rRIP, t3;	\
+	vmovdqa .Ltf_lo__x2__and__fwd_aff rRIP, t4;	\
+	vmovdqa .Ltf_hi__x2__and__fwd_aff rRIP, t5;	\
+							\
+	vaesenclast t7, x0, x0;				\
+	vaesenclast t7, x4, x4;				\
+	vaesenclast t7, x1, x1;				\
+	vaesenclast t7, x5, x5;				\
+	vaesdeclast t7, x2, x2;				\
+	vaesdeclast t7, x6, x6;				\
+							\
+	/* AES inverse shift rows */			\
+	vpshufb t0, x0, x0;				\
+	vpshufb t0, x4, x4;				\
+	vpshufb t0, x1, x1;				\
+	vpshufb t0, x5, x5;				\
+	vpshufb t1, x3, x3;				\
+	vpshufb t1, x7, x7;				\
+	vpshufb t1, x2, x2;				\
+	vpshufb t1, x6, x6;				\
+							\
+	/* affine transformation for S2 */		\
+	filter_8bit(x1, t2, t3, t6, t0);		\
+	/* affine transformation for S2 */		\
+	filter_8bit(x5, t2, t3, t6, t0);		\
+							\
+	/* affine transformation for X2 */		\
+	filter_8bit(x3, t4, t5, t6, t0);		\
+	/* affine transformation for X2 */		\
+	filter_8bit(x7, t4, t5, t6, t0);		\
+	vaesdeclast t7, x3, x3;				\
+	vaesdeclast t7, x7, x7;
+
+#define aria_diff_m(x0, x1, x2, x3,			\
+		    t0, t1, t2, t3)			\
+	/* T = rotr32(X, 8); */				\
+	/* X ^= T */					\
+	vpxor x0, x3, t0;				\
+	vpxor x1, x0, t1;				\
+	vpxor x2, x1, t2;				\
+	vpxor x3, x2, t3;				\
+	/* X = T ^ rotr(X, 16); */			\
+	vpxor t2, x0, x0;				\
+	vpxor x1, t3, t3;				\
+	vpxor t0, x2, x2;				\
+	vpxor t1, x3, x1;				\
+	vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7)			\
+	/* t1 ^= t2; */					\
+	vpxor y0, x4, x4;				\
+	vpxor y1, x5, x5;				\
+	vpxor y2, x6, x6;				\
+	vpxor y3, x7, x7;				\
+							\
+	/* t2 ^= t3; */					\
+	vpxor y4, y0, y0;				\
+	vpxor y5, y1, y1;				\
+	vpxor y6, y2, y2;				\
+	vpxor y7, y3, y3;				\
+							\
+	/* t0 ^= t1; */					\
+	vpxor x4, x0, x0;				\
+	vpxor x5, x1, x1;				\
+	vpxor x6, x2, x2;				\
+	vpxor x7, x3, x3;				\
+							\
+	/* t3 ^= t1; */					\
+	vpxor x4, y4, y4;				\
+	vpxor x5, y5, y5;				\
+	vpxor x6, y6, y6;				\
+	vpxor x7, y7, y7;				\
+							\
+	/* t2 ^= t0; */					\
+	vpxor x0, y0, y0;				\
+	vpxor x1, y1, y1;				\
+	vpxor x2, y2, y2;				\
+	vpxor x3, y3, y3;				\
+							\
+	/* t1 ^= t2; */					\
+	vpxor y0, x4, x4;				\
+	vpxor y1, x5, x5;				\
+	vpxor y2, x6, x6;				\
+	vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3,				\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round)			\
+	vpxor y7, y7, y7;				\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, round);	\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, round);	\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte() 				\
+	 * T3 = ABCD -> BADC 				\
+	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
+	 * T0 = ABCD -> CDAB 				\
+	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
+	 * T1 = ABCD -> DCBA 				\
+	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
+	 */						\
+	aria_diff_word(x2, x3, x0, x1,			\
+		       x7, x6, x5, x4,			\
+		       y0, y1, y2, y3,			\
+		       y5, y4, y7, y6);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3,				\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round)			\
+	vpxor y7, y7, y7;				\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, round);	\
+							\
+	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, round);	\
+							\
+	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte() 				\
+	 * T1 = ABCD -> BADC 				\
+	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
+	 * T2 = ABCD -> CDAB 				\
+	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
+	 * T3 = ABCD -> DCBA 				\
+	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
+	 */						\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x5, x4, x7, x6,			\
+		       y2, y3, y0, y1,			\
+		       y7, y6, y5, y4);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3,				\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round, last_round)		\
+	vpxor y7, y7, y7;				\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, round);	\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, last_round);	\
+							\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, round);	\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, last_round);	\
+							\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);
+
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	vpxor y7, y7, y7;				\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, round);	\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3, 		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, round);	\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3, 		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte() 				\
+	 * T3 = ABCD -> BADC 				\
+	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 	\
+	 * T0 = ABCD -> CDAB 				\
+	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 	\
+	 * T1 = ABCD -> DCBA 				\
+	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
+	 */						\
+	aria_diff_word(x2, x3, x0, x1,			\
+		       x7, x6, x5, x4,			\
+		       y0, y1, y2, y3,			\
+		       y5, y4, y7, y6);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	vpxor y7, y7, y7;				\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, round);	\
+							\
+	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
+			    x4, x5, x6, x7,		\
+			    y0, y1, y2, y3, 		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, round);	\
+							\
+	aria_sbox_8way_gfni(x0, x1, x2, x3, 		\
+			    x4, x5, x6, x7,		\
+			    y0, y1, y2, y3, 		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte() 				\
+	 * T1 = ABCD -> BADC 				\
+	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
+	 * T2 = ABCD -> CDAB 				\
+	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 	\
+	 * T3 = ABCD -> DCBA 				\
+	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 	\
+	 */						\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x5, x4, x7, x6,			\
+		       y2, y3, y0, y1,			\
+		       y7, y6, y5, y4);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3,			\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round, last_round)		\
+	vpxor y7, y7, y7;				\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, round);	\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3, 		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 8, last_round);	\
+							\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, round);	\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1, 		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3, 		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, y7, y2, rk, 0, last_round);	\
+							\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);
+
+#endif /* CONFIG_AS_GFNI */
+
+
+SECTION_RODATA
+.align 16
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ *      1 1 0 0 0 0 0 1     x0     0
+ *      0 1 0 0 1 0 0 0     x1     0
+ *      1 1 0 0 1 1 1 1     x2     0
+ *      0 1 1 0 1 0 0 1     x3     1
+ *      0 1 0 0 1 1 0 0  *  x4  +  0
+ *      0 1 0 1 1 0 0 0     x5     0
+ *      0 0 0 0 0 1 0 1     x6     0
+ *      1 1 1 0 0 1 1 1     x7     1
+ */
+.Ltf_lo__inv_aff__and__s2:
+	.octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ *      1 0 1 1 0 0 0 1     x0     0
+ *      0 1 1 1 1 0 1 1     x1     0
+ *      0 0 0 1 1 0 1 0     x2     1
+ *      0 1 0 0 0 1 0 0     x3     0
+ *      0 0 1 1 1 0 1 1  *  x4  +  0
+ *      0 1 0 0 1 0 0 0     x5     0
+ *      1 1 0 1 0 0 1 1     x6     0
+ *      0 1 0 0 1 0 1 0     x7     0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+.Lbige_addb_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
+#ifdef CONFIG_AS_GFNI
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
+		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
+		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
+		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
+		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.text
+
+.align 16
+ELF(.type __aria_aesni_avx_crypt_16way, at function;)
+__aria_aesni_avx_crypt_16way:
+	/* input:
+	*      %r9: rk
+	*      %rsi: dst
+	*      %rdx: src
+	*      %xmm0..%xmm15: 16 byte-sliced blocks
+	*/
+	CFI_STARTPROC();
+
+	movq %rsi, %rax;
+	leaq 8 * 16(%rax), %r8;
+
+	movl ARIA_CTX_rounds(CTX), %r10d;
+	subl $2, %r10d;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %r8);
+	aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		%rax, %r9, 0);
+	leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_aesni:
+	aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		%xmm15, %rax, %r9, 0);
+	aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+		%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		%rax, %r9, 1);
+	leaq 2*16(%r9), %r9;
+	subl $2, %r10d;
+	jnz .Loop_aesni;
+
+	aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+		%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		%xmm15, %rax, %r9, 0, 1);
+
+	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+			   %xmm9, %xmm13, %xmm0, %xmm5,
+			   %xmm10, %xmm14, %xmm3, %xmm6,
+			   %xmm11, %xmm15, %xmm2, %xmm7,
+			   (%rax), (%r8));
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_aesni_avx_crypt_16way,.-__aria_aesni_avx_crypt_16way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx_ecb_crypt_blk1_16
+ELF(.type _gcry_aria_aesni_avx_ecb_crypt_blk1_16, at function;)
+_gcry_aria_aesni_avx_ecb_crypt_blk1_16:
+	/* input:
+	*      %rdi: ctx, CTX
+	*      %rsi: dst
+	*      %rdx: src
+	*      %rcx: round keys
+	*      %r8: num blocks
+	*/
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 16), %rsp;
+	andq $~15, %rsp;
+
+	movq %rcx, %r9;
+	movq %rsi, %r11;
+	movq %rsp, %rsi; /* use stack for temporary store */
+
+	cmpq $16, %r8;
+	jb .Lecb_less_than_16;
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx);
+
+	call __aria_aesni_avx_crypt_16way;
+
+	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %r11);
+
+.Lecb_end:
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+
+.Lecb_less_than_16:
+	pushq %r8;
+	inpack_1_15_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+			%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+			%xmm15, %rdx, %r8d);
+
+	call __aria_aesni_avx_crypt_16way;
+
+	popq %rax;
+	write_output_1_15(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6,
+			  %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13,
+			  %xmm14, %xmm15, %r11, %eax);
+
+	jmp .Lecb_end;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx_ecb_crypt_blk1_16,
+	  .-_gcry_aria_aesni_avx_ecb_crypt_blk1_16;)
+
+.align 16
+ELF(.type __aria_aesni_avx_ctr_gen_keystream_16way, at function;)
+__aria_aesni_avx_ctr_gen_keystream_16way:
+	/* input:
+	*      %rdi: ctx
+	*      %rsi: dst
+	*      %rdx: src
+	*      %rcx: keystream
+	*      %r8: iv (big endian, 128bit)
+	*/
+	CFI_STARTPROC();
+
+	/* load IV */
+	vmovdqu (%r8), %xmm8;
+	cmpb $(0x100 - 16), 15(%r8);
+	jb .Lctr_byteadd;
+
+	/* byteswap */
+	vmovdqa .Lbswap128_mask rRIP, %xmm1;
+	vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
+
+	vpcmpeqd %xmm0, %xmm0, %xmm0;
+	vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
+
+	/* construct IVs */
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm9;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm10;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm11;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm12;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm13;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm14;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm15;
+	vmovdqu %xmm8, (0 * 16)(%rcx);
+	vmovdqu %xmm9, (1 * 16)(%rcx);
+	vmovdqu %xmm10, (2 * 16)(%rcx);
+	vmovdqu %xmm11, (3 * 16)(%rcx);
+	vmovdqu %xmm12, (4 * 16)(%rcx);
+	vmovdqu %xmm13, (5 * 16)(%rcx);
+	vmovdqu %xmm14, (6 * 16)(%rcx);
+	vmovdqu %xmm15, (7 * 16)(%rcx);
+
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm8;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm9;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm10;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm11;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm12;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm13;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm14;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm15;
+	inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+	vpshufb %xmm1, %xmm3, %xmm4;
+	vmovdqu %xmm4, (%r8);
+
+	vmovdqu (0 * 16)(%rcx), %xmm0;
+	vmovdqu (1 * 16)(%rcx), %xmm1;
+	vmovdqu (2 * 16)(%rcx), %xmm2;
+	vmovdqu (3 * 16)(%rcx), %xmm3;
+	vmovdqu (4 * 16)(%rcx), %xmm4;
+	vmovdqu (5 * 16)(%rcx), %xmm5;
+	vmovdqu (6 * 16)(%rcx), %xmm6;
+	vmovdqu (7 * 16)(%rcx), %xmm7;
+
+	ret_spec_stop;
+
+.Lctr_byteadd:
+	addb $16, 15(%r8);
+	vmovdqa %xmm8, %xmm0;
+	vpaddb .Lbige_addb_1 rRIP, %xmm8, %xmm1;
+	vpaddb .Lbige_addb_2 rRIP, %xmm8, %xmm2;
+	vpaddb .Lbige_addb_3 rRIP, %xmm8, %xmm3;
+	vpaddb .Lbige_addb_4 rRIP, %xmm8, %xmm4;
+	vpaddb .Lbige_addb_5 rRIP, %xmm8, %xmm5;
+	vpaddb .Lbige_addb_6 rRIP, %xmm8, %xmm6;
+	vpaddb .Lbige_addb_7 rRIP, %xmm8, %xmm7;
+	vpaddb .Lbige_addb_8 rRIP, %xmm0, %xmm8;
+	vpaddb .Lbige_addb_9 rRIP, %xmm0, %xmm9;
+	vpaddb .Lbige_addb_10 rRIP, %xmm0, %xmm10;
+	vpaddb .Lbige_addb_11 rRIP, %xmm0, %xmm11;
+	vpaddb .Lbige_addb_12 rRIP, %xmm0, %xmm12;
+	vpaddb .Lbige_addb_13 rRIP, %xmm0, %xmm13;
+	vpaddb .Lbige_addb_14 rRIP, %xmm0, %xmm14;
+	vpaddb .Lbige_addb_15 rRIP, %xmm0, %xmm15;
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_aesni_avx_ctr_gen_keystream_16way,.-__aria_aesni_avx_ctr_gen_keystream_16way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx_ctr_crypt_blk16
+ELF(.type _gcry_aria_aesni_avx_ctr_crypt_blk16, at function;)
+_gcry_aria_aesni_avx_ctr_crypt_blk16:
+	/* input:
+	*      %rdi: ctx
+	*      %rsi: dst
+	*      %rdx: src
+	*      %rcx: iv (big endian, 128bit)
+	*/
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 16), %rsp;
+	andq $~15, %rsp;
+
+	movq %rcx, %r8;  /* %r8: iv */
+	movq %rsp, %rcx; /* %rcx: keystream */
+	call __aria_aesni_avx_ctr_gen_keystream_16way;
+
+	pushq %rsi;
+	movq %rdx, %r11;
+	movq %rcx, %rsi; /* use stack for temporary store */
+	movq %rcx, %rdx;
+	leaq ARIA_CTX_enc_key(CTX), %r9;
+
+	call __aria_aesni_avx_crypt_16way;
+
+	popq %rsi;
+	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rsi);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx_ctr_crypt_blk16,.-_gcry_aria_aesni_avx_ctr_crypt_blk16;)
+
+#ifdef CONFIG_AS_GFNI
+.align 16
+ELF(.type __aria_gfni_avx_crypt_16way, at function;)
+__aria_gfni_avx_crypt_16way:
+	/* input:
+	*      %r9: rk
+	*      %rsi: dst
+	*      %rdx: src
+	*      %xmm0..%xmm15: 16 byte-sliced blocks
+	*/
+	CFI_STARTPROC();
+
+	movq %rsi, %rax;
+	leaq 8 * 16(%rax), %r8;
+
+	movl ARIA_CTX_rounds(CTX), %r10d;
+	subl $2, %r10d;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
+		      %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11,
+		      %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %r8);
+	aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
+		     %xmm12, %xmm13, %xmm14, %xmm15,
+		     %xmm0, %xmm1, %xmm2, %xmm3,
+		     %xmm4, %xmm5, %xmm6, %xmm7,
+		     %rax, %r9, 0);
+	leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+	aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+		     %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11,
+		     %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %r9, 0);
+	aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+		     %xmm12, %xmm13, %xmm14, %xmm15,
+		     %xmm0, %xmm1, %xmm2, %xmm3,
+		     %xmm4, %xmm5, %xmm6, %xmm7,
+		     %rax, %r9, 1);
+	leaq 2*16(%r9), %r9;
+	subl $2, %r10d;
+	jnz .Loop_gfni;
+
+	aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %r9, 0, 1);
+
+	debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+			   %xmm9, %xmm13, %xmm0, %xmm5,
+			   %xmm10, %xmm14, %xmm3, %xmm6,
+			   %xmm11, %xmm15, %xmm2, %xmm7,
+			   (%rax), (%r8));
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_gfni_avx_crypt_16way,.-__aria_gfni_avx_crypt_16way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx_ecb_crypt_blk1_16
+ELF(.type _gcry_aria_gfni_avx_ecb_crypt_blk1_16, at function;)
+_gcry_aria_gfni_avx_ecb_crypt_blk1_16:
+	/* input:
+	*      %rdi: ctx, CTX
+	*      %rsi: dst
+	*      %rdx: src
+	*      %rcx: round keys
+	*      %r8: num blocks
+	*/
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 16), %rsp;
+	andq $~15, %rsp;
+
+	movq %rcx, %r9;
+	movq %rsi, %r11;
+	movq %rsp, %rsi; /* use stack for temporary store */
+
+	cmpq $16, %r8;
+	jb .Lecb_less_than_16_gfni;
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx);
+
+	call __aria_gfni_avx_crypt_16way;
+
+	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %r11);
+
+.Lecb_end_gfni:
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+
+.Lecb_less_than_16_gfni:
+	pushq %r8;
+	inpack_1_15_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+			%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+			%xmm15, %rdx, %r8d);
+
+	call __aria_gfni_avx_crypt_16way;
+
+	popq %rax;
+	write_output_1_15(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6,
+			  %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13,
+			  %xmm14, %xmm15, %r11, %eax);
+
+	jmp .Lecb_end_gfni;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx_ecb_crypt_blk1_16,
+	  .-_gcry_aria_gfni_avx_ecb_crypt_blk1_16;)
+
+.align 16
+.globl _gcry_aria_gfni_avx_ctr_crypt_blk16
+ELF(.type _gcry_aria_gfni_avx_ctr_crypt_blk16, at function;)
+_gcry_aria_gfni_avx_ctr_crypt_blk16:
+	/* input:
+	*      %rdi: ctx
+	*      %rsi: dst
+	*      %rdx: src
+	*      %rcx: iv (big endian, 128bit)
+	*/
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 16), %rsp;
+	andq $~15, %rsp;
+
+	movq %rcx, %r8;  /* %r8: iv */
+	movq %rsp, %rcx; /* %rcx: keystream */
+	call __aria_aesni_avx_ctr_gen_keystream_16way
+
+	pushq %rsi;
+	movq %rdx, %r11;
+	movq %rcx, %rsi; /* use stack for temporary store */
+	movq %rcx, %rdx;
+	leaq ARIA_CTX_enc_key(CTX), %r9;
+
+	call __aria_gfni_avx_crypt_16way;
+
+	popq %rsi;
+	vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+	vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+	vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+	vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+	vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+	vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+	vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+	vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+	vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+	vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+	vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+	vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+	vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+	vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+	vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+	vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+	write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rsi);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx_ctr_crypt_blk16,.-_gcry_aria_gfni_avx_ctr_crypt_blk16;)
+#endif /* CONFIG_AS_GFNI */
+
+#endif /* ENABLE_AVX_SUPPORT && ENABLE_AESNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/cipher/aria-aesni-avx2-amd64.S b/cipher/aria-aesni-avx2-amd64.S
new file mode 100644
index 00000000..f09a9042
--- /dev/null
+++ b/cipher/aria-aesni-avx2-amd64.S
@@ -0,0 +1,1449 @@
+/* aria-aesni-avx2-amd64.S  -  AESNI/GFNI/AVX2 implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073 at gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#ifdef ENABLE_GFNI_SUPPORT
+#  define CONFIG_AS_GFNI 1
+#endif
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 32 + 31)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)		\
+	( (((a0) & 1) << 0) |				\
+	  (((a1) & 1) << 1) |				\
+	  (((a2) & 1) << 2) |				\
+	  (((a3) & 1) << 3) |				\
+	  (((a4) & 1) << 4) |				\
+	  (((a5) & 1) << 5) |				\
+	  (((a6) & 1) << 6) |				\
+	  (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)		\
+	( ((l7) << (0 * 8)) |				\
+	  ((l6) << (1 * 8)) |				\
+	  ((l5) << (2 * 8)) |				\
+	  ((l4) << (3 * 8)) |				\
+	  ((l3) << (4 * 8)) |				\
+	  ((l2) << (5 * 8)) |				\
+	  ((l1) << (6 * 8)) |				\
+	  ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define inc_le128(x, minus_one, tmp)			\
+	vpcmpeqq minus_one, x, tmp;			\
+	vpsubq minus_one, x, x;				\
+	vpslldq $8, tmp, tmp;				\
+	vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)	\
+	vpand x, mask4bit, tmp0;			\
+	vpandn x, mask4bit, x;				\
+	vpsrld $4, x, x;				\
+							\
+	vpshufb tmp0, lo_t, tmp0;			\
+	vpshufb x, hi_t, x;				\
+	vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)		\
+	vpunpckhdq x1, x0, t2;				\
+	vpunpckldq x1, x0, x0;				\
+							\
+	vpunpckldq x3, x2, t1;				\
+	vpunpckhdq x3, x2, x2;				\
+							\
+	vpunpckhqdq t1, x0, x1;				\
+	vpunpcklqdq t1, x0, x0;				\
+							\
+	vpunpckhqdq x2, t2, x3;				\
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,		\
+			 a1, b1, c1, d1,		\
+			 a2, b2, c2, d2,		\
+			 a3, b3, c3, d3,		\
+			 st0, st1)			\
+	vmovdqu d2, st0;				\
+	vmovdqu d3, st1;				\
+	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
+	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu a0, st0;				\
+	vmovdqu a1, st1;				\
+	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
+	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
+							\
+	vbroadcasti128 .Lshufb_16x16b rRIP, a0;		\
+	vmovdqu st1, a1;				\
+	vpshufb a0, a2, a2;				\
+	vpshufb a0, a3, a3;				\
+	vpshufb a0, b0, b0;				\
+	vpshufb a0, b1, b1;				\
+	vpshufb a0, b2, b2;				\
+	vpshufb a0, b3, b3;				\
+	vpshufb a0, a1, a1;				\
+	vpshufb a0, c0, c0;				\
+	vpshufb a0, c1, c1;				\
+	vpshufb a0, c2, c2;				\
+	vpshufb a0, c3, c3;				\
+	vpshufb a0, d0, d0;				\
+	vpshufb a0, d1, d1;				\
+	vpshufb a0, d2, d2;				\
+	vpshufb a0, d3, d3;				\
+	vmovdqu d3, st1;				\
+	vmovdqu st0, d3;				\
+	vpshufb a0, d3, a0;				\
+	vmovdqu d2, st0;				\
+							\
+	transpose_4x4(a0, b0, c0, d0, d2, d3);		\
+	transpose_4x4(a1, b1, c1, d1, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu b0, st0;				\
+	vmovdqu b1, st1;				\
+	transpose_4x4(a2, b2, c2, d2, b0, b1);		\
+	transpose_4x4(a3, b3, c3, d3, b0, b1);		\
+	vmovdqu st0, b0;				\
+	vmovdqu st1, b1;				\
+	/* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,		\
+			   a1, b1, c1, d1,		\
+			   a2, b2, c2, d2,		\
+			   a3, b3, c3, d3,		\
+			   st0, st1)			\
+	vmovdqu d2, st0;				\
+	vmovdqu d3, st1;				\
+	transpose_4x4(a0, a1, a2, a3, d2, d3);		\
+	transpose_4x4(b0, b1, b2, b3, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu a0, st0;				\
+	vmovdqu a1, st1;				\
+	transpose_4x4(c0, c1, c2, c3, a0, a1);		\
+	transpose_4x4(d0, d1, d2, d3, a0, a1);		\
+							\
+	vbroadcasti128 .Lshufb_16x16b rRIP, a0;		\
+	vmovdqu st1, a1;				\
+	vpshufb a0, a2, a2;				\
+	vpshufb a0, a3, a3;				\
+	vpshufb a0, b0, b0;				\
+	vpshufb a0, b1, b1;				\
+	vpshufb a0, b2, b2;				\
+	vpshufb a0, b3, b3;				\
+	vpshufb a0, a1, a1;				\
+	vpshufb a0, c0, c0;				\
+	vpshufb a0, c1, c1;				\
+	vpshufb a0, c2, c2;				\
+	vpshufb a0, c3, c3;				\
+	vpshufb a0, d0, d0;				\
+	vpshufb a0, d1, d1;				\
+	vpshufb a0, d2, d2;				\
+	vpshufb a0, d3, d3;				\
+	vmovdqu d3, st1;				\
+	vmovdqu st0, d3;				\
+	vpshufb a0, d3, a0;				\
+	vmovdqu d2, st0;				\
+							\
+	transpose_4x4(c0, d0, a0, b0, d2, d3);		\
+	transpose_4x4(c1, d1, a1, b1, d2, d3);		\
+	vmovdqu st0, d2;				\
+	vmovdqu st1, d3;				\
+							\
+	vmovdqu b0, st0;				\
+	vmovdqu b1, st1;				\
+	transpose_4x4(c2, d2, a2, b2, b0, b1);		\
+	transpose_4x4(c3, d3, a3, b3, b0, b1);		\
+	vmovdqu st0, b0;				\
+	vmovdqu st1, b1;				\
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     rio)				\
+	vmovdqu (0 * 32)(rio), x0;			\
+	vmovdqu (1 * 32)(rio), x1;			\
+	vmovdqu (2 * 32)(rio), x2;			\
+	vmovdqu (3 * 32)(rio), x3;			\
+	vmovdqu (4 * 32)(rio), x4;			\
+	vmovdqu (5 * 32)(rio), x5;			\
+	vmovdqu (6 * 32)(rio), x6;			\
+	vmovdqu (7 * 32)(rio), x7;			\
+	vmovdqu (8 * 32)(rio), y0;			\
+	vmovdqu (9 * 32)(rio), y1;			\
+	vmovdqu (10 * 32)(rio), y2;			\
+	vmovdqu (11 * 32)(rio), y3;			\
+	vmovdqu (12 * 32)(rio), y4;			\
+	vmovdqu (13 * 32)(rio), y5;			\
+	vmovdqu (14 * 32)(rio), y6;			\
+	vmovdqu (15 * 32)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,			\
+		      x4, x5, x6, x7,			\
+		      y0, y1, y2, y3,			\
+		      y4, y5, y6, y7,			\
+		      mem_ab, mem_cd)			\
+	byteslice_16x16b(x0, x1, x2, x3,		\
+			 x4, x5, x6, x7,		\
+			 y0, y1, y2, y3,		\
+			 y4, y5, y6, y7,		\
+			 (mem_ab), (mem_cd));		\
+							\
+	vmovdqu x0, 0 * 32(mem_ab);			\
+	vmovdqu x1, 1 * 32(mem_ab);			\
+	vmovdqu x2, 2 * 32(mem_ab);			\
+	vmovdqu x3, 3 * 32(mem_ab);			\
+	vmovdqu x4, 4 * 32(mem_ab);			\
+	vmovdqu x5, 5 * 32(mem_ab);			\
+	vmovdqu x6, 6 * 32(mem_ab);			\
+	vmovdqu x7, 7 * 32(mem_ab);			\
+	vmovdqu y0, 0 * 32(mem_cd);			\
+	vmovdqu y1, 1 * 32(mem_cd);			\
+	vmovdqu y2, 2 * 32(mem_cd);			\
+	vmovdqu y3, 3 * 32(mem_cd);			\
+	vmovdqu y4, 4 * 32(mem_cd);			\
+	vmovdqu y5, 5 * 32(mem_cd);			\
+	vmovdqu y6, 6 * 32(mem_cd);			\
+	vmovdqu y7, 7 * 32(mem_cd);
+
+#define write_output(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem)				\
+	vmovdqu x0, 0 * 32(mem);			\
+	vmovdqu x1, 1 * 32(mem);			\
+	vmovdqu x2, 2 * 32(mem);			\
+	vmovdqu x3, 3 * 32(mem);			\
+	vmovdqu x4, 4 * 32(mem);			\
+	vmovdqu x5, 5 * 32(mem);			\
+	vmovdqu x6, 6 * 32(mem);			\
+	vmovdqu x7, 7 * 32(mem);			\
+	vmovdqu y0, 8 * 32(mem);			\
+	vmovdqu y1, 9 * 32(mem);			\
+	vmovdqu y2, 10 * 32(mem);			\
+	vmovdqu y3, 11 * 32(mem);			\
+	vmovdqu y4, 12 * 32(mem);			\
+	vmovdqu y5, 13 * 32(mem);			\
+	vmovdqu y6, 14 * 32(mem);			\
+	vmovdqu y7, 15 * 32(mem);			\
+
+#define aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, idx)		\
+	vmovdqu x0, ((idx + 0) * 32)(mem_tmp);		\
+	vmovdqu x1, ((idx + 1) * 32)(mem_tmp);		\
+	vmovdqu x2, ((idx + 2) * 32)(mem_tmp);		\
+	vmovdqu x3, ((idx + 3) * 32)(mem_tmp);		\
+	vmovdqu x4, ((idx + 4) * 32)(mem_tmp);		\
+	vmovdqu x5, ((idx + 5) * 32)(mem_tmp);		\
+	vmovdqu x6, ((idx + 6) * 32)(mem_tmp);		\
+	vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, idx)		\
+	vmovdqu ((idx + 0) * 32)(mem_tmp), x0;		\
+	vmovdqu ((idx + 1) * 32)(mem_tmp), x1;		\
+	vmovdqu ((idx + 2) * 32)(mem_tmp), x2;		\
+	vmovdqu ((idx + 3) * 32)(mem_tmp), x3;		\
+	vmovdqu ((idx + 4) * 32)(mem_tmp), x4;		\
+	vmovdqu ((idx + 5) * 32)(mem_tmp), x5;		\
+	vmovdqu ((idx + 6) * 32)(mem_tmp), x6;		\
+	vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3,			\
+		      x4, x5, x6, x7,			\
+		      t0, rk, idx, round)		\
+	/* AddRoundKey */                               \
+	vpbroadcastb ((round * 16) + idx + 3)(rk), t0;	\
+	vpxor t0, x0, x0;				\
+	vpbroadcastb ((round * 16) + idx + 2)(rk), t0;	\
+	vpxor t0, x1, x1;				\
+	vpbroadcastb ((round * 16) + idx + 1)(rk), t0;	\
+	vpxor t0, x2, x2;				\
+	vpbroadcastb ((round * 16) + idx + 0)(rk), t0;	\
+	vpxor t0, x3, x3;				\
+	vpbroadcastb ((round * 16) + idx + 7)(rk), t0;	\
+	vpxor t0, x4, x4;				\
+	vpbroadcastb ((round * 16) + idx + 6)(rk), t0;	\
+	vpxor t0, x5, x5;				\
+	vpbroadcastb ((round * 16) + idx + 5)(rk), t0;	\
+	vpxor t0, x6, x6;				\
+	vpbroadcastb ((round * 16) + idx + 4)(rk), t0;	\
+	vpxor t0, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,		\
+			    x4, x5, x6, x7,		\
+			    t0, t1, t2, t3,		\
+			    t4, t5, t6, t7)		\
+	vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;	\
+	vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;	\
+	vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;	\
+	vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;	\
+	vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;	\
+	vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x2, x2;	\
+	vgf2p8affineqb $(tf_inv_const), t1, x6, x6;	\
+	vgf2p8affineinvqb $0, t2, x2, x2;		\
+	vgf2p8affineinvqb $0, t2, x6, x6;		\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;	\
+	vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x3, x3;	\
+	vgf2p8affineqb $(tf_x2_const), t4, x7, x7;	\
+	vgf2p8affineinvqb $0, t2, x3, x3;		\
+	vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
+#define aria_sbox_8way(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       t0, t1, t2, t3,			\
+		       t4, t5, t6, t7)			\
+	vpxor t7, t7, t7;				\
+	vpxor t6, t6, t6;				\
+	vbroadcasti128 .Linv_shift_row rRIP, t0;	\
+	vbroadcasti128 .Lshift_row rRIP, t1;		\
+	vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
+	vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
+	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
+	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
+							\
+	vextracti128 $1, x0, t6##_x;			\
+	vaesenclast t7##_x, x0##_x, x0##_x;		\
+	vaesenclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x0, x0;			\
+							\
+	vextracti128 $1, x4, t6##_x;			\
+	vaesenclast t7##_x, x4##_x, x4##_x;		\
+	vaesenclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x4, x4;			\
+							\
+	vextracti128 $1, x1, t6##_x;			\
+	vaesenclast t7##_x, x1##_x, x1##_x;		\
+	vaesenclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x1, x1;			\
+							\
+	vextracti128 $1, x5, t6##_x;			\
+	vaesenclast t7##_x, x5##_x, x5##_x;		\
+	vaesenclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x5, x5;			\
+							\
+	vextracti128 $1, x2, t6##_x;			\
+	vaesdeclast t7##_x, x2##_x, x2##_x;		\
+	vaesdeclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x2, x2;			\
+							\
+	vextracti128 $1, x6, t6##_x;			\
+	vaesdeclast t7##_x, x6##_x, x6##_x;		\
+	vaesdeclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x6, x6;			\
+							\
+	vpbroadcastd .L0f0f0f0f rRIP, t6;		\
+							\
+	/* AES inverse shift rows */			\
+	vpshufb t0, x0, x0;				\
+	vpshufb t0, x4, x4;				\
+	vpshufb t0, x1, x1;				\
+	vpshufb t0, x5, x5;				\
+	vpshufb t1, x3, x3;				\
+	vpshufb t1, x7, x7;				\
+	vpshufb t1, x2, x2;				\
+	vpshufb t1, x6, x6;				\
+							\
+	/* affine transformation for S2 */		\
+	filter_8bit(x1, t2, t3, t6, t0);		\
+	/* affine transformation for S2 */		\
+	filter_8bit(x5, t2, t3, t6, t0);		\
+							\
+	/* affine transformation for X2 */		\
+	filter_8bit(x3, t4, t5, t6, t0);		\
+	/* affine transformation for X2 */		\
+	filter_8bit(x7, t4, t5, t6, t0);		\
+							\
+	vpxor t6, t6, t6;				\
+	vextracti128 $1, x3, t6##_x;			\
+	vaesdeclast t7##_x, x3##_x, x3##_x;		\
+	vaesdeclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x3, x3;			\
+							\
+	vextracti128 $1, x7, t6##_x;			\
+	vaesdeclast t7##_x, x7##_x, x7##_x;		\
+	vaesdeclast t7##_x, t6##_x, t6##_x;		\
+	vinserti128 $1, t6##_x, x7, x7;			\
+
+#define aria_diff_m(x0, x1, x2, x3,			\
+		    t0, t1, t2, t3)			\
+	/* T = rotr32(X, 8); */				\
+	/* X ^= T */					\
+	vpxor x0, x3, t0;				\
+	vpxor x1, x0, t1;				\
+	vpxor x2, x1, t2;				\
+	vpxor x3, x2, t3;				\
+	/* X = T ^ rotr(X, 16); */			\
+	vpxor t2, x0, x0;				\
+	vpxor x1, t3, t3;				\
+	vpxor t0, x2, x2;				\
+	vpxor t1, x3, x1;				\
+	vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7)			\
+	/* t1 ^= t2; */					\
+	vpxor y0, x4, x4;				\
+	vpxor y1, x5, x5;				\
+	vpxor y2, x6, x6;				\
+	vpxor y3, x7, x7;				\
+							\
+	/* t2 ^= t3; */					\
+	vpxor y4, y0, y0;				\
+	vpxor y5, y1, y1;				\
+	vpxor y6, y2, y2;				\
+	vpxor y7, y3, y3;				\
+							\
+	/* t0 ^= t1; */					\
+	vpxor x4, x0, x0;				\
+	vpxor x5, x1, x1;				\
+	vpxor x6, x2, x2;				\
+	vpxor x7, x3, x3;				\
+							\
+	/* t3 ^= t1; */					\
+	vpxor x4, y4, y4;				\
+	vpxor x5, y5, y5;				\
+	vpxor x6, y6, y6;				\
+	vpxor x7, y7, y7;				\
+							\
+	/* t2 ^= t0; */					\
+	vpxor x0, y0, y0;				\
+	vpxor x1, y1, y1;				\
+	vpxor x2, y2, y2;				\
+	vpxor x3, y3, y3;				\
+							\
+	/* t1 ^= t2; */					\
+	vpxor y0, x4, x4;				\
+	vpxor y1, x5, x5;				\
+	vpxor y2, x6, x6;				\
+	vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3,				\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round)			\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T3 = ABCD -> BADC				\
+	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
+	 * T0 = ABCD -> CDAB				\
+	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
+	 * T1 = ABCD -> DCBA				\
+	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
+	 */						\
+	aria_diff_word(x2, x3, x0, x1,			\
+		       x7, x6, x5, x4,			\
+		       y0, y1, y2, y3,			\
+		       y5, y4, y7, y6);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3,				\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round)			\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T1 = ABCD -> BADC				\
+	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
+	 * T2 = ABCD -> CDAB				\
+	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
+	 * T3 = ABCD -> DCBA				\
+	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
+	 */						\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x5, x4, x7, x6,			\
+		       y2, y3, y0, y1,			\
+		       y7, y6, y5, y4);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3,				\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round, last_round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, last_round);		\
+							\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,	\
+		       y0, y1, y2, y3, y4, y5, y6, y7);	\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, last_round);		\
+							\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3,		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3,		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T3 = ABCD -> BADC				\
+	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
+	 * T0 = ABCD -> CDAB				\
+	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
+	 * T1 = ABCD -> DCBA				\
+	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
+	 */						\
+	aria_diff_word(x2, x3, x0, x1,			\
+		       x7, x6, x5, x4,			\
+		       y0, y1, y2, y3,			\
+		       y5, y4, y7, y6);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3,			\
+		     x4, x5, x6, x7,			\
+		     y0, y1, y2, y3,			\
+		     y4, y5, y6, y7,			\
+		     mem_tmp, rk, round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
+			    x4, x5, x6, x7,		\
+			    y0, y1, y2, y3,		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_gfni(x0, x1, x2, x3,		\
+			    x4, x5, x6, x7,		\
+			    y0, y1, y2, y3,		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
+	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 0);		\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);		\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x4, x5, x6, x7,			\
+		       y0, y1, y2, y3,			\
+		       y4, y5, y6, y7);			\
+	/* aria_diff_byte()				\
+	 * T1 = ABCD -> BADC				\
+	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
+	 * T2 = ABCD -> CDAB				\
+	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
+	 * T3 = ABCD -> DCBA				\
+	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
+	 */						\
+	aria_diff_word(x0, x1, x2, x3,			\
+		       x5, x4, x7, x6,			\
+		       y2, y3, y0, y1,			\
+		       y7, y6, y5, y4);			\
+	aria_store_state_8way(x3, x2, x1, x0,		\
+			      x6, x7, x4, x5,		\
+			      mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3,			\
+		x4, x5, x6, x7,				\
+		y0, y1, y2, y3,				\
+		y4, y5, y6, y7,				\
+		mem_tmp, rk, round, last_round)		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, round);		\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3,		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 8, last_round);		\
+							\
+	aria_store_state_8way(x0, x1, x2, x3,		\
+			      x4, x5, x6, x7,		\
+			      mem_tmp, 8);		\
+							\
+	aria_load_state_8way(x0, x1, x2, x3,		\
+			     x4, x5, x6, x7,		\
+			     mem_tmp, 0);		\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, round);		\
+							\
+	aria_sbox_8way_gfni(x2, x3, x0, x1,		\
+			    x6, x7, x4, x5,		\
+			    y0, y1, y2, y3,		\
+			    y4, y5, y6, y7);		\
+							\
+	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
+		      y0, rk, 0, last_round);		\
+							\
+	aria_load_state_8way(y0, y1, y2, y3,		\
+			     y4, y5, y6, y7,		\
+			     mem_tmp, 8);
+#endif /* CONFIG_AS_GFNI */
+
+
+SECTION_RODATA
+.align 32
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.align 32
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16_16:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+.align 16
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+	.byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+	.byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+	.byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ *      1 1 0 0 0 0 0 1     x0     0
+ *      0 1 0 0 1 0 0 0     x1     0
+ *      1 1 0 0 1 1 1 1     x2     0
+ *      0 1 1 0 1 0 0 1     x3     1
+ *      0 1 0 0 1 1 0 0  *  x4  +  0
+ *      0 1 0 1 1 0 0 0     x5     0
+ *      0 0 0 0 0 1 0 1     x6     0
+ *      1 1 1 0 0 1 1 1     x7     1
+ */
+.Ltf_lo__inv_aff__and__s2:
+	.octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+	.octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ *      1 0 1 1 0 0 0 1     x0     0
+ *      0 1 1 1 1 0 1 1     x1     0
+ *      0 0 0 1 1 0 1 0     x2     1
+ *      0 1 0 0 0 1 0 0     x3     0
+ *      0 0 1 1 1 0 1 1  *  x4  +  0
+ *      0 1 0 0 1 0 0 0     x5     0
+ *      1 1 0 1 0 0 1 1     x6     0
+ *      0 1 0 0 1 0 1 0     x7     0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+	.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+#ifdef CONFIG_AS_GFNI
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+	.quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+		    BV8(1, 1, 0, 0, 0, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 0, 0, 1),
+		    BV8(1, 1, 1, 1, 1, 0, 0, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+	.quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 1, 0, 0, 1, 0),
+		    BV8(0, 0, 1, 0, 1, 0, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 1, 0, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+	.quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+		    BV8(0, 0, 1, 1, 1, 1, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 1),
+		    BV8(1, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(0, 1, 0, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+	.quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 1, 1, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 1, 0),
+		    BV8(1, 1, 1, 0, 0, 0, 1, 1),
+		    BV8(1, 1, 1, 0, 1, 1, 0, 0),
+		    BV8(0, 1, 1, 0, 1, 0, 1, 1),
+		    BV8(1, 0, 1, 1, 1, 1, 0, 1),
+		    BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+	.quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 1, 0, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 1, 0, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 1, 0, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 1, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 1, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.text
+
+.align 16
+ELF(.type __aria_aesni_avx2_crypt_32way, at function;)
+__aria_aesni_avx2_crypt_32way:
+	/* input:
+	 *      %r9: rk
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %ymm0..%ymm15: byte-sliced blocks
+	 */
+	CFI_STARTPROC();
+
+	movq %rsi, %rax;
+	leaq 8 * 32(%rax), %r8;
+
+	movl ARIA_CTX_rounds(CTX), %r10d;
+	subl $2, %r10d;
+
+	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %r8);
+	aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		%rax, %r9, 0);
+	leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_aesni:
+	aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		%ymm15, %rax, %r9, 0);
+	aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+		%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		%rax, %r9, 1);
+	leaq 2*16(%r9), %r9;
+	subl $2, %r10d;
+	jnz .Loop_aesni;
+
+	aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		%ymm15, %rax, %r9, 0, 1);
+
+	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+			   %ymm9, %ymm13, %ymm0, %ymm5,
+			   %ymm10, %ymm14, %ymm3, %ymm6,
+			   %ymm11, %ymm15, %ymm2, %ymm7,
+			   (%rax), (%r8));
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_aesni_avx2_crypt_32way,.-__aria_aesni_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_aesni_avx2_ecb_crypt_blk32, at function;)
+_gcry_aria_aesni_avx2_ecb_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx, CTX
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: round keys
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r9;
+	movq %rsi, %r11;
+	movq %rsp, %rsi; /* use stack for temporary store */
+
+	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx);
+
+	call __aria_aesni_avx2_crypt_32way;
+
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %r11);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx2_ecb_crypt_blk32,
+	  .-_gcry_aria_aesni_avx2_ecb_crypt_blk32;)
+
+.align 16
+ELF(.type __aria_aesni_avx2_ctr_gen_keystream_32way, at function;)
+__aria_aesni_avx2_ctr_gen_keystream_32way:
+	/* input:
+	 *      %rdi: ctx
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: keystream
+	 *      %r8: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	cmpb $(0x100 - 32), 15(%r8);
+	jb .Lctr_byteadd;
+
+	movq 8(%r8), %r11;
+	bswapq %r11;
+
+	vbroadcasti128 .Lbswap128_mask rRIP, %ymm6;
+	vpcmpeqd %ymm0, %ymm0, %ymm0;
+	vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
+	vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%r8), %xmm7;
+	vpshufb %xmm6, %xmm7, %xmm7;
+	vmovdqa %xmm7, %xmm3;
+	inc_le128(%xmm7, %xmm0, %xmm4);
+	vinserti128 $1, %xmm7, %ymm3, %ymm3;
+	vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 32), %r11;
+	ja .Lhandle_ctr_carry;
+
+	/* construct IVs */
+	vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
+	vpshufb %ymm6, %ymm3, %ymm9;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
+	vpshufb %ymm6, %ymm3, %ymm10;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
+	vpshufb %ymm6, %ymm3, %ymm11;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
+	vpshufb %ymm6, %ymm3, %ymm12;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
+	vpshufb %ymm6, %ymm3, %ymm13;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
+	vpshufb %ymm6, %ymm3, %ymm14;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
+	vpshufb %ymm6, %ymm3, %ymm15;
+	vmovdqu %ymm8, (0 * 32)(%rcx);
+	vmovdqu %ymm9, (1 * 32)(%rcx);
+	vmovdqu %ymm10, (2 * 32)(%rcx);
+	vmovdqu %ymm11, (3 * 32)(%rcx);
+	vmovdqu %ymm12, (4 * 32)(%rcx);
+	vmovdqu %ymm13, (5 * 32)(%rcx);
+	vmovdqu %ymm14, (6 * 32)(%rcx);
+	vmovdqu %ymm15, (7 * 32)(%rcx);
+
+	vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
+	vpshufb %ymm6, %ymm3, %ymm8;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
+	vpshufb %ymm6, %ymm3, %ymm9;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
+	vpshufb %ymm6, %ymm3, %ymm10;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
+	vpshufb %ymm6, %ymm3, %ymm11;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
+	vpshufb %ymm6, %ymm3, %ymm12;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
+	vpshufb %ymm6, %ymm3, %ymm13;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
+	vpshufb %ymm6, %ymm3, %ymm14;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
+	vpshufb %ymm6, %ymm3, %ymm15;
+	vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
+	vpshufb %xmm6, %xmm3, %xmm3;
+	vmovdqu %xmm3, (%r8);
+	vmovdqu (0 * 32)(%rcx), %ymm0;
+	vmovdqu (1 * 32)(%rcx), %ymm1;
+	vmovdqu (2 * 32)(%rcx), %ymm2;
+	vmovdqu (3 * 32)(%rcx), %ymm3;
+	vmovdqu (4 * 32)(%rcx), %ymm4;
+	vmovdqu (5 * 32)(%rcx), %ymm5;
+	vmovdqu (6 * 32)(%rcx), %ymm6;
+	vmovdqu (7 * 32)(%rcx), %ymm7;
+	jmp .Lctr_carry_done;
+
+	.Lhandle_ctr_carry:
+	/* construct IVs */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
+	vmovdqu %ymm8, (0 * 32)(%rcx);
+	vmovdqu %ymm9, (1 * 32)(%rcx);
+	vmovdqu %ymm10, (2 * 32)(%rcx);
+	vmovdqu %ymm11, (3 * 32)(%rcx);
+	vmovdqu %ymm12, (4 * 32)(%rcx);
+	vmovdqu %ymm13, (5 * 32)(%rcx);
+	vmovdqu %ymm14, (6 * 32)(%rcx);
+	vmovdqu %ymm15, (7 * 32)(%rcx);
+
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
+	inc_le128(%ymm3, %ymm0, %ymm4);
+	vextracti128 $1, %ymm3, %xmm3;
+	vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
+	vmovdqu %xmm3, (%r8);
+	vmovdqu (0 * 32)(%rcx), %ymm0;
+	vmovdqu (1 * 32)(%rcx), %ymm1;
+	vmovdqu (2 * 32)(%rcx), %ymm2;
+	vmovdqu (3 * 32)(%rcx), %ymm3;
+	vmovdqu (4 * 32)(%rcx), %ymm4;
+	vmovdqu (5 * 32)(%rcx), %ymm5;
+	vmovdqu (6 * 32)(%rcx), %ymm6;
+	vmovdqu (7 * 32)(%rcx), %ymm7;
+
+.Lctr_carry_done:
+	ret_spec_stop;
+
+.Lctr_byteadd:
+	vbroadcasti128 (%r8), %ymm8;
+	addb $32, 15(%r8);
+	vpaddb .Lbige_addb_16_16 rRIP, %ymm8, %ymm15;
+	vpaddb .Lbige_addb_0_1 rRIP, %ymm8, %ymm0;
+	vpaddb .Lbige_addb_2_3 rRIP, %ymm8, %ymm1;
+	vpaddb .Lbige_addb_4_5 rRIP, %ymm8, %ymm2;
+	vpaddb .Lbige_addb_6_7 rRIP, %ymm8, %ymm3;
+	vpaddb .Lbige_addb_8_9 rRIP, %ymm8, %ymm4;
+	vpaddb .Lbige_addb_10_11 rRIP, %ymm8, %ymm5;
+	vpaddb .Lbige_addb_12_13 rRIP, %ymm8, %ymm6;
+	vpaddb .Lbige_addb_14_15 rRIP, %ymm8, %ymm7;
+	vpaddb .Lbige_addb_0_1 rRIP, %ymm15, %ymm8;
+	vpaddb .Lbige_addb_2_3 rRIP, %ymm15, %ymm9;
+	vpaddb .Lbige_addb_4_5 rRIP, %ymm15, %ymm10;
+	vpaddb .Lbige_addb_6_7 rRIP, %ymm15, %ymm11;
+	vpaddb .Lbige_addb_8_9 rRIP, %ymm15, %ymm12;
+	vpaddb .Lbige_addb_10_11 rRIP, %ymm15, %ymm13;
+	vpaddb .Lbige_addb_12_13 rRIP, %ymm15, %ymm14;
+	vpaddb .Lbige_addb_14_15 rRIP, %ymm15, %ymm15;
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_aesni_avx2_ctr_gen_keystream_32way,
+	  .-__aria_aesni_avx2_ctr_gen_keystream_32way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_aesni_avx2_ctr_crypt_blk32, at function;)
+_gcry_aria_aesni_avx2_ctr_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r8;  /* %r8: iv */
+	movq %rsp, %rcx; /* %rcx: keystream */
+	call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+	pushq %rsi;
+	movq %rdx, %r11;
+	movq %rcx, %rsi; /* use stack for temporary store */
+	movq %rcx, %rdx;
+	leaq ARIA_CTX_enc_key(CTX), %r9;
+
+	call __aria_aesni_avx2_crypt_32way;
+
+	popq %rsi;
+	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rsi);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx2_ctr_crypt_blk32,
+	  .-_gcry_aria_aesni_avx2_ctr_crypt_blk32;)
+
+#ifdef CONFIG_AS_GFNI
+.align 16
+ELF(.type __aria_gfni_avx2_crypt_32way, at function;)
+__aria_gfni_avx2_crypt_32way:
+	/* input:
+	 *      %r9: rk
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %ymm0..%ymm15: byte-sliced blocks
+	 */
+	CFI_STARTPROC();
+
+	movq %rsi, %rax;
+	leaq 8 * 32(%rax), %r8;
+
+	movl ARIA_CTX_rounds(CTX), %r10d;
+	subl $2, %r10d;
+
+	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
+		      %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11,
+		      %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %r8);
+	aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %ymm0, %ymm1, %ymm2, %ymm3,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %rax, %r9, 0);
+	leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+	aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11,
+		     %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %r9, 0);
+	aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+		     %ymm12, %ymm13, %ymm14, %ymm15,
+		     %ymm0, %ymm1, %ymm2, %ymm3,
+		     %ymm4, %ymm5, %ymm6, %ymm7,
+		     %rax, %r9, 1);
+	leaq 2*16(%r9), %r9;
+	subl $2, %r10d;
+	jnz .Loop_gfni;
+
+	aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %r9, 0, 1);
+
+	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+			   %ymm9, %ymm13, %ymm0, %ymm5,
+			   %ymm10, %ymm14, %ymm3, %ymm6,
+			   %ymm11, %ymm15, %ymm2, %ymm7,
+			   (%rax), (%r8));
+
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size __aria_gfni_avx2_crypt_32way,.-__aria_gfni_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_gfni_avx2_ecb_crypt_blk32, at function;)
+_gcry_aria_gfni_avx2_ecb_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx, CTX
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: round keys
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r9;
+	movq %rsi, %r11;
+	movq %rsp, %rsi; /* use stack for temporary store */
+
+	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx);
+
+	call __aria_gfni_avx2_crypt_32way;
+
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %r11);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx2_ecb_crypt_blk32,
+	  .-_gcry_aria_gfni_avx2_ecb_crypt_blk32;)
+
+.align 16
+.globl _gcry_aria_gfni_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_gfni_avx2_ctr_crypt_blk32, at function;)
+_gcry_aria_gfni_avx2_ctr_crypt_blk32:
+	/* input:
+	 *      %rdi: ctx
+	 *      %rsi: dst
+	 *      %rdx: src
+	 *      %rcx: iv (big endian, 128bit)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	subq $(16 * 32), %rsp;
+	andq $~31, %rsp;
+
+	movq %rcx, %r8;  /* %r8: iv */
+	movq %rsp, %rcx; /* %rcx: keystream */
+	call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+	pushq %rsi;
+	movq %rdx, %r11;
+	movq %rcx, %rsi; /* use stack for temporary store */
+	movq %rcx, %rdx;
+	leaq ARIA_CTX_enc_key(CTX), %r9;
+
+	call __aria_gfni_avx2_crypt_32way;
+
+	popq %rsi;
+	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rsi);
+
+	movl $STACK_DEPTH, %eax;
+	leave;
+	CFI_LEAVE();
+	vzeroall;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx2_ctr_crypt_blk32,
+	  .-_gcry_aria_gfni_avx2_ctr_crypt_blk32;)
+#endif /* CONFIG_AS_GFNI */
+
+#endif /* ENABLE_AVX2_SUPPORT && ENABLE_AESNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/cipher/aria.c b/cipher/aria.c
index 700ea409..18952d04 100644
--- a/cipher/aria.c
+++ b/cipher/aria.c
@@ -50,6 +50,60 @@
 #endif
 
 
+/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX 1
+# endif
+#endif
+
+/* USE_GFNI_AVX inidicates whether to compile with Intel GFNI/AVX code. */
+#undef USE_GFNI_AVX
+#if defined(USE_AESNI_AVX) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX 1
+#endif
+
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
+#undef USE_AESNI_AVX2
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX2 1
+# endif
+#endif
+
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
+/* How many parallel blocks to handle in bulk processing functions. */
+#if defined(USE_AESNI_AVX2)
+# define MAX_PARALLEL_BLKS 32
+#elif defined(USE_AESNI_AVX)
+# define MAX_PARALLEL_BLKS 16
+#else
+# define MAX_PARALLEL_BLKS 8
+#endif
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
 static const char *aria_selftest (void);
 
 
@@ -69,6 +123,15 @@ typedef struct
   unsigned int decryption_prepared:1; /* The decryption key is set up. */
   unsigned int bulk_prefetch_ready:1; /* Look-up table prefetch ready for
 				       * current bulk operation. */
+
+#ifdef USE_AESNI_AVX
+  unsigned int use_aesni_avx:1;
+  unsigned int use_gfni_avx:1;
+#endif
+#ifdef USE_AESNI_AVX2
+  unsigned int use_aesni_avx2:1;
+  unsigned int use_gfni_avx2:1;
+#endif
 } ARIA_context;
 
 
@@ -363,6 +426,102 @@ static struct
     0
   };
 
+#ifdef USE_AESNI_AVX
+extern unsigned int
+_gcry_aria_aesni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+				       const byte *in, const void *key,
+				       u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+				     const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_GFNI_AVX
+extern unsigned int
+_gcry_aria_gfni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+				      const byte *in, const void *key,
+				      u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+				    const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX */
+
+static inline unsigned int
+aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, const byte *in,
+			   const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
+{
+#ifdef USE_GFNI_AVX
+  if (ctx->use_gfni_avx)
+    return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+		+ ASM_EXTRA_STACK;
+  else
+#endif /* USE_GFNI_AVX */
+    return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+		+ ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const byte *in,
+			 byte *iv)
+{
+#ifdef USE_GFNI_AVX
+  if (ctx->use_gfni_avx)
+    return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+		+ ASM_EXTRA_STACK;
+  else
+#endif /* USE_GFNI_AVX */
+    return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+		+ ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX2
+extern unsigned int
+_gcry_aria_aesni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+				      const byte *in,
+				      const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+				      const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_GFNI_AVX2
+extern unsigned int
+_gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+				     const byte *in,
+				     const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+				     const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX2 */
+
+static inline unsigned int
+aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+			  const u32 key[][ARIA_RD_KEY_WORDS])
+{
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+		+ ASM_EXTRA_STACK;
+  else
+#endif /* USE_GFNI_AVX2 */
+    return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+		+ ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+			  byte *iv)
+{
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2)
+    return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+		+ ASM_EXTRA_STACK;
+  else
+#endif /* USE_GFNI_AVX2 */
+    return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+		+ ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX2 */
+
 /* Prefetching for sbox tables. */
 static inline void
 prefetch_table(const volatile byte *tab, size_t len)
@@ -864,7 +1023,47 @@ aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
 		   size_t num_blks, u32 key[][ARIA_RD_KEY_WORDS])
 {
   unsigned int burn_depth = 0;
-  unsigned int nburn;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 32)
+	{
+	  nburn = aria_avx2_ecb_crypt_blk32 (ctx, out, in, key);
+	  in += 32 * ARIA_BLOCK_SIZE;
+	  out += 32 * ARIA_BLOCK_SIZE;
+	  num_blks -= 32;
+	}
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+	return burn_depth;
+    }
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 3)
+	{
+	  size_t curr_blks = num_blks < 16 ? num_blks : 16;
+	  nburn = aria_avx_ecb_crypt_blk1_16 (ctx, out, in, key, curr_blks);
+	  in += curr_blks * ARIA_BLOCK_SIZE;
+	  out += curr_blks * ARIA_BLOCK_SIZE;
+	  num_blks -= curr_blks;
+	}
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+	return burn_depth;
+    }
+#endif /* USE_AESNI_AVX */
 
   if (!ctx->bulk_prefetch_ready)
     {
@@ -874,19 +1073,19 @@ aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
 
   while (num_blks >= 2)
     {
-      nburn = aria_crypt_2blks (ctx, out, in, key);
+      unsigned int nburn = aria_crypt_2blks (ctx, out, in, key);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
-      out += 2 * 16;
-      in += 2 * 16;
+      out += 2 * ARIA_BLOCK_SIZE;
+      in += 2 * ARIA_BLOCK_SIZE;
       num_blks -= 2;
     }
 
   while (num_blks)
     {
-      nburn = aria_crypt (ctx, out, in, key);
+      unsigned int nburn = aria_crypt (ctx, out, in, key);
       burn_depth = nburn > burn_depth ? nburn : burn_depth;
-      out += 16;
-      in += 16;
+      out += ARIA_BLOCK_SIZE;
+      in += ARIA_BLOCK_SIZE;
       num_blks--;
     }
 
@@ -925,12 +1124,46 @@ _gcry_aria_ctr_enc(void *context, unsigned char *ctr,
   const byte *inbuf = inbuf_arg;
   int burn_stack_depth = 0;
 
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 32)
+	{
+	  nburn = aria_avx2_ctr_crypt_blk32 (ctx, outbuf, inbuf, ctr);
+	  inbuf += 32 * ARIA_BLOCK_SIZE;
+	  outbuf += 32 * ARIA_BLOCK_SIZE;
+	  nblocks -= 32;
+	}
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 16)
+	{
+	  nburn = aria_avx_ctr_crypt_blk16 (ctx, outbuf, inbuf, ctr);
+	  inbuf += 16 * ARIA_BLOCK_SIZE;
+	  outbuf += 16 * ARIA_BLOCK_SIZE;
+	  nblocks -= 16;
+	}
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
   /* Process remaining blocks. */
   if (nblocks)
     {
-      byte tmpbuf[16 * ARIA_BLOCK_SIZE];
+      byte tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
       unsigned int tmp_used = ARIA_BLOCK_SIZE;
-      size_t nburn;
+      size_t nburn = 0;
 
       ctx->bulk_prefetch_ready = 0;
 
@@ -1002,7 +1235,7 @@ _gcry_aria_cbc_dec(void *context, unsigned char *iv,
   /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
       unsigned int tmp_used = ARIA_BLOCK_SIZE;
       size_t nburn;
 
@@ -1062,7 +1295,7 @@ _gcry_aria_cfb_dec(void *context, unsigned char *iv,
   /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
       unsigned int tmp_used = ARIA_BLOCK_SIZE;
       size_t nburn;
 
@@ -1099,14 +1332,14 @@ _gcry_aria_ecb_crypt (void *context, void *outbuf_arg,
   /* Process remaining blocks. */
   if (nblocks)
     {
-      bulk_crypt_fn_t crypt_blk1_16;
+      bulk_crypt_fn_t crypt_blk1_n;
       size_t nburn;
 
       ctx->bulk_prefetch_ready = 0;
-      crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
 
-      nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_16,
-				 outbuf, inbuf, nblocks, 16);
+      nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_n,
+				 outbuf, inbuf, nblocks, MAX_PARALLEL_BLKS);
       burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
     }
 
@@ -1133,15 +1366,15 @@ _gcry_aria_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
   /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
       unsigned int tmp_used = ARIA_BLOCK_SIZE;
-      bulk_crypt_fn_t crypt_blk1_16;
+      bulk_crypt_fn_t crypt_blk1_n;
       size_t nburn;
 
       ctx->bulk_prefetch_ready = 0;
-      crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
 
-      nburn = bulk_xts_crypt_128(ctx, crypt_blk1_16,
+      nburn = bulk_xts_crypt_128(ctx, crypt_blk1_n,
 				 outbuf, inbuf, nblocks,
 				 tweak, tmpbuf,
 				 sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
@@ -1169,7 +1402,7 @@ _gcry_aria_ctr32le_enc(void *context, unsigned char *ctr,
   /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
       unsigned int tmp_used = ARIA_BLOCK_SIZE;
       size_t nburn;
 
@@ -1208,15 +1441,15 @@ _gcry_aria_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
       unsigned int tmp_used = ARIA_BLOCK_SIZE;
-      bulk_crypt_fn_t crypt_blk1_16;
+      bulk_crypt_fn_t crypt_blk1_n;
       size_t nburn;
 
       ctx->bulk_prefetch_ready = 0;
-      crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
 
-      nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_16, outbuf, inbuf, nblocks,
+      nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_n, outbuf, inbuf, nblocks,
 				  &blkn, encrypt, tmpbuf,
 				  sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
 				  &tmp_used);
@@ -1245,7 +1478,7 @@ _gcry_aria_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
   /* Process remaining blocks. */
   if (nblocks)
     {
-      unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE];
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
       unsigned int tmp_used = ARIA_BLOCK_SIZE;
       size_t nburn;
 
@@ -1275,6 +1508,9 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
   ARIA_context *ctx = c;
   static int initialized = 0;
   static const char *selftest_failed = NULL;
+  unsigned int hwf = _gcry_get_hw_features ();
+
+  (void)hwf;
 
   if (keylen != 16 && keylen != 24 && keylen != 32)
     return GPG_ERR_INV_KEYLEN;
@@ -1290,6 +1526,19 @@ aria_setkey(void *c, const byte *key, unsigned keylen,
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
+#ifdef USE_AESNI_AVX2
+  ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_GFNI_AVX2
+  ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_AESNI_AVX
+  ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
+#endif
+#ifdef USE_GFNI_AVX
+  ctx->use_gfni_avx = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX);
+#endif
+
   /* Setup bulk encryption routines.  */
   memset (bulk_ops, 0, sizeof(*bulk_ops));
   bulk_ops->cbc_enc = _gcry_aria_cbc_enc;
diff --git a/configure.ac b/configure.ac
index 9163b2ed..4f983a58 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3034,6 +3034,14 @@ LIST_MEMBER(aria, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS aria.lo"
    AC_DEFINE(USE_ARIA, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx-amd64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS aria-aesni-avx2-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
-- 
2.37.2


From jussi.kivilinna at iki.fi  Mon Feb 20 18:38:49 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 20 Feb 2023 19:38:49 +0200
Subject: [PATCH 4/5] aria-avx512: small optimization for aria_diff_m
In-Reply-To: <0b0367e5-d5f2-3ed5-58c9-be59a7d770d3@gmail.com>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
 <20230219084910.1302701-4-jussi.kivilinna@iki.fi>
 <0b0367e5-d5f2-3ed5-58c9-be59a7d770d3@gmail.com>
Message-ID: <821ad653-4b27-036c-4938-f3c2904eea02@iki.fi>

Hello,

On 20.2.2023 12.54, Taehee Yoo wrote:
> On 2/19/23 17:49, Jussi Kivilinna wrote:
> 
> Hi Jussi,
> Thank you so much for this optimization!
> 
> I tested this optimization in the kernel.
> It works very well.
> In my machine(i3-12100), it improves performance ~9%, awesome!

Interesting.. I'd expect alderlake to behave similarly to tigerlake. Did you
test with version that has unrolled round functions?

In libgcrypt, I changed from round unrolling to using loops in order to reduce
code size and to allow code to fit into uop-cache. Maybe speed increase happens
since vpternlogq reduces code-size for unrolled version enough and algorithm fits
into i3-12100's uop-cache, giving the extra performance.

-Jussi

> It will be really helpful to the kernel side aria-avx512 driver for improving performance.
> 
>  > * cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for
>  > 3-way XOR operation.
>  > ---
>  >
>  > Using vpternlogq gives small performance improvement on AMD Zen4. With
>  > Intel tiger-lake speed is the same as before.
>  >
>  > Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
>  >
>  > Before:
>  >?? ARIA128??????? |? nanosecs/byte?? mebibytes/sec?? cycles/byte? auto Mhz
>  >????????? ECB enc |???? 0.204 ns/B????? 4682 MiB/s???? 0.957 c/B????? 4700
>  >????????? ECB dec |???? 0.204 ns/B????? 4668 MiB/s???? 0.960 c/B????? 4700
>  >????????? CTR enc |???? 0.212 ns/B????? 4509 MiB/s???? 0.994 c/B????? 4700
>  >????????? CTR dec |???? 0.212 ns/B????? 4490 MiB/s???? 0.998 c/B????? 4700
>  >
>  > After (~3% faster):
>  >?? ARIA128??????? |? nanosecs/byte?? mebibytes/sec?? cycles/byte? auto Mhz
>  >????????? ECB enc |???? 0.198 ns/B????? 4812 MiB/s???? 0.932 c/B????? 4700
>  >????????? ECB dec |???? 0.198 ns/B????? 4824 MiB/s???? 0.929 c/B????? 4700
>  >????????? CTR enc |???? 0.204 ns/B????? 4665 MiB/s???? 0.961 c/B????? 4700
>  >????????? CTR dec |???? 0.206 ns/B????? 4631 MiB/s???? 0.968 c/B????? 4700
>  >
>  > Cc: Taehee Yoo <ap420073 at gmail.com>
>  > Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
>  > ---
>  >?? cipher/aria-gfni-avx512-amd64.S | 16 ++++++----------
>  >?? 1 file changed, 6 insertions(+), 10 deletions(-)
>  >
>  > diff --git a/cipher/aria-gfni-avx512-amd64.S b/cipher/aria-gfni-avx512-amd64.S
>  > index 849c744b..24a49a89 100644
>  > --- a/cipher/aria-gfni-avx512-amd64.S
>  > +++ b/cipher/aria-gfni-avx512-amd64.S
>  > @@ -406,21 +406,17 @@
>  >?????? vgf2p8affineinvqb $0, t2, y3, y3;??????? \
>  >?????? vgf2p8affineinvqb $0, t2, y7, y7;
>  >
>  > -
>  >?? #define aria_diff_m(x0, x1, x2, x3,??????????? \
>  >?????????????? t0, t1, t2, t3)??????????? \
>  >?????? /* T = rotr32(X, 8); */??????????????? \
>  >?????? /* X ^= T */??????????????????? \
>  > -??? vpxorq x0, x3, t0;??????????????? \
>  > -??? vpxorq x1, x0, t1;??????????????? \
>  > -??? vpxorq x2, x1, t2;??????????????? \
>  > -??? vpxorq x3, x2, t3;??????????????? \
>  >?????? /* X = T ^ rotr(X, 16); */??????????? \
>  > -??? vpxorq t2, x0, x0;??????????????? \
>  > -??? vpxorq x1, t3, t3;??????????????? \
>  > -??? vpxorq t0, x2, x2;??????????????? \
>  > -??? vpxorq t1, x3, x1;??????????????? \
>  > -??? vmovdqu64 t3, x3;
>  > +??? vmovdqa64 x0, t0;??????????????? \
>  > +??? vmovdqa64 x3, t3;??????????????? \
>  > +??? vpternlogq $0x96, x2, x1, x0;??????????? \
>  > +??? vpternlogq $0x96, x2, x1, x3;??????????? \
>  > +??? vpternlogq $0x96, t0, t3, x2;??????????? \
>  > +??? vpternlogq $0x96, t0, t3, x1;
>  >
>  >?? #define aria_diff_word(x0, x1, x2, x3,??????????? \
>  >????????????????? x4, x5, x6, x7,??????????? \
> 
> Thank you so much!
> Taehee Yoo
> 


From ap420073 at gmail.com  Mon Feb 20 12:04:44 2023
From: ap420073 at gmail.com (Taehee Yoo)
Date: Mon, 20 Feb 2023 20:04:44 +0900
Subject: [PATCH 5/5] aria-avx2: add VAES accelerated implementation
In-Reply-To: <20230219084910.1302701-5-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
 <20230219084910.1302701-5-jussi.kivilinna@iki.fi>
Message-ID: <6bebe676-c338-0dc6-6166-9f1398f6d377@gmail.com>

On 2/19/23 17:49, Jussi Kivilinna wrote:

Hi Jussi,
Thank you so much for this implementation!

I tested this in the kernel and it works really well.
In my machine(i3-12100), as you mentioned, it improves 30% performance 
for the in-kernel aria-avx2 driver.
It will be really helpful to the in-kernel aria-avx2 driver.

 > * cipher/aria-aesni-avx2-amd64.S (CONFIG_AS_VAES): New.
 > [CONFIG_AS_VAES]: Add VAES accelerated assembly macros and functions.
 > * cipher/aria.c (USE_VAES_AVX2): New.
 > (ARIA_context): Add 'use_vaes_avx2'.
 > (_gcry_aria_vaes_avx2_ecb_crypt_blk32)
 > (_gcry_aria_vaes_avx2_ctr_crypt_blk32)
 > (aria_avx2_ecb_crypt_blk32, aria_avx2_ctr_crypt_blk32): Add VAES/AVX2
 > code paths.
 > (aria_setkey): Enable VAES/AVX2 implementation based on HW features.
 > --
 >
 > This patch adds VAES/AVX2 accelerated ARIA block cipher implementation.
 >
 > VAES instruction set extends AESNI instructions to work on all 128-bit
 > lanes of 256-bit YMM and 512-bit ZMM vector registers, thus AES
 > operations can be executed directly on YMM registers without needing
 > to manually split YMM to two XMM halfs for AESNI instructions.
 > This improves performance on CPUs that support VAES but not GFNI, like
 > AMD Zen3.
 >
 > Benchmark on Ryzen 7 5800X (zen3, turbo-freq off):
 >
 >   Before (AESNI/AVX2):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.559 ns/B      1707 MiB/s      2.12 c/B      3800
 >          ECB dec |     0.560 ns/B      1703 MiB/s      2.13 c/B      3800
 >          CTR enc |     0.570 ns/B      1672 MiB/s      2.17 c/B      3800
 >          CTR dec |     0.568 ns/B      1679 MiB/s      2.16 c/B      3800
 >
 >   After (VAES/AVX2, ~33% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.435 ns/B      2193 MiB/s      1.65 c/B      3800
 >          ECB dec |     0.434 ns/B      2197 MiB/s      1.65 c/B      3800
 >          CTR enc |     0.413 ns/B      2306 MiB/s      1.57 c/B      3800
 >          CTR dec |     0.411 ns/B      2318 MiB/s      1.56 c/B      3800
 >
 > Cc: Taehee Yoo <ap420073 at gmail.com>
 > Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
 > ---
 >   cipher/aria-aesni-avx2-amd64.S | 368 ++++++++++++++++++++++++++++++++-
 >   cipher/aria.c                  |  50 ++++-
 >   2 files changed, 409 insertions(+), 9 deletions(-)
 >
 > diff --git a/cipher/aria-aesni-avx2-amd64.S 
b/cipher/aria-aesni-avx2-amd64.S
 > index f09a9042..f1dcdb70 100644
 > --- a/cipher/aria-aesni-avx2-amd64.S
 > +++ b/cipher/aria-aesni-avx2-amd64.S
 > @@ -31,6 +31,9 @@
 >   #ifdef ENABLE_GFNI_SUPPORT
 >   #  define CONFIG_AS_GFNI 1
 >   #endif
 > +#ifdef HAVE_GCC_INLINE_ASM_VAES_VPCLMUL
 > +#  define CONFIG_AS_VAES 1
 > +#endif
 >
 >   /* struct ARIA_context: */
 >   #define ARIA_BLOCK_SIZE  16
 > @@ -358,6 +361,53 @@
 >   	vgf2p8affineinvqb $0, t2, x7, x7
 >   #endif /* CONFIG_AS_GFNI */
 >
 > +#ifdef CONFIG_AS_VAES
 > +#define aria_sbox_8way_vaes(x0, x1, x2, x3,		\
 > +			    x4, x5, x6, x7,		\
 > +			    t0, t1, t2, t3,		\
 > +			    t4, t5, t6, t7)		\
 > +	vpxor t7, t7, t7;				\
 > +	vpxor t6, t6, t6;				\
 > +	vbroadcasti128 .Linv_shift_row rRIP, t0;	\
 > +	vbroadcasti128 .Lshift_row rRIP, t1;		\
 > +	vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
 > +	vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
 > +	vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
 > +	vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
 > +							\
 > +	vaesenclast t7, x0, x0;				\
 > +	vaesenclast t7, x4, x4;				\
 > +	vaesenclast t7, x1, x1;				\
 > +	vaesenclast t7, x5, x5;				\
 > +	vaesdeclast t7, x2, x2;				\
 > +	vaesdeclast t7, x6, x6;				\
 > +							\
 > +	vpbroadcastd .L0f0f0f0f rRIP, t6;		\
 > +							\
 > +	/* AES inverse shift rows */			\
 > +	vpshufb t0, x0, x0;				\
 > +	vpshufb t0, x4, x4;				\
 > +	vpshufb t0, x1, x1;				\
 > +	vpshufb t0, x5, x5;				\
 > +	vpshufb t1, x3, x3;				\
 > +	vpshufb t1, x7, x7;				\
 > +	vpshufb t1, x2, x2;				\
 > +	vpshufb t1, x6, x6;				\
 > +							\
 > +	/* affine transformation for S2 */		\
 > +	filter_8bit(x1, t2, t3, t6, t0);		\
 > +	/* affine transformation for S2 */		\
 > +	filter_8bit(x5, t2, t3, t6, t0);		\
 > +							\
 > +	/* affine transformation for X2 */		\
 > +	filter_8bit(x3, t4, t5, t6, t0);		\
 > +	/* affine transformation for X2 */		\
 > +	filter_8bit(x7, t4, t5, t6, t0);		\
 > +							\
 > +	vaesdeclast t7, x3, x3;				\
 > +	vaesdeclast t7, x7, x7;
 > +#endif /* CONFIG_AS_VAES */
 > +
 >   #define aria_sbox_8way(x0, x1, x2, x3,			\
 >   		       x4, x5, x6, x7,			\
 >   		       t0, t1, t2, t3,			\
 > @@ -432,7 +482,7 @@
 >   	vextracti128 $1, x7, t6##_x;			\
 >   	vaesdeclast t7##_x, x7##_x, x7##_x;		\
 >   	vaesdeclast t7##_x, t6##_x, t6##_x;		\
 > -	vinserti128 $1, t6##_x, x7, x7;			\
 > +	vinserti128 $1, t6##_x, x7, x7;
 >
 >   #define aria_diff_m(x0, x1, x2, x3,			\
 >   		    t0, t1, t2, t3)			\
 > @@ -630,6 +680,7 @@
 >   	aria_load_state_8way(y0, y1, y2, y3,		\
 >   			     y4, y5, y6, y7,		\
 >   			     mem_tmp, 8);
 > +
 >   #ifdef CONFIG_AS_GFNI
 >   #define aria_fe_gfni(x0, x1, x2, x3,			\
 >   		     x4, x5, x6, x7,			\
 > @@ -786,6 +837,155 @@
 >   			     mem_tmp, 8);
 >   #endif /* CONFIG_AS_GFNI */
 >
 > +#ifdef CONFIG_AS_VAES
 > +#define aria_fe_vaes(x0, x1, x2, x3,			\
 > +		     x4, x5, x6, x7,			\
 > +		     y0, y1, y2, y3,			\
 > +		     y4, y5, y6, y7,			\
 > +		     mem_tmp, rk, round)		\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 8, round);		\
 > +							\
 > +	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
 > +			    x5, y0, y1, y2, y3, y4, y5,	\
 > +			    y6, y7);			\
 > +							\
 > +	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 > +	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 > +	aria_store_state_8way(x0, x1, x2, x3,		\
 > +			      x4, x5, x6, x7,		\
 > +			      mem_tmp, 8);		\
 > +							\
 > +	aria_load_state_8way(x0, x1, x2, x3,		\
 > +			     x4, x5, x6, x7,		\
 > +			     mem_tmp, 0);		\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 0, round);		\
 > +							\
 > +	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
 > +			    x5, y0, y1, y2, y3, y4, y5,	\
 > +			    y6, y7);			\
 > +							\
 > +	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 > +	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 > +	aria_store_state_8way(x0, x1, x2, x3,		\
 > +			      x4, x5, x6, x7,		\
 > +			      mem_tmp, 0);		\
 > +	aria_load_state_8way(y0, y1, y2, y3,		\
 > +			     y4, y5, y6, y7,		\
 > +			     mem_tmp, 8);		\
 > +	aria_diff_word(x0, x1, x2, x3,			\
 > +		       x4, x5, x6, x7,			\
 > +		       y0, y1, y2, y3,			\
 > +		       y4, y5, y6, y7);			\
 > +	/* aria_diff_byte()				\
 > +	 * T3 = ABCD -> BADC				\
 > +	 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6	\
 > +	 * T0 = ABCD -> CDAB				\
 > +	 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1	\
 > +	 * T1 = ABCD -> DCBA				\
 > +	 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4	\
 > +	 */						\
 > +	aria_diff_word(x2, x3, x0, x1,			\
 > +		       x7, x6, x5, x4,			\
 > +		       y0, y1, y2, y3,			\
 > +		       y5, y4, y7, y6);			\
 > +	aria_store_state_8way(x3, x2, x1, x0,		\
 > +			      x6, x7, x4, x5,		\
 > +			      mem_tmp, 0);
 > +
 > +#define aria_fo_vaes(x0, x1, x2, x3,			\
 > +		     x4, x5, x6, x7,			\
 > +		     y0, y1, y2, y3,			\
 > +		     y4, y5, y6, y7,			\
 > +		     mem_tmp, rk, round)		\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 8, round);		\
 > +							\
 > +	aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6,	\
 > +			    x7, y0, y1, y2, y3, y4, y5,	\
 > +			    y6, y7);			\
 > +							\
 > +	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 > +	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 > +	aria_store_state_8way(x0, x1, x2, x3,		\
 > +			      x4, x5, x6, x7,		\
 > +			      mem_tmp, 8);		\
 > +							\
 > +	aria_load_state_8way(x0, x1, x2, x3,		\
 > +			     x4, x5, x6, x7,		\
 > +			     mem_tmp, 0);		\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 0, round);		\
 > +							\
 > +	aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6,	\
 > +			    x7, y0, y1, y2, y3, y4, y5,	\
 > +			    y6, y7);	\
 > +							\
 > +	aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);	\
 > +	aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);	\
 > +	aria_store_state_8way(x0, x1, x2, x3,		\
 > +			      x4, x5, x6, x7,		\
 > +			      mem_tmp, 0);		\
 > +	aria_load_state_8way(y0, y1, y2, y3,		\
 > +			     y4, y5, y6, y7,		\
 > +			     mem_tmp, 8);		\
 > +	aria_diff_word(x0, x1, x2, x3,			\
 > +		       x4, x5, x6, x7,			\
 > +		       y0, y1, y2, y3,			\
 > +		       y4, y5, y6, y7);			\
 > +	/* aria_diff_byte()				\
 > +	 * T1 = ABCD -> BADC				\
 > +	 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6	\
 > +	 * T2 = ABCD -> CDAB				\
 > +	 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1	\
 > +	 * T3 = ABCD -> DCBA				\
 > +	 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4	\
 > +	 */						\
 > +	aria_diff_word(x0, x1, x2, x3,			\
 > +		       x5, x4, x7, x6,			\
 > +		       y2, y3, y0, y1,			\
 > +		       y7, y6, y5, y4);			\
 > +	aria_store_state_8way(x3, x2, x1, x0,		\
 > +			      x6, x7, x4, x5,		\
 > +			      mem_tmp, 0);
 > +
 > +#define aria_ff_vaes(x0, x1, x2, x3,			\
 > +		     x4, x5, x6, x7,			\
 > +		     y0, y1, y2, y3,			\
 > +		     y4, y5, y6, y7,			\
 > +		     mem_tmp, rk, round, last_round)	\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 8, round);		\
 > +							\
 > +	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
 > +			    x5, y0, y1, y2, y3, y4, y5,	\
 > +			    y6, y7);			\
 > +							\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 8, last_round);		\
 > +							\
 > +	aria_store_state_8way(x0, x1, x2, x3,		\
 > +			      x4, x5, x6, x7,		\
 > +			      mem_tmp, 8);		\
 > +							\
 > +	aria_load_state_8way(x0, x1, x2, x3,		\
 > +			     x4, x5, x6, x7,		\
 > +			     mem_tmp, 0);		\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 0, round);		\
 > +							\
 > +	aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4,	\
 > +			    x5, y0, y1, y2, y3, y4, y5,	\
 > +			    y6, y7);	\
 > +							\
 > +	aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,	\
 > +		      y0, rk, 0, last_round);		\
 > +							\
 > +	aria_load_state_8way(y0, y1, y2, y3,		\
 > +			     y4, y5, y6, y7,		\
 > +			     mem_tmp, 8);
 > +#endif /* CONFIG_AS_VAES */
 >
 >   SECTION_RODATA
 >   .align 32
 > @@ -1279,6 +1479,172 @@ _gcry_aria_aesni_avx2_ctr_crypt_blk32:
 >   ELF(.size _gcry_aria_aesni_avx2_ctr_crypt_blk32,
 >   	  .-_gcry_aria_aesni_avx2_ctr_crypt_blk32;)
 >
 > +#ifdef CONFIG_AS_VAES
 > +.align 16
 > +ELF(.type __aria_vaes_avx2_crypt_32way, at function;)
 > +__aria_vaes_avx2_crypt_32way:
 > +	/* input:
 > +	 *      %r9: rk
 > +	 *      %rsi: dst
 > +	 *      %rdx: src
 > +	 *      %ymm0..%ymm15: byte-sliced blocks
 > +	 */
 > +	CFI_STARTPROC();
 > +
 > +	movq %rsi, %rax;
 > +	leaq 8 * 32(%rax), %r8;
 > +
 > +	movl ARIA_CTX_rounds(CTX), %r10d;
 > +	subl $2, %r10d;
 > +
 > +	inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 > +		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 > +		      %ymm15, %rax, %r8);
 > +	aria_fo_vaes(%ymm8, %ymm9, %ymm10, %ymm11,
 > +		     %ymm12, %ymm13, %ymm14, %ymm15,
 > +		     %ymm0, %ymm1, %ymm2, %ymm3,
 > +		     %ymm4, %ymm5, %ymm6, %ymm7,
 > +		     %rax, %r9, 0);
 > +	leaq 1*16(%r9), %r9;
 > +
 > +.align 16
 > +.Loop_vaes:
 > +	aria_fe_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
 > +		     %ymm4, %ymm5, %ymm6, %ymm7,
 > +		     %ymm8, %ymm9, %ymm10, %ymm11,
 > +		     %ymm12, %ymm13, %ymm14, %ymm15,
 > +		     %rax, %r9, 0);
 > +	aria_fo_vaes(%ymm9, %ymm8, %ymm11, %ymm10,
 > +		     %ymm12, %ymm13, %ymm14, %ymm15,
 > +		     %ymm0, %ymm1, %ymm2, %ymm3,
 > +		     %ymm4, %ymm5, %ymm6, %ymm7,
 > +		     %rax, %r9, 1);
 > +	leaq 2*16(%r9), %r9;
 > +	subl $2, %r10d;
 > +	jnz .Loop_vaes;
 > +
 > +	aria_ff_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
 > +		     %ymm4, %ymm5, %ymm6, %ymm7,
 > +		     %ymm8, %ymm9, %ymm10, %ymm11,
 > +		     %ymm12, %ymm13, %ymm14, %ymm15,
 > +		     %rax, %r9, 0, 1);
 > +
 > +	debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
 > +			   %ymm9, %ymm13, %ymm0, %ymm5,
 > +			   %ymm10, %ymm14, %ymm3, %ymm6,
 > +			   %ymm11, %ymm15, %ymm2, %ymm7,
 > +			   (%rax), (%r8));
 > +
 > +	ret_spec_stop;
 > +	CFI_ENDPROC();
 > +ELF(.size __aria_vaes_avx2_crypt_32way,.-__aria_vaes_avx2_crypt_32way;)
 > +
 > +.align 16
 > +.globl _gcry_aria_vaes_avx2_ecb_crypt_blk32
 > +ELF(.type _gcry_aria_vaes_avx2_ecb_crypt_blk32, at function;)
 > +_gcry_aria_vaes_avx2_ecb_crypt_blk32:
 > +	/* input:
 > +	 *      %rdi: ctx, CTX
 > +	 *      %rsi: dst
 > +	 *      %rdx: src
 > +	 *      %rcx: round keys
 > +	 */
 > +	CFI_STARTPROC();
 > +
 > +	pushq %rbp;
 > +	CFI_PUSH(%rbp);
 > +	movq %rsp, %rbp;
 > +	CFI_DEF_CFA_REGISTER(%rbp);
 > +
 > +	subq $(16 * 32), %rsp;
 > +	andq $~31, %rsp;
 > +
 > +	movq %rcx, %r9;
 > +	movq %rsi, %r11;
 > +	movq %rsp, %rsi; /* use stack for temporary store */
 > +
 > +	inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 > +		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 > +		     %ymm15, %rdx);
 > +
 > +	call __aria_vaes_avx2_crypt_32way;
 > +
 > +	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
 > +		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 > +		     %ymm15, %r11);
 > +
 > +	movl $STACK_DEPTH, %eax;
 > +	leave;
 > +	CFI_LEAVE();
 > +	vzeroall;
 > +	ret_spec_stop;
 > +	CFI_ENDPROC();
 > +ELF(.size _gcry_aria_vaes_avx2_ecb_crypt_blk32,
 > +	  .-_gcry_aria_vaes_avx2_ecb_crypt_blk32;)
 > +
 > +.align 16
 > +.globl _gcry_aria_vaes_avx2_ctr_crypt_blk32
 > +ELF(.type _gcry_aria_vaes_avx2_ctr_crypt_blk32, at function;)
 > +_gcry_aria_vaes_avx2_ctr_crypt_blk32:
 > +	/* input:
 > +	 *      %rdi: ctx
 > +	 *      %rsi: dst
 > +	 *      %rdx: src
 > +	 *      %rcx: iv (big endian, 128bit)
 > +	 */
 > +	CFI_STARTPROC();
 > +
 > +	pushq %rbp;
 > +	CFI_PUSH(%rbp);
 > +	movq %rsp, %rbp;
 > +	CFI_DEF_CFA_REGISTER(%rbp);
 > +
 > +	subq $(16 * 32), %rsp;
 > +	andq $~31, %rsp;
 > +
 > +	movq %rcx, %r8;  /* %r8: iv */
 > +	movq %rsp, %rcx; /* %rcx: keystream */
 > +	call __aria_aesni_avx2_ctr_gen_keystream_32way;
 > +
 > +	pushq %rsi;
 > +	movq %rdx, %r11;
 > +	movq %rcx, %rsi; /* use stack for temporary store */
 > +	movq %rcx, %rdx;
 > +	leaq ARIA_CTX_enc_key(CTX), %r9;
 > +
 > +	call __aria_vaes_avx2_crypt_32way;
 > +
 > +	popq %rsi;
 > +	vpxor (0 * 32)(%r11), %ymm1, %ymm1;
 > +	vpxor (1 * 32)(%r11), %ymm0, %ymm0;
 > +	vpxor (2 * 32)(%r11), %ymm3, %ymm3;
 > +	vpxor (3 * 32)(%r11), %ymm2, %ymm2;
 > +	vpxor (4 * 32)(%r11), %ymm4, %ymm4;
 > +	vpxor (5 * 32)(%r11), %ymm5, %ymm5;
 > +	vpxor (6 * 32)(%r11), %ymm6, %ymm6;
 > +	vpxor (7 * 32)(%r11), %ymm7, %ymm7;
 > +	vpxor (8 * 32)(%r11), %ymm8, %ymm8;
 > +	vpxor (9 * 32)(%r11), %ymm9, %ymm9;
 > +	vpxor (10 * 32)(%r11), %ymm10, %ymm10;
 > +	vpxor (11 * 32)(%r11), %ymm11, %ymm11;
 > +	vpxor (12 * 32)(%r11), %ymm12, %ymm12;
 > +	vpxor (13 * 32)(%r11), %ymm13, %ymm13;
 > +	vpxor (14 * 32)(%r11), %ymm14, %ymm14;
 > +	vpxor (15 * 32)(%r11), %ymm15, %ymm15;
 > +	write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
 > +		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 > +		     %ymm15, %rsi);
 > +
 > +	movl $STACK_DEPTH, %eax;
 > +	leave;
 > +	CFI_LEAVE();
 > +	vzeroall;
 > +	ret_spec_stop;
 > +	CFI_ENDPROC();
 > +ELF(.size _gcry_aria_vaes_avx2_ctr_crypt_blk32,
 > +	  .-_gcry_aria_vaes_avx2_ctr_crypt_blk32;)
 > +#endif /* CONFIG_AS_VAES */
 > +
 >   #ifdef CONFIG_AS_GFNI
 >   .align 16
 >   ELF(.type __aria_gfni_avx2_crypt_32way, at function;)
 > diff --git a/cipher/aria.c b/cipher/aria.c
 > index 9eb42a2d..bc2d4384 100644
 > --- a/cipher/aria.c
 > +++ b/cipher/aria.c
 > @@ -74,6 +74,12 @@
 >   # endif
 >   #endif
 >
 > +/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 
code. */
 > +#undef USE_VAES_AVX2
 > +#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
 > +# define USE_VAES_AVX2 1
 > +#endif
 > +
 >   /* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 
code. */
 >   #undef USE_GFNI_AVX2
 >   #if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
 > @@ -142,6 +148,7 @@ typedef struct
 >   #endif
 >   #ifdef USE_AESNI_AVX2
 >     unsigned int use_aesni_avx2:1;
 > +  unsigned int use_vaes_avx2:1;
 >     unsigned int use_gfni_avx2:1;
 >   #endif
 >   #ifdef USE_GFNI_AVX512
 > @@ -464,12 +471,13 @@ static inline unsigned int
 >   aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, 
const byte *in,
 >   			   const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
 >   {
 > +  if (0) { }
 >   #ifdef USE_GFNI_AVX
 > -  if (ctx->use_gfni_avx)
 > +  else if (ctx->use_gfni_avx)
 >       return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, 
nblks)
 >   		+ ASM_EXTRA_STACK;
 > -  else
 >   #endif /* USE_GFNI_AVX */
 > +  else
 >       return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, 
key, nblks)
 >   		+ ASM_EXTRA_STACK;
 >   }
 > @@ -478,12 +486,13 @@ static inline unsigned int
 >   aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const 
byte *in,
 >   			 byte *iv)
 >   {
 > +  if (0) { }
 >   #ifdef USE_GFNI_AVX
 > -  if (ctx->use_gfni_avx)
 > +  else if (ctx->use_gfni_avx)
 >       return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
 >   		+ ASM_EXTRA_STACK;
 > -  else
 >   #endif /* USE_GFNI_AVX */
 > +  else
 >       return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
 >   		+ ASM_EXTRA_STACK;
 >   }
 > @@ -498,6 +507,16 @@ extern unsigned int
 >   _gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
 >   				      const byte *in, byte *iv) ASM_FUNC_ABI;
 >
 > +#ifdef USE_VAES_AVX2
 > +extern unsigned int
 > +_gcry_aria_vaes_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
 > +				     const byte *in,
 > +				     const void *key) ASM_FUNC_ABI;
 > +extern unsigned int
 > +_gcry_aria_vaes_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
 > +				     const byte *in, byte *iv) ASM_FUNC_ABI;
 > +#endif /* USE_VAES_AVX2 */
 > +
 >   #ifdef USE_GFNI_AVX2
 >   extern unsigned int
 >   _gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
 > @@ -512,12 +531,18 @@ static inline unsigned int
 >   aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const 
byte *in,
 >   			  const u32 key[][ARIA_RD_KEY_WORDS])
 >   {
 > +  if (0) { }
 >   #ifdef USE_GFNI_AVX2
 > -  if (ctx->use_gfni_avx2)
 > +  else if (ctx->use_gfni_avx2)
 >       return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
 >   		+ ASM_EXTRA_STACK;
 > -  else
 >   #endif /* USE_GFNI_AVX2 */
 > +#ifdef USE_VAES_AVX2
 > +  else if (ctx->use_vaes_avx2)
 > +    return _gcry_aria_vaes_avx2_ecb_crypt_blk32(ctx, out, in, key)
 > +		+ ASM_EXTRA_STACK;
 > +#endif /* USE_VAES_AVX2 */
 > +  else
 >       return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
 >   		+ ASM_EXTRA_STACK;
 >   }
 > @@ -526,12 +551,18 @@ static inline unsigned int
 >   aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const 
byte *in,
 >   			  byte *iv)
 >   {
 > +  if (0) { }
 >   #ifdef USE_GFNI_AVX2
 > -  if (ctx->use_gfni_avx2)
 > +  else if (ctx->use_gfni_avx2)
 >       return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
 >   		+ ASM_EXTRA_STACK;
 > -  else
 >   #endif /* USE_GFNI_AVX2 */
 > +#ifdef USE_VAES_AVX2
 > +  else if (ctx->use_vaes_avx2)
 > +    return _gcry_aria_vaes_avx2_ctr_crypt_blk32(ctx, out, in, iv)
 > +		+ ASM_EXTRA_STACK;
 > +#endif /* USE_VAES_AVX2 */
 > +  else
 >       return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
 >   		+ ASM_EXTRA_STACK;
 >   }
 > @@ -1614,6 +1645,9 @@ aria_setkey(void *c, const byte *key, unsigned 
keylen,
 >   #ifdef USE_GFNI_AVX2
 >     ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & 
HWF_INTEL_AVX2);
 >   #endif
 > +#ifdef USE_VAES_AVX2
 > +  ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & 
HWF_INTEL_AVX2);
 > +#endif
 >   #ifdef USE_AESNI_AVX
 >     ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & 
HWF_INTEL_AVX);
 >   #endif

Thank you so much for this implementation!
Taehee Yoo


From ap420073 at gmail.com  Mon Feb 20 11:49:19 2023
From: ap420073 at gmail.com (Taehee Yoo)
Date: Mon, 20 Feb 2023 19:49:19 +0900
Subject: [PATCH 3/5] aria-avx: small optimization for aria_ark_8way
In-Reply-To: <20230219084910.1302701-3-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
 <20230219084910.1302701-3-jussi.kivilinna@iki.fi>
Message-ID: <c5652a46-25cc-183f-384a-e8e3638606e7@gmail.com>

On 2/19/23 17:49, Jussi Kivilinna wrote:

Hi Jussi,
Thank you so much for this optimization!

 > * cipher/aria-aesni-avx-amd64.S (aria_ark_8way): Use 'vmovd' for
 > loading key material and 'vpshufb' for broadcasting from byte
 > locations 3, 2, 1 and 0.

I tested this optimization in the kernel, it works well :)
It will be helpful to the in-kernel aria-avx too!

 > --
 >
 > Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
 >
 >   Before (GFNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.516 ns/B      1847 MiB/s      2.43 c/B      4700
 >          ECB dec |     0.519 ns/B      1839 MiB/s      2.44 c/B      4700
 >          CTR enc |     0.517 ns/B      1846 MiB/s      2.43 c/B      4700
 >          CTR dec |     0.518 ns/B      1843 MiB/s      2.43 c/B      4700
 >
 >   After (GFNI/AVX, ~5% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.490 ns/B      1947 MiB/s      2.30 c/B      4700
 >          ECB dec |     0.490 ns/B      1946 MiB/s      2.30 c/B      4700
 >          CTR enc |     0.493 ns/B      1935 MiB/s      2.32 c/B      4700
 >          CTR dec |     0.493 ns/B      1934 MiB/s      2.32 c/B      4700
 >
 > ===
 >
 > Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off):
 >
 >   Before (GFNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.967 ns/B     986.6 MiB/s      2.89 c/B      2992
 >          ECB dec |     0.966 ns/B     987.1 MiB/s      2.89 c/B      2992
 >          CTR enc |     0.972 ns/B     980.8 MiB/s      2.91 c/B      2993
 >          CTR dec |     0.971 ns/B     982.5 MiB/s      2.90 c/B      2993
 >
 >   After (GFNI/AVX, ~6% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.908 ns/B      1050 MiB/s      2.72 c/B      2992
 >          ECB dec |     0.903 ns/B      1056 MiB/s      2.70 c/B      2992
 >          CTR enc |     0.913 ns/B      1045 MiB/s      2.73 c/B      2992
 >          CTR dec |     0.910 ns/B      1048 MiB/s      2.72 c/B      2992
 >
 > ===
 >
 > Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.921 ns/B      1035 MiB/s      3.50 c/B      3800
 >          ECB dec |     0.922 ns/B      1034 MiB/s      3.50 c/B      3800
 >          CTR enc |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
 >          CTR dec |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
 >
 >   After (AESNI/AVX, ~6% faster)
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.862 ns/B      1106 MiB/s      3.28 c/B      3800
 >          ECB dec |     0.862 ns/B      1106 MiB/s      3.28 c/B      3800
 >          CTR enc |     0.865 ns/B      1102 MiB/s      3.29 c/B      3800
 >          CTR dec |     0.865 ns/B      1103 MiB/s      3.29 c/B      3800
 >
 > ===
 >
 > Benchmark on AMD EPYC 7642 (zen2):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.22 ns/B     784.5 MiB/s      4.01 c/B      3298
 >          ECB dec |      1.22 ns/B     784.8 MiB/s      4.00 c/B      3292
 >          CTR enc |      1.22 ns/B     780.1 MiB/s      4.03 c/B      3299
 >          CTR dec |      1.22 ns/B     779.1 MiB/s      4.04 c/B      3299
 >
 >   After (AESNI/AVX, ~13% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.07 ns/B     888.3 MiB/s      3.54 c/B      3299
 >          ECB dec |      1.08 ns/B     885.3 MiB/s      3.55 c/B      3299
 >          CTR enc |      1.07 ns/B     888.7 MiB/s      3.54 c/B      3298
 >          CTR dec |      1.07 ns/B     887.4 MiB/s      3.55 c/B      3299
 >
 > ===
 >
 > Benchmark on Intel Core i5-6500 (skylake):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.24 ns/B     766.6 MiB/s      4.48 c/B      3598
 >          ECB dec |      1.25 ns/B     764.9 MiB/s      4.49 c/B      3598
 >          CTR enc |      1.25 ns/B     761.7 MiB/s      4.50 c/B      3598
 >          CTR dec |      1.25 ns/B     761.6 MiB/s      4.51 c/B      3598
 >
 >   After (AESNI/AVX, ~1% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.22 ns/B     780.0 MiB/s      4.40 c/B      3598
 >          ECB dec |      1.22 ns/B     779.6 MiB/s      4.40 c/B      3598
 >          CTR enc |      1.23 ns/B     776.6 MiB/s      4.42 c/B      3598
 >          CTR dec |      1.23 ns/B     776.6 MiB/s      4.42 c/B      3598
 >
 > ===
 >
 > Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      2.11 ns/B     452.7 MiB/s      5.25 c/B      2494
 >          ECB dec |      2.10 ns/B     454.5 MiB/s      5.23 c/B      2494
 >          CTR enc |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
 >          CTR dec |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
 >
 >   After (AESNI/AVX, ~4% faster)
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      2.00 ns/B     475.8 MiB/s      5.00 c/B      2494
 >          ECB dec |      2.00 ns/B     476.4 MiB/s      4.99 c/B      2494
 >          CTR enc |      2.01 ns/B     474.7 MiB/s      5.01 c/B      2494
 >          CTR dec |      2.01 ns/B     473.9 MiB/s      5.02 c/B      2494
 >
 > Cc: Taehee Yoo <ap420073 at gmail.com>
 > Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
 > ---
 >   cipher/aria-aesni-avx-amd64.S | 29 +++++++++++++++--------------
 >   1 file changed, 15 insertions(+), 14 deletions(-)
 >
 > diff --git a/cipher/aria-aesni-avx-amd64.S 
b/cipher/aria-aesni-avx-amd64.S
 > index 7274b80e..f0c72225 100644
 > --- a/cipher/aria-aesni-avx-amd64.S
 > +++ b/cipher/aria-aesni-avx-amd64.S
 > @@ -357,27 +357,21 @@
 >   		      t0, t1, t2, rk,			\
 >   		      idx, round)			\
 >   	/* AddRoundKey */                               \
 > -	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
 > -	vpsrld $24, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vmovd ((round * 16) + idx + 0)(rk), t0;		\
 > +	vpshufb .Lthree_x16 rRIP, t0, t2;		\
 >   	vpxor t2, x0, x0;				\
 > -	vpsrld $16, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Ltwo_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x1, x1;				\
 > -	vpsrld $8, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Lone_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x2, x2;				\
 >   	vpshufb t1, t0, t2;				\
 >   	vpxor t2, x3, x3;				\
 > -	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
 > -	vpsrld $24, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vmovd ((round * 16) + idx + 4)(rk), t0;		\
 > +	vpshufb .Lthree_x16 rRIP, t0, t2;		\
 >   	vpxor t2, x4, x4;				\
 > -	vpsrld $16, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Ltwo_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x5, x5;				\
 > -	vpsrld $8, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Lone_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x6, x6;				\
 >   	vpshufb t1, t0, t2;				\
 >   	vpxor t2, x7, x7;
 > @@ -858,6 +852,13 @@ SECTION_RODATA
 >   .Ltf_hi__x2__and__fwd_aff:
 >   	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
 >
 > +.Lthree_x16:
 > +	.byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
 > +.Ltwo_x16:
 > +	.byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 > +.Lone_x16:
 > +	.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 > +
 >   .Lbige_addb_1:
 >   	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
 >   .Lbige_addb_2:

Thanks a lot!
Taehee Yoo


From ap420073 at gmail.com  Mon Feb 20 11:54:30 2023
From: ap420073 at gmail.com (Taehee Yoo)
Date: Mon, 20 Feb 2023 19:54:30 +0900
Subject: [PATCH 4/5] aria-avx512: small optimization for aria_diff_m
In-Reply-To: <20230219084910.1302701-4-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
 <20230219084910.1302701-4-jussi.kivilinna@iki.fi>
Message-ID: <0b0367e5-d5f2-3ed5-58c9-be59a7d770d3@gmail.com>

On 2/19/23 17:49, Jussi Kivilinna wrote:

Hi Jussi,
Thank you so much for this optimization!

I tested this optimization in the kernel.
It works very well.
In my machine(i3-12100), it improves performance ~9%, awesome!
It will be really helpful to the kernel side aria-avx512 driver for 
improving performance.

 > * cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for
 > 3-way XOR operation.
 > ---
 >
 > Using vpternlogq gives small performance improvement on AMD Zen4. With
 > Intel tiger-lake speed is the same as before.
 >
 > Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
 >
 > Before:
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.204 ns/B      4682 MiB/s     0.957 c/B      4700
 >          ECB dec |     0.204 ns/B      4668 MiB/s     0.960 c/B      4700
 >          CTR enc |     0.212 ns/B      4509 MiB/s     0.994 c/B      4700
 >          CTR dec |     0.212 ns/B      4490 MiB/s     0.998 c/B      4700
 >
 > After (~3% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.198 ns/B      4812 MiB/s     0.932 c/B      4700
 >          ECB dec |     0.198 ns/B      4824 MiB/s     0.929 c/B      4700
 >          CTR enc |     0.204 ns/B      4665 MiB/s     0.961 c/B      4700
 >          CTR dec |     0.206 ns/B      4631 MiB/s     0.968 c/B      4700
 >
 > Cc: Taehee Yoo <ap420073 at gmail.com>
 > Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
 > ---
 >   cipher/aria-gfni-avx512-amd64.S | 16 ++++++----------
 >   1 file changed, 6 insertions(+), 10 deletions(-)
 >
 > diff --git a/cipher/aria-gfni-avx512-amd64.S 
b/cipher/aria-gfni-avx512-amd64.S
 > index 849c744b..24a49a89 100644
 > --- a/cipher/aria-gfni-avx512-amd64.S
 > +++ b/cipher/aria-gfni-avx512-amd64.S
 > @@ -406,21 +406,17 @@
 >   	vgf2p8affineinvqb $0, t2, y3, y3;		\
 >   	vgf2p8affineinvqb $0, t2, y7, y7;
 >
 > -
 >   #define aria_diff_m(x0, x1, x2, x3,			\
 >   		    t0, t1, t2, t3)			\
 >   	/* T = rotr32(X, 8); */				\
 >   	/* X ^= T */					\
 > -	vpxorq x0, x3, t0;				\
 > -	vpxorq x1, x0, t1;				\
 > -	vpxorq x2, x1, t2;				\
 > -	vpxorq x3, x2, t3;				\
 >   	/* X = T ^ rotr(X, 16); */			\
 > -	vpxorq t2, x0, x0;				\
 > -	vpxorq x1, t3, t3;				\
 > -	vpxorq t0, x2, x2;				\
 > -	vpxorq t1, x3, x1;				\
 > -	vmovdqu64 t3, x3;
 > +	vmovdqa64 x0, t0;				\
 > +	vmovdqa64 x3, t3;				\
 > +	vpternlogq $0x96, x2, x1, x0;			\
 > +	vpternlogq $0x96, x2, x1, x3;			\
 > +	vpternlogq $0x96, t0, t3, x2;			\
 > +	vpternlogq $0x96, t0, t3, x1;
 >
 >   #define aria_diff_word(x0, x1, x2, x3,			\
 >   		       x4, x5, x6, x7,			\

Thank you so much!
Taehee Yoo


From jussi.kivilinna at iki.fi  Wed Feb 22 19:39:14 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 20:39:14 +0200
Subject: [PATCH 1/5] aria: add x86_64 AESNI/GFNI/AVX/AVX2 accelerated
 implementations
In-Reply-To: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
Message-ID: <0c2ec933-797a-a04b-8967-6a9e7ab09985@iki.fi>

On 19.2.2023 10.49, Jussi Kivilinna wrote:
> * cipher/Makefile.am: Add 'aria-aesni-avx-amd64.S' and
> 'aria-aesni-avx2-amd64.S'.
> * cipher/aria-aesni-avx-amd64.S: New.
> * cipher/aria-aesni-avx2-amd64.S: New....

Pushed series to master with minor changes to CTR-mode handling:
  - Small optimization for CTR byte-addition code-path in AVX/AVX2
  - Added CTR byte-addition code-path for AVX512

-Jussi


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:17 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:17 +0200
Subject: [PATCH 1/8] aes-vaes-avx2: improve case when only CTR needs carry
 handling
Message-ID: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>

* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ctr_enc_amd64): Add handling for the case when
only main counter needs carry handling but generated vector counters
do not.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-vaes-avx2-amd64.S | 76 +++++++++++++++++--------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index aceccb96..10213bfb 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -738,6 +738,16 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	vpslldq $8, tmp2, tmp2; \
 	vpsubq tmp2, x, x;
 
+#define handle_ctr_128bit_add(nblks) \
+	addq $(nblks), %r10; \
+	adcq $0, %r11; \
+	bswapq %r10; \
+	bswapq %r11; \
+	movq %r10, 8(%rsi); \
+	movq %r11, 0(%rsi); \
+	bswapq %r10; \
+	bswapq %r11;
+
 	/* Process 16 blocks per loop. */
 .align 8
 .Lctr_enc_blk16:
@@ -753,6 +763,9 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	addb $16, 15(%rsi);
 	jc .Lctr_enc_blk16_handle_carry;
 
+	leaq 16(%r10), %r10;
+
+  .Lctr_enc_blk16_byte_bige_add:
 	/* Increment counters. */
 	vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0;
 	vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1;
@@ -762,7 +775,6 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5;
 	vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6;
 	vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7;
-	leaq 16(%r10), %r10;
 
   .Lctr_enc_blk16_rounds:
 	/* AES rounds */
@@ -829,22 +841,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	jmp .Lctr_enc_blk16;
 
+  .align 8
+  .Lctr_enc_blk16_handle_only_ctr_carry:
+	handle_ctr_128bit_add(16);
+	jmp .Lctr_enc_blk16_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk16_handle_carry:
+	jz .Lctr_enc_blk16_handle_only_ctr_carry;
 	/* Increment counters (handle carry). */
 	vpshufb %xmm13, %xmm7, %xmm1; /* be => le */
 	vmovdqa %xmm1, %xmm0;
 	inc_le128(%xmm1, %xmm15, %xmm5);
 	vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */
 	vpshufb %ymm13, %ymm7, %ymm0;
-	addq $16, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(16);
 	add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */
 	vpshufb %ymm13, %ymm7, %ymm1;
 	add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */
@@ -877,12 +888,14 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	addb $8, 15(%rsi);
 	jc .Lctr_enc_blk8_handle_carry;
 
+	leaq 8(%r10), %r10;
+
+  .Lctr_enc_blk8_byte_bige_add:
 	/* Increment counters. */
 	vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
 	vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
 	vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2;
 	vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3;
-	leaq 8(%r10), %r10;
 
   .Lctr_enc_blk8_rounds:
 	/* AES rounds */
@@ -937,22 +950,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	jmp .Lctr_enc_blk4;
 
+  .align 8
+  .Lctr_enc_blk8_handle_only_ctr_carry:
+	handle_ctr_128bit_add(8);
+	jmp .Lctr_enc_blk8_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk8_handle_carry:
+	jz .Lctr_enc_blk8_handle_only_ctr_carry;
 	/* Increment counters (handle carry). */
 	vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
 	vmovdqa %xmm1, %xmm0;
 	inc_le128(%xmm1, %xmm15, %xmm5);
 	vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
 	vpshufb %ymm13, %ymm3, %ymm0;
-	addq $8, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(8);
 	add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
 	vpshufb %ymm13, %ymm3, %ymm1;
 	add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */
@@ -977,10 +989,12 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	addb $4, 15(%rsi);
 	jc .Lctr_enc_blk4_handle_carry;
 
+	leaq 4(%r10), %r10;
+
+  .Lctr_enc_blk4_byte_bige_add:
 	/* Increment counters. */
 	vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
 	vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
-	leaq 4(%r10), %r10;
 
   .Lctr_enc_blk4_rounds:
 	/* AES rounds */
@@ -1029,22 +1043,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	jmp .Lctr_enc_blk1;
 
+  .align 8
+  .Lctr_enc_blk4_handle_only_ctr_carry:
+	handle_ctr_128bit_add(4);
+	jmp .Lctr_enc_blk4_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk4_handle_carry:
+	jz .Lctr_enc_blk4_handle_only_ctr_carry;
 	/* Increment counters (handle carry). */
 	vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
 	vmovdqa %xmm1, %xmm0;
 	inc_le128(%xmm1, %xmm15, %xmm5);
 	vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
 	vpshufb %ymm13, %ymm3, %ymm0;
-	addq $4, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(4);
 	add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
 	vpshufb %ymm13, %ymm3, %ymm1;
 
@@ -1060,14 +1073,7 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	/* Load and increament counter. */
 	vmovdqu (%rsi), %xmm0;
-	addq $1, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(1);
 
 	/* AES rounds. */
 	vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:21 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:21 +0200
Subject: [PATCH 5/8] camellia-avx2: add fast path for full 32 block ECB input
In-Reply-To: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
References: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
Message-ID: <20230222192924.2291776-5-jussi.kivilinna@iki.fi>

* cipher/camellia-aesni-avx2-amd64.h (enc_blk1_32, dec_blk1_32): Add
fast path for 32 block input.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx2-amd64.h | 41 ++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 7d451c09..92f0ce5f 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -2127,12 +2127,9 @@ FUNC_NAME(enc_blk1_32):
 
 	cmpl $31, %ecx;
 	vpxor %xmm0, %xmm0, %xmm0;
-	ja 1f;
+	ja .Lenc_blk32;
 	jb 2f;
 	  vmovdqu 15 * 32(%rdx), %xmm0;
-	  jmp 2f;
-	1:
-	  vmovdqu 15 * 32(%rdx), %ymm0;
 	2:
 	  vmovdqu %ymm0, (%rax);
 
@@ -2195,13 +2192,29 @@ FUNC_NAME(enc_blk1_32):
 	STORE_OUTPUT(ymm9, 14);
 	STORE_OUTPUT(ymm8, 15);
 
+.align 8
 2:
+.Lenc_blk32_done:
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
+
+.align 8
+.Lenc_blk32:
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX));
+
+	call FUNC_NAME(enc_blk32);
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+	jmp .Lenc_blk32_done;
+	CFI_ENDPROC();
 ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
 
 .align 16
@@ -2235,12 +2248,9 @@ FUNC_NAME(dec_blk1_32):
 
 	cmpl $31, %ecx;
 	vpxor %xmm0, %xmm0, %xmm0;
-	ja 1f;
+	ja .Ldec_blk32;
 	jb 2f;
 	  vmovdqu 15 * 32(%rdx), %xmm0;
-	  jmp 2f;
-	1:
-	  vmovdqu 15 * 32(%rdx), %ymm0;
 	2:
 	  vmovdqu %ymm0, (%rax);
 
@@ -2284,12 +2294,27 @@ FUNC_NAME(dec_blk1_32):
 	STORE_OUTPUT(ymm9, 14);
 	STORE_OUTPUT(ymm8, 15);
 
+.align 8
 2:
+.Ldec_blk32_done:
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
+
+.align 8
+.Ldec_blk32:
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	call FUNC_NAME(dec_blk32);
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+	jmp .Ldec_blk32_done;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
 
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:22 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:22 +0200
Subject: [PATCH 6/8] camellia-gfni-avx512: speed up for round key broadcasting
In-Reply-To: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
References: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
Message-ID: <20230222192924.2291776-6-jussi.kivilinna@iki.fi>

* cipher/camellia-gfni-avx512-amd64.S (roundsm64, fls64): Use
'vpbroadcastb' for loading round key.
--

Benchmark on AMD Ryzen 9 7900X (turbo-freq off):

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.173 ns/B      5514 MiB/s     0.813 c/B      4700
        ECB dec |     0.176 ns/B      5432 MiB/s     0.825 c/B      4700

 After (~13% faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.152 ns/B      6267 MiB/s     0.715 c/B      4700
        ECB dec |     0.155 ns/B      6170 MiB/s     0.726 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-gfni-avx512-amd64.S | 88 ++++++++++-------------------
 1 file changed, 31 insertions(+), 57 deletions(-)

diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index c62b7848..b676379f 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -1,6 +1,6 @@
 /* camellia-gfni-avx512-amd64.S - GFNI/AVX512 implementation of Camellia
  *
- * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -175,8 +175,6 @@
 	vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
 	vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
 	vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
-	vpxor t7##_x, t7##_x, t7##_x; \
-	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
 	\
 	/* prefilter sboxes */ \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
@@ -202,10 +200,8 @@
 	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
 	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
 	\
-	vpsrldq $1, t0, t1; \
-	vpsrldq $2, t0, t2; \
-	vpshufb t7, t1, t1; \
-	vpsrldq $3, t0, t3; \
+	vpbroadcastb 7+key, t7; \
+	vpbroadcastb 6+key, t6; \
 	\
 	/* P-function */ \
 	vpxorq x5, x0, x0; \
@@ -213,26 +209,25 @@
 	vpxorq x7, x2, x2; \
 	vpxorq x4, x3, x3; \
 	\
-	vpshufb t7, t2, t2; \
-	vpsrldq $4, t0, t4; \
-	vpshufb t7, t3, t3; \
-	vpsrldq $5, t0, t5; \
-	vpshufb t7, t4, t4; \
+	vpbroadcastb 5+key, t5; \
+	vpbroadcastb 4+key, t4; \
 	\
 	vpxorq x2, x4, x4; \
 	vpxorq x3, x5, x5; \
 	vpxorq x0, x6, x6; \
 	vpxorq x1, x7, x7; \
 	\
-	vpsrldq $6, t0, t6; \
-	vpshufb t7, t5, t5; \
-	vpshufb t7, t6, t6; \
+	vpbroadcastb 3+key, t3; \
+	vpbroadcastb 2+key, t2; \
 	\
 	vpxorq x7, x0, x0; \
 	vpxorq x4, x1, x1; \
 	vpxorq x5, x2, x2; \
 	vpxorq x6, x3, x3; \
 	\
+	vpbroadcastb 1+key, t1; \
+	vpbroadcastb 0+key, t0; \
+	\
 	vpxorq x3, x4, x4; \
 	vpxorq x0, x5, x5; \
 	vpxorq x1, x6, x6; \
@@ -240,13 +235,8 @@
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
-	vpternlogq $0x96, mem_cd##_5, t6, x1; \
-	\
-	vpsrldq $7, t0, t6; \
-	vpshufb t7, t0, t0; \
-	vpshufb t7, t6, t7; \
-	\
 	vpternlogq $0x96, mem_cd##_4, t7, x0; \
+	vpternlogq $0x96, mem_cd##_5, t6, x1; \
 	vpternlogq $0x96, mem_cd##_6, t5, x2; \
 	vpternlogq $0x96, mem_cd##_7, t4, x3; \
 	vpternlogq $0x96, mem_cd##_0, t3, x4; \
@@ -348,16 +338,12 @@
 	 * t0 &= ll; \
 	 * lr ^= rol32(t0, 1); \
 	 */ \
-	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
 	vpbroadcastq .Lbyte_ones rRIP, tmp; \
-	vpxor tt3##_x, tt3##_x, tt3##_x; \
-	vpshufb tt3, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t0; \
+	vpxor tt3##_y, tt3##_y, tt3##_y; \
+	vpbroadcastb 0+kll, t3; \
+	vpbroadcastb 1+kll, t2; \
+	vpbroadcastb 2+kll, t1; \
+	vpbroadcastb 3+kll, t0; \
 	\
 	vpandq l0, t0, t0; \
 	vpandq l1, t1, t1; \
@@ -367,7 +353,6 @@
 	rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
 	\
 	vpternlogq $0x96, tt2, t0, l4; \
-	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
 	vmovdqu64 l4, l##_4; \
 	vpternlogq $0x96, tt1, t1, l5; \
 	vmovdqu64 l5, l##_5; \
@@ -375,7 +360,7 @@
 	vmovdqu64 l6, l##_6; \
 	vpternlogq $0x96, tt3, t3, l7; \
 	vmovdqu64 l7, l##_7; \
-	vpxor tt3##_x, tt3##_x, tt3##_x; \
+	vpxor tt3##_y, tt3##_y, tt3##_y; \
 	\
 	/* \
 	 * t2 = krr; \
@@ -383,16 +368,12 @@
 	 * rl ^= t2; \
 	 */ \
 	\
-	vpshufb tt3, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t0; \
+	vpbroadcastb 0+krr, t3; \
+	vpbroadcastb 1+krr, t2; \
+	vpbroadcastb 2+krr, t1; \
+	vpbroadcastb 3+krr, t0; \
 	\
 	vpternlogq $0x1e, r##_4, t0, r##_0; \
-	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
 	vpternlogq $0x1e, r##_5, t1, r##_1; \
 	vpternlogq $0x1e, r##_6, t2, r##_2; \
 	vpternlogq $0x1e, r##_7, t3, r##_3; \
@@ -402,13 +383,10 @@
 	 * t2 &= rl; \
 	 * rr ^= rol32(t2, 1); \
 	 */ \
-	vpshufb tt3, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t0; \
+	vpbroadcastb 0+krl, t3; \
+	vpbroadcastb 1+krl, t2; \
+	vpbroadcastb 2+krl, t1; \
+	vpbroadcastb 3+krl, t0; \
 	\
 	vpandq r##_0, t0, t0; \
 	vpandq r##_1, t1, t1; \
@@ -418,11 +396,10 @@
 	rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
 	\
 	vpternlogq $0x96, tt2, t0, r##_4; \
-	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
 	vpternlogq $0x96, tt1, t1, r##_5; \
 	vpternlogq $0x96, tt0, t2, r##_6; \
 	vpternlogq $0x96, tt3, t3, r##_7; \
-	vpxor tt3##_x, tt3##_x, tt3##_x; \
+	vpxor tt3##_y, tt3##_y, tt3##_y; \
 	\
 	/* \
 	 * t0 = klr; \
@@ -430,13 +407,10 @@
 	 * ll ^= t0; \
 	 */ \
 	\
-	vpshufb tt3, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt3, t0, t0; \
+	vpbroadcastb 0+klr, t3; \
+	vpbroadcastb 1+klr, t2; \
+	vpbroadcastb 2+klr, t1; \
+	vpbroadcastb 3+klr, t0; \
 	\
 	vpternlogq $0x1e, l4, t0, l0; \
 	vmovdqu64 l0, l##_0; \
@@ -623,7 +597,7 @@ ELF(.type   _gcry_camellia_gfni_avx512__constants, at object;)
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 .Lbyte_ones:
-	.byte 1, 1, 1, 1, 1, 1, 1, 1
+	.quad 0x0101010101010101
 
 /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
  * and s4.
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:18 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:18 +0200
Subject: [PATCH 2/8] sm4: add CTR-mode byte addition for AVX/AVX2/AVX512
 implementations
In-Reply-To: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
References: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
Message-ID: <20230222192924.2291776-2-jussi.kivilinna@iki.fi>

* cipher/sm4-aesni-avx-amd64.S
(_gcry_sm4_aesni_avx_ctr_enc): Add byte addition fast-path.
* cipher/sm4-aesni-avx2-amd64.S
(_gcry_sm4_aesni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx2-amd64.S
(_gcry_sm4_gfni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx512-amd64.S
(_gcry_sm4_gfni_avx512_ctr_enc)
(_gcry_sm4_gfni_avx512_ctr_enc_blk32): Likewise.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sm4-aesni-avx-amd64.S   |  68 +++++++++++++++++++++-
 cipher/sm4-aesni-avx2-amd64.S  |  65 ++++++++++++++++++++-
 cipher/sm4-gfni-avx2-amd64.S   |  65 ++++++++++++++++++++-
 cipher/sm4-gfni-avx512-amd64.S | 103 ++++++++++++++++++++++++++++++++-
 4 files changed, 295 insertions(+), 6 deletions(-)

diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S
index c09b205d..ca9be44a 100644
--- a/cipher/sm4-aesni-avx-amd64.S
+++ b/cipher/sm4-aesni-avx-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-avx-aesni-amd64.S  -  AES-NI/AVX implementation of SM4 cipher
  *
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2020,2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -150,6 +150,38 @@ _sm4_aesni_avx_consts:
 .Lbswap32_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.Lbige_addb_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
@@ -529,6 +561,9 @@ _gcry_sm4_aesni_avx_ctr_enc:
 	 */
 	CFI_STARTPROC();
 
+	cmpb $(0x100 - 8), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	/* load IV and byteswap */
 	vmovdqu (%rcx), RA0;
 
@@ -565,6 +600,8 @@ _gcry_sm4_aesni_avx_ctr_enc:
 	/* store new IV */
 	vmovdqu RTMP1, (%rcx);
 
+.align 8
+.Lload_ctr_done:
 	call __sm4_crypt_blk8;
 
 	vpxor (0 * 16)(%rdx), RA0, RA0;
@@ -588,6 +625,35 @@ _gcry_sm4_aesni_avx_ctr_enc:
 	vzeroall;
 
 	ret_spec_stop;
+	.align 8
+
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $8, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+	vmovdqu (%rcx), RA0;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $8, 15(%rcx);
+.Lctr_byteadd_xmm:
+	vpaddb .Lbige_addb_1 rRIP, RA0, RA1;
+	vpaddb .Lbige_addb_2 rRIP, RA0, RA2;
+	vpaddb .Lbige_addb_3 rRIP, RA0, RA3;
+	vpaddb .Lbige_addb_4 rRIP, RA0, RB0;
+	vpaddb .Lbige_addb_5 rRIP, RA0, RB1;
+	vpaddb .Lbige_addb_6 rRIP, RA0, RB2;
+	vpaddb .Lbige_addb_7 rRIP, RA0, RB3;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;)
 
diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index acd37cff..03f979fa 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-avx2-amd64.S  -  AVX2 implementation of SM4 cipher
  *
- * Copyright (C) 2020, 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -171,6 +171,33 @@ _sm4_aesni_avx2_consts:
 .Lbswap32_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
@@ -371,6 +398,9 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 	 */
 	CFI_STARTPROC();
 
+	cmpb $(0x100 - 16), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
@@ -438,11 +468,12 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 	vextracti128 $1, RTMP0, RTMP0x;
 	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
 
-.align 4
 .Lctr_carry_done:
 	/* store new IV */
 	vmovdqu RTMP0x, (%rcx);
 
+.align 8
+.Lload_ctr_done:
 	call __sm4_crypt_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -466,6 +497,36 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 	vzeroall;
 
 	ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $16, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+	vbroadcasti128 (%rcx), RB3;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $16, 15(%rcx);
+.Lctr_byteadd_ymm:
+	vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+	vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+	vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+	vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+	vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;)
 
diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S
index 2fbaffd5..464da399 100644
--- a/cipher/sm4-gfni-avx2-amd64.S
+++ b/cipher/sm4-gfni-avx2-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-gfni-avx2-amd64.S  -  GFNI/AVX2 implementation of SM4 cipher
  *
- * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -136,6 +136,33 @@ _sm4_gfni_avx2_consts:
 .Lbswap32_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .text
 
 .align 16
@@ -658,6 +685,9 @@ _gcry_sm4_gfni_avx2_ctr_enc:
 	 */
 	CFI_STARTPROC();
 
+	cmpb $(0x100 - 16), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
@@ -725,11 +755,12 @@ _gcry_sm4_gfni_avx2_ctr_enc:
 	vextracti128 $1, RTMP0, RTMP0x;
 	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
 
-.align 4
 .Lctr_carry_done:
 	/* store new IV */
 	vmovdqu RTMP0x, (%rcx);
 
+.align 8
+.Lload_ctr_done:
 	call __sm4_gfni_crypt_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -753,6 +784,36 @@ _gcry_sm4_gfni_avx2_ctr_enc:
 	vzeroall;
 
 	ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $16, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+	vbroadcasti128 (%rcx), RB3;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $16, 15(%rcx);
+.Lctr_byteadd_ymm:
+	vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+	vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+	vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+	vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+	vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;)
 
diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
index b095f85d..91f6e80b 100644
--- a/cipher/sm4-gfni-avx512-amd64.S
+++ b/cipher/sm4-gfni-avx512-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-gfni-avx512-amd64.S  -  GFNI/AVX512 implementation of SM4 cipher
  *
- * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -146,6 +146,35 @@ SECTION_RODATA
 	.quad 2, 0
 	.quad 3, 0
 
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
 .text
 
 .align 16
@@ -627,6 +656,9 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	CFI_STARTPROC();
 	spec_stop_avx512;
 
+	cmpb $(0x100 - 16), 15(%rcx);
+	jbe .Lctr_byteadd16;
+
 	vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
 	vmovdqa .Lcounter0123_lo rRIP, RTMP1;
 	vbroadcasti128 .Lcounter2222_lo rRIP, RTMP2;
@@ -695,6 +727,8 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	vpshufb RTMP0, RB2, RB2;
 	vpshufb RTMP0, RB3, RB3;
 
+.align 16
+.Lload_ctr_done16:
 	call __sm4_gfni_crypt_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -719,6 +753,36 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	kxorq %k1, %k1, %k1;
 
 	ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry16:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $16, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_ymm16;
+.align 16
+.Lctr_byteadd16:
+	vbroadcasti128 (%rcx), RB3;
+	je .Lctr_byteadd_full_ctr_carry16;
+	addb $16, 15(%rcx);
+.Lctr_byteadd_ymm16:
+	vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+	vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+	vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+	vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+	vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+	jmp .Lload_ctr_done16;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_gfni_avx512_ctr_enc,.-_gcry_sm4_gfni_avx512_ctr_enc;)
 
@@ -1304,6 +1368,9 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	CFI_STARTPROC();
 	spec_stop_avx512;
 
+	cmpb $(0x100 - 32), 15(%rcx);
+	jbe .Lctr_byteadd32;
+
 	vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z;
 	vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z;
 	vbroadcasti64x2 .Lcounter4444_lo rRIP, RTMP2z;
@@ -1372,6 +1439,8 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	vpshufb RTMP0z, RB2z, RB2z;
 	vpshufb RTMP0z, RB3z, RB3z;
 
+.align 16
+.Lload_ctr_done32:
 	call __sm4_gfni_crypt_blk32;
 
 	vpxord (0 * 64)(%rdx), RA0z, RA0z;
@@ -1396,6 +1465,38 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	kxorq %k1, %k1, %k1;
 
 	ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry32:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $32, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_zmm32;
+.align 16
+.Lctr_byteadd32:
+	vbroadcasti64x2 (%rcx), RA3z;
+	je .Lctr_byteadd_full_ctr_carry32;
+	addb $32, 15(%rcx);
+.Lctr_byteadd_zmm32:
+	vbroadcasti64x2 .Lbige_addb_16 rRIP, RB3z;
+	vpaddb RB3z, RA3z, RB3z;
+	vpaddb .Lbige_addb_0_1 rRIP, RA3z, RA0z;
+	vpaddb .Lbige_addb_4_5 rRIP, RA3z, RA1z;
+	vpaddb .Lbige_addb_8_9 rRIP, RA3z, RA2z;
+	vpaddb .Lbige_addb_12_13 rRIP, RA3z, RA3z;
+	vpaddb .Lbige_addb_0_1 rRIP, RB3z, RB0z;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3z, RB1z;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3z, RB2z;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3z, RB3z;
+
+	jmp .Lload_ctr_done32;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_gfni_avx512_ctr_enc_blk32,.-_gcry_sm4_gfni_avx512_ctr_enc_blk32;)
 
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:23 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:23 +0200
Subject: [PATCH 7/8] camellia-avx2: speed up for round key broadcasting
In-Reply-To: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
References: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
Message-ID: <20230222192924.2291776-7-jussi.kivilinna@iki.fi>

* cipher/camellia-aesni-avx2-amd64.h (roundsm32, fls32): Use
'vpbroadcastb' for loading round key.
* cipher/camellia-glue.c (camellia_encrypt_blk1_32)
(camellia_decrypt_blk1_32): Adjust num_blks thresholds for AVX2
implementations, 2 blks for GFNI, 4 blks for VAES and 5 blks for AESNI.
--

Benchmark on AMD Ryzen 9 7900X (turbo-freq off):

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.213 ns/B      4469 MiB/s      1.00 c/B      4700
        ECB dec |     0.215 ns/B      4440 MiB/s      1.01 c/B      4700

 After (~10% faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.194 ns/B      4919 MiB/s     0.911 c/B      4700
        ECB dec |     0.195 ns/B      4896 MiB/s     0.916 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx2-amd64.h | 120 +++++++++++------------------
 cipher/camellia-glue.c             |  24 +++---
 2 files changed, 55 insertions(+), 89 deletions(-)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 92f0ce5f..003c4496 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -1,6 +1,6 @@
 /* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia
  *
- * Copyright (C) 2013-2015,2020-2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015,2020-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -145,8 +145,6 @@
 	vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
 	vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
 	vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
-	vpxor t7##_x, t7##_x, t7##_x; \
-	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
 	\
 	/* prefilter sboxes */ \
 	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
@@ -172,10 +170,8 @@
 	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
 	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
 	\
-	vpsrldq $1, t0, t1; \
-	vpsrldq $2, t0, t2; \
-	vpshufb t7, t1, t1; \
-	vpsrldq $3, t0, t3; \
+	vpbroadcastb 7+key, t7; \
+	vpbroadcastb 6+key, t6; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
@@ -183,26 +179,25 @@
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
-	vpshufb t7, t2, t2; \
-	vpsrldq $4, t0, t4; \
-	vpshufb t7, t3, t3; \
-	vpsrldq $5, t0, t5; \
-	vpshufb t7, t4, t4; \
+	vpbroadcastb 5+key, t5; \
+	vpbroadcastb 4+key, t4; \
 	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
-	vpsrldq $6, t0, t6; \
-	vpshufb t7, t5, t5; \
-	vpshufb t7, t6, t6; \
+	vpbroadcastb 3+key, t3; \
+	vpbroadcastb 2+key, t2; \
 	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
+	vpbroadcastb 1+key, t1; \
+	vpbroadcastb 0+key, t0; \
+	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
@@ -210,16 +205,12 @@
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
-	vpxor t6, x1, x1; \
-	vpxor 5 * 32(mem_cd), x1, x1; \
-	\
-	vpsrldq $7, t0, t6; \
-	vpshufb t7, t0, t0; \
-	vpshufb t7, t6, t7; \
-	\
 	vpxor t7, x0, x0; \
 	vpxor 4 * 32(mem_cd), x0, x0; \
 	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 32(mem_cd), x1, x1; \
+	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 32(mem_cd), x2, x2; \
 	\
@@ -285,7 +276,7 @@
 	filter_8bit(x1, t5, t6, t7, t4); \
 	filter_8bit(x4, t5, t6, t7, t4); \
 	\
-	vpxor t4##_x, t4##_x, t4##_x; \
+	vpxor t4, t4, t4; \
 	\
 	/* AES subbytes + AES shift rows */ \
 	IF_AESNI(vextracti128 $1, x2, t6##_x; \
@@ -341,17 +332,12 @@
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
-	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
-	\
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
-	vpxor t7##_x, t7##_x, t7##_x; \
 	\
-	vpsrldq $1, t0, t1; \
-	vpsrldq $2, t0, t2; \
-	vpshufb t7, t1, t1; \
-	vpsrldq $3, t0, t3; \
+	vpbroadcastb 7+key, t7; \
+	vpbroadcastb 6+key, t6; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
@@ -359,26 +345,25 @@
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
-	vpshufb t7, t2, t2; \
-	vpsrldq $4, t0, t4; \
-	vpshufb t7, t3, t3; \
-	vpsrldq $5, t0, t5; \
-	vpshufb t7, t4, t4; \
+	vpbroadcastb 5+key, t5; \
+	vpbroadcastb 4+key, t4; \
 	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
-	vpsrldq $6, t0, t6; \
-	vpshufb t7, t5, t5; \
-	vpshufb t7, t6, t6; \
+	vpbroadcastb 3+key, t3; \
+	vpbroadcastb 2+key, t2; \
 	\
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
+	vpbroadcastb 1+key, t1; \
+	vpbroadcastb 0+key, t0; \
+	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
@@ -386,16 +371,12 @@
 	\
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
-	vpxor t6, x1, x1; \
-	vpxor 5 * 32(mem_cd), x1, x1; \
-	\
-	vpsrldq $7, t0, t6; \
-	vpshufb t7, t0, t0; \
-	vpshufb t7, t6, t7; \
-	\
 	vpxor t7, x0, x0; \
 	vpxor 4 * 32(mem_cd), x0, x0; \
 	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 32(mem_cd), x1, x1; \
+	\
 	vpxor t5, x2, x2; \
 	vpxor 6 * 32(mem_cd), x2, x2; \
 	\
@@ -515,15 +496,11 @@
 	 * t0 &= ll; \
 	 * lr ^= rol32(t0, 1); \
 	 */ \
-	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
 	vpxor tt0, tt0, tt0; \
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpbroadcastb 0+kll, t3; \
+	vpbroadcastb 1+kll, t2; \
+	vpbroadcastb 2+kll, t1; \
+	vpbroadcastb 3+kll, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
@@ -533,7 +510,6 @@
 	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
 	\
 	vpxor l4, t0, l4; \
-	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
 	vmovdqu l4, 4 * 32(l); \
 	vpxor l5, t1, l5; \
 	vmovdqu l5, 5 * 32(l); \
@@ -548,13 +524,10 @@
 	 * rl ^= t2; \
 	 */ \
 	\
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpbroadcastb 0+krr, t3; \
+	vpbroadcastb 1+krr, t2; \
+	vpbroadcastb 2+krr, t1; \
+	vpbroadcastb 3+krr, t0; \
 	\
 	vpor 4 * 32(r), t0, t0; \
 	vpor 5 * 32(r), t1, t1; \
@@ -566,7 +539,6 @@
 	vpxor 2 * 32(r), t2, t2; \
 	vpxor 3 * 32(r), t3, t3; \
 	vmovdqu t0, 0 * 32(r); \
-	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 1 * 32(r); \
 	vmovdqu t2, 2 * 32(r); \
 	vmovdqu t3, 3 * 32(r); \
@@ -576,13 +548,10 @@
 	 * t2 &= rl; \
 	 * rr ^= rol32(t2, 1); \
 	 */ \
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpbroadcastb 0+krl, t3; \
+	vpbroadcastb 1+krl, t2; \
+	vpbroadcastb 2+krl, t1; \
+	vpbroadcastb 3+krl, t0; \
 	\
 	vpand 0 * 32(r), t0, t0; \
 	vpand 1 * 32(r), t1, t1; \
@@ -596,7 +565,6 @@
 	vpxor 6 * 32(r), t2, t2; \
 	vpxor 7 * 32(r), t3, t3; \
 	vmovdqu t0, 4 * 32(r); \
-	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
 	vmovdqu t1, 5 * 32(r); \
 	vmovdqu t2, 6 * 32(r); \
 	vmovdqu t3, 7 * 32(r); \
@@ -607,13 +575,10 @@
 	 * ll ^= t0; \
 	 */ \
 	\
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpbroadcastb 0+klr, t3; \
+	vpbroadcastb 1+klr, t2; \
+	vpbroadcastb 2+klr, t1; \
+	vpbroadcastb 3+klr, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
@@ -837,6 +802,7 @@ ELF(.type   FUNC_NAME(_constants), at object;)
 
 #ifdef CAMELLIA_GFNI_BUILD
 
+.align 64
 /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
  * and s4.
  *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 76a09eb1..b87faa91 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -630,27 +630,27 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
   gcry_assert (num_blks <= 32);
 
 #ifdef USE_GFNI_AVX2
-  if (ctx->use_gfni_avx2 && num_blks >= 3)
+  if (ctx->use_gfni_avx2 && num_blks >= 2)
     {
-      /* 3 or more parallel block GFNI processing is faster than
+      /* 2 or more parallel block GFNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_VAES_AVX2
-  if (ctx->use_vaes_avx2 && num_blks >= 6)
+  if (ctx->use_vaes_avx2 && num_blks >= 4)
     {
-      /* 6 or more parallel block VAES processing is faster than
+      /* 4 or more parallel block VAES processing is faster than
        * generic C implementation.  */
       _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2 && num_blks >= 6)
+  if (ctx->use_aesni_avx2 && num_blks >= 5)
     {
-      /* 6 or more parallel block AESNI processing is faster than
+      /* 5 or more parallel block AESNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
@@ -721,27 +721,27 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
   gcry_assert (num_blks <= 32);
 
 #ifdef USE_GFNI_AVX2
-  if (ctx->use_gfni_avx2 && num_blks >= 3)
+  if (ctx->use_gfni_avx2 && num_blks >= 2)
     {
-      /* 3 or more parallel block GFNI processing is faster than
+      /* 2 or more parallel block GFNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_VAES_AVX2
-  if (ctx->use_vaes_avx2 && num_blks >= 6)
+  if (ctx->use_vaes_avx2 && num_blks >= 4)
     {
-      /* 6 or more parallel block VAES processing is faster than
+      /* 4 or more parallel block VAES processing is faster than
        * generic C implementation.  */
       _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
     }
 #endif
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2 && num_blks >= 6)
+  if (ctx->use_aesni_avx2 && num_blks >= 5)
     {
-      /* 6 or more parallel block AESNI processing is faster than
+      /* 5 or more parallel block AESNI processing is faster than
        * generic C implementation.  */
       _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
       return avx2_burn_stack_depth;
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:24 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:24 +0200
Subject: [PATCH 8/8] camellia-aesni-avx: speed up for round key broadcasting
In-Reply-To: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
References: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
Message-ID: <20230222192924.2291776-8-jussi.kivilinna@iki.fi>

* cipher/camellia-aesni-avx2-amd64.h (roundsm16, fls16): Broadcast
round key bytes directly with 'vpshufb'.
--

Benchmark on AMD Ryzen 9 7900X (turbo-freq off):

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.837 ns/B      1139 MiB/s      3.94 c/B      4700
        ECB dec |     0.839 ns/B      1137 MiB/s      3.94 c/B      4700

 After (~3% faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.808 ns/B      1180 MiB/s      3.80 c/B      4700
        ECB dec |     0.810 ns/B      1177 MiB/s      3.81 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S | 89 ++++++++++++++++---------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 5ec33b9b..76e62ea8 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1,6 +1,6 @@
 /* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
  *
- * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015,2020,2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -121,25 +121,14 @@
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
-	vpxor t6, t6, t6; \
 	vmovq key, t0; \
 	\
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
 	\
-	vpsrldq $5, t0, t5; \
-	vpsrldq $1, t0, t1; \
-	vpsrldq $2, t0, t2; \
-	vpsrldq $3, t0, t3; \
-	vpsrldq $4, t0, t4; \
-	vpshufb t6, t0, t0; \
-	vpshufb t6, t1, t1; \
-	vpshufb t6, t2, t2; \
-	vpshufb t6, t3, t3; \
-	vpshufb t6, t4, t4; \
-	vpsrldq $2, t5, t7; \
-	vpshufb t6, t7, t7; \
+	vpshufb .Lbyte_threes rRIP, t0, t3; \
+	vpshufb .Lbyte_twos rRIP, t0, t2; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
@@ -147,16 +136,23 @@
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
+	vpshufb .Lbyte_ones rRIP, t0, t1; \
+	vpshufb .Lbyte_sevens rRIP, t0, t7; \
+	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
+	vpshufb .Lbyte_sixs rRIP, t0, t6; \
+	vpshufb .Lbyte_fives rRIP, t0, t5; \
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
+	vpshufb .Lbyte_fours rRIP, t0, t4; \
+	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
@@ -165,15 +161,14 @@
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
 	vpxor t3, x4, x4; \
+	vpxor t3, t3, t3; \
 	vpxor 0 * 16(mem_cd), x4, x4; \
 	\
+	vpshufb t3, t0, t0; \
+	\
 	vpxor t2, x5, x5; \
 	vpxor 1 * 16(mem_cd), x5, x5; \
 	\
-	vpsrldq $1, t5, t3; \
-	vpshufb t6, t5, t5; \
-	vpshufb t6, t3, t6; \
-	\
 	vpxor t1, x6, x6; \
 	vpxor 2 * 16(mem_cd), x6, x6; \
 	\
@@ -294,12 +289,9 @@
 	vpxor tt0, tt0, tt0; \
 	vmovd kll, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
@@ -325,12 +317,9 @@
 	\
 	vmovd krr, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpor 4 * 16(r), t0, t0; \
 	vpor 5 * 16(r), t1, t1; \
@@ -353,12 +342,9 @@
 	 */ \
 	vmovd krl, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpand 0 * 16(r), t0, t0; \
 	vpand 1 * 16(r), t1, t1; \
@@ -384,12 +370,9 @@
 	\
 	vmovd klr, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
@@ -637,6 +620,28 @@ _camellia_aesni_avx_data:
 	.long 0x80808080
 	.long 0x80808080
 
+.Lbyte_ones:
+	.quad 1 * 0x0101010101010101
+	.quad 1 * 0x0101010101010101
+.Lbyte_twos:
+	.quad 2 * 0x0101010101010101
+	.quad 2 * 0x0101010101010101
+.Lbyte_threes:
+	.quad 3 * 0x0101010101010101
+	.quad 3 * 0x0101010101010101
+.Lbyte_fours:
+	.quad 4 * 0x0101010101010101
+	.quad 4 * 0x0101010101010101
+.Lbyte_fives:
+	.quad 5 * 0x0101010101010101
+	.quad 5 * 0x0101010101010101
+.Lbyte_sixs:
+	.quad 6 * 0x0101010101010101
+	.quad 6 * 0x0101010101010101
+.Lbyte_sevens:
+	.quad 7 * 0x0101010101010101
+	.quad 7 * 0x0101010101010101
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:19 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:19 +0200
Subject: [PATCH 3/8] camellia-aesni-avx: add acceleration for ECB/XTS/CTR32LE
 modes
In-Reply-To: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
References: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
Message-ID: <20230222192924.2291776-3-jussi.kivilinna@iki.fi>

* cipher/camellia-aesni-avx-amd64.S (_gcry_camellia_aesni_avx_ecb_enc)
(_gcry_camellia_aesni_avx_ecb_dec): New.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_ecb_enc)
(_gcry_camellia_aesni_avx_ecb_dec): New.
(camellia_setkey): Always enable XTS/ECB/CTR32LE bulk functions.
(camellia_encrypt_blk1_32, camellia_decrypt_blk1_32)
[USE_AESNI_AVX]: Add AESNI/AVX code-path.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S | 92 +++++++++++++++++++++++++++++++
 cipher/camellia-glue.c            | 59 ++++++++++++++------
 2 files changed, 133 insertions(+), 18 deletions(-)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 1f241e03..93c96791 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1029,6 +1029,98 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
+.align 16
+.globl _gcry_camellia_aesni_avx_ecb_enc
+ELF(.type   _gcry_camellia_aesni_avx_ecb_enc, at function;)
+
+_gcry_camellia_aesni_avx_ecb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX));
+
+	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	call __camellia_enc_blk16;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ecb_enc,.-_gcry_camellia_aesni_avx_ecb_enc;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ecb_dec
+ELF(.type   _gcry_camellia_aesni_avx_ecb_dec, at function;)
+
+_gcry_camellia_aesni_avx_ecb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	vzeroupper;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	call __camellia_dec_blk16;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ecb_dec,.-_gcry_camellia_aesni_avx_ecb_dec;)
+
 .align 16
 .globl _gcry_camellia_aesni_avx_cbc_dec
 ELF(.type   _gcry_camellia_aesni_avx_cbc_dec, at function;)
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 2e00f563..8b4b4b3c 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -172,15 +172,25 @@ extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx,
 					     const u64 Ls[16]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
-					     const unsigned char *abuf,
-					     unsigned char *offset,
-					     unsigned char *checksum,
-					     const u64 Ls[16]) ASM_FUNC_ABI;
+					      const unsigned char *abuf,
+					      unsigned char *offset,
+					      unsigned char *checksum,
+					      const u64 Ls[16]) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 					    const unsigned char *key,
 					    unsigned int keylen) ASM_FUNC_ABI;
 
+extern void _gcry_camellia_aesni_avx_ecb_enc(const CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in)
+					     ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ecb_dec(const CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in)
+					     ASM_FUNC_ABI;
+
 static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
                                         2 * sizeof(void *) + ASM_EXTRA_STACK;
 
@@ -473,18 +483,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   bulk_ops->ctr_enc = _gcry_camellia_ctr_enc;
   bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt;
   bulk_ops->ocb_auth  = _gcry_camellia_ocb_auth;
-#ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2)
-    {
-      bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
-      bulk_ops->ecb_crypt = _gcry_camellia_ecb_crypt;
-      bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc;
-    }
-#else
-  (void)_gcry_camellia_xts_crypt;
-  (void)_gcry_camellia_ecb_crypt;
-  (void)_gcry_camellia_ctr32le_enc;
-#endif
+  bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
+  bulk_ops->ecb_crypt = _gcry_camellia_ecb_crypt;
+  bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc;
 
   if (0)
     { }
@@ -651,10 +652,21 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       return avx2_burn_stack_depth;
     }
 #endif
+#ifdef USE_AESNI_AVX
+  while (ctx->use_aesni_avx && num_blks >= 16)
+    {
+      _gcry_camellia_aesni_avx_ecb_enc (ctx, outbuf, inbuf);
+      stack_burn_size = avx_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
-      stack_burn_size = camellia_encrypt((void *)ctx, outbuf, inbuf);
+      unsigned int nburn = camellia_encrypt((void *)ctx, outbuf, inbuf);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
       outbuf += CAMELLIA_BLOCK_SIZE;
       inbuf += CAMELLIA_BLOCK_SIZE;
       num_blks--;
@@ -731,10 +743,21 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       return avx2_burn_stack_depth;
     }
 #endif
+#ifdef USE_AESNI_AVX
+  while (ctx->use_aesni_avx && num_blks >= 16)
+    {
+      _gcry_camellia_aesni_avx_ecb_dec (ctx, outbuf, inbuf);
+      stack_burn_size = avx_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
-      stack_burn_size = camellia_decrypt((void *)ctx, outbuf, inbuf);
+      unsigned int nburn = camellia_decrypt((void *)ctx, outbuf, inbuf);
+      stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
       outbuf += CAMELLIA_BLOCK_SIZE;
       inbuf += CAMELLIA_BLOCK_SIZE;
       num_blks--;
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Feb 22 20:29:20 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 Feb 2023 21:29:20 +0200
Subject: [PATCH 4/8] camellia: add CTR-mode byte addition for AVX/AVX2/AVX512
 impl.
In-Reply-To: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
References: <20230222192924.2291776-1-jussi.kivilinna@iki.fi>
Message-ID: <20230222192924.2291776-4-jussi.kivilinna@iki.fi>

* cipher/camellia-aesni-avx-amd64.S
(_gcry_camellia_aesni_avx_ctr_enc): Add byte addition fast-path.
* cipher/camellia-aesni-avx2-amd64.h (ctr_enc): Likewise.
* cipher/camellia-gfni-avx512-amd64.S
(_gcry_camellia_gfni_avx512_ctr_enc): Likewise.
* cipher/camellia-glue.c (CAMELLIA_context): Add 'use_avx2'.
(camellia_setkey, _gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec)
(_gcry_camellia_cfb_dec, _gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth) [USE_AESNI_AVX2]: Use 'use_avx2' to check
if any of the AVX2 implementations is enabled.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S   | 78 +++++++++++++++++++++++
 cipher/camellia-aesni-avx2-amd64.h  | 83 ++++++++++++++++++++++--
 cipher/camellia-gfni-avx512-amd64.S | 97 +++++++++++++++++++++++++++--
 cipher/camellia-glue.c              | 14 +++--
 4 files changed, 257 insertions(+), 15 deletions(-)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 93c96791..5ec33b9b 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -761,6 +761,38 @@ _camellia_aesni_avx_data:
 .Ltranspose_8x8_shuf:
 	.byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
 
+/* CTR byte addition constants */
+.Lbige_addb_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
@@ -930,6 +962,9 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
+	cmpb $(0x100 - 16), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	vmovdqa .Lbswap128_mask rRIP, %xmm14;
 
 	/* load IV and byteswap */
@@ -978,6 +1013,8 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
 	vmovdqu %xmm13, (%rcx);
 
+.align 8
+.Lload_ctr_done:
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
 	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
@@ -1026,6 +1063,47 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $16, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+	vmovdqu (%rcx), %xmm15;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $16, 15(%rcx);
+.Lctr_byteadd_xmm:
+	vmovdqa %xmm15, %xmm0;
+	vpaddb .Lbige_addb_1 rRIP, %xmm15, %xmm14;
+	vmovdqu %xmm15, 15 * 16(%rax);
+	vpaddb .Lbige_addb_2 rRIP, %xmm15, %xmm13;
+	vmovdqu %xmm14, 14 * 16(%rax);
+	vpaddb .Lbige_addb_3 rRIP, %xmm15, %xmm12;
+	vmovdqu %xmm13, 13 * 16(%rax);
+	vpaddb .Lbige_addb_4 rRIP, %xmm15, %xmm11;
+	vpaddb .Lbige_addb_5 rRIP, %xmm15, %xmm10;
+	vpaddb .Lbige_addb_6 rRIP, %xmm15, %xmm9;
+	vpaddb .Lbige_addb_7 rRIP, %xmm15, %xmm8;
+	vpaddb .Lbige_addb_8 rRIP, %xmm0, %xmm7;
+	vpaddb .Lbige_addb_9 rRIP, %xmm0, %xmm6;
+	vpaddb .Lbige_addb_10 rRIP, %xmm0, %xmm5;
+	vpaddb .Lbige_addb_11 rRIP, %xmm0, %xmm4;
+	vpaddb .Lbige_addb_12 rRIP, %xmm0, %xmm3;
+	vpaddb .Lbige_addb_13 rRIP, %xmm0, %xmm2;
+	vpaddb .Lbige_addb_14 rRIP, %xmm0, %xmm1;
+	vpaddb .Lbige_addb_15 rRIP, %xmm0, %xmm0;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index c92a0559..7d451c09 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -805,6 +805,36 @@ ELF(.type   FUNC_NAME(_constants), at object;)
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16_16:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
 #ifdef CAMELLIA_GFNI_BUILD
 
 /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
@@ -1151,9 +1181,6 @@ FUNC_NAME(ctr_enc):
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	movq 8(%rcx), %r11;
-	bswapq %r11;
-
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
@@ -1163,6 +1190,12 @@ FUNC_NAME(ctr_enc):
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
+	cmpb $(0x100 - 32), 15(%rcx);
+	jbe .Lctr_byteadd;
+
+	movq 8(%rcx), %r11;
+	bswapq %r11;
+
 	vpcmpeqd %ymm15, %ymm15, %ymm15;
 	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
 
@@ -1275,7 +1308,7 @@ FUNC_NAME(ctr_enc):
 	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13;
 	vmovdqu %xmm13, (%rcx);
 
-.align 4
+.align 8
 .Lload_ctr_done:
 	/* inpack32_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
@@ -1325,6 +1358,48 @@ FUNC_NAME(ctr_enc):
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $32, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+	vbroadcasti128 (%rcx), %ymm8;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $32, 15(%rcx);
+.Lctr_byteadd_ymm:
+	vpaddb .Lbige_addb_16_16 rRIP, %ymm8, %ymm0;
+	vpaddb .Lbige_addb_0_1 rRIP, %ymm8, %ymm15;
+	vpaddb .Lbige_addb_2_3 rRIP, %ymm8, %ymm14;
+	vmovdqu %ymm15, 15 * 32(%rax);
+	vpaddb .Lbige_addb_4_5 rRIP, %ymm8, %ymm13;
+	vmovdqu %ymm14, 14 * 32(%rax);
+	vpaddb .Lbige_addb_6_7 rRIP, %ymm8, %ymm12;
+	vmovdqu %ymm13, 13 * 32(%rax);
+	vpaddb .Lbige_addb_8_9 rRIP, %ymm8, %ymm11;
+	vpaddb .Lbige_addb_10_11 rRIP, %ymm8, %ymm10;
+	vpaddb .Lbige_addb_12_13 rRIP, %ymm8, %ymm9;
+	vpaddb .Lbige_addb_14_15 rRIP, %ymm8, %ymm8;
+	vpaddb .Lbige_addb_0_1 rRIP, %ymm0, %ymm7;
+	vpaddb .Lbige_addb_2_3 rRIP, %ymm0, %ymm6;
+	vpaddb .Lbige_addb_4_5 rRIP, %ymm0, %ymm5;
+	vpaddb .Lbige_addb_6_7 rRIP, %ymm0, %ymm4;
+	vpaddb .Lbige_addb_8_9 rRIP, %ymm0, %ymm3;
+	vpaddb .Lbige_addb_10_11 rRIP, %ymm0, %ymm2;
+	vpaddb .Lbige_addb_12_13 rRIP, %ymm0, %ymm1;
+	vpaddb .Lbige_addb_14_15 rRIP, %ymm0, %ymm0;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);)
 
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index 64fef8b6..c62b7848 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -689,6 +689,35 @@ ELF(.type   _gcry_camellia_gfni_avx512__constants, at object;)
 		    BV8(0, 0, 0, 1, 1, 1, 0, 0),
 		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
 
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
 ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;)
 
 .text
@@ -836,6 +865,14 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 	CFI_STARTPROC();
 	spec_stop_avx512;
 
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	cmpb $(0x100 - 64), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
 	vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
 	vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
@@ -851,11 +888,6 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 	vbroadcasti64x2 (%rcx), %zmm0;
 	vpshufb %zmm19, %zmm0, %zmm0;
 
-	cmpl $128, key_bitlength(CTX);
-	movl $32, %r8d;
-	movl $24, %eax;
-	cmovel %eax, %r8d; /* max */
-
 	/* check need for handling 64-bit overflow and carry */
 	cmpq $(0xffffffffffffffff - 64), %r11;
 	ja .Lload_ctr_carry;
@@ -901,8 +933,9 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 
 .align 4
 .Lload_ctr_done:
+	vbroadcasti64x2 .Lpack_bswap rRIP, %zmm17;
 	vpbroadcastq (key_table)(CTX), %zmm16;
-	vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+	vpshufb %zmm17, %zmm16, %zmm16;
 
 	/* Byte-swap IVs and update counter. */
 	addq $64, %r11;
@@ -928,6 +961,8 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 	movq %r11, 8(%rcx);
 	movq %r10, (%rcx);
 
+.align 16
+.Lctr_inpack64_pre:
 	/* inpack64_pre: */
 	vpxorq %zmm0, %zmm16, %zmm0;
 	vpxorq %zmm1, %zmm16, %zmm1;
@@ -972,6 +1007,56 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 	clear_regs();
 
 	ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $64, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_zmm;
+.align 16
+.Lctr_byteadd:
+	vbroadcasti64x2 (%rcx), %zmm12;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $64, 15(%rcx);
+.Lctr_byteadd_zmm:
+	vbroadcasti64x2 .Lbige_addb_16 rRIP, %zmm16;
+	vmovdqa64 .Lbige_addb_0_1 rRIP, %zmm17;
+	vmovdqa64 .Lbige_addb_4_5 rRIP, %zmm18;
+	vmovdqa64 .Lbige_addb_8_9 rRIP, %zmm19;
+	vmovdqa64 .Lbige_addb_12_13 rRIP, %zmm20;
+	vpaddb %zmm16, %zmm12, %zmm8;
+	vpaddb %zmm17, %zmm12, %zmm15;
+	vpaddb %zmm18, %zmm12, %zmm14;
+	vpaddb %zmm19, %zmm12, %zmm13;
+	vpaddb %zmm20, %zmm12, %zmm12;
+	vpaddb %zmm16, %zmm8, %zmm4;
+	vpaddb %zmm17, %zmm8, %zmm11;
+	vpaddb %zmm18, %zmm8, %zmm10;
+	vpaddb %zmm19, %zmm8, %zmm9;
+	vpaddb %zmm20, %zmm8, %zmm8;
+	vpaddb %zmm16, %zmm4, %zmm0;
+	vpaddb %zmm17, %zmm4, %zmm7;
+	vpaddb %zmm18, %zmm4, %zmm6;
+	vpaddb %zmm19, %zmm4, %zmm5;
+	vpaddb %zmm20, %zmm4, %zmm4;
+	vpaddb %zmm17, %zmm0, %zmm3;
+	vpaddb %zmm18, %zmm0, %zmm2;
+	vpaddb %zmm19, %zmm0, %zmm1;
+	vpaddb %zmm20, %zmm0, %zmm0;
+
+	vbroadcasti64x2 .Lpack_bswap rRIP, %zmm17
+	vpbroadcastq (key_table)(CTX), %zmm16;
+	vpshufb %zmm17, %zmm16, %zmm16;
+
+	jmp .Lctr_inpack64_pre;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_gfni_avx512_ctr_enc,.-_gcry_camellia_gfni_avx512_ctr_enc;)
 
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 8b4b4b3c..76a09eb1 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -117,6 +117,7 @@ typedef struct
   unsigned int use_aesni_avx:1;	/* AES-NI/AVX implementation shall be used.  */
 #endif /*USE_AESNI_AVX*/
 #ifdef USE_AESNI_AVX2
+  unsigned int use_avx2:1; /* If any of AVX2 implementation is enabled.  */
   unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used.  */
   unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used.  */
   unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used.  */
@@ -463,12 +464,15 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   ctx->use_vaes_avx2 = 0;
   ctx->use_gfni_avx2 = 0;
   ctx->use_gfni_avx512 = 0;
+  ctx->use_avx2 = ctx->use_aesni_avx2;
 #endif
 #ifdef USE_VAES_AVX2
   ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
+  ctx->use_avx2 |= ctx->use_vaes_avx2;
 #endif
 #ifdef USE_GFNI_AVX2
   ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+  ctx->use_avx2 |= ctx->use_gfni_avx2;
 #endif
 #ifdef USE_GFNI_AVX512
   ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
@@ -838,7 +842,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 #endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
       typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn =
@@ -956,7 +960,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 #endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
       typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn =
@@ -1074,7 +1078,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 #endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
       typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn =
@@ -1301,7 +1305,7 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 #endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
       u64 Ls[32];
@@ -1435,7 +1439,7 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 #endif
 
 #ifdef USE_AESNI_AVX2
-  if (ctx->use_aesni_avx2)
+  if (ctx->use_avx2)
     {
       int did_use_aesni_avx2 = 0;
       u64 Ls[32];
-- 
2.37.2


From ap420073 at gmail.com  Wed Feb 22 13:07:43 2023
From: ap420073 at gmail.com (Taehee Yoo)
Date: Wed, 22 Feb 2023 21:07:43 +0900
Subject: [PATCH 4/5] aria-avx512: small optimization for aria_diff_m
In-Reply-To: <821ad653-4b27-036c-4938-f3c2904eea02@iki.fi>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
 <20230219084910.1302701-4-jussi.kivilinna@iki.fi>
 <0b0367e5-d5f2-3ed5-58c9-be59a7d770d3@gmail.com>
 <821ad653-4b27-036c-4938-f3c2904eea02@iki.fi>
Message-ID: <fb8ed585-f4d5-9f0c-cf89-4510f64e5111@gmail.com>

On 2023. 2. 21. ?? 2:38, Jussi Kivilinna wrote:

Hi Jussi,

 > Hello,
 >
 > On 20.2.2023 12.54, Taehee Yoo wrote:
 >> On 2/19/23 17:49, Jussi Kivilinna wrote:
 >>
 >> Hi Jussi,
 >> Thank you so much for this optimization!
 >>
 >> I tested this optimization in the kernel.
 >> It works very well.
 >> In my machine(i3-12100), it improves performance ~9%, awesome!
 >
 > Interesting.. I'd expect alderlake to behave similarly to tigerlake. Did
 > you
 > test with version that has unrolled round functions?
 >
 > In libgcrypt, I changed from round unrolling to using loops in order to
 > reduce
 > code size and to allow code to fit into uop-cache. Maybe speed increase
 > happens
 > since vpternlogq reduces code-size for unrolled version enough and
 > algorithm fits
 > into i3-12100's uop-cache, giving the extra performance.
 >

After your response, I retested it and found my benchmark data is wrong.
When I implement aria-avx512, the benchmark result is below.

testing speed of multibuffer ecb(aria) (ecb-aria-avx512) encryption
tcrypt: 1 operation in 1504 cycles (1024 bytes)
tcrypt: 1 operation in 4595 cycles (4096 bytes)
tcrypt: 1 operation in 1763 cycles (1024 bytes)
tcrypt: 1 operation in 5540 cycles (4096 bytes)
testing speed of multibuffer ecb(aria) (ecb-aria-avx512) decryption
tcrypt: 1 operation in 1502 cycles (1024 bytes)
tcrypt: 1 operation in 4615 cycles (4096 bytes)
tcrypt: 1 operation in 1759 cycles (1024 bytes)
tcrypt: 1 operation in 5554 cycles (4096 bytes)

But, the current result is like this.
tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) encryption
tcrypt: 1 operation in 1443 cycles (1024 bytes)
tcrypt: 1 operation in 4396 cycles (4096 bytes)
tcrypt: 1 operation in 1683 cycles (1024 bytes)
tcrypt: 1 operation in 5368 cycles (4096 bytes)
tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) decryption
tcrypt: 1 operation in 1458 cycles (1024 bytes)
tcrypt: 1 operation in 4416 cycles (4096 bytes)
tcrypt: 1 operation in 1723 cycles (1024 bytes)
tcrypt: 1 operation in 5358 cycles (4096 bytes)

So, after your optimization is like this.
tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) encryption
tcrypt: 1 operation in 1388 cycles (1024 bytes)
tcrypt: 1 operation in 4107 cycles (4096 bytes)
tcrypt: 1 operation in 1595 cycles (1024 bytes)
tcrypt: 1 operation in 5011 cycles (4096 bytes)
tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) decryption
tcrypt: 1 operation in 1379 cycles (1024 bytes)
tcrypt: 1 operation in 4163 cycles (4096 bytes)
tcrypt: 1 operation in 1603 cycles (1024 bytes)
tcrypt: 1 operation in 5098 cycles (4096 bytes)

The 9% performance gap I said is actually wrong.
I don't know why the result is changed... anyway, this optimization 
increases performance by 5~7%.
Also, I tested it on the both loop and unroll but I couldn't find any 
performance gap.
I haven't enough knowledge about uop-cache, so I couldn't provide useful 
for focusing on the uop-cache.
Sorry for that the previous benchmark result is wrong.

Thank you so much!
Taehee Yoo


 > -Jussi
 >
 >> It will be really helpful to the kernel side aria-avx512 driver for
 >> improving performance.
 >>
 >>  > * cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for
 >>  > 3-way XOR operation.
 >>  > ---
 >>  >
 >>  > Using vpternlogq gives small performance improvement on AMD Zen4. 
With
 >>  > Intel tiger-lake speed is the same as before.
 >>  >
 >>  > Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
 >>  >
 >>  > Before:
 >>  >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte
 >> auto Mhz
 >>  >          ECB enc |     0.204 ns/B      4682 MiB/s     0.957
 >> c/B      4700
 >>  >          ECB dec |     0.204 ns/B      4668 MiB/s     0.960
 >> c/B      4700
 >>  >          CTR enc |     0.212 ns/B      4509 MiB/s     0.994
 >> c/B      4700
 >>  >          CTR dec |     0.212 ns/B      4490 MiB/s     0.998
 >> c/B      4700
 >>  >
 >>  > After (~3% faster):
 >>  >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte
 >> auto Mhz
 >>  >          ECB enc |     0.198 ns/B      4812 MiB/s     0.932
 >> c/B      4700
 >>  >          ECB dec |     0.198 ns/B      4824 MiB/s     0.929
 >> c/B      4700
 >>  >          CTR enc |     0.204 ns/B      4665 MiB/s     0.961
 >> c/B      4700
 >>  >          CTR dec |     0.206 ns/B      4631 MiB/s     0.968
 >> c/B      4700
 >>  >
 >>  > Cc: Taehee Yoo <ap420073 at gmail.com>
 >>  > Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
 >>  > ---
 >>  >   cipher/aria-gfni-avx512-amd64.S | 16 ++++++----------
 >>  >   1 file changed, 6 insertions(+), 10 deletions(-)
 >>  >
 >>  > diff --git a/cipher/aria-gfni-avx512-amd64.S
 >> b/cipher/aria-gfni-avx512-amd64.S
 >>  > index 849c744b..24a49a89 100644
 >>  > --- a/cipher/aria-gfni-avx512-amd64.S
 >>  > +++ b/cipher/aria-gfni-avx512-amd64.S
 >>  > @@ -406,21 +406,17 @@
 >>  >       vgf2p8affineinvqb $0, t2, y3, y3;        \
 >>  >       vgf2p8affineinvqb $0, t2, y7, y7;
 >>  >
 >>  > -
 >>  >   #define aria_diff_m(x0, x1, x2, x3,            \
 >>  >               t0, t1, t2, t3)            \
 >>  >       /* T = rotr32(X, 8); */                \
 >>  >       /* X ^= T */                    \
 >>  > -    vpxorq x0, x3, t0;                \
 >>  > -    vpxorq x1, x0, t1;                \
 >>  > -    vpxorq x2, x1, t2;                \
 >>  > -    vpxorq x3, x2, t3;                \
 >>  >       /* X = T ^ rotr(X, 16); */            \
 >>  > -    vpxorq t2, x0, x0;                \
 >>  > -    vpxorq x1, t3, t3;                \
 >>  > -    vpxorq t0, x2, x2;                \
 >>  > -    vpxorq t1, x3, x1;                \
 >>  > -    vmovdqu64 t3, x3;
 >>  > +    vmovdqa64 x0, t0;                \
 >>  > +    vmovdqa64 x3, t3;                \
 >>  > +    vpternlogq $0x96, x2, x1, x0;            \
 >>  > +    vpternlogq $0x96, x2, x1, x3;            \
 >>  > +    vpternlogq $0x96, t0, t3, x2;            \
 >>  > +    vpternlogq $0x96, t0, t3, x1;
 >>  >
 >>  >   #define aria_diff_word(x0, x1, x2, x3,            \
 >>  >                  x4, x5, x6, x7,            \
 >>
 >> Thank you so much!
 >> Taehee Yoo
 >>
 >


From jussi.kivilinna at iki.fi  Thu Feb 23 17:30:32 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 23 Feb 2023 18:30:32 +0200
Subject: [PATCH 4/5] aria-avx512: small optimization for aria_diff_m
In-Reply-To: <fb8ed585-f4d5-9f0c-cf89-4510f64e5111@gmail.com>
References: <20230219084910.1302701-1-jussi.kivilinna@iki.fi>
 <20230219084910.1302701-4-jussi.kivilinna@iki.fi>
 <0b0367e5-d5f2-3ed5-58c9-be59a7d770d3@gmail.com>
 <821ad653-4b27-036c-4938-f3c2904eea02@iki.fi>
 <fb8ed585-f4d5-9f0c-cf89-4510f64e5111@gmail.com>
Message-ID: <cdbdb5a0-1569-33b5-99ea-b67ae034318a@iki.fi>

On 22.2.2023 14.07, Taehee Yoo wrote:
> On 2023. 2. 21. ?? 2:38, Jussi Kivilinna wrote:
> 
> Hi Jussi,
> 
>  > Hello,
>  >
>  > On 20.2.2023 12.54, Taehee Yoo wrote:
>  >> On 2/19/23 17:49, Jussi Kivilinna wrote:
>  >>
>  >> Hi Jussi,
>  >> Thank you so much for this optimization!
>  >>
>  >> I tested this optimization in the kernel.
>  >> It works very well.
>  >> In my machine(i3-12100), it improves performance ~9%, awesome!
>  >
>  > Interesting.. I'd expect alderlake to behave similarly to tigerlake. Did
>  > you
>  > test with version that has unrolled round functions?
>  >
>  > In libgcrypt, I changed from round unrolling to using loops in order to
>  > reduce
>  > code size and to allow code to fit into uop-cache. Maybe speed increase
>  > happens
>  > since vpternlogq reduces code-size for unrolled version enough and
>  > algorithm fits
>  > into i3-12100's uop-cache, giving the extra performance.
>  >
> 
> After your response, I retested it and found my benchmark data is wrong.
> When I implement aria-avx512, the benchmark result is below.
> 
> testing speed of multibuffer ecb(aria) (ecb-aria-avx512) encryption
> tcrypt: 1 operation in 1504 cycles (1024 bytes)
> tcrypt: 1 operation in 4595 cycles (4096 bytes)
> tcrypt: 1 operation in 1763 cycles (1024 bytes)
> tcrypt: 1 operation in 5540 cycles (4096 bytes)
> testing speed of multibuffer ecb(aria) (ecb-aria-avx512) decryption
> tcrypt: 1 operation in 1502 cycles (1024 bytes)
> tcrypt: 1 operation in 4615 cycles (4096 bytes)
> tcrypt: 1 operation in 1759 cycles (1024 bytes)
> tcrypt: 1 operation in 5554 cycles (4096 bytes)
> 
> But, the current result is like this.
> tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) encryption
> tcrypt: 1 operation in 1443 cycles (1024 bytes)
> tcrypt: 1 operation in 4396 cycles (4096 bytes)
> tcrypt: 1 operation in 1683 cycles (1024 bytes)
> tcrypt: 1 operation in 5368 cycles (4096 bytes)
> tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) decryption
> tcrypt: 1 operation in 1458 cycles (1024 bytes)
> tcrypt: 1 operation in 4416 cycles (4096 bytes)
> tcrypt: 1 operation in 1723 cycles (1024 bytes)
> tcrypt: 1 operation in 5358 cycles (4096 bytes)
> 
> So, after your optimization is like this.
> tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) encryption
> tcrypt: 1 operation in 1388 cycles (1024 bytes)
> tcrypt: 1 operation in 4107 cycles (4096 bytes)
> tcrypt: 1 operation in 1595 cycles (1024 bytes)
> tcrypt: 1 operation in 5011 cycles (4096 bytes)
> tcrypt: testing speed of multibuffer ecb(aria) (ecb-aria-avx512) decryption
> tcrypt: 1 operation in 1379 cycles (1024 bytes)
> tcrypt: 1 operation in 4163 cycles (4096 bytes)
> tcrypt: 1 operation in 1603 cycles (1024 bytes)
> tcrypt: 1 operation in 5098 cycles (4096 bytes)
> 
> The 9% performance gap I said is actually wrong.
> I don't know why the result is changed... anyway, this optimization increases performance by 5~7%.
> Also, I tested it on the both loop and unroll but I couldn't find any performance gap.
> I haven't enough knowledge about uop-cache, so I couldn't provide useful for focusing on the uop-cache.
> Sorry for that the previous benchmark result is wrong.

Ok, thanks for testing. I was just wondering from where the improvement came.

Anyway, good to see that there was performance increase on other CPU in
addition to AMD Zen4.

-Jussi

> 
> Thank you so much!
> Taehee Yoo
> 
> 
>  > -Jussi
>  >
>  >> It will be really helpful to the kernel side aria-avx512 driver for
>  >> improving performance.
>  >>
>  >>? > * cipher/aria-gfni-avx512-amd64.S (aria_diff_m): Use 'vpternlogq' for
>  >>? > 3-way XOR operation.
>  >>? > ---
>  >>? >
>  >>? > Using vpternlogq gives small performance improvement on AMD Zen4. With
>  >>? > Intel tiger-lake speed is the same as before.
>  >>? >
>  >>? > Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
>  >>? >
>  >>? > Before:
>  >>? >?? ARIA128??????? |? nanosecs/byte?? mebibytes/sec?? cycles/byte
>  >> auto Mhz
>  >>? >????????? ECB enc |???? 0.204 ns/B????? 4682 MiB/s???? 0.957
>  >> c/B????? 4700
>  >>? >????????? ECB dec |???? 0.204 ns/B????? 4668 MiB/s???? 0.960
>  >> c/B????? 4700
>  >>? >????????? CTR enc |???? 0.212 ns/B????? 4509 MiB/s???? 0.994
>  >> c/B????? 4700
>  >>? >????????? CTR dec |???? 0.212 ns/B????? 4490 MiB/s???? 0.998
>  >> c/B????? 4700
>  >>? >
>  >>? > After (~3% faster):
>  >>? >?? ARIA128??????? |? nanosecs/byte?? mebibytes/sec?? cycles/byte
>  >> auto Mhz
>  >>? >????????? ECB enc |???? 0.198 ns/B????? 4812 MiB/s???? 0.932
>  >> c/B????? 4700
>  >>? >????????? ECB dec |???? 0.198 ns/B????? 4824 MiB/s???? 0.929
>  >> c/B????? 4700
>  >>? >????????? CTR enc |???? 0.204 ns/B????? 4665 MiB/s???? 0.961
>  >> c/B????? 4700
>  >>? >????????? CTR dec |???? 0.206 ns/B????? 4631 MiB/s???? 0.968
>  >> c/B????? 4700
>  >>? >
>  >>? > Cc: Taehee Yoo <ap420073 at gmail.com>
>  >>? > Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
>  >>? > ---
>  >>? >?? cipher/aria-gfni-avx512-amd64.S | 16 ++++++----------
>  >>? >?? 1 file changed, 6 insertions(+), 10 deletions(-)
>  >>? >
>  >>? > diff --git a/cipher/aria-gfni-avx512-amd64.S
>  >> b/cipher/aria-gfni-avx512-amd64.S
>  >>? > index 849c744b..24a49a89 100644
>  >>? > --- a/cipher/aria-gfni-avx512-amd64.S
>  >>? > +++ b/cipher/aria-gfni-avx512-amd64.S
>  >>? > @@ -406,21 +406,17 @@
>  >>? >?????? vgf2p8affineinvqb $0, t2, y3, y3;??????? \
>  >>? >?????? vgf2p8affineinvqb $0, t2, y7, y7;
>  >>? >
>  >>? > -
>  >>? >?? #define aria_diff_m(x0, x1, x2, x3,??????????? \
>  >>? >?????????????? t0, t1, t2, t3)??????????? \
>  >>? >?????? /* T = rotr32(X, 8); */??????????????? \
>  >>? >?????? /* X ^= T */??????????????????? \
>  >>? > -??? vpxorq x0, x3, t0;??????????????? \
>  >>? > -??? vpxorq x1, x0, t1;??????????????? \
>  >>? > -??? vpxorq x2, x1, t2;??????????????? \
>  >>? > -??? vpxorq x3, x2, t3;??????????????? \
>  >>? >?????? /* X = T ^ rotr(X, 16); */??????????? \
>  >>? > -??? vpxorq t2, x0, x0;??????????????? \
>  >>? > -??? vpxorq x1, t3, t3;??????????????? \
>  >>? > -??? vpxorq t0, x2, x2;??????????????? \
>  >>? > -??? vpxorq t1, x3, x1;??????????????? \
>  >>? > -??? vmovdqu64 t3, x3;
>  >>? > +??? vmovdqa64 x0, t0;??????????????? \
>  >>? > +??? vmovdqa64 x3, t3;??????????????? \
>  >>? > +??? vpternlogq $0x96, x2, x1, x0;??????????? \
>  >>? > +??? vpternlogq $0x96, x2, x1, x3;??????????? \
>  >>? > +??? vpternlogq $0x96, t0, t3, x2;??????????? \
>  >>? > +??? vpternlogq $0x96, t0, t3, x1;
>  >>? >
>  >>? >?? #define aria_diff_word(x0, x1, x2, x3,??????????? \
>  >>? >????????????????? x4, x5, x6, x7,??????????? \
>  >>
>  >> Thank you so much!
>  >> Taehee Yoo
>  >>
>  >
> 


From jussi.kivilinna at iki.fi  Sun Feb 26 14:00:34 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 26 Feb 2023 15:00:34 +0200
Subject: [PATCH 2/5] aes-ppc: add ECB bulk acceleration for benchmarking
 purposes
In-Reply-To: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
References: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
Message-ID: <20230226130037.847546-2-jussi.kivilinna@iki.fi>

* cipher/rijndael-ppc-functions.h (ECB_CRYPT_FUNC): New.
* cipher/rijndael-ppc.c (_gcry_aes_ppc8_ecb_crypt): New.
* cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ecb_crypt): New.
* cipher/rijndael.c (_gcry_aes_ppc8_ecb_crypt)
(_gcry_aes_ppc9le_ecb_crypt): New.
(do_setkey): Set up _gcry_aes_ppc8_ecb_crypt for POWER8 and
_gcry_aes_ppc9le_ecb_crypt for POWER9.
--

Benchmark on POWER9:

 Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     0.875 ns/B      1090 MiB/s      2.01 c/B
        ECB dec |      1.06 ns/B     899.8 MiB/s      2.44 c/B

 After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     0.305 ns/B      3126 MiB/s     0.702 c/B
        ECB dec |     0.305 ns/B      3126 MiB/s     0.702 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ppc-functions.h | 257 ++++++++++++++++++++++++++++++++
 cipher/rijndael-ppc.c           |   1 +
 cipher/rijndael-ppc9le.c        |   1 +
 cipher/rijndael.c               |  10 ++
 4 files changed, 269 insertions(+)

diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 063c5358..8a05d3c9 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -118,6 +118,263 @@ void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
   VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
 }
 
+
+void ECB_CRYPT_FUNC (void *context, void *outbuf_arg, const void *inbuf_arg,
+		     size_t nblocks, int encrypt)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = encrypt ? (u128_t *)&ctx->keyschenc
+			     : (u128_t *)&ctx->keyschdec;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block b0, b1, b2, b3, b4, b5, b6, b7;
+  block rkey;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      internal_aes_ppc_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rounds);
+
+  for (; nblocks >= 8; nblocks -= 8)
+    {
+      b0 = VEC_LOAD_BE (in, 0, bige_const);
+      b1 = VEC_LOAD_BE (in, 1, bige_const);
+      b2 = VEC_LOAD_BE (in, 2, bige_const);
+      b3 = VEC_LOAD_BE (in, 3, bige_const);
+      b0 = asm_xor (rkey0, b0);
+      b1 = asm_xor (rkey0, b1);
+      b4 = VEC_LOAD_BE (in, 4, bige_const);
+      b5 = VEC_LOAD_BE (in, 5, bige_const);
+      b2 = asm_xor (rkey0, b2);
+      b3 = asm_xor (rkey0, b3);
+      b6 = VEC_LOAD_BE (in, 6, bige_const);
+      b7 = VEC_LOAD_BE (in, 7, bige_const);
+      in += 8;
+      b4 = asm_xor (rkey0, b4);
+      b5 = asm_xor (rkey0, b5);
+      b6 = asm_xor (rkey0, b6);
+      b7 = asm_xor (rkey0, b7);
+
+      if (encrypt)
+	{
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_cipherlast_be (b0, rkeylast);
+	  b1 = asm_cipherlast_be (b1, rkeylast);
+	  b2 = asm_cipherlast_be (b2, rkeylast);
+	  b3 = asm_cipherlast_be (b3, rkeylast);
+	  b4 = asm_cipherlast_be (b4, rkeylast);
+	  b5 = asm_cipherlast_be (b5, rkeylast);
+	  b6 = asm_cipherlast_be (b6, rkeylast);
+	  b7 = asm_cipherlast_be (b7, rkeylast);
+	}
+      else
+	{
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey); \
+	      b4 = asm_ncipher_be (b4, rkey); \
+	      b5 = asm_ncipher_be (b5, rkey); \
+	      b6 = asm_ncipher_be (b6, rkey); \
+	      b7 = asm_ncipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_ncipherlast_be (b0, rkeylast);
+	  b1 = asm_ncipherlast_be (b1, rkeylast);
+	  b2 = asm_ncipherlast_be (b2, rkeylast);
+	  b3 = asm_ncipherlast_be (b3, rkeylast);
+	  b4 = asm_ncipherlast_be (b4, rkeylast);
+	  b5 = asm_ncipherlast_be (b5, rkeylast);
+	  b6 = asm_ncipherlast_be (b6, rkeylast);
+	  b7 = asm_ncipherlast_be (b7, rkeylast);
+	}
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+      VEC_STORE_BE (out, 4, b4, bige_const);
+      VEC_STORE_BE (out, 5, b5, bige_const);
+      VEC_STORE_BE (out, 6, b6, bige_const);
+      VEC_STORE_BE (out, 7, b7, bige_const);
+      out += 8;
+    }
+
+  if (nblocks >= 4)
+    {
+      b0 = VEC_LOAD_BE (in, 0, bige_const);
+      b1 = VEC_LOAD_BE (in, 1, bige_const);
+      b2 = VEC_LOAD_BE (in, 2, bige_const);
+      b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+      b0 = asm_xor (rkey0, b0);
+      b1 = asm_xor (rkey0, b1);
+      b2 = asm_xor (rkey0, b2);
+      b3 = asm_xor (rkey0, b3);
+
+      if (encrypt)
+	{
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+#undef DO_ROUND
+
+	  b0 = asm_cipherlast_be (b0, rkeylast);
+	  b1 = asm_cipherlast_be (b1, rkeylast);
+	  b2 = asm_cipherlast_be (b2, rkeylast);
+	  b3 = asm_cipherlast_be (b3, rkeylast);
+	}
+      else
+        {
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+#undef DO_ROUND
+
+	  b0 = asm_ncipherlast_be (b0, rkeylast);
+	  b1 = asm_ncipherlast_be (b1, rkeylast);
+	  b2 = asm_ncipherlast_be (b2, rkeylast);
+	  b3 = asm_ncipherlast_be (b3, rkeylast);
+	}
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+
+      in += 4;
+      out += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b0 = VEC_LOAD_BE (in, 0, bige_const);
+
+      if (encrypt)
+	{
+	  AES_ENCRYPT (b0, rounds);
+	}
+      else
+	{
+	  AES_DECRYPT (b0, rounds);
+	}
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+
+      out++;
+      in++;
+    }
+}
+
+
 void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
 		   void *outbuf_arg, const void *inbuf_arg,
 		   size_t nblocks)
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index 19f6a7e1..53c4f126 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -189,6 +189,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 #define GCRY_AES_PPC8 1
 #define ENCRYPT_BLOCK_FUNC	_gcry_aes_ppc8_encrypt
 #define DECRYPT_BLOCK_FUNC	_gcry_aes_ppc8_decrypt
+#define ECB_CRYPT_FUNC		_gcry_aes_ppc8_ecb_crypt
 #define CFB_ENC_FUNC		_gcry_aes_ppc8_cfb_enc
 #define CFB_DEC_FUNC		_gcry_aes_ppc8_cfb_dec
 #define CBC_ENC_FUNC		_gcry_aes_ppc8_cbc_enc
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
index facdedd4..9ce9c224 100644
--- a/cipher/rijndael-ppc9le.c
+++ b/cipher/rijndael-ppc9le.c
@@ -88,6 +88,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
 #define GCRY_AES_PPC9LE 1
 #define ENCRYPT_BLOCK_FUNC	_gcry_aes_ppc9le_encrypt
 #define DECRYPT_BLOCK_FUNC	_gcry_aes_ppc9le_decrypt
+#define ECB_CRYPT_FUNC		_gcry_aes_ppc9le_ecb_crypt
 #define CFB_ENC_FUNC		_gcry_aes_ppc9le_cfb_enc
 #define CFB_DEC_FUNC		_gcry_aes_ppc9le_cfb_dec
 #define CBC_ENC_FUNC		_gcry_aes_ppc9le_cbc_enc
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 84cb7109..071d4a16 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -250,6 +250,10 @@ extern unsigned int _gcry_aes_ppc8_decrypt(const RIJNDAEL_context *ctx,
 					   unsigned char *dst,
 					   const unsigned char *src);
 
+extern void _gcry_aes_ppc8_ecb_crypt (void *context, void *outbuf_arg,
+				      const void *inbuf_arg, size_t nblocks,
+				      int encrypt);
+
 extern void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv,
 				    void *outbuf_arg, const void *inbuf_arg,
 				    size_t nblocks);
@@ -287,6 +291,10 @@ extern unsigned int _gcry_aes_ppc9le_decrypt(const RIJNDAEL_context *ctx,
 					    unsigned char *dst,
 					    const unsigned char *src);
 
+extern void _gcry_aes_ppc9le_ecb_crypt (void *context, void *outbuf_arg,
+					const void *inbuf_arg, size_t nblocks,
+					int encrypt);
+
 extern void _gcry_aes_ppc9le_cfb_enc (void *context, unsigned char *iv,
 				      void *outbuf_arg, const void *inbuf_arg,
 				      size_t nblocks);
@@ -616,6 +624,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
 
       /* Setup PPC9LE bulk encryption routines.  */
+      bulk_ops->ecb_crypt = _gcry_aes_ppc9le_ecb_crypt;
       bulk_ops->cfb_enc = _gcry_aes_ppc9le_cfb_enc;
       bulk_ops->cfb_dec = _gcry_aes_ppc9le_cfb_dec;
       bulk_ops->cbc_enc = _gcry_aes_ppc9le_cbc_enc;
@@ -645,6 +654,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
 
       /* Setup PPC8 bulk encryption routines.  */
+      bulk_ops->ecb_crypt = _gcry_aes_ppc8_ecb_crypt;
       bulk_ops->cfb_enc = _gcry_aes_ppc8_cfb_enc;
       bulk_ops->cfb_dec = _gcry_aes_ppc8_cfb_dec;
       bulk_ops->cbc_enc = _gcry_aes_ppc8_cbc_enc;
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 26 14:00:37 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 26 Feb 2023 15:00:37 +0200
Subject: [PATCH 5/5] chacha20-ppc: use target and optimize attributes for P8
 and P9
In-Reply-To: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
References: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
Message-ID: <20230226130037.847546-5-jussi.kivilinna@iki.fi>

* cipher/chacha20-ppc.c (_gcry_chacha20_ppc8_blocks1): Rename to...
(chacha20_ppc_blocks1): ...this; Add 'always inline' attribute.
(_gcry_chacha20_ppc8_blocks4): Rename to...
(chacha20_ppc_blocks4): ...this; Add 'always inline' attribute.
(_gcry_chacha20_poly1305_ppc8_blocks4): Rename to...
(chacha20_poly1305_ppc_blocks4): ...this; Add 'always inline'
attribute.
(FUNC_ATTR_OPT_O2, FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
(_gcry_chacha20_ppc8_blocks1, _gcry_chacha20_ppc8_blocks4)
(_gcry_chacha20_poly1305_ppc8_blocks4): New.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
* cipher/chacha20.c (CHACHA20_context_t): Add 'use_p9'.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
(chacha20_do_setkey): Set 'use_p9' if HW has HWF_PPC_ARCH_3_00.
(chacha20_blocks, do_chacha20_encrypt_stream_tail)
(_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt) [USE_PPC_VEC]: Add 'use_p9' paths.
--

This change makes sure that chacha20-ppc gets compiled
with proper optimization level and right target setting.

Benchmark on POWER9:

 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      1.11 ns/B     856.0 MiB/s      2.56 c/B
     STREAM dec |      1.11 ns/B     856.0 MiB/s      2.56 c/B
   POLY1305 enc |      1.57 ns/B     606.2 MiB/s      3.62 c/B
   POLY1305 dec |      1.56 ns/B     610.4 MiB/s      3.59 c/B
  POLY1305 auth |     0.876 ns/B      1089 MiB/s      2.02 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-ppc.c | 118 ++++++++++++++++++++++++++++++++++++++----
 cipher/chacha20.c     |  55 ++++++++++++++++----
 2 files changed, 154 insertions(+), 19 deletions(-)

diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 4a21b837..3fe7bc8c 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -136,9 +136,8 @@ vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a)
 #define ADD_U64(v,a) \
 	(v = vec_add_ctr_u64(v, a))
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
-			    size_t nblks)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_ppc_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks)
 {
   vector4x_u32 counter_1 = { 1, 0, 0, 0 };
   vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
@@ -283,9 +282,8 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
-			    size_t nblks)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks)
 {
   vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
   vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -470,10 +468,10 @@ _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
     MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \
   } while (0)
 
-unsigned int ASM_FUNC_ATTR
-_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
-				     size_t nblks, POLY1305_STATE *st,
-				     const byte *poly1305_src)
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+			      size_t nblks, POLY1305_STATE *st,
+			      const byte *poly1305_src)
 {
   vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
   vector4x_u32 counter_4 = { 4, 0, 0, 0 };
@@ -641,6 +639,106 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
   return 0;
 }
 
+#else
+
+static unsigned int ASM_FUNC_ATTR_INLINE
+chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
+			      size_t nblks, POLY1305_STATE *st,
+			      const byte *poly1305_src)
+{
+}
+
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+#endif
+
+
+/* Functions targetting POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
+				     size_t nblks, POLY1305_STATE *st,
+				     const byte *poly1305_src)
+{
+  return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+				       poly1305_src);
+}
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+/* Functions targetting POWER9. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return chacha20_ppc_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+				     size_t nblks, POLY1305_STATE *st,
+				     const byte *poly1305_src)
+{
+  return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st,
+				       poly1305_src);
+}
+#else
+/* Compiler does not support target attribute, use same functions for POWER9
+ * as for POWER8. */
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return _gcry_chacha20_ppc8_blocks1(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+			    size_t nblks)
+{
+  return _gcry_chacha20_ppc8_blocks4(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
+_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src,
+				     size_t nblks, POLY1305_STATE *st,
+				     const byte *poly1305_src)
+{
+  return _gcry_chacha20_poly1305_ppc8_blocks4(state, dst, src, nblks, st,
+					      poly1305_src);
+}
+#endif /* HAVE_GCC_ATTRIBUTE_PPC_TARGET */
+
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index a7e0dd63..d979d263 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -134,6 +134,7 @@ typedef struct CHACHA20_context_s
   unsigned int use_avx512:1;
   unsigned int use_neon:1;
   unsigned int use_ppc:1;
+  unsigned int use_p9:1;
   unsigned int use_p10:1;
   unsigned int use_s390x:1;
 } CHACHA20_context_t;
@@ -195,12 +196,24 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
 					 const byte *src,
 					 size_t nblks);
 
+unsigned int _gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst,
+					 const byte *src,
+					 size_t nblks);
+
+unsigned int _gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst,
+					 const byte *src,
+					 size_t nblks);
+
 #undef USE_PPC_VEC_POLY1305
 #if SIZEOF_UNSIGNED_LONG == 8
 #define USE_PPC_VEC_POLY1305 1
 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
 		u32 *state, byte *dst, const byte *src, size_t nblks,
 		POLY1305_STATE *st, const byte *poly1305_src);
+
+unsigned int _gcry_chacha20_poly1305_ppc9_blocks4(
+		u32 *state, byte *dst, const byte *src, size_t nblks,
+		POLY1305_STATE *st, const byte *poly1305_src);
 #endif /* SIZEOF_UNSIGNED_LONG == 8 */
 
 #endif /* USE_PPC_VEC */
@@ -369,7 +382,10 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
 #ifdef USE_PPC_VEC
   if (ctx->use_ppc)
     {
-      return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
+      if (ctx->use_p9)
+	return _gcry_chacha20_ppc9_blocks1(ctx->input, dst, src, nblks);
+      else
+	return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
     }
 #endif
 
@@ -509,6 +525,7 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #endif
 #ifdef USE_PPC_VEC
   ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
+  ctx->use_p9  = (features & HWF_PPC_ARCH_3_00) != 0;
 # ifndef WORDS_BIGENDIAN
   ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0;
 #  ifdef ENABLE_FORCE_SOFT_HWFEATURES
@@ -626,18 +643,25 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
+      if (0)
+        {}
 #ifndef WORDS_BIGENDIAN
       /*
        * A workaround to skip counter overflow. This is rare.
        */
-      if (ctx->use_p10 && nblocks >= 8
-          && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
+      else if (ctx->use_p10 && nblocks >= 8
+               && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU)
         {
           size_t len = nblocks * CHACHA20_BLOCK_SIZE;
           nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len);
         }
-      else
 #endif
+      else if (ctx->use_p9)
+        {
+          nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf,
+                                              nblocks);
+        }
+      else
         {
           nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf,
                                               nblocks);
@@ -844,7 +868,10 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
     }
   else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
     {
-      nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
+      if (ctx->use_p9)
+        nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf, 4);
+      else
+	nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
       burn = nburn > burn ? nburn : burn;
 
       authptr = outbuf;
@@ -986,7 +1013,12 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
 	  size_t nblocks = length / CHACHA20_BLOCK_SIZE;
 	  nblocks -= nblocks % 4;
 
-	  nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+	  if (ctx->use_p9)
+	    nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+		      ctx->input, outbuf, inbuf, nblocks,
+		      &c->u_mode.poly1305.ctx.state, authptr);
+	  else
+	    nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
 		      ctx->input, outbuf, inbuf, nblocks,
 		      &c->u_mode.poly1305.ctx.state, authptr);
 	  burn = nburn > burn ? nburn : burn;
@@ -1212,9 +1244,14 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
 
-      nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
-			ctx->input, outbuf, inbuf, nblocks,
-			&c->u_mode.poly1305.ctx.state, inbuf);
+      if (ctx->use_p9)
+	nburn = _gcry_chacha20_poly1305_ppc9_blocks4(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, inbuf);
+      else
+	nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+			  ctx->input, outbuf, inbuf, nblocks,
+			  &c->u_mode.poly1305.ctx.state, inbuf);
       burn = nburn > burn ? nburn : burn;
 
       length -= nblocks * CHACHA20_BLOCK_SIZE;
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 26 14:00:35 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 26 Feb 2023 15:00:35 +0200
Subject: [PATCH 3/5] aes-ppc: add CTR32LE bulk acceleration
In-Reply-To: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
References: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
Message-ID: <20230226130037.847546-3-jussi.kivilinna@iki.fi>

* cipher/rijndael-ppc-functions.h (CTR32LE_ENC_FUNC): New.
* cipher/rijndael-ppc.c (_gcry_aes_ppc8_ctr32le_enc): New.
* cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ctr32le_enc): New.
* cipher/rijndael.c (_gcry_aes_ppc8_ctr32le_enc)
(_gcry_aes_ppc9le_ctr32le_enc): New.
(do_setkey): Setup _gcry_aes_ppc8_ctr32le_enc for POWER8 and
_gcry_aes_ppc9le_ctr32le_enc for POWER9.
--

Benchmark on POWER9:

 Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
    GCM-SIV enc |      1.42 ns/B     672.2 MiB/s      3.26 c/B

 After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
    GCM-SIV enc |     0.725 ns/B      1316 MiB/s      1.67 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ppc-functions.h | 245 ++++++++++++++++++++++++++++++++
 cipher/rijndael-ppc.c           |   1 +
 cipher/rijndael-ppc9le.c        |   1 +
 cipher/rijndael.c               |  11 ++
 4 files changed, 258 insertions(+)

diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 8a05d3c9..79eca7a2 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -2292,3 +2292,248 @@ void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
 
 #undef GEN_TWEAK
 }
+
+
+void CTR32LE_ENC_FUNC(void *context, unsigned char *ctr_arg, void *outbuf_arg,
+		      const void *inbuf_arg, size_t nblocks)
+{
+#ifndef WORDS_BIGENDIAN
+  static const vec_u32 vec_u32_one = { 1, 0, 0, 0 };
+#else
+  static const vec_u32 vec_u32_one = { 0, 0, 0, 1 };
+#endif
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block b;
+  vec_u32 ctr, one;
+
+  ctr = (vec_u32)vec_reve (VEC_LOAD_BE (ctr_arg, 0, bige_const));
+  one = vec_u32_one;
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+#define VEC_ADD_CTRLE32(ctrv_u32, addv_u32) \
+      vec_reve((block)((ctrv_u32) + (addv_u32)))
+
+  if (nblocks >= 4)
+    {
+      block in0, in1, in2, in3, in4, in5, in6, in7;
+      block b0, b1, b2, b3, b4, b5, b6, b7;
+      vec_u32 two, three, four, five, six, seven, eight;
+      block rkey;
+
+      two   = one + one;
+      three = two + one;
+      four  = two + two;
+      five  = three + two;
+      six   = three + three;
+      seven = four + three;
+      eight = four + four;
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b1 = VEC_ADD_CTRLE32 (ctr, one);
+	  b2 = VEC_ADD_CTRLE32 (ctr, two);
+	  b3 = VEC_ADD_CTRLE32 (ctr, three);
+	  b4 = VEC_ADD_CTRLE32 (ctr, four);
+	  b5 = VEC_ADD_CTRLE32 (ctr, five);
+	  b6 = VEC_ADD_CTRLE32 (ctr, six);
+	  b7 = VEC_ADD_CTRLE32 (ctr, seven);
+	  b0 = asm_xor (rkey0, vec_reve((block)ctr));
+	  rkey = ALIGNED_LOAD (rk, 1);
+	  ctr = ctr + eight;
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+	  b0 = asm_cipher_be (b0, rkey);
+	  b1 = asm_cipher_be (b1, rkey);
+	  b2 = asm_cipher_be (b2, rkey);
+	  b3 = asm_cipher_be (b3, rkey);
+	  b4 = asm_xor (rkey0, b4);
+	  b5 = asm_xor (rkey0, b5);
+	  b6 = asm_xor (rkey0, b6);
+	  b7 = asm_xor (rkey0, b7);
+	  b4 = asm_cipher_be (b4, rkey);
+	  b5 = asm_cipher_be (b5, rkey);
+	  b6 = asm_cipher_be (b6, rkey);
+	  b7 = asm_cipher_be (b7, rkey);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  DO_ROUND(2);
+	  in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  DO_ROUND(3);
+	  in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  DO_ROUND(4);
+	  in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  DO_ROUND(5);
+	  in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  DO_ROUND(6);
+	  in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  DO_ROUND(7);
+	  in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  DO_ROUND(8);
+	  in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  DO_ROUND(9);
+
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = VEC_BE_SWAP (in0, bige_const);
+	  in1 = VEC_BE_SWAP (in1, bige_const);
+	  in2 = VEC_BE_SWAP (in2, bige_const);
+	  in3 = VEC_BE_SWAP (in3, bige_const);
+	  in4 = VEC_BE_SWAP (in4, bige_const);
+	  in5 = VEC_BE_SWAP (in5, bige_const);
+	  in6 = VEC_BE_SWAP (in6, bige_const);
+	  in7 = VEC_BE_SWAP (in7, bige_const);
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  in4 = asm_xor (rkeylast, in4);
+	  in5 = asm_xor (rkeylast, in5);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+	  in6 = asm_xor (rkeylast, in6);
+	  in7 = asm_xor (rkeylast, in7);
+	  b4 = asm_cipherlast_be (b4, in4);
+	  b5 = asm_cipherlast_be (b5, in5);
+	  b6 = asm_cipherlast_be (b6, in6);
+	  b7 = asm_cipherlast_be (b7, in7);
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  b1 = VEC_ADD_CTRLE32 (ctr, one);
+	  b2 = VEC_ADD_CTRLE32 (ctr, two);
+	  b3 = VEC_ADD_CTRLE32 (ctr, three);
+	  b0 = asm_xor (rkey0, vec_reve((block)ctr));
+	  ctr = ctr + four;
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+
+	  in0 = VEC_LOAD_BE (in, 0, bige_const);
+	  in1 = VEC_LOAD_BE (in, 1, bige_const);
+	  in2 = VEC_LOAD_BE (in, 2, bige_const);
+	  in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b = vec_reve((block)ctr);
+      ctr = ctr + one;
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+
+      AES_ENCRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      out++;
+      in++;
+    }
+
+#undef VEC_ADD_CTRLE32
+
+  VEC_STORE_BE (ctr_arg, 0, vec_reve((block)ctr), bige_const);
+}
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index 53c4f126..d16fbb40 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -198,6 +198,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 #define OCB_CRYPT_FUNC		_gcry_aes_ppc8_ocb_crypt
 #define OCB_AUTH_FUNC		_gcry_aes_ppc8_ocb_auth
 #define XTS_CRYPT_FUNC		_gcry_aes_ppc8_xts_crypt
+#define CTR32LE_ENC_FUNC	_gcry_aes_ppc8_ctr32le_enc
 
 #include <rijndael-ppc-functions.h>
 
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
index 9ce9c224..f7055290 100644
--- a/cipher/rijndael-ppc9le.c
+++ b/cipher/rijndael-ppc9le.c
@@ -97,6 +97,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
 #define OCB_CRYPT_FUNC		_gcry_aes_ppc9le_ocb_crypt
 #define OCB_AUTH_FUNC		_gcry_aes_ppc9le_ocb_auth
 #define XTS_CRYPT_FUNC		_gcry_aes_ppc9le_xts_crypt
+#define CTR32LE_ENC_FUNC	_gcry_aes_ppc9le_ctr32le_enc
 
 #include <rijndael-ppc-functions.h>
 
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 071d4a16..b49a0642 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -280,6 +280,10 @@ extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak,
 				      void *outbuf_arg,
 				      const void *inbuf_arg,
 				      size_t nblocks, int encrypt);
+
+extern void _gcry_aes_ppc8_ctr32le_enc (void *context, unsigned char *ctr,
+					void *outbuf_arg, const void *inbuf_arg,
+					size_t nblocks);
 #endif /*USE_PPC_CRYPTO*/
 
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
@@ -322,6 +326,11 @@ extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak,
 					const void *inbuf_arg,
 					size_t nblocks, int encrypt);
 
+extern void _gcry_aes_ppc9le_ctr32le_enc (void *context, unsigned char *ctr,
+					  void *outbuf_arg,
+					  const void *inbuf_arg,
+					  size_t nblocks);
+
 extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 					 const void *inbuf_arg,
 					 size_t nblocks, int encrypt);
@@ -633,6 +642,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt;
+      bulk_ops->ctr32le_enc = _gcry_aes_ppc9le_ctr32le_enc;
       if (hwfeatures & HWF_PPC_ARCH_3_10)  /* for P10 */
         bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt;
 # ifdef ENABLE_FORCE_SOFT_HWFEATURES
@@ -663,6 +673,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_ppc8_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_ppc8_xts_crypt;
+      bulk_ops->ctr32le_enc = _gcry_aes_ppc8_ctr32le_enc;
     }
 #endif
 #ifdef USE_S390X_CRYPTO
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 26 14:00:36 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 26 Feb 2023 15:00:36 +0200
Subject: [PATCH 4/5] aes-ppc: use target and optimize attributes for P8 and P9
In-Reply-To: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
References: <20230226130037.847546-1-jussi.kivilinna@iki.fi>
Message-ID: <20230226130037.847546-4-jussi.kivilinna@iki.fi>

* cipher/rijndael-ppc-functions.h: Add PPC_OPT_ATTR attribute
macro for all functions.
* cipher/rijndael-ppc.c (FUNC_ATTR_OPT, PPC_OPT_ATTR): New.
(_gcry_aes_ppc8_setkey, _gcry_aes_ppc8_prepare_decryption): Add
PPC_OPT_ATTR attribute macro.
* cipher/rijndael-ppc9le.c (FUNC_ATTR_OPT, PPC_OPT_ATTR): New.
--

This change makes sure that PPC accelerated AES gets compiled
with proper optimization level and right target setting.

Benchmark on POWER9:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     0.305 ns/B      3129 MiB/s     0.701 c/B
        ECB dec |     0.305 ns/B      3127 MiB/s     0.701 c/B
        CBC enc |      1.66 ns/B     575.3 MiB/s      3.81 c/B
        CBC dec |     0.318 ns/B      2997 MiB/s     0.732 c/B
        CFB enc |      1.66 ns/B     574.7 MiB/s      3.82 c/B
        CFB dec |     0.319 ns/B      2987 MiB/s     0.734 c/B
        OFB enc |      2.15 ns/B     443.4 MiB/s      4.95 c/B
        OFB dec |      2.15 ns/B     443.3 MiB/s      4.95 c/B
        CTR enc |     0.328 ns/B      2907 MiB/s     0.754 c/B
        CTR dec |     0.328 ns/B      2906 MiB/s     0.755 c/B
        XTS enc |     0.516 ns/B      1849 MiB/s      1.19 c/B
        XTS dec |     0.515 ns/B      1850 MiB/s      1.19 c/B
        CCM enc |      1.98 ns/B     480.6 MiB/s      4.56 c/B
        CCM dec |      1.98 ns/B     480.5 MiB/s      4.56 c/B
       CCM auth |      1.66 ns/B     574.9 MiB/s      3.82 c/B
        EAX enc |      1.99 ns/B     480.2 MiB/s      4.57 c/B
        EAX dec |      1.99 ns/B     480.2 MiB/s      4.57 c/B
       EAX auth |      1.66 ns/B     575.2 MiB/s      3.81 c/B
        GCM enc |     0.552 ns/B      1727 MiB/s      1.27 c/B
        GCM dec |     0.552 ns/B      1728 MiB/s      1.27 c/B
       GCM auth |     0.225 ns/B      4240 MiB/s     0.517 c/B
        OCB enc |     0.381 ns/B      2504 MiB/s     0.876 c/B
        OCB dec |     0.385 ns/B      2477 MiB/s     0.886 c/B
       OCB auth |     0.356 ns/B      2682 MiB/s     0.818 c/B
        SIV enc |      1.98 ns/B     480.9 MiB/s      4.56 c/B
        SIV dec |      2.11 ns/B     452.9 MiB/s      4.84 c/B
       SIV auth |      1.66 ns/B     575.4 MiB/s      3.81 c/B
    GCM-SIV enc |     0.726 ns/B      1314 MiB/s      1.67 c/B
    GCM-SIV dec |     0.843 ns/B      1131 MiB/s      1.94 c/B
   GCM-SIV auth |     0.377 ns/B      2527 MiB/s     0.868 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ppc-functions.h | 71 ++++++++++++++++++---------------
 cipher/rijndael-ppc.c           | 17 +++++++-
 cipher/rijndael-ppc9le.c        | 13 ++++++
 3 files changed, 66 insertions(+), 35 deletions(-)

diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 79eca7a2..ec5cda73 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -1,6 +1,6 @@
 /* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
  * Copyright (C) 2019 Shawn Landden <shawn at git.icu>
- * Copyright (C) 2019-2020, 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2019-2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -23,9 +23,9 @@
  * is released under.
  */
 
-unsigned int ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
-				 unsigned char *out,
-				 const unsigned char *in)
+unsigned int PPC_OPT_ATTR
+ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx, unsigned char *out,
+		    const unsigned char *in)
 {
   const block bige_const = asm_load_be_const();
   const u128_t *rk = (u128_t *)&ctx->keyschenc;
@@ -44,9 +44,9 @@ unsigned int ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
 }
 
 
-unsigned int DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
-				 unsigned char *out,
-				 const unsigned char *in)
+unsigned int PPC_OPT_ATTR
+DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx, unsigned char *out,
+		    const unsigned char *in)
 {
   const block bige_const = asm_load_be_const();
   const u128_t *rk = (u128_t *)&ctx->keyschdec;
@@ -65,9 +65,9 @@ unsigned int DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
 }
 
 
-void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
-		   void *outbuf_arg, const void *inbuf_arg,
-		   size_t nblocks)
+void PPC_OPT_ATTR
+CFB_ENC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+	      const void *inbuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -119,8 +119,9 @@ void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
 }
 
 
-void ECB_CRYPT_FUNC (void *context, void *outbuf_arg, const void *inbuf_arg,
-		     size_t nblocks, int encrypt)
+void PPC_OPT_ATTR
+ECB_CRYPT_FUNC (void *context, void *outbuf_arg, const void *inbuf_arg,
+		size_t nblocks, int encrypt)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -375,9 +376,9 @@ void ECB_CRYPT_FUNC (void *context, void *outbuf_arg, const void *inbuf_arg,
 }
 
 
-void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
-		   void *outbuf_arg, const void *inbuf_arg,
-		   size_t nblocks)
+void PPC_OPT_ATTR
+CFB_DEC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+	      const void *inbuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -573,9 +574,9 @@ void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
 }
 
 
-void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
-		   void *outbuf_arg, const void *inbuf_arg,
-		   size_t nblocks, int cbc_mac)
+void PPC_OPT_ATTR
+CBC_ENC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+	      const void *inbuf_arg, size_t nblocks, int cbc_mac)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -641,9 +642,10 @@ void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
   VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
 }
 
-void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
-		   void *outbuf_arg, const void *inbuf_arg,
-		   size_t nblocks)
+
+void PPC_OPT_ATTR
+CBC_DEC_FUNC (void *context, unsigned char *iv_arg, void *outbuf_arg,
+	      const void *inbuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = context;
@@ -846,9 +848,9 @@ void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
 }
 
 
-void CTR_ENC_FUNC (void *context, unsigned char *ctr_arg,
-		   void *outbuf_arg, const void *inbuf_arg,
-		   size_t nblocks)
+void PPC_OPT_ATTR
+CTR_ENC_FUNC (void *context, unsigned char *ctr_arg, void *outbuf_arg,
+	      const void *inbuf_arg, size_t nblocks)
 {
   static const unsigned char vec_one_const[16] =
     { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
@@ -1079,9 +1081,9 @@ void CTR_ENC_FUNC (void *context, unsigned char *ctr_arg,
 }
 
 
-size_t OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg,
-		       const void *inbuf_arg, size_t nblocks,
-		       int encrypt)
+size_t PPC_OPT_ATTR
+OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg,
+		size_t nblocks, int encrypt)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = (void *)&c->context.c;
@@ -1585,7 +1587,9 @@ size_t OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg,
   return 0;
 }
 
-size_t OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
+
+size_t PPC_OPT_ATTR
+OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
 {
   const block bige_const = asm_load_be_const();
   RIJNDAEL_context *ctx = (void *)&c->context.c;
@@ -1794,9 +1798,9 @@ size_t OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
 }
 
 
-void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
-		     void *outbuf_arg, const void *inbuf_arg,
-		     size_t nblocks, int encrypt)
+void PPC_OPT_ATTR
+XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg, void *outbuf_arg,
+		const void *inbuf_arg, size_t nblocks, int encrypt)
 {
 #ifdef WORDS_BIGENDIAN
   static const block vec_bswap128_const =
@@ -2294,8 +2298,9 @@ void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
 }
 
 
-void CTR32LE_ENC_FUNC(void *context, unsigned char *ctr_arg, void *outbuf_arg,
-		      const void *inbuf_arg, size_t nblocks)
+void PPC_OPT_ATTR
+CTR32LE_ENC_FUNC(void *context, unsigned char *ctr_arg, void *outbuf_arg,
+		 const void *inbuf_arg, size_t nblocks)
 {
 #ifndef WORDS_BIGENDIAN
   static const vec_u32 vec_u32_one = { 1, 0, 0, 0 };
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index d16fbb40..f376e0f1 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -34,6 +34,19 @@
 #include "rijndael-ppc-common.h"
 
 
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+# define PPC_OPT_ATTR __attribute__((target("cpu=power8"))) FUNC_ATTR_OPT
+#else
+# define PPC_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+
 #ifndef WORDS_BIGENDIAN
 static const block vec_bswap32_const_neg =
   { ~3, ~2, ~1, ~0, ~7, ~6, ~5, ~4, ~11, ~10, ~9, ~8, ~15, ~14, ~13, ~12 };
@@ -124,7 +137,7 @@ keysched_idx(unsigned int in)
 }
 
 
-void
+void PPC_OPT_ATTR
 _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
   u32 tk_u32[MAXKC];
@@ -179,7 +192,7 @@ _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
 }
 
 
-void
+void PPC_OPT_ATTR
 _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 {
   internal_aes_ppc_prepare_decryption (ctx);
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
index f7055290..e462befc 100644
--- a/cipher/rijndael-ppc9le.c
+++ b/cipher/rijndael-ppc9le.c
@@ -34,6 +34,19 @@
 #include "rijndael-ppc-common.h"
 
 
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+# define PPC_OPT_ATTR __attribute__((target("cpu=power9"))) FUNC_ATTR_OPT
+#else
+# define PPC_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+
 static ASM_FUNC_ATTR_INLINE block
 asm_load_be_const(void)
 {
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 26 14:38:40 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 26 Feb 2023 15:38:40 +0200
Subject: [PATCH] ppc: add support for clang target attribute
Message-ID: <20230226133840.968496-1-jussi.kivilinna@iki.fi>

* configure.ac (gcry_cv_clang_attribute_ppc_target): New.
* cipher/chacha20-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
(FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
* cipher/rijndael-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
(FPC_OPT_ATTR): New.
* cipher/rijndael-ppc9le.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
(FPC_OPT_ATTR): New.
* cipher/sha256-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
(FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
* cipher/sha512-ppc.c [HAVE_CLANG_ATTRIBUTE_PPC_TARGET]
(FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
(ror64): Remove unused function.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-ppc.c    |  5 ++++-
 cipher/rijndael-ppc.c    |  4 +++-
 cipher/rijndael-ppc9le.c |  4 +++-
 cipher/sha256-ppc.c      |  5 ++++-
 cipher/sha512-ppc.c      | 13 +++++--------
 configure.ac             | 22 ++++++++++++++++++++++
 6 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 3fe7bc8c..243c12ff 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -657,7 +657,10 @@ chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src,
 # define FUNC_ATTR_OPT_O2
 #endif
 
-#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
 # define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
 # define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
 #else
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index f376e0f1..7530209d 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -40,7 +40,9 @@
 # define FUNC_ATTR_OPT
 #endif
 
-#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define PPC_OPT_ATTR __attribute__((target("arch=pwr8"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
 # define PPC_OPT_ATTR __attribute__((target("cpu=power8"))) FUNC_ATTR_OPT
 #else
 # define PPC_OPT_ATTR FUNC_ATTR_OPT
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
index e462befc..6a44bcf3 100644
--- a/cipher/rijndael-ppc9le.c
+++ b/cipher/rijndael-ppc9le.c
@@ -40,7 +40,9 @@
 # define FUNC_ATTR_OPT
 #endif
 
-#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define PPC_OPT_ATTR __attribute__((target("arch=pwr9"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
 # define PPC_OPT_ATTR __attribute__((target("cpu=power9"))) FUNC_ATTR_OPT
 #else
 # define PPC_OPT_ATTR FUNC_ATTR_OPT
diff --git a/cipher/sha256-ppc.c b/cipher/sha256-ppc.c
index 7b17b943..fd69380f 100644
--- a/cipher/sha256-ppc.c
+++ b/cipher/sha256-ppc.c
@@ -48,7 +48,10 @@ typedef vector unsigned long long vector2x_u64;
 # define FUNC_ATTR_OPT_O2
 #endif
 
-#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
 # define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
 # define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
 #else
diff --git a/cipher/sha512-ppc.c b/cipher/sha512-ppc.c
index b03aa6aa..6e69ddb9 100644
--- a/cipher/sha512-ppc.c
+++ b/cipher/sha512-ppc.c
@@ -47,7 +47,11 @@ typedef vector unsigned long long vector2x_u64;
 # define FUNC_ATTR_OPT_O2
 #endif
 
-#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
 # define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
 # define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
 #else
@@ -101,13 +105,6 @@ static const vector2x_u64 K[80] =
   };
 
 
-static ASM_FUNC_ATTR_INLINE u64
-ror64 (u64 v, u64 shift)
-{
-  return (v >> (shift & 63)) ^ (v << ((64 - shift) & 63));
-}
-
-
 static ASM_FUNC_ATTR_INLINE vector2x_u64
 vec_rol_elems(vector2x_u64 v, unsigned int idx)
 {
diff --git a/configure.ac b/configure.ac
index 63f705ea..b9ac99bb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2291,6 +2291,28 @@ if test "$gcry_cv_gcc_attribute_ppc_target" = "yes" ; then
 fi
 
 
+#
+# Check whether compiler supports clang PowerPC target attributes
+#
+AC_CACHE_CHECK([whether compiler supports clang PowerPC target attributes],
+       [gcry_cv_clang_attribute_ppc_target],
+       [if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_clang_attribute_ppc_target="n/a"
+        else
+          gcry_cv_clang_attribute_ppc_target=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void __attribute__((target("arch=pwr8"))) testfn8(void) {}
+            void __attribute__((target("arch=pwr9"))) testfn9(void)
+            { testfn8(); }
+            ]], [ testfn9(); ])],
+          [gcry_cv_clang_attribute_ppc_target=yes])
+        fi])
+if test "$gcry_cv_clang_attribute_ppc_target" = "yes" ; then
+   AC_DEFINE(HAVE_CLANG_ATTRIBUTE_PPC_TARGET,1,
+     [Defined if compiler supports clang PowerPC target attributes])
+fi
+
+
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Feb 26 14:00:33 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 26 Feb 2023 15:00:33 +0200
Subject: [PATCH 1/5] sha2-ppc: better optimization for POWER9
Message-ID: <20230226130037.847546-1-jussi.kivilinna@iki.fi>

* cipher/sha256-ppc.c: Change to use vector registers, generate
POWER8 and POWER9 from same code with help of 'target' and
'optimize' attribute.
* cipher/sha512-ppc.c: Likewise.
* configure.ac (gcry_cv_gcc_attribute_optimize)
(gcry_cv_gcc_attribute_ppc_target): New.
--

Benchmark on POWER9:

 Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA256         |      5.22 ns/B     182.8 MiB/s     12.00 c/B
 SHA512         |      3.53 ns/B     269.9 MiB/s      8.13 c/B

 After (sha256 ~12% faster, sha512 ~19% faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA256         |      4.65 ns/B     204.9 MiB/s     10.71 c/B
 SHA512         |      2.97 ns/B     321.1 MiB/s      6.83 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sha256-ppc.c | 1017 +++++++++++++++---------------------
 cipher/sha512-ppc.c | 1211 +++++++++++++++++--------------------------
 configure.ac        |   37 ++
 3 files changed, 940 insertions(+), 1325 deletions(-)

diff --git a/cipher/sha256-ppc.c b/cipher/sha256-ppc.c
index c49d9ff2..7b17b943 100644
--- a/cipher/sha256-ppc.c
+++ b/cipher/sha256-ppc.c
@@ -1,5 +1,5 @@
 /* sha256-ppc.c - PowerPC vcrypto implementation of SHA-256 transform
- * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2019,2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -42,26 +42,40 @@ typedef vector unsigned long long vector2x_u64;
 #define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
 #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
 
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+#endif
 
-static const u32 K[64] =
+
+static const vector4x_u32 K[64 / 4] =
   {
 #define TBL(v) v
-    TBL(0x428a2f98), TBL(0x71374491), TBL(0xb5c0fbcf), TBL(0xe9b5dba5),
-    TBL(0x3956c25b), TBL(0x59f111f1), TBL(0x923f82a4), TBL(0xab1c5ed5),
-    TBL(0xd807aa98), TBL(0x12835b01), TBL(0x243185be), TBL(0x550c7dc3),
-    TBL(0x72be5d74), TBL(0x80deb1fe), TBL(0x9bdc06a7), TBL(0xc19bf174),
-    TBL(0xe49b69c1), TBL(0xefbe4786), TBL(0x0fc19dc6), TBL(0x240ca1cc),
-    TBL(0x2de92c6f), TBL(0x4a7484aa), TBL(0x5cb0a9dc), TBL(0x76f988da),
-    TBL(0x983e5152), TBL(0xa831c66d), TBL(0xb00327c8), TBL(0xbf597fc7),
-    TBL(0xc6e00bf3), TBL(0xd5a79147), TBL(0x06ca6351), TBL(0x14292967),
-    TBL(0x27b70a85), TBL(0x2e1b2138), TBL(0x4d2c6dfc), TBL(0x53380d13),
-    TBL(0x650a7354), TBL(0x766a0abb), TBL(0x81c2c92e), TBL(0x92722c85),
-    TBL(0xa2bfe8a1), TBL(0xa81a664b), TBL(0xc24b8b70), TBL(0xc76c51a3),
-    TBL(0xd192e819), TBL(0xd6990624), TBL(0xf40e3585), TBL(0x106aa070),
-    TBL(0x19a4c116), TBL(0x1e376c08), TBL(0x2748774c), TBL(0x34b0bcb5),
-    TBL(0x391c0cb3), TBL(0x4ed8aa4a), TBL(0x5b9cca4f), TBL(0x682e6ff3),
-    TBL(0x748f82ee), TBL(0x78a5636f), TBL(0x84c87814), TBL(0x8cc70208),
-    TBL(0x90befffa), TBL(0xa4506ceb), TBL(0xbef9a3f7), TBL(0xc67178f2)
+    { TBL(0x428a2f98), TBL(0x71374491), TBL(0xb5c0fbcf), TBL(0xe9b5dba5) },
+    { TBL(0x3956c25b), TBL(0x59f111f1), TBL(0x923f82a4), TBL(0xab1c5ed5) },
+    { TBL(0xd807aa98), TBL(0x12835b01), TBL(0x243185be), TBL(0x550c7dc3) },
+    { TBL(0x72be5d74), TBL(0x80deb1fe), TBL(0x9bdc06a7), TBL(0xc19bf174) },
+    { TBL(0xe49b69c1), TBL(0xefbe4786), TBL(0x0fc19dc6), TBL(0x240ca1cc) },
+    { TBL(0x2de92c6f), TBL(0x4a7484aa), TBL(0x5cb0a9dc), TBL(0x76f988da) },
+    { TBL(0x983e5152), TBL(0xa831c66d), TBL(0xb00327c8), TBL(0xbf597fc7) },
+    { TBL(0xc6e00bf3), TBL(0xd5a79147), TBL(0x06ca6351), TBL(0x14292967) },
+    { TBL(0x27b70a85), TBL(0x2e1b2138), TBL(0x4d2c6dfc), TBL(0x53380d13) },
+    { TBL(0x650a7354), TBL(0x766a0abb), TBL(0x81c2c92e), TBL(0x92722c85) },
+    { TBL(0xa2bfe8a1), TBL(0xa81a664b), TBL(0xc24b8b70), TBL(0xc76c51a3) },
+    { TBL(0xd192e819), TBL(0xd6990624), TBL(0xf40e3585), TBL(0x106aa070) },
+    { TBL(0x19a4c116), TBL(0x1e376c08), TBL(0x2748774c), TBL(0x34b0bcb5) },
+    { TBL(0x391c0cb3), TBL(0x4ed8aa4a), TBL(0x5b9cca4f), TBL(0x682e6ff3) },
+    { TBL(0x748f82ee), TBL(0x78a5636f), TBL(0x84c87814), TBL(0x8cc70208) },
+    { TBL(0x90befffa), TBL(0xa4506ceb), TBL(0xbef9a3f7), TBL(0xc67178f2) }
 #undef TBL
   };
 
@@ -97,19 +111,75 @@ vec_vshasigma_u32(vector4x_u32 v, unsigned int a, unsigned int b)
 }
 
 
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_add_u32(vector4x_u32 v, vector4x_u32 w)
+{
+  __asm__ ("vadduwm %0,%1,%2"
+	   : "=v" (v)
+	   : "v" (v), "v" (w)
+	   : "memory");
+  return v;
+}
+
+
+static ASM_FUNC_ATTR_INLINE vector4x_u32
+vec_u32_load_be(unsigned long offset, const void *ptr)
+{
+  vector4x_u32 vecu32;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvw4x %x0,0,%1\n\t"
+		      : "=wa" (vecu32)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvw4x %x0,%1,%2\n\t"
+		      : "=wa" (vecu32)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+#ifndef WORDS_BIGENDIAN
+  return (vector4x_u32)vec_reve((vector16x_u8)vecu32);
+#else
+  return vecu32;
+#endif
+}
+
+
 /* SHA2 round in vector registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                             \
+#define R(a,b,c,d,e,f,g,h,ki,w) do                            \
     {                                                         \
-      t1  = (h);                                              \
-      t1 += ((k) + (w));                                      \
-      t1 += Cho((e),(f),(g));                                 \
-      t1 += Sum1((e));                                        \
-      t2  = Sum0((a));                                        \
-      t2 += Maj((a),(b),(c));                                 \
-      d  += t1;                                               \
-      h   = t1 + t2;                                          \
+      t1 = vec_add_u32((h), (w));                             \
+      t2 = Cho((e),(f),(g));                                  \
+      t1 = vec_add_u32(t1, GETK(ki));                         \
+      t1 = vec_add_u32(t1, t2);                               \
+      t1 = Sum1add(t1, e);                                    \
+      t2 = Maj((a),(b),(c));                                  \
+      t2 = Sum0add(t2, a);                                    \
+      h  = vec_add_u32(t1, t2);                               \
+      d += t1;                                                \
     } while (0)
 
+#define GETK(kidx) \
+    ({ \
+      vector4x_u32 rk; \
+      if (((kidx) % 4) == 0) \
+	{ \
+	  rk = ktmp = *(kptr++); \
+	  if ((kidx) < 63) \
+	    asm volatile("" : "+r" (kptr) :: "memory"); \
+	} \
+      else if (((kidx) % 4) == 1) \
+	{ \
+	  rk = vec_mergeo(ktmp, ktmp); \
+	} \
+      else \
+	{ \
+	  rk = vec_rol_elems(ktmp, ((kidx) % 4)); \
+	} \
+      rk; \
+    })
+
 #define Cho(b, c, d)  (vec_sel(d, c, b))
 
 #define Maj(c, d, b)  (vec_sel(c, b, c ^ d))
@@ -118,52 +188,119 @@ vec_vshasigma_u32(vector4x_u32 v, unsigned int a, unsigned int b)
 
 #define Sum1(x)       (vec_vshasigma_u32(x, 1, 15))
 
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))
-#define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))
-
-#define I(i) ( w[i] = buf_get_be32(data + i * 4) )
-#define W(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-		w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-		w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-		w[i&0x0f]; })
-
-#define I2(i) ( w2[i] = buf_get_be32(64 + data + i * 4), I(i) )
-#define W2(i) ({ w2[i]  = w2[i-7];       \
-		 w2[i] += S1(w2[i-2]);   \
-		 w2[i] += S0(w2[i-15]);  \
-		 w2[i] += w2[i-16];      \
-		 W(i); })
-#define R2(i) ( w2[i] )
-
-
-unsigned int ASM_FUNC_ATTR
-_gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
-			    size_t nblks)
+#define S0(x)         (vec_vshasigma_u32(x, 0, 0))
+
+#define S1(x)         (vec_vshasigma_u32(x, 0, 15))
+
+#define Xadd(X, d, x) vec_add_u32(d, X(x))
+
+#define Sum0add(d, x) Xadd(Sum0, d, x)
+
+#define Sum1add(d, x) Xadd(Sum1, d, x)
+
+#define S0add(d, x)   Xadd(S0, d, x)
+
+#define S1add(d, x)   Xadd(S1, d, x)
+
+#define I(i) \
+    ({ \
+      if (((i) % 4) == 0) \
+	{ \
+	  w[i] = vec_u32_load_be(0, data); \
+	  data += 4 * 4; \
+	  if ((i) / 4 < 3) \
+	    asm volatile("" : "+r"(data) :: "memory"); \
+	} \
+      else if (((i) % 4) == 1) \
+	{ \
+	  w[i] = vec_mergeo(w[(i) - 1], w[(i) - 1]); \
+	} \
+      else \
+	{ \
+	  w[i] = vec_rol_elems(w[(i) - (i) % 4], (i)); \
+	} \
+    })
+
+#define WN(i) ({ w[(i)&0x0f] += w[((i)-7) &0x0f];  \
+		 w[(i)&0x0f] = S0add(w[(i)&0x0f], w[((i)-15)&0x0f]); \
+		 w[(i)&0x0f] = S1add(w[(i)&0x0f], w[((i)-2) &0x0f]); })
+
+#define W(i) ({ vector4x_u32 r = w[(i)&0x0f]; WN(i); r; })
+
+#define L(i) w[(i)&0x0f]
+
+#define I2(i) \
+    ({ \
+      if ((i) % 4 == 0) \
+	{ \
+	  vector4x_u32 iw = vec_u32_load_be(0, data); \
+	  vector4x_u32 iw2 = vec_u32_load_be(64, data); \
+	  if ((i) / 4 < 3) \
+	    { \
+	      data += 4 * 4; \
+	      asm volatile("" : "+r"(data) :: "memory"); \
+	    } \
+	  else \
+	    { \
+	      data += 4 * 4 + 64; \
+	      asm volatile("" : "+r"(data) :: "memory"); \
+	    } \
+	  w[(i) + 0] = vec_mergeh(iw, iw2); \
+	  w[(i) + 1] = vec_rol_elems(w[(i) + 0], 2); \
+	  w[(i) + 2] = vec_mergel(iw, iw2); \
+	  w[(i) + 3] = vec_rol_elems(w[(i) + 2], 2); \
+	} \
+    })
+
+#define W2(i) \
+    ({ \
+      vector4x_u32 wt1 = w[(i)&0x0f]; \
+      WN(i); \
+      w2[(i) / 2] = (((i) % 2) == 0) ? wt1 : vec_mergeo(w2[(i) / 2], wt1); \
+      wt1; \
+    })
+
+#define L2(i) \
+    ({ \
+      vector4x_u32 lt1 = w[(i)&0x0f]; \
+      w2[(i) / 2] = (((i) % 2) == 0) ? lt1 : vec_mergeo(w2[(i) / 2], lt1); \
+      lt1; \
+    })
+
+#define WL(i) \
+    ({ \
+      vector4x_u32 wlt1 = w2[(i) / 2]; \
+      if (((i) % 2) == 0 && (i) < 63) \
+	w2[(i) / 2] = vec_mergeo(wlt1, wlt1); \
+      wlt1; \
+    })
+
+static unsigned int ASM_FUNC_ATTR ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2
+sha256_transform_ppc(u32 state[8], const unsigned char *data, size_t nblks)
 {
-  /* GPRs used for message expansion as vector intrinsics based generates
-   * slower code. */
   vector4x_u32 h0, h1, h2, h3, h4, h5, h6, h7;
   vector4x_u32 h0_h3, h4_h7;
   vector4x_u32 a, b, c, d, e, f, g, h, t1, t2;
-  u32 w[16];
-  u32 w2[64];
+  vector4x_u32 w[16];
+  vector4x_u32 w2[64 / 2];
 
   h0_h3 = vec_vsx_ld (4 * 0, state);
   h4_h7 = vec_vsx_ld (4 * 4, state);
 
   h0 = h0_h3;
-  h1 = vec_rol_elems (h0_h3, 1);
+  h1 = vec_mergeo (h0_h3, h0_h3);
   h2 = vec_rol_elems (h0_h3, 2);
   h3 = vec_rol_elems (h0_h3, 3);
   h4 = h4_h7;
-  h5 = vec_rol_elems (h4_h7, 1);
+  h5 = vec_mergeo (h4_h7, h4_h7);
   h6 = vec_rol_elems (h4_h7, 2);
   h7 = vec_rol_elems (h4_h7, 3);
 
   while (nblks >= 2)
     {
+      const vector4x_u32 *kptr = K;
+      vector4x_u32 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -173,74 +310,78 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], I2(0));
-      R(h, a, b, c, d, e, f, g, K[1], I2(1));
-      R(g, h, a, b, c, d, e, f, K[2], I2(2));
-      R(f, g, h, a, b, c, d, e, K[3], I2(3));
-      R(e, f, g, h, a, b, c, d, K[4], I2(4));
-      R(d, e, f, g, h, a, b, c, K[5], I2(5));
-      R(c, d, e, f, g, h, a, b, K[6], I2(6));
-      R(b, c, d, e, f, g, h, a, K[7], I2(7));
-      R(a, b, c, d, e, f, g, h, K[8], I2(8));
-      R(h, a, b, c, d, e, f, g, K[9], I2(9));
-      R(g, h, a, b, c, d, e, f, K[10], I2(10));
-      R(f, g, h, a, b, c, d, e, K[11], I2(11));
-      R(e, f, g, h, a, b, c, d, K[12], I2(12));
-      R(d, e, f, g, h, a, b, c, K[13], I2(13));
-      R(c, d, e, f, g, h, a, b, K[14], I2(14));
-      R(b, c, d, e, f, g, h, a, K[15], I2(15));
-      data += 64 * 2;
-
-      R(a, b, c, d, e, f, g, h, K[16], W2(16));
-      R(h, a, b, c, d, e, f, g, K[17], W2(17));
-      R(g, h, a, b, c, d, e, f, K[18], W2(18));
-      R(f, g, h, a, b, c, d, e, K[19], W2(19));
-      R(e, f, g, h, a, b, c, d, K[20], W2(20));
-      R(d, e, f, g, h, a, b, c, K[21], W2(21));
-      R(c, d, e, f, g, h, a, b, K[22], W2(22));
-      R(b, c, d, e, f, g, h, a, K[23], W2(23));
-      R(a, b, c, d, e, f, g, h, K[24], W2(24));
-      R(h, a, b, c, d, e, f, g, K[25], W2(25));
-      R(g, h, a, b, c, d, e, f, K[26], W2(26));
-      R(f, g, h, a, b, c, d, e, K[27], W2(27));
-      R(e, f, g, h, a, b, c, d, K[28], W2(28));
-      R(d, e, f, g, h, a, b, c, K[29], W2(29));
-      R(c, d, e, f, g, h, a, b, K[30], W2(30));
-      R(b, c, d, e, f, g, h, a, K[31], W2(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W2(32));
-      R(h, a, b, c, d, e, f, g, K[33], W2(33));
-      R(g, h, a, b, c, d, e, f, K[34], W2(34));
-      R(f, g, h, a, b, c, d, e, K[35], W2(35));
-      R(e, f, g, h, a, b, c, d, K[36], W2(36));
-      R(d, e, f, g, h, a, b, c, K[37], W2(37));
-      R(c, d, e, f, g, h, a, b, K[38], W2(38));
-      R(b, c, d, e, f, g, h, a, K[39], W2(39));
-      R(a, b, c, d, e, f, g, h, K[40], W2(40));
-      R(h, a, b, c, d, e, f, g, K[41], W2(41));
-      R(g, h, a, b, c, d, e, f, K[42], W2(42));
-      R(f, g, h, a, b, c, d, e, K[43], W2(43));
-      R(e, f, g, h, a, b, c, d, K[44], W2(44));
-      R(d, e, f, g, h, a, b, c, K[45], W2(45));
-      R(c, d, e, f, g, h, a, b, K[46], W2(46));
-      R(b, c, d, e, f, g, h, a, K[47], W2(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W2(48));
-      R(h, a, b, c, d, e, f, g, K[49], W2(49));
-      R(g, h, a, b, c, d, e, f, K[50], W2(50));
-      R(f, g, h, a, b, c, d, e, K[51], W2(51));
-      R(e, f, g, h, a, b, c, d, K[52], W2(52));
-      R(d, e, f, g, h, a, b, c, K[53], W2(53));
-      R(c, d, e, f, g, h, a, b, K[54], W2(54));
-      R(b, c, d, e, f, g, h, a, K[55], W2(55));
-      R(a, b, c, d, e, f, g, h, K[56], W2(56));
-      R(h, a, b, c, d, e, f, g, K[57], W2(57));
-      R(g, h, a, b, c, d, e, f, K[58], W2(58));
-      R(f, g, h, a, b, c, d, e, K[59], W2(59));
-      R(e, f, g, h, a, b, c, d, K[60], W2(60));
-      R(d, e, f, g, h, a, b, c, K[61], W2(61));
-      R(c, d, e, f, g, h, a, b, K[62], W2(62));
-      R(b, c, d, e, f, g, h, a, K[63], W2(63));
+      I2(0); I2(1); I2(2); I2(3);
+      I2(4); I2(5); I2(6); I2(7);
+      I2(8); I2(9); I2(10); I2(11);
+      I2(12); I2(13); I2(14); I2(15);
+
+      R(a, b, c, d, e, f, g, h, 0, W2(0));
+      R(h, a, b, c, d, e, f, g, 1, W2(1));
+      R(g, h, a, b, c, d, e, f, 2, W2(2));
+      R(f, g, h, a, b, c, d, e, 3, W2(3));
+      R(e, f, g, h, a, b, c, d, 4, W2(4));
+      R(d, e, f, g, h, a, b, c, 5, W2(5));
+      R(c, d, e, f, g, h, a, b, 6, W2(6));
+      R(b, c, d, e, f, g, h, a, 7, W2(7));
+      R(a, b, c, d, e, f, g, h, 8, W2(8));
+      R(h, a, b, c, d, e, f, g, 9, W2(9));
+      R(g, h, a, b, c, d, e, f, 10, W2(10));
+      R(f, g, h, a, b, c, d, e, 11, W2(11));
+      R(e, f, g, h, a, b, c, d, 12, W2(12));
+      R(d, e, f, g, h, a, b, c, 13, W2(13));
+      R(c, d, e, f, g, h, a, b, 14, W2(14));
+      R(b, c, d, e, f, g, h, a, 15, W2(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W2(16));
+      R(h, a, b, c, d, e, f, g, 17, W2(17));
+      R(g, h, a, b, c, d, e, f, 18, W2(18));
+      R(f, g, h, a, b, c, d, e, 19, W2(19));
+      R(e, f, g, h, a, b, c, d, 20, W2(20));
+      R(d, e, f, g, h, a, b, c, 21, W2(21));
+      R(c, d, e, f, g, h, a, b, 22, W2(22));
+      R(b, c, d, e, f, g, h, a, 23, W2(23));
+      R(a, b, c, d, e, f, g, h, 24, W2(24));
+      R(h, a, b, c, d, e, f, g, 25, W2(25));
+      R(g, h, a, b, c, d, e, f, 26, W2(26));
+      R(f, g, h, a, b, c, d, e, 27, W2(27));
+      R(e, f, g, h, a, b, c, d, 28, W2(28));
+      R(d, e, f, g, h, a, b, c, 29, W2(29));
+      R(c, d, e, f, g, h, a, b, 30, W2(30));
+      R(b, c, d, e, f, g, h, a, 31, W2(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W2(32));
+      R(h, a, b, c, d, e, f, g, 33, W2(33));
+      R(g, h, a, b, c, d, e, f, 34, W2(34));
+      R(f, g, h, a, b, c, d, e, 35, W2(35));
+      R(e, f, g, h, a, b, c, d, 36, W2(36));
+      R(d, e, f, g, h, a, b, c, 37, W2(37));
+      R(c, d, e, f, g, h, a, b, 38, W2(38));
+      R(b, c, d, e, f, g, h, a, 39, W2(39));
+      R(a, b, c, d, e, f, g, h, 40, W2(40));
+      R(h, a, b, c, d, e, f, g, 41, W2(41));
+      R(g, h, a, b, c, d, e, f, 42, W2(42));
+      R(f, g, h, a, b, c, d, e, 43, W2(43));
+      R(e, f, g, h, a, b, c, d, 44, W2(44));
+      R(d, e, f, g, h, a, b, c, 45, W2(45));
+      R(c, d, e, f, g, h, a, b, 46, W2(46));
+      R(b, c, d, e, f, g, h, a, 47, W2(47));
+
+      R(a, b, c, d, e, f, g, h, 48, L2(48));
+      R(h, a, b, c, d, e, f, g, 49, L2(49));
+      R(g, h, a, b, c, d, e, f, 50, L2(50));
+      R(f, g, h, a, b, c, d, e, 51, L2(51));
+      R(e, f, g, h, a, b, c, d, 52, L2(52));
+      R(d, e, f, g, h, a, b, c, 53, L2(53));
+      R(c, d, e, f, g, h, a, b, 54, L2(54));
+      R(b, c, d, e, f, g, h, a, 55, L2(55));
+      R(a, b, c, d, e, f, g, h, 56, L2(56));
+      R(h, a, b, c, d, e, f, g, 57, L2(57));
+      R(g, h, a, b, c, d, e, f, 58, L2(58));
+      R(f, g, h, a, b, c, d, e, 59, L2(59));
+      R(e, f, g, h, a, b, c, d, 60, L2(60));
+      R(d, e, f, g, h, a, b, c, 61, L2(61));
+      R(c, d, e, f, g, h, a, b, 62, L2(62));
+      R(b, c, d, e, f, g, h, a, 63, L2(63));
 
       h0 += a;
       h1 += b;
@@ -251,6 +392,8 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       h6 += g;
       h7 += h;
 
+      kptr = K;
+
       a = h0;
       b = h1;
       c = h2;
@@ -260,73 +403,73 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], R2(0));
-      R(h, a, b, c, d, e, f, g, K[1], R2(1));
-      R(g, h, a, b, c, d, e, f, K[2], R2(2));
-      R(f, g, h, a, b, c, d, e, K[3], R2(3));
-      R(e, f, g, h, a, b, c, d, K[4], R2(4));
-      R(d, e, f, g, h, a, b, c, K[5], R2(5));
-      R(c, d, e, f, g, h, a, b, K[6], R2(6));
-      R(b, c, d, e, f, g, h, a, K[7], R2(7));
-      R(a, b, c, d, e, f, g, h, K[8], R2(8));
-      R(h, a, b, c, d, e, f, g, K[9], R2(9));
-      R(g, h, a, b, c, d, e, f, K[10], R2(10));
-      R(f, g, h, a, b, c, d, e, K[11], R2(11));
-      R(e, f, g, h, a, b, c, d, K[12], R2(12));
-      R(d, e, f, g, h, a, b, c, K[13], R2(13));
-      R(c, d, e, f, g, h, a, b, K[14], R2(14));
-      R(b, c, d, e, f, g, h, a, K[15], R2(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], R2(16));
-      R(h, a, b, c, d, e, f, g, K[17], R2(17));
-      R(g, h, a, b, c, d, e, f, K[18], R2(18));
-      R(f, g, h, a, b, c, d, e, K[19], R2(19));
-      R(e, f, g, h, a, b, c, d, K[20], R2(20));
-      R(d, e, f, g, h, a, b, c, K[21], R2(21));
-      R(c, d, e, f, g, h, a, b, K[22], R2(22));
-      R(b, c, d, e, f, g, h, a, K[23], R2(23));
-      R(a, b, c, d, e, f, g, h, K[24], R2(24));
-      R(h, a, b, c, d, e, f, g, K[25], R2(25));
-      R(g, h, a, b, c, d, e, f, K[26], R2(26));
-      R(f, g, h, a, b, c, d, e, K[27], R2(27));
-      R(e, f, g, h, a, b, c, d, K[28], R2(28));
-      R(d, e, f, g, h, a, b, c, K[29], R2(29));
-      R(c, d, e, f, g, h, a, b, K[30], R2(30));
-      R(b, c, d, e, f, g, h, a, K[31], R2(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], R2(32));
-      R(h, a, b, c, d, e, f, g, K[33], R2(33));
-      R(g, h, a, b, c, d, e, f, K[34], R2(34));
-      R(f, g, h, a, b, c, d, e, K[35], R2(35));
-      R(e, f, g, h, a, b, c, d, K[36], R2(36));
-      R(d, e, f, g, h, a, b, c, K[37], R2(37));
-      R(c, d, e, f, g, h, a, b, K[38], R2(38));
-      R(b, c, d, e, f, g, h, a, K[39], R2(39));
-      R(a, b, c, d, e, f, g, h, K[40], R2(40));
-      R(h, a, b, c, d, e, f, g, K[41], R2(41));
-      R(g, h, a, b, c, d, e, f, K[42], R2(42));
-      R(f, g, h, a, b, c, d, e, K[43], R2(43));
-      R(e, f, g, h, a, b, c, d, K[44], R2(44));
-      R(d, e, f, g, h, a, b, c, K[45], R2(45));
-      R(c, d, e, f, g, h, a, b, K[46], R2(46));
-      R(b, c, d, e, f, g, h, a, K[47], R2(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], R2(48));
-      R(h, a, b, c, d, e, f, g, K[49], R2(49));
-      R(g, h, a, b, c, d, e, f, K[50], R2(50));
-      R(f, g, h, a, b, c, d, e, K[51], R2(51));
-      R(e, f, g, h, a, b, c, d, K[52], R2(52));
-      R(d, e, f, g, h, a, b, c, K[53], R2(53));
-      R(c, d, e, f, g, h, a, b, K[54], R2(54));
-      R(b, c, d, e, f, g, h, a, K[55], R2(55));
-      R(a, b, c, d, e, f, g, h, K[56], R2(56));
-      R(h, a, b, c, d, e, f, g, K[57], R2(57));
-      R(g, h, a, b, c, d, e, f, K[58], R2(58));
-      R(f, g, h, a, b, c, d, e, K[59], R2(59));
-      R(e, f, g, h, a, b, c, d, K[60], R2(60));
-      R(d, e, f, g, h, a, b, c, K[61], R2(61));
-      R(c, d, e, f, g, h, a, b, K[62], R2(62));
-      R(b, c, d, e, f, g, h, a, K[63], R2(63));
+      R(a, b, c, d, e, f, g, h, 0, WL(0));
+      R(h, a, b, c, d, e, f, g, 1, WL(1));
+      R(g, h, a, b, c, d, e, f, 2, WL(2));
+      R(f, g, h, a, b, c, d, e, 3, WL(3));
+      R(e, f, g, h, a, b, c, d, 4, WL(4));
+      R(d, e, f, g, h, a, b, c, 5, WL(5));
+      R(c, d, e, f, g, h, a, b, 6, WL(6));
+      R(b, c, d, e, f, g, h, a, 7, WL(7));
+      R(a, b, c, d, e, f, g, h, 8, WL(8));
+      R(h, a, b, c, d, e, f, g, 9, WL(9));
+      R(g, h, a, b, c, d, e, f, 10, WL(10));
+      R(f, g, h, a, b, c, d, e, 11, WL(11));
+      R(e, f, g, h, a, b, c, d, 12, WL(12));
+      R(d, e, f, g, h, a, b, c, 13, WL(13));
+      R(c, d, e, f, g, h, a, b, 14, WL(14));
+      R(b, c, d, e, f, g, h, a, 15, WL(15));
+
+      R(a, b, c, d, e, f, g, h, 16, WL(16));
+      R(h, a, b, c, d, e, f, g, 17, WL(17));
+      R(g, h, a, b, c, d, e, f, 18, WL(18));
+      R(f, g, h, a, b, c, d, e, 19, WL(19));
+      R(e, f, g, h, a, b, c, d, 20, WL(20));
+      R(d, e, f, g, h, a, b, c, 21, WL(21));
+      R(c, d, e, f, g, h, a, b, 22, WL(22));
+      R(b, c, d, e, f, g, h, a, 23, WL(23));
+      R(a, b, c, d, e, f, g, h, 24, WL(24));
+      R(h, a, b, c, d, e, f, g, 25, WL(25));
+      R(g, h, a, b, c, d, e, f, 26, WL(26));
+      R(f, g, h, a, b, c, d, e, 27, WL(27));
+      R(e, f, g, h, a, b, c, d, 28, WL(28));
+      R(d, e, f, g, h, a, b, c, 29, WL(29));
+      R(c, d, e, f, g, h, a, b, 30, WL(30));
+      R(b, c, d, e, f, g, h, a, 31, WL(31));
+
+      R(a, b, c, d, e, f, g, h, 32, WL(32));
+      R(h, a, b, c, d, e, f, g, 33, WL(33));
+      R(g, h, a, b, c, d, e, f, 34, WL(34));
+      R(f, g, h, a, b, c, d, e, 35, WL(35));
+      R(e, f, g, h, a, b, c, d, 36, WL(36));
+      R(d, e, f, g, h, a, b, c, 37, WL(37));
+      R(c, d, e, f, g, h, a, b, 38, WL(38));
+      R(b, c, d, e, f, g, h, a, 39, WL(39));
+      R(a, b, c, d, e, f, g, h, 40, WL(40));
+      R(h, a, b, c, d, e, f, g, 41, WL(41));
+      R(g, h, a, b, c, d, e, f, 42, WL(42));
+      R(f, g, h, a, b, c, d, e, 43, WL(43));
+      R(e, f, g, h, a, b, c, d, 44, WL(44));
+      R(d, e, f, g, h, a, b, c, 45, WL(45));
+      R(c, d, e, f, g, h, a, b, 46, WL(46));
+      R(b, c, d, e, f, g, h, a, 47, WL(47));
+
+      R(a, b, c, d, e, f, g, h, 48, WL(48));
+      R(h, a, b, c, d, e, f, g, 49, WL(49));
+      R(g, h, a, b, c, d, e, f, 50, WL(50));
+      R(f, g, h, a, b, c, d, e, 51, WL(51));
+      R(e, f, g, h, a, b, c, d, 52, WL(52));
+      R(d, e, f, g, h, a, b, c, 53, WL(53));
+      R(c, d, e, f, g, h, a, b, 54, WL(54));
+      R(b, c, d, e, f, g, h, a, 55, WL(55));
+      R(a, b, c, d, e, f, g, h, 56, WL(56));
+      R(h, a, b, c, d, e, f, g, 57, WL(57));
+      R(g, h, a, b, c, d, e, f, 58, WL(58));
+      R(f, g, h, a, b, c, d, e, 59, WL(59));
+      R(e, f, g, h, a, b, c, d, 60, WL(60));
+      R(d, e, f, g, h, a, b, c, 61, WL(61));
+      R(c, d, e, f, g, h, a, b, 62, WL(62));
+      R(b, c, d, e, f, g, h, a, 63, WL(63));
 
       h0 += a;
       h1 += b;
@@ -340,8 +483,11 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       nblks -= 2;
     }
 
-  while (nblks)
+  if (nblks)
     {
+      const vector4x_u32 *kptr = K;
+      vector4x_u32 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -351,74 +497,78 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], I(0));
-      R(h, a, b, c, d, e, f, g, K[1], I(1));
-      R(g, h, a, b, c, d, e, f, K[2], I(2));
-      R(f, g, h, a, b, c, d, e, K[3], I(3));
-      R(e, f, g, h, a, b, c, d, K[4], I(4));
-      R(d, e, f, g, h, a, b, c, K[5], I(5));
-      R(c, d, e, f, g, h, a, b, K[6], I(6));
-      R(b, c, d, e, f, g, h, a, K[7], I(7));
-      R(a, b, c, d, e, f, g, h, K[8], I(8));
-      R(h, a, b, c, d, e, f, g, K[9], I(9));
-      R(g, h, a, b, c, d, e, f, K[10], I(10));
-      R(f, g, h, a, b, c, d, e, K[11], I(11));
-      R(e, f, g, h, a, b, c, d, K[12], I(12));
-      R(d, e, f, g, h, a, b, c, K[13], I(13));
-      R(c, d, e, f, g, h, a, b, K[14], I(14));
-      R(b, c, d, e, f, g, h, a, K[15], I(15));
-      data += 64;
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
+      I(0); I(1); I(2); I(3);
+      I(4); I(5); I(6); I(7);
+      I(8); I(9); I(10); I(11);
+      I(12); I(13); I(14); I(15);
+
+      R(a, b, c, d, e, f, g, h, 0, W(0));
+      R(h, a, b, c, d, e, f, g, 1, W(1));
+      R(g, h, a, b, c, d, e, f, 2, W(2));
+      R(f, g, h, a, b, c, d, e, 3, W(3));
+      R(e, f, g, h, a, b, c, d, 4, W(4));
+      R(d, e, f, g, h, a, b, c, 5, W(5));
+      R(c, d, e, f, g, h, a, b, 6, W(6));
+      R(b, c, d, e, f, g, h, a, 7, W(7));
+      R(a, b, c, d, e, f, g, h, 8, W(8));
+      R(h, a, b, c, d, e, f, g, 9, W(9));
+      R(g, h, a, b, c, d, e, f, 10, W(10));
+      R(f, g, h, a, b, c, d, e, 11, W(11));
+      R(e, f, g, h, a, b, c, d, 12, W(12));
+      R(d, e, f, g, h, a, b, c, 13, W(13));
+      R(c, d, e, f, g, h, a, b, 14, W(14));
+      R(b, c, d, e, f, g, h, a, 15, W(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W(16));
+      R(h, a, b, c, d, e, f, g, 17, W(17));
+      R(g, h, a, b, c, d, e, f, 18, W(18));
+      R(f, g, h, a, b, c, d, e, 19, W(19));
+      R(e, f, g, h, a, b, c, d, 20, W(20));
+      R(d, e, f, g, h, a, b, c, 21, W(21));
+      R(c, d, e, f, g, h, a, b, 22, W(22));
+      R(b, c, d, e, f, g, h, a, 23, W(23));
+      R(a, b, c, d, e, f, g, h, 24, W(24));
+      R(h, a, b, c, d, e, f, g, 25, W(25));
+      R(g, h, a, b, c, d, e, f, 26, W(26));
+      R(f, g, h, a, b, c, d, e, 27, W(27));
+      R(e, f, g, h, a, b, c, d, 28, W(28));
+      R(d, e, f, g, h, a, b, c, 29, W(29));
+      R(c, d, e, f, g, h, a, b, 30, W(30));
+      R(b, c, d, e, f, g, h, a, 31, W(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W(32));
+      R(h, a, b, c, d, e, f, g, 33, W(33));
+      R(g, h, a, b, c, d, e, f, 34, W(34));
+      R(f, g, h, a, b, c, d, e, 35, W(35));
+      R(e, f, g, h, a, b, c, d, 36, W(36));
+      R(d, e, f, g, h, a, b, c, 37, W(37));
+      R(c, d, e, f, g, h, a, b, 38, W(38));
+      R(b, c, d, e, f, g, h, a, 39, W(39));
+      R(a, b, c, d, e, f, g, h, 40, W(40));
+      R(h, a, b, c, d, e, f, g, 41, W(41));
+      R(g, h, a, b, c, d, e, f, 42, W(42));
+      R(f, g, h, a, b, c, d, e, 43, W(43));
+      R(e, f, g, h, a, b, c, d, 44, W(44));
+      R(d, e, f, g, h, a, b, c, 45, W(45));
+      R(c, d, e, f, g, h, a, b, 46, W(46));
+      R(b, c, d, e, f, g, h, a, 47, W(47));
+
+      R(a, b, c, d, e, f, g, h, 48, L(48));
+      R(h, a, b, c, d, e, f, g, 49, L(49));
+      R(g, h, a, b, c, d, e, f, 50, L(50));
+      R(f, g, h, a, b, c, d, e, 51, L(51));
+      R(e, f, g, h, a, b, c, d, 52, L(52));
+      R(d, e, f, g, h, a, b, c, 53, L(53));
+      R(c, d, e, f, g, h, a, b, 54, L(54));
+      R(b, c, d, e, f, g, h, a, 55, L(55));
+      R(a, b, c, d, e, f, g, h, 56, L(56));
+      R(h, a, b, c, d, e, f, g, 57, L(57));
+      R(g, h, a, b, c, d, e, f, 58, L(58));
+      R(f, g, h, a, b, c, d, e, 59, L(59));
+      R(e, f, g, h, a, b, c, d, 60, L(60));
+      R(d, e, f, g, h, a, b, c, 61, L(61));
+      R(c, d, e, f, g, h, a, b, 62, L(62));
+      R(b, c, d, e, f, g, h, a, 63, L(63));
 
       h0 += a;
       h1 += b;
@@ -439,350 +589,19 @@ _gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
 
   return sizeof(w2) + sizeof(w);
 }
-#undef R
-#undef Cho
-#undef Maj
-#undef Sum0
-#undef Sum1
-#undef S0
-#undef S1
-#undef I
-#undef W
-#undef I2
-#undef W2
-#undef R2
-
-
-/* SHA2 round in general purpose registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                                 \
-          {                                                       \
-            t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + ((k) + (w));\
-            t2 = Sum0((a)) + Maj((a),(b),(c));                    \
-            d += t1;                                              \
-            h  = t1 + t2;                                         \
-          } while (0)
-
-#define Cho(x, y, z)  ((x & y) + (~x & z))
-
-#define Maj(z, x, y)  ((x & y) + (z & (x ^ y)))
-
-#define Sum0(x)       (ror (x, 2) ^ ror (x ^ ror (x, 22-13), 13))
-
-#define Sum1(x)       (ror (x, 6) ^ ror (x, 11) ^ ror (x, 25))
-
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror ((x), 7) ^ ror ((x), 18) ^ ((x) >> 3))
-#define S1(x) (ror ((x), 17) ^ ror ((x), 19) ^ ((x) >> 10))
-
-#define I(i) ( w[i] = buf_get_be32(data + i * 4) )
-#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-		 w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-		 w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-		 w[i&0x0f]; })
-#define W(i) ({ u32 r = w[i&0x0f]; WN(i); r; })
-#define L(i) w[i&0x0f]
 
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_sha256_transform_ppc8(u32 state[8], const unsigned char *data,
+			    size_t nblks)
+{
+  return sha256_transform_ppc(state, data, nblks);
+}
 
-unsigned int ASM_FUNC_ATTR
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
 _gcry_sha256_transform_ppc9(u32 state[8], const unsigned char *data,
 			    size_t nblks)
 {
-  /* GPRs used for round function and message expansion as vector intrinsics
-   * based generates slower code for POWER9. */
-  u32 a, b, c, d, e, f, g, h, t1, t2;
-  u32 w[16];
-
-  a = state[0];
-  b = state[1];
-  c = state[2];
-  d = state[3];
-  e = state[4];
-  f = state[5];
-  g = state[6];
-  h = state[7];
-
-  while (nblks >= 2)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 64;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], L(48));
-      R(h, a, b, c, d, e, f, g, K[49], L(49));
-      R(g, h, a, b, c, d, e, f, K[50], L(50));
-      R(f, g, h, a, b, c, d, e, K[51], L(51));
-      I(0); I(1); I(2); I(3);
-      R(e, f, g, h, a, b, c, d, K[52], L(52));
-      R(d, e, f, g, h, a, b, c, K[53], L(53));
-      R(c, d, e, f, g, h, a, b, K[54], L(54));
-      R(b, c, d, e, f, g, h, a, K[55], L(55));
-      I(4); I(5); I(6); I(7);
-      R(a, b, c, d, e, f, g, h, K[56], L(56));
-      R(h, a, b, c, d, e, f, g, K[57], L(57));
-      R(g, h, a, b, c, d, e, f, K[58], L(58));
-      R(f, g, h, a, b, c, d, e, K[59], L(59));
-      I(8); I(9); I(10); I(11);
-      R(e, f, g, h, a, b, c, d, K[60], L(60));
-      R(d, e, f, g, h, a, b, c, K[61], L(61));
-      R(c, d, e, f, g, h, a, b, K[62], L(62));
-      R(b, c, d, e, f, g, h, a, K[63], L(63));
-      I(12); I(13); I(14); I(15);
-      data += 64;
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], L(48));
-      R(h, a, b, c, d, e, f, g, K[49], L(49));
-      R(g, h, a, b, c, d, e, f, K[50], L(50));
-      R(f, g, h, a, b, c, d, e, K[51], L(51));
-      R(e, f, g, h, a, b, c, d, K[52], L(52));
-      R(d, e, f, g, h, a, b, c, K[53], L(53));
-      R(c, d, e, f, g, h, a, b, K[54], L(54));
-      R(b, c, d, e, f, g, h, a, K[55], L(55));
-      R(a, b, c, d, e, f, g, h, K[56], L(56));
-      R(h, a, b, c, d, e, f, g, K[57], L(57));
-      R(g, h, a, b, c, d, e, f, K[58], L(58));
-      R(f, g, h, a, b, c, d, e, K[59], L(59));
-      R(e, f, g, h, a, b, c, d, K[60], L(60));
-      R(d, e, f, g, h, a, b, c, K[61], L(61));
-      R(c, d, e, f, g, h, a, b, K[62], L(62));
-      R(b, c, d, e, f, g, h, a, K[63], L(63));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks -= 2;
-    }
-
-  while (nblks)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 64;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], L(48));
-      R(h, a, b, c, d, e, f, g, K[49], L(49));
-      R(g, h, a, b, c, d, e, f, K[50], L(50));
-      R(f, g, h, a, b, c, d, e, K[51], L(51));
-      R(e, f, g, h, a, b, c, d, K[52], L(52));
-      R(d, e, f, g, h, a, b, c, K[53], L(53));
-      R(c, d, e, f, g, h, a, b, K[54], L(54));
-      R(b, c, d, e, f, g, h, a, K[55], L(55));
-      R(a, b, c, d, e, f, g, h, K[56], L(56));
-      R(h, a, b, c, d, e, f, g, K[57], L(57));
-      R(g, h, a, b, c, d, e, f, K[58], L(58));
-      R(f, g, h, a, b, c, d, e, K[59], L(59));
-      R(e, f, g, h, a, b, c, d, K[60], L(60));
-      R(d, e, f, g, h, a, b, c, K[61], L(61));
-      R(c, d, e, f, g, h, a, b, K[62], L(62));
-      R(b, c, d, e, f, g, h, a, K[63], L(63));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks--;
-    }
-
-  return sizeof(w);
+  return sha256_transform_ppc(state, data, nblks);
 }
 
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/sha512-ppc.c b/cipher/sha512-ppc.c
index 31ea25bf..b03aa6aa 100644
--- a/cipher/sha512-ppc.c
+++ b/cipher/sha512-ppc.c
@@ -1,5 +1,5 @@
 /* sha512-ppc.c - PowerPC vcrypto implementation of SHA-512 transform
- * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2019,2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -41,49 +41,63 @@ typedef vector unsigned long long vector2x_u64;
 #define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
 #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
 
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+#endif
+
 
-static const u64 K[80] =
+static const vector2x_u64 K[80] =
   {
-    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
-    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
-    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
-    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
-    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
-    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
-    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
-    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
-    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
-    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
-    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
-    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
-    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
-    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
-    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
-    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
-    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
-    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
-    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
-    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
-    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
-    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
-    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
-    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
-    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
-    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
-    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
-    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
-    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
-    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
-    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
-    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
-    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
-    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
-    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
-    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
-    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
-    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
-    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
-    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
+    { U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd) },
+    { U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc) },
+    { U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019) },
+    { U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118) },
+    { U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe) },
+    { U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2) },
+    { U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1) },
+    { U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694) },
+    { U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3) },
+    { U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65) },
+    { U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483) },
+    { U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5) },
+    { U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210) },
+    { U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4) },
+    { U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725) },
+    { U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70) },
+    { U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926) },
+    { U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df) },
+    { U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8) },
+    { U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b) },
+    { U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001) },
+    { U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30) },
+    { U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910) },
+    { U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8) },
+    { U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53) },
+    { U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8) },
+    { U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb) },
+    { U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3) },
+    { U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60) },
+    { U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec) },
+    { U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9) },
+    { U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b) },
+    { U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207) },
+    { U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178) },
+    { U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6) },
+    { U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b) },
+    { U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493) },
+    { U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c) },
+    { U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a) },
+    { U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817) }
   };
 
 
@@ -123,6 +137,17 @@ vec_vshasigma_u64(vector2x_u64 v, unsigned int a, unsigned int b)
 }
 
 
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_add_u64(vector2x_u64 v, vector2x_u64 w)
+{
+  __asm__ ("vaddudm %0,%1,%2"
+	   : "=v" (v)
+	   : "v" (v), "v" (w)
+	   : "memory");
+  return v;
+}
+
+
 static ASM_FUNC_ATTR_INLINE vector2x_u64
 vec_u64_load(unsigned long offset, const void *ptr)
 {
@@ -171,19 +196,59 @@ vec_u64_store(vector2x_u64 vecu64, unsigned long offset, void *ptr)
 }
 
 
+static ASM_FUNC_ATTR_INLINE vector2x_u64
+vec_u64_load_be(unsigned long offset, const void *ptr)
+{
+  vector2x_u64 vecu64;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvd2x %x0,0,%1\n\t"
+		      : "=wa" (vecu64)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvd2x %x0,%1,%2\n\t"
+		      : "=wa" (vecu64)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+#ifndef WORDS_BIGENDIAN
+  return (vector2x_u64)vec_reve((vector16x_u8)vecu64);
+#else
+  return vecu64;
+#endif
+}
+
+
 /* SHA2 round in vector registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                             \
+#define R(a,b,c,d,e,f,g,h,ki,w) do                            \
     {                                                         \
-      t1  = (h);                                              \
-      t1 += ((k) + (w));                                      \
-      t1 += Cho((e),(f),(g));                                 \
-      t1 += Sum1((e));                                        \
-      t2  = Sum0((a));                                        \
-      t2 += Maj((a),(b),(c));                                 \
-      d  += t1;                                               \
-      h   = t1 + t2;                                          \
+      t1 = vec_add_u64((h), (w));                             \
+      t2 = Cho((e),(f),(g));                                  \
+      t1 = vec_add_u64(t1, GETK(ki));                         \
+      t1 = vec_add_u64(t1, t2);                               \
+      t1 = Sum1add(t1, e);                                    \
+      t2 = Maj((a),(b),(c));                                  \
+      t2 = Sum0add(t2, a);                                    \
+      h  = vec_add_u64(t1, t2);                               \
+      d += t1;                                                \
     } while (0)
 
+#define GETK(kidx) \
+    ({ \
+      if (((kidx) % 2) == 0) \
+	{ \
+	  ktmp = *(kptr++); \
+	  if ((kidx) < 79) \
+	    asm volatile("" : "+r" (kptr) :: "memory"); \
+	} \
+      else \
+	{ \
+	  ktmp = vec_mergel(ktmp, ktmp); \
+	} \
+      ktmp; \
+    })
+
 #define Cho(b, c, d)  (vec_sel(d, c, b))
 
 #define Maj(c, d, b)  (vec_sel(c, b, c ^ d))
@@ -192,29 +257,98 @@ vec_u64_store(vector2x_u64 vecu64, unsigned long offset, void *ptr)
 
 #define Sum1(x)       (vec_vshasigma_u64(x, 1, 15))
 
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror64 ((x), 1) ^ ror64 ((x), 8) ^ ((x) >> 7))
-#define S1(x) (ror64 ((x), 19) ^ ror64 ((x), 61) ^ ((x) >> 6))
-
-#define I(i) ( w[i] = buf_get_be64(data + i * 8) )
-#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-		 w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-		 w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-		 w[i&0x0f]; })
-#define W(i) ({ u64 r = w[i&0x0f]; WN(i); r; })
-#define L(i) w[i&0x0f]
-
-
-unsigned int ASM_FUNC_ATTR
-_gcry_sha512_transform_ppc8(u64 state[8],
-			    const unsigned char *data, size_t nblks)
+#define S0(x)         (vec_vshasigma_u64(x, 0, 0))
+
+#define S1(x)         (vec_vshasigma_u64(x, 0, 15))
+
+#define Xadd(X, d, x) vec_add_u64(d, X(x))
+
+#define Sum0add(d, x) Xadd(Sum0, d, x)
+
+#define Sum1add(d, x) Xadd(Sum1, d, x)
+
+#define S0add(d, x)   Xadd(S0, d, x)
+
+#define S1add(d, x)   Xadd(S1, d, x)
+
+#define I(i) \
+    ({ \
+      if (((i) % 2) == 0) \
+	{ \
+	  w[i] = vec_u64_load_be(0, data); \
+	  data += 2 * 8; \
+	  if ((i) / 2 < 7) \
+	    asm volatile("" : "+r"(data) :: "memory"); \
+	} \
+      else \
+	{ \
+	  w[i] = vec_mergel(w[(i) - 1], w[(i) - 1]); \
+	} \
+    })
+
+#define WN(i) ({ w[(i)&0x0f] += w[((i)-7) &0x0f];  \
+		 w[(i)&0x0f] = S0add(w[(i)&0x0f], w[((i)-15)&0x0f]); \
+		 w[(i)&0x0f] = S1add(w[(i)&0x0f], w[((i)-2) &0x0f]); })
+
+#define W(i) ({ vector2x_u64 r = w[(i)&0x0f]; WN(i); r; })
+
+#define L(i) w[(i)&0x0f]
+
+#define I2(i) \
+    ({ \
+      if (((i) % 2) == 0) \
+	{ \
+	  w[i] = vec_u64_load_be(0, data); \
+	} \
+      else \
+	{ \
+	  vector2x_u64 it1 = vec_u64_load_be(128, data); \
+	  vector2x_u64 it2 = vec_mergeh(w[(i) - 1], it1); \
+	  w[i] = vec_mergel(w[(i) - 1], it1); \
+	  w[(i) - 1] = it2; \
+	  if ((i) < 15) \
+	    { \
+	      data += 2 * 8; \
+	      asm volatile("" : "+r"(data) :: "memory"); \
+	    } \
+	  else \
+	    { \
+	      data += 2 * 8 + 128; \
+	      asm volatile("" : "+r"(data) :: "memory"); \
+	    } \
+	} \
+    })
+
+#define W2(i) \
+    ({ \
+      vector2x_u64 wt1 = w[(i)&0x0f]; \
+      WN(i); \
+      w2[(i) / 2] = (((i) % 2) == 0) ? wt1 : vec_mergel(w2[(i) / 2], wt1); \
+      wt1; \
+    })
+
+#define L2(i) \
+    ({ \
+      vector2x_u64 lt1 = w[(i)&0x0f]; \
+      w2[(i) / 2] = (((i) % 2) == 0) ? lt1 : vec_mergel(w2[(i) / 2], lt1); \
+      lt1; \
+    })
+
+#define WL(i) \
+    ({ \
+      vector2x_u64 wlt1 = w2[(i) / 2]; \
+      if (((i) % 2) == 0 && (i) < 79) \
+	w2[(i) / 2] = vec_mergel(wlt1, wlt1); \
+      wlt1; \
+    })
+
+static unsigned int ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2
+sha512_transform_ppc(u64 state[8], const unsigned char *data, size_t nblks)
 {
-  /* GPRs used for message expansion as vector intrinsics based generates
-   * slower code. */
   vector2x_u64 h0, h1, h2, h3, h4, h5, h6, h7;
   vector2x_u64 a, b, c, d, e, f, g, h, t1, t2;
-  u64 w[16];
+  vector2x_u64 w[16];
+  vector2x_u64 w2[80 / 2];
 
   h0 = vec_u64_load (8 * 0, (unsigned long long *)state);
   h1 = vec_rol_elems (h0, 1);
@@ -227,6 +361,9 @@ _gcry_sha512_transform_ppc8(u64 state[8],
 
   while (nblks >= 2)
     {
+      const vector2x_u64 *kptr = K;
+      vector2x_u64 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -236,100 +373,96 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       g = h6;
       h = h7;
 
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      I(0); I(1); I(2); I(3);
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      I(4); I(5); I(6); I(7);
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      I(8); I(9); I(10); I(11);
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-      I(12); I(13); I(14); I(15);
-      data += 128;
+      I2(0); I2(1); I2(2); I2(3);
+      I2(4); I2(5); I2(6); I2(7);
+      I2(8); I2(9); I2(10); I2(11);
+      I2(12); I2(13); I2(14); I2(15);
+
+      R(a, b, c, d, e, f, g, h, 0, W2(0));
+      R(h, a, b, c, d, e, f, g, 1, W2(1));
+      R(g, h, a, b, c, d, e, f, 2, W2(2));
+      R(f, g, h, a, b, c, d, e, 3, W2(3));
+      R(e, f, g, h, a, b, c, d, 4, W2(4));
+      R(d, e, f, g, h, a, b, c, 5, W2(5));
+      R(c, d, e, f, g, h, a, b, 6, W2(6));
+      R(b, c, d, e, f, g, h, a, 7, W2(7));
+
+      R(a, b, c, d, e, f, g, h, 8, W2(8));
+      R(h, a, b, c, d, e, f, g, 9, W2(9));
+      R(g, h, a, b, c, d, e, f, 10, W2(10));
+      R(f, g, h, a, b, c, d, e, 11, W2(11));
+      R(e, f, g, h, a, b, c, d, 12, W2(12));
+      R(d, e, f, g, h, a, b, c, 13, W2(13));
+      R(c, d, e, f, g, h, a, b, 14, W2(14));
+      R(b, c, d, e, f, g, h, a, 15, W2(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W2(16));
+      R(h, a, b, c, d, e, f, g, 17, W2(17));
+      R(g, h, a, b, c, d, e, f, 18, W2(18));
+      R(f, g, h, a, b, c, d, e, 19, W2(19));
+      R(e, f, g, h, a, b, c, d, 20, W2(20));
+      R(d, e, f, g, h, a, b, c, 21, W2(21));
+      R(c, d, e, f, g, h, a, b, 22, W2(22));
+      R(b, c, d, e, f, g, h, a, 23, W2(23));
+      R(a, b, c, d, e, f, g, h, 24, W2(24));
+      R(h, a, b, c, d, e, f, g, 25, W2(25));
+      R(g, h, a, b, c, d, e, f, 26, W2(26));
+      R(f, g, h, a, b, c, d, e, 27, W2(27));
+      R(e, f, g, h, a, b, c, d, 28, W2(28));
+      R(d, e, f, g, h, a, b, c, 29, W2(29));
+      R(c, d, e, f, g, h, a, b, 30, W2(30));
+      R(b, c, d, e, f, g, h, a, 31, W2(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W2(32));
+      R(h, a, b, c, d, e, f, g, 33, W2(33));
+      R(g, h, a, b, c, d, e, f, 34, W2(34));
+      R(f, g, h, a, b, c, d, e, 35, W2(35));
+      R(e, f, g, h, a, b, c, d, 36, W2(36));
+      R(d, e, f, g, h, a, b, c, 37, W2(37));
+      R(c, d, e, f, g, h, a, b, 38, W2(38));
+      R(b, c, d, e, f, g, h, a, 39, W2(39));
+      R(a, b, c, d, e, f, g, h, 40, W2(40));
+      R(h, a, b, c, d, e, f, g, 41, W2(41));
+      R(g, h, a, b, c, d, e, f, 42, W2(42));
+      R(f, g, h, a, b, c, d, e, 43, W2(43));
+      R(e, f, g, h, a, b, c, d, 44, W2(44));
+      R(d, e, f, g, h, a, b, c, 45, W2(45));
+      R(c, d, e, f, g, h, a, b, 46, W2(46));
+      R(b, c, d, e, f, g, h, a, 47, W2(47));
+
+      R(a, b, c, d, e, f, g, h, 48, W2(48));
+      R(h, a, b, c, d, e, f, g, 49, W2(49));
+      R(g, h, a, b, c, d, e, f, 50, W2(50));
+      R(f, g, h, a, b, c, d, e, 51, W2(51));
+      R(e, f, g, h, a, b, c, d, 52, W2(52));
+      R(d, e, f, g, h, a, b, c, 53, W2(53));
+      R(c, d, e, f, g, h, a, b, 54, W2(54));
+      R(b, c, d, e, f, g, h, a, 55, W2(55));
+      R(a, b, c, d, e, f, g, h, 56, W2(56));
+      R(h, a, b, c, d, e, f, g, 57, W2(57));
+      R(g, h, a, b, c, d, e, f, 58, W2(58));
+      R(f, g, h, a, b, c, d, e, 59, W2(59));
+      R(e, f, g, h, a, b, c, d, 60, W2(60));
+      R(d, e, f, g, h, a, b, c, 61, W2(61));
+      R(c, d, e, f, g, h, a, b, 62, W2(62));
+      R(b, c, d, e, f, g, h, a, 63, W2(63));
+
+      R(a, b, c, d, e, f, g, h, 64, L2(64));
+      R(h, a, b, c, d, e, f, g, 65, L2(65));
+      R(g, h, a, b, c, d, e, f, 66, L2(66));
+      R(f, g, h, a, b, c, d, e, 67, L2(67));
+      R(e, f, g, h, a, b, c, d, 68, L2(68));
+      R(d, e, f, g, h, a, b, c, 69, L2(69));
+      R(c, d, e, f, g, h, a, b, 70, L2(70));
+      R(b, c, d, e, f, g, h, a, 71, L2(71));
+      R(a, b, c, d, e, f, g, h, 72, L2(72));
+      R(h, a, b, c, d, e, f, g, 73, L2(73));
+      R(g, h, a, b, c, d, e, f, 74, L2(74));
+      R(f, g, h, a, b, c, d, e, 75, L2(75));
+      R(e, f, g, h, a, b, c, d, 76, L2(76));
+      R(d, e, f, g, h, a, b, c, 77, L2(77));
+      R(c, d, e, f, g, h, a, b, 78, L2(78));
+      R(b, c, d, e, f, g, h, a, 79, L2(79));
 
       h0 += a;
       h1 += b;
@@ -339,6 +472,9 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       h5 += f;
       h6 += g;
       h7 += h;
+
+      kptr = K;
+
       a = h0;
       b = h1;
       c = h2;
@@ -348,90 +484,91 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       g = h6;
       h = h7;
 
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
+      R(a, b, c, d, e, f, g, h, 0, WL(0));
+      R(h, a, b, c, d, e, f, g, 1, WL(1));
+      R(g, h, a, b, c, d, e, f, 2, WL(2));
+      R(f, g, h, a, b, c, d, e, 3, WL(3));
+      R(e, f, g, h, a, b, c, d, 4, WL(4));
+      R(d, e, f, g, h, a, b, c, 5, WL(5));
+      R(c, d, e, f, g, h, a, b, 6, WL(6));
+      R(b, c, d, e, f, g, h, a, 7, WL(7));
+
+      R(a, b, c, d, e, f, g, h, 8, WL(8));
+      R(h, a, b, c, d, e, f, g, 9, WL(9));
+      R(g, h, a, b, c, d, e, f, 10, WL(10));
+      R(f, g, h, a, b, c, d, e, 11, WL(11));
+      R(e, f, g, h, a, b, c, d, 12, WL(12));
+      R(d, e, f, g, h, a, b, c, 13, WL(13));
+      R(c, d, e, f, g, h, a, b, 14, WL(14));
+      R(b, c, d, e, f, g, h, a, 15, WL(15));
+
+      R(a, b, c, d, e, f, g, h, 16, WL(16));
+      R(h, a, b, c, d, e, f, g, 17, WL(17));
+      R(g, h, a, b, c, d, e, f, 18, WL(18));
+      R(f, g, h, a, b, c, d, e, 19, WL(19));
+      R(e, f, g, h, a, b, c, d, 20, WL(20));
+      R(d, e, f, g, h, a, b, c, 21, WL(21));
+      R(c, d, e, f, g, h, a, b, 22, WL(22));
+      R(b, c, d, e, f, g, h, a, 23, WL(23));
+      R(a, b, c, d, e, f, g, h, 24, WL(24));
+      R(h, a, b, c, d, e, f, g, 25, WL(25));
+      R(g, h, a, b, c, d, e, f, 26, WL(26));
+      R(f, g, h, a, b, c, d, e, 27, WL(27));
+      R(e, f, g, h, a, b, c, d, 28, WL(28));
+      R(d, e, f, g, h, a, b, c, 29, WL(29));
+      R(c, d, e, f, g, h, a, b, 30, WL(30));
+      R(b, c, d, e, f, g, h, a, 31, WL(31));
+
+      R(a, b, c, d, e, f, g, h, 32, WL(32));
+      R(h, a, b, c, d, e, f, g, 33, WL(33));
+      R(g, h, a, b, c, d, e, f, 34, WL(34));
+      R(f, g, h, a, b, c, d, e, 35, WL(35));
+      R(e, f, g, h, a, b, c, d, 36, WL(36));
+      R(d, e, f, g, h, a, b, c, 37, WL(37));
+      R(c, d, e, f, g, h, a, b, 38, WL(38));
+      R(b, c, d, e, f, g, h, a, 39, WL(39));
+      R(a, b, c, d, e, f, g, h, 40, WL(40));
+      R(h, a, b, c, d, e, f, g, 41, WL(41));
+      R(g, h, a, b, c, d, e, f, 42, WL(42));
+      R(f, g, h, a, b, c, d, e, 43, WL(43));
+      R(e, f, g, h, a, b, c, d, 44, WL(44));
+      R(d, e, f, g, h, a, b, c, 45, WL(45));
+      R(c, d, e, f, g, h, a, b, 46, WL(46));
+      R(b, c, d, e, f, g, h, a, 47, WL(47));
+
+      R(a, b, c, d, e, f, g, h, 48, WL(48));
+      R(h, a, b, c, d, e, f, g, 49, WL(49));
+      R(g, h, a, b, c, d, e, f, 50, WL(50));
+      R(f, g, h, a, b, c, d, e, 51, WL(51));
+      R(e, f, g, h, a, b, c, d, 52, WL(52));
+      R(d, e, f, g, h, a, b, c, 53, WL(53));
+      R(c, d, e, f, g, h, a, b, 54, WL(54));
+      R(b, c, d, e, f, g, h, a, 55, WL(55));
+      R(a, b, c, d, e, f, g, h, 56, WL(56));
+      R(h, a, b, c, d, e, f, g, 57, WL(57));
+      R(g, h, a, b, c, d, e, f, 58, WL(58));
+      R(f, g, h, a, b, c, d, e, 59, WL(59));
+      R(e, f, g, h, a, b, c, d, 60, WL(60));
+      R(d, e, f, g, h, a, b, c, 61, WL(61));
+      R(c, d, e, f, g, h, a, b, 62, WL(62));
+      R(b, c, d, e, f, g, h, a, 63, WL(63));
+
+      R(a, b, c, d, e, f, g, h, 64, WL(64));
+      R(h, a, b, c, d, e, f, g, 65, WL(65));
+      R(g, h, a, b, c, d, e, f, 66, WL(66));
+      R(f, g, h, a, b, c, d, e, 67, WL(67));
+      R(e, f, g, h, a, b, c, d, 68, WL(68));
+      R(d, e, f, g, h, a, b, c, 69, WL(69));
+      R(c, d, e, f, g, h, a, b, 70, WL(70));
+      R(b, c, d, e, f, g, h, a, 71, WL(71));
+      R(a, b, c, d, e, f, g, h, 72, WL(72));
+      R(h, a, b, c, d, e, f, g, 73, WL(73));
+      R(g, h, a, b, c, d, e, f, 74, WL(74));
+      R(f, g, h, a, b, c, d, e, 75, WL(75));
+      R(e, f, g, h, a, b, c, d, 76, WL(76));
+      R(d, e, f, g, h, a, b, c, 77, WL(77));
+      R(c, d, e, f, g, h, a, b, 78, WL(78));
+      R(b, c, d, e, f, g, h, a, 79, WL(79));
 
       h0 += a;
       h1 += b;
@@ -445,8 +582,11 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       nblks -= 2;
     }
 
-  while (nblks)
+  if (nblks)
     {
+      const vector2x_u64 *kptr = K;
+      vector2x_u64 ktmp;
+
       a = h0;
       b = h1;
       c = h2;
@@ -460,91 +600,92 @@ _gcry_sha512_transform_ppc8(u64 state[8],
       I(4); I(5); I(6); I(7);
       I(8); I(9); I(10); I(11);
       I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
+
+      R(a, b, c, d, e, f, g, h, 0, W(0));
+      R(h, a, b, c, d, e, f, g, 1, W(1));
+      R(g, h, a, b, c, d, e, f, 2, W(2));
+      R(f, g, h, a, b, c, d, e, 3, W(3));
+      R(e, f, g, h, a, b, c, d, 4, W(4));
+      R(d, e, f, g, h, a, b, c, 5, W(5));
+      R(c, d, e, f, g, h, a, b, 6, W(6));
+      R(b, c, d, e, f, g, h, a, 7, W(7));
+
+      R(a, b, c, d, e, f, g, h, 8, W(8));
+      R(h, a, b, c, d, e, f, g, 9, W(9));
+      R(g, h, a, b, c, d, e, f, 10, W(10));
+      R(f, g, h, a, b, c, d, e, 11, W(11));
+      R(e, f, g, h, a, b, c, d, 12, W(12));
+      R(d, e, f, g, h, a, b, c, 13, W(13));
+      R(c, d, e, f, g, h, a, b, 14, W(14));
+      R(b, c, d, e, f, g, h, a, 15, W(15));
+
+      R(a, b, c, d, e, f, g, h, 16, W(16));
+      R(h, a, b, c, d, e, f, g, 17, W(17));
+      R(g, h, a, b, c, d, e, f, 18, W(18));
+      R(f, g, h, a, b, c, d, e, 19, W(19));
+      R(e, f, g, h, a, b, c, d, 20, W(20));
+      R(d, e, f, g, h, a, b, c, 21, W(21));
+      R(c, d, e, f, g, h, a, b, 22, W(22));
+      R(b, c, d, e, f, g, h, a, 23, W(23));
+      R(a, b, c, d, e, f, g, h, 24, W(24));
+      R(h, a, b, c, d, e, f, g, 25, W(25));
+      R(g, h, a, b, c, d, e, f, 26, W(26));
+      R(f, g, h, a, b, c, d, e, 27, W(27));
+      R(e, f, g, h, a, b, c, d, 28, W(28));
+      R(d, e, f, g, h, a, b, c, 29, W(29));
+      R(c, d, e, f, g, h, a, b, 30, W(30));
+      R(b, c, d, e, f, g, h, a, 31, W(31));
+
+      R(a, b, c, d, e, f, g, h, 32, W(32));
+      R(h, a, b, c, d, e, f, g, 33, W(33));
+      R(g, h, a, b, c, d, e, f, 34, W(34));
+      R(f, g, h, a, b, c, d, e, 35, W(35));
+      R(e, f, g, h, a, b, c, d, 36, W(36));
+      R(d, e, f, g, h, a, b, c, 37, W(37));
+      R(c, d, e, f, g, h, a, b, 38, W(38));
+      R(b, c, d, e, f, g, h, a, 39, W(39));
+      R(a, b, c, d, e, f, g, h, 40, W(40));
+      R(h, a, b, c, d, e, f, g, 41, W(41));
+      R(g, h, a, b, c, d, e, f, 42, W(42));
+      R(f, g, h, a, b, c, d, e, 43, W(43));
+      R(e, f, g, h, a, b, c, d, 44, W(44));
+      R(d, e, f, g, h, a, b, c, 45, W(45));
+      R(c, d, e, f, g, h, a, b, 46, W(46));
+      R(b, c, d, e, f, g, h, a, 47, W(47));
+
+      R(a, b, c, d, e, f, g, h, 48, W(48));
+      R(h, a, b, c, d, e, f, g, 49, W(49));
+      R(g, h, a, b, c, d, e, f, 50, W(50));
+      R(f, g, h, a, b, c, d, e, 51, W(51));
+      R(e, f, g, h, a, b, c, d, 52, W(52));
+      R(d, e, f, g, h, a, b, c, 53, W(53));
+      R(c, d, e, f, g, h, a, b, 54, W(54));
+      R(b, c, d, e, f, g, h, a, 55, W(55));
+      R(a, b, c, d, e, f, g, h, 56, W(56));
+      R(h, a, b, c, d, e, f, g, 57, W(57));
+      R(g, h, a, b, c, d, e, f, 58, W(58));
+      R(f, g, h, a, b, c, d, e, 59, W(59));
+      R(e, f, g, h, a, b, c, d, 60, W(60));
+      R(d, e, f, g, h, a, b, c, 61, W(61));
+      R(c, d, e, f, g, h, a, b, 62, W(62));
+      R(b, c, d, e, f, g, h, a, 63, W(63));
+
+      R(a, b, c, d, e, f, g, h, 64, L(64));
+      R(h, a, b, c, d, e, f, g, 65, L(65));
+      R(g, h, a, b, c, d, e, f, 66, L(66));
+      R(f, g, h, a, b, c, d, e, 67, L(67));
+      R(e, f, g, h, a, b, c, d, 68, L(68));
+      R(d, e, f, g, h, a, b, c, 69, L(69));
+      R(c, d, e, f, g, h, a, b, 70, L(70));
+      R(b, c, d, e, f, g, h, a, 71, L(71));
+      R(a, b, c, d, e, f, g, h, 72, L(72));
+      R(h, a, b, c, d, e, f, g, 73, L(73));
+      R(g, h, a, b, c, d, e, f, 74, L(74));
+      R(f, g, h, a, b, c, d, e, 75, L(75));
+      R(e, f, g, h, a, b, c, d, 76, L(76));
+      R(d, e, f, g, h, a, b, c, 77, L(77));
+      R(c, d, e, f, g, h, a, b, 78, L(78));
+      R(b, c, d, e, f, g, h, a, 79, L(79));
 
       h0 += a;
       h1 += b;
@@ -567,403 +708,21 @@ _gcry_sha512_transform_ppc8(u64 state[8],
   vec_u64_store (h4, 8 * 4, (unsigned long long *)state);
   vec_u64_store (h6, 8 * 6, (unsigned long long *)state);
 
-  return sizeof(w);
+  return sizeof(w) + sizeof(w2);
 }
-#undef R
-#undef Cho
-#undef Maj
-#undef Sum0
-#undef Sum1
-#undef S0
-#undef S1
-#undef I
-#undef W
-#undef I2
-#undef W2
-#undef R2
-
-
-/* SHA2 round in general purpose registers */
-#define R(a,b,c,d,e,f,g,h,k,w) do                                 \
-          {                                                       \
-            t1 = (h) + Sum1((e)) + Cho((e),(f),(g)) + ((k) + (w));\
-            t2 = Sum0((a)) + Maj((a),(b),(c));                    \
-            d += t1;                                              \
-            h  = t1 + t2;                                         \
-          } while (0)
-
-#define Cho(x, y, z)  ((x & y) + (~x & z))
-
-#define Maj(z, x, y)  ((x & y) + (z & (x ^ y)))
-
-#define Sum0(x)       (ror64(x, 28) ^ ror64(x ^ ror64(x, 39-34), 34))
-
-#define Sum1(x)       (ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41))
-
-
-/* Message expansion on general purpose registers */
-#define S0(x) (ror64 ((x), 1) ^ ror64 ((x), 8) ^ ((x) >> 7))
-#define S1(x) (ror64 ((x), 19) ^ ror64 ((x), 61) ^ ((x) >> 6))
-
-#define I(i) ( w[i] = buf_get_be64(data + i * 8) )
-#define WN(i) ({ w[i&0x0f] +=    w[(i-7) &0x0f];  \
-		 w[i&0x0f] += S0(w[(i-15)&0x0f]); \
-		 w[i&0x0f] += S1(w[(i-2) &0x0f]); \
-		 w[i&0x0f]; })
-#define W(i) ({ u64 r = w[i&0x0f]; WN(i); r; })
-#define L(i) w[i&0x0f]
 
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2
+_gcry_sha512_transform_ppc8(u64 state[8], const unsigned char *data,
+			    size_t nblks)
+{
+  return sha512_transform_ppc(state, data, nblks);
+}
 
-unsigned int ASM_FUNC_ATTR
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2
 _gcry_sha512_transform_ppc9(u64 state[8], const unsigned char *data,
 			    size_t nblks)
 {
-  /* GPRs used for round function and message expansion as vector intrinsics
-   * based generates slower code for POWER9. */
-  u64 a, b, c, d, e, f, g, h, t1, t2;
-  u64 w[16];
-
-  a = state[0];
-  b = state[1];
-  c = state[2];
-  d = state[3];
-  e = state[4];
-  f = state[5];
-  g = state[6];
-  h = state[7];
-
-  while (nblks >= 2)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      I(0); I(1); I(2); I(3);
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      I(4); I(5); I(6); I(7);
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      I(8); I(9); I(10); I(11);
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-      I(12); I(13); I(14); I(15);
-      data += 128;
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks -= 2;
-    }
-
-  while (nblks)
-    {
-      I(0); I(1); I(2); I(3);
-      I(4); I(5); I(6); I(7);
-      I(8); I(9); I(10); I(11);
-      I(12); I(13); I(14); I(15);
-      data += 128;
-      R(a, b, c, d, e, f, g, h, K[0], W(0));
-      R(h, a, b, c, d, e, f, g, K[1], W(1));
-      R(g, h, a, b, c, d, e, f, K[2], W(2));
-      R(f, g, h, a, b, c, d, e, K[3], W(3));
-      R(e, f, g, h, a, b, c, d, K[4], W(4));
-      R(d, e, f, g, h, a, b, c, K[5], W(5));
-      R(c, d, e, f, g, h, a, b, K[6], W(6));
-      R(b, c, d, e, f, g, h, a, K[7], W(7));
-      R(a, b, c, d, e, f, g, h, K[8], W(8));
-      R(h, a, b, c, d, e, f, g, K[9], W(9));
-      R(g, h, a, b, c, d, e, f, K[10], W(10));
-      R(f, g, h, a, b, c, d, e, K[11], W(11));
-      R(e, f, g, h, a, b, c, d, K[12], W(12));
-      R(d, e, f, g, h, a, b, c, K[13], W(13));
-      R(c, d, e, f, g, h, a, b, K[14], W(14));
-      R(b, c, d, e, f, g, h, a, K[15], W(15));
-
-      R(a, b, c, d, e, f, g, h, K[16], W(16));
-      R(h, a, b, c, d, e, f, g, K[17], W(17));
-      R(g, h, a, b, c, d, e, f, K[18], W(18));
-      R(f, g, h, a, b, c, d, e, K[19], W(19));
-      R(e, f, g, h, a, b, c, d, K[20], W(20));
-      R(d, e, f, g, h, a, b, c, K[21], W(21));
-      R(c, d, e, f, g, h, a, b, K[22], W(22));
-      R(b, c, d, e, f, g, h, a, K[23], W(23));
-      R(a, b, c, d, e, f, g, h, K[24], W(24));
-      R(h, a, b, c, d, e, f, g, K[25], W(25));
-      R(g, h, a, b, c, d, e, f, K[26], W(26));
-      R(f, g, h, a, b, c, d, e, K[27], W(27));
-      R(e, f, g, h, a, b, c, d, K[28], W(28));
-      R(d, e, f, g, h, a, b, c, K[29], W(29));
-      R(c, d, e, f, g, h, a, b, K[30], W(30));
-      R(b, c, d, e, f, g, h, a, K[31], W(31));
-
-      R(a, b, c, d, e, f, g, h, K[32], W(32));
-      R(h, a, b, c, d, e, f, g, K[33], W(33));
-      R(g, h, a, b, c, d, e, f, K[34], W(34));
-      R(f, g, h, a, b, c, d, e, K[35], W(35));
-      R(e, f, g, h, a, b, c, d, K[36], W(36));
-      R(d, e, f, g, h, a, b, c, K[37], W(37));
-      R(c, d, e, f, g, h, a, b, K[38], W(38));
-      R(b, c, d, e, f, g, h, a, K[39], W(39));
-      R(a, b, c, d, e, f, g, h, K[40], W(40));
-      R(h, a, b, c, d, e, f, g, K[41], W(41));
-      R(g, h, a, b, c, d, e, f, K[42], W(42));
-      R(f, g, h, a, b, c, d, e, K[43], W(43));
-      R(e, f, g, h, a, b, c, d, K[44], W(44));
-      R(d, e, f, g, h, a, b, c, K[45], W(45));
-      R(c, d, e, f, g, h, a, b, K[46], W(46));
-      R(b, c, d, e, f, g, h, a, K[47], W(47));
-
-      R(a, b, c, d, e, f, g, h, K[48], W(48));
-      R(h, a, b, c, d, e, f, g, K[49], W(49));
-      R(g, h, a, b, c, d, e, f, K[50], W(50));
-      R(f, g, h, a, b, c, d, e, K[51], W(51));
-      R(e, f, g, h, a, b, c, d, K[52], W(52));
-      R(d, e, f, g, h, a, b, c, K[53], W(53));
-      R(c, d, e, f, g, h, a, b, K[54], W(54));
-      R(b, c, d, e, f, g, h, a, K[55], W(55));
-      R(a, b, c, d, e, f, g, h, K[56], W(56));
-      R(h, a, b, c, d, e, f, g, K[57], W(57));
-      R(g, h, a, b, c, d, e, f, K[58], W(58));
-      R(f, g, h, a, b, c, d, e, K[59], W(59));
-      R(e, f, g, h, a, b, c, d, K[60], W(60));
-      R(d, e, f, g, h, a, b, c, K[61], W(61));
-      R(c, d, e, f, g, h, a, b, K[62], W(62));
-      R(b, c, d, e, f, g, h, a, K[63], W(63));
-
-      R(a, b, c, d, e, f, g, h, K[64], L(64));
-      R(h, a, b, c, d, e, f, g, K[65], L(65));
-      R(g, h, a, b, c, d, e, f, K[66], L(66));
-      R(f, g, h, a, b, c, d, e, K[67], L(67));
-      R(e, f, g, h, a, b, c, d, K[68], L(68));
-      R(d, e, f, g, h, a, b, c, K[69], L(69));
-      R(c, d, e, f, g, h, a, b, K[70], L(70));
-      R(b, c, d, e, f, g, h, a, K[71], L(71));
-      R(a, b, c, d, e, f, g, h, K[72], L(72));
-      R(h, a, b, c, d, e, f, g, K[73], L(73));
-      R(g, h, a, b, c, d, e, f, K[74], L(74));
-      R(f, g, h, a, b, c, d, e, K[75], L(75));
-      R(e, f, g, h, a, b, c, d, K[76], L(76));
-      R(d, e, f, g, h, a, b, c, K[77], L(77));
-      R(c, d, e, f, g, h, a, b, K[78], L(78));
-      R(b, c, d, e, f, g, h, a, K[79], L(79));
-
-      a += state[0];
-      b += state[1];
-      c += state[2];
-      d += state[3];
-      e += state[4];
-      f += state[5];
-      g += state[6];
-      h += state[7];
-      state[0] = a;
-      state[1] = b;
-      state[2] = c;
-      state[3] = d;
-      state[4] = e;
-      state[5] = f;
-      state[6] = g;
-      state[7] = h;
-
-      nblks--;
-    }
-
-  return sizeof(w);
+  return sha512_transform_ppc(state, data, nblks);
 }
 
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/configure.ac b/configure.ac
index 75622e50..63f705ea 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1363,6 +1363,21 @@ _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -Werror"
 
 
+#
+# Check whether compiler supports 'optimize' function attribute
+#
+AC_CACHE_CHECK([whether compiler supports 'optimize' function attribute],
+       [gcry_cv_gcc_attribute_optimize],
+       [gcry_cv_gcc_attribute_optimize=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[int __attribute__ ((optimize("-O2"))) fn(int i){return i;}]])],
+          [gcry_cv_gcc_attribute_optimize=yes])])
+if test "$gcry_cv_gcc_attribute_optimize" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_ATTRIBUTE_OPTIMIZE,1,
+     [Defined if compiler supports "__attribute__ ((optimize))" function attribute])
+fi
+
+
 #
 # Check whether compiler supports 'ms_abi' function attribute.
 #
@@ -2254,6 +2269,28 @@ if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
 fi
 
 
+#
+# Check whether compiler supports GCC PowerPC target attributes
+#
+AC_CACHE_CHECK([whether compiler supports GCC PowerPC target attributes],
+       [gcry_cv_gcc_attribute_ppc_target],
+       [if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_gcc_attribute_ppc_target="n/a"
+        else
+          gcry_cv_gcc_attribute_ppc_target=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[void __attribute__((target("cpu=power8"))) testfn8(void) {}
+            void __attribute__((target("cpu=power9"))) testfn9(void)
+            { testfn8(); }
+            ]], [ testfn9(); ])],
+          [gcry_cv_gcc_attribute_ppc_target=yes])
+        fi])
+if test "$gcry_cv_gcc_attribute_ppc_target" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_ATTRIBUTE_PPC_TARGET,1,
+     [Defined if compiler supports GCC PowerPC target attributes])
+fi
+
+
 #
 # Check whether GCC inline assembler supports zSeries instructions
 #
-- 
2.37.2


From jussi.kivilinna at iki.fi  Tue Feb 28 15:15:22 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 28 Feb 2023 16:15:22 +0200
Subject: [PATCH 2/2] camellia: add AArch64 crypto-extension implementation
In-Reply-To: <20230228141522.143025-1-jussi.kivilinna@iki.fi>
References: <20230228141522.143025-1-jussi.kivilinna@iki.fi>
Message-ID: <20230228141522.143025-2-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'camellia-aarch64-ce.(c|o|lo)'.
(aarch64_neon_cflags): New.
* cipher/camellia-aarch64-ce.c: New.
* cipher/camellia-glue.c (USE_AARCH64_CE): New.
(CAMELLIA_context): Add 'use_aarch64ce'.
(_gcry_camellia_aarch64ce_encrypt_blk16)
(_gcry_camellia_aarch64ce_decrypt_blk16)
(_gcry_camellia_aarch64ce_keygen, camellia_aarch64ce_enc_blk16)
(camellia_aarch64ce_dec_blk16, aarch64ce_burn_stack_depth): New.
(camellia_setkey) [USE_AARCH64_CE]: Set use_aarch64ce if HW has
HWF_ARM_AES; Use AArch64/CE key generation if supported by HW.
(camellia_encrypt_blk1_32, camellia_decrypt_blk1_32)
[USE_AARCH64_CE]: Add AArch64/CE code path.
--

Patch enables 128-bit vector instrinsics implementation of Camellia
cipher for AArch64.

Benchmark on AWS Graviton2:

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      5.99 ns/B     159.2 MiB/s     14.97 c/B      2500
        ECB dec |      5.99 ns/B     159.1 MiB/s     14.98 c/B      2500
        CBC enc |      6.16 ns/B     154.7 MiB/s     15.41 c/B      2500
        CBC dec |      6.12 ns/B     155.8 MiB/s     15.29 c/B      2499
        CFB enc |      6.49 ns/B     147.0 MiB/s     16.21 c/B      2500
        CFB dec |      6.05 ns/B     157.6 MiB/s     15.13 c/B      2500
        CTR enc |      6.09 ns/B     156.7 MiB/s     15.22 c/B      2500
        CTR dec |      6.09 ns/B     156.6 MiB/s     15.22 c/B      2500
        XTS enc |      6.16 ns/B     154.9 MiB/s     15.39 c/B      2500
        XTS dec |      6.16 ns/B     154.8 MiB/s     15.40 c/B      2499
        GCM enc |      6.31 ns/B     151.1 MiB/s     15.78 c/B      2500
        GCM dec |      6.31 ns/B     151.1 MiB/s     15.78 c/B      2500
       GCM auth |     0.206 ns/B      4635 MiB/s     0.514 c/B      2500
        OCB enc |      6.63 ns/B     143.9 MiB/s     16.57 c/B      2499
        OCB dec |      6.63 ns/B     143.9 MiB/s     16.56 c/B      2499
       OCB auth |      6.55 ns/B     145.7 MiB/s     16.37 c/B      2499

 After (ecb ~2.1x faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      2.77 ns/B     344.2 MiB/s      6.93 c/B      2499
        ECB dec |      2.76 ns/B     345.3 MiB/s      6.90 c/B      2499
        CBC enc |      6.17 ns/B     154.7 MiB/s     15.41 c/B      2499
        CBC dec |      2.89 ns/B     330.3 MiB/s      7.22 c/B      2500
        CFB enc |      6.48 ns/B     147.1 MiB/s     16.21 c/B      2499
        CFB dec |      2.84 ns/B     336.1 MiB/s      7.09 c/B      2499
        CTR enc |      2.90 ns/B     328.8 MiB/s      7.25 c/B      2499
        CTR dec |      2.90 ns/B     328.9 MiB/s      7.25 c/B      2500
        XTS enc |      2.93 ns/B     325.3 MiB/s      7.33 c/B      2500
        XTS dec |      2.92 ns/B     326.2 MiB/s      7.31 c/B      2500
        GCM enc |      3.10 ns/B     307.2 MiB/s      7.76 c/B      2500
        GCM dec |      3.10 ns/B     307.2 MiB/s      7.76 c/B      2499
       GCM auth |     0.206 ns/B      4635 MiB/s     0.514 c/B      2500

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am           |  14 ++++-
 cipher/camellia-aarch64-ce.c |  42 ++++++++++++++
 cipher/camellia-glue.c       |  70 +++++++++++++++++++++++
 configure.ac                 | 106 +++++++++++++++++++++++++++++++++--
 4 files changed, 227 insertions(+), 5 deletions(-)
 create mode 100644 cipher/camellia-aarch64-ce.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 52435ed5..dcaa68bb 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -148,7 +148,7 @@ EXTRA_libcipher_la_SOURCES = \
 	camellia-aesni-avx2-amd64.h \
 	camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
 	camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
-	camellia-arm.S camellia-aarch64.S \
+	camellia-arm.S camellia-aarch64.S camellia-aarch64-ce.c \
 	camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \
 	blake2.c \
 	blake2b-amd64-avx2.S blake2b-amd64-avx512.S \
@@ -238,6 +238,12 @@ else
 ppc_vcrypto_cflags =
 endif
 
+if ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS
+aarch64_neon_cflags = -O2 -march=armv8-a+crypto
+else
+aarch64_neon_cflags =
+endif
+
 rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
@@ -297,3 +303,9 @@ camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile
 
 camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile
+	`echo $(COMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile
+	`echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/camellia-aarch64-ce.c b/cipher/camellia-aarch64-ce.c
new file mode 100644
index 00000000..76813e94
--- /dev/null
+++ b/cipher/camellia-aarch64-ce.c
@@ -0,0 +1,42 @@
+/* camellia-aarch64-ce.c - ARMv8/CE Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+    (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENC_BLK16 _gcry_camellia_aarch64ce_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_aarch64ce_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_aarch64ce_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* __AARCH64EL__ */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 46bbe182..0b07f2d1 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -119,6 +119,16 @@
 # define USE_PPC_CRYPTO 1
 #endif
 
+/* USE_AARCH64_CE indicates whether to enable ARMv8/CE accelerated code. */
+#undef USE_AARCH64_CE
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+    (__GNUC__ >= 4)
+# define USE_AARCH64_CE 1
+#endif
+
 typedef struct
 {
   KEY_TABLE_TYPE keytable;
@@ -138,6 +148,9 @@ typedef struct
   unsigned int use_ppc8:1;
   unsigned int use_ppc9:1;
 #endif /*USE_PPC_CRYPTO*/
+#ifdef USE_AARCH64_CE
+  unsigned int use_aarch64ce:1;
+#endif /*USE_AARCH64_CE*/
 } CAMELLIA_context;
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
@@ -472,6 +485,36 @@ static const int ppc_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
                                         2 * sizeof(void *);
 #endif /*USE_PPC_CRYPTO*/
 
+#ifdef USE_AARCH64_CE
+extern void _gcry_camellia_aarch64ce_encrypt_blk16(const void *key_table,
+						   void *out, const void *in,
+						   int key_length);
+
+extern void _gcry_camellia_aarch64ce_decrypt_blk16(const void *key_table,
+						   void *out, const void *in,
+						   int key_length);
+
+extern void _gcry_camellia_aarch64ce_keygen(void *key_table, const void *vkey,
+					    unsigned int keylen);
+
+void camellia_aarch64ce_enc_blk16(const CAMELLIA_context *ctx,
+				  unsigned char *out, const unsigned char *in)
+{
+  _gcry_camellia_aarch64ce_encrypt_blk16 (ctx->keytable, out, in,
+					  ctx->keybitlength / 8);
+}
+
+void camellia_aarch64ce_dec_blk16(const CAMELLIA_context *ctx,
+				  unsigned char *out, const unsigned char *in)
+{
+  _gcry_camellia_aarch64ce_decrypt_blk16 (ctx->keytable, out, in,
+					  ctx->keybitlength / 8);
+}
+
+static const int aarch64ce_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+					      2 * sizeof(void *);
+#endif /*USE_AARCH64_CE*/
+
 static const char *selftest(void);
 
 static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -549,6 +592,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   ctx->use_ppc9 = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00);
   ctx->use_ppc = ctx->use_ppc8 || ctx->use_ppc9;
 #endif
+#ifdef USE_AARCH64_CE
+  ctx->use_aarch64ce = (hwf & HWF_ARM_AES) != 0;
+#endif
 
   ctx->keybitlength=keylen*8;
 
@@ -574,6 +620,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
     _gcry_camellia_ppc9_keygen(ctx->keytable, key, keylen);
   else if (ctx->use_ppc8)
     _gcry_camellia_ppc8_keygen(ctx->keytable, key, keylen);
+#endif
+#ifdef USE_AARCH64_CE
+  else if (ctx->use_aarch64ce)
+    _gcry_camellia_aarch64ce_keygen(ctx->keytable, key, keylen);
 #endif
   else
     {
@@ -754,6 +804,16 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       num_blks -= 16;
     }
 #endif
+#ifdef USE_AARCH64_CE
+  while (ctx->use_aarch64ce && num_blks >= 16)
+    {
+      camellia_aarch64ce_enc_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = aarch64ce_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
@@ -855,6 +915,16 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       num_blks -= 16;
     }
 #endif
+#ifdef USE_AARCH64_CE
+  while (ctx->use_aarch64ce && num_blks >= 16)
+    {
+      camellia_aarch64ce_dec_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = aarch64ce_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
diff --git a/configure.ac b/configure.ac
index a40a8135..0d5c9160 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2136,7 +2136,103 @@ fi
 
 
 #
-# Check whether PowerPC AltiVec/VSX intrinsics
+# Check whether compiler supports AArch64/NEON/crypto intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics],
+      [gcry_cv_cc_aarch64_neon_intrinsics],
+      [if test "$mpi_cpu_arch" != "aarch64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	gcry_cv_cc_aarch64_neon_intrinsics="n/a"
+      else
+	gcry_cv_cc_aarch64_neon_intrinsics=no
+	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+	[[#include <arm_neon.h>
+	  #define __m128i uint64x2_t
+	  #define vpsrldq128(s, a, o) \
+	    ({ uint64x2_t __tmp = { 0, 0 }; \
+		o = (__m128i)vextq_u8((uint8x16_t)a, \
+				      (uint8x16_t)__tmp, (s) & 15); })
+	  #define vaesenclast128(a, b, o) \
+	    (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+	  #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+	  static inline __attribute__((always_inline)) __m128i
+	  fn2(__m128i a)
+	  {
+	    vpsrldq128(2, a, a);
+	    return a;
+	  }
+	  __m128i fn(__m128i in)
+	  {
+	    __m128i x;
+	    memory_barrier_with_vec(in);
+	    x = fn2(in);
+	    memory_barrier_with_vec(x);
+	    vaesenclast128(in, x, in);
+	    memory_barrier_with_vec(in);
+	    return in;
+	  }
+	  ]])],
+	[gcry_cv_cc_aarch64_neon_intrinsics=yes])
+      fi])
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+	    [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=armv8-a+crypto"
+
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" &&
+   test "$mpi_cpu_arch" = "aarch64" &&
+   test "$try_asm_modules" = "yes" ; then
+  AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags],
+    [gcry_cv_cc_aarch64_neon_intrinsics_cflags],
+    [gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+      [[#include <arm_neon.h>
+	#define __m128i uint64x2_t
+	#define vpsrldq128(s, a, o) \
+	  ({ uint64x2_t __tmp = { 0, 0 }; \
+	      o = (__m128i)vextq_u8((uint8x16_t)a, \
+				    (uint8x16_t)__tmp, (s) & 15); })
+	#define vaesenclast128(a, b, o) \
+	  (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+	#define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+	static inline __attribute__((always_inline)) __m128i
+	fn2(__m128i a)
+	{
+	  vpsrldq128(2, a, a);
+	  return a;
+	}
+	__m128i fn(__m128i in)
+	{
+	  __m128i x;
+	  memory_barrier_with_vec(in);
+	  x = fn2(in);
+	  memory_barrier_with_vec(x);
+	  vaesenclast128(in, x, in);
+	  memory_barrier_with_vec(in);
+	  return in;
+	}
+	]])],
+      [gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])])
+  if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+	      [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS,1,
+	      [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags])
+  fi
+fi
+
+AM_CONDITIONAL(ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS,
+	       test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
+#
+# Check whether compiler supports PowerPC AltiVec/VSX intrinsics
 #
 AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
       [gcry_cv_cc_ppc_altivec],
@@ -2173,8 +2269,8 @@ _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
-    test "$mpi_cpu_arch" = "ppc" &&
-    test "$try_asm_modules" == "yes" ; then
+   test "$mpi_cpu_arch" = "ppc" &&
+   test "$try_asm_modules" = "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
     [gcry_cv_cc_ppc_altivec_cflags=no
@@ -2193,7 +2289,8 @@ if test "$gcry_cv_cc_ppc_altivec" = "no" &&
 	  vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	  y = vec_sld_u32 (y, y, 3);
 	  return vec_cipher_be (t, in) ^ (block)y;
-	}]])],
+	}
+	]])],
       [gcry_cv_cc_ppc_altivec_cflags=yes])])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
@@ -2966,6 +3063,7 @@ if test "$found" = "1" ; then
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the POWER vector implementations
-- 
2.37.2


From jussi.kivilinna at iki.fi  Tue Feb 28 15:15:21 2023
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 28 Feb 2023 16:15:21 +0200
Subject: [PATCH 1/2] camellia: add POWER8/POWER9 vcrypto implementation
Message-ID: <20230228141522.143025-1-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'camellia-simd128.h',
'camellia-ppc8le.c' and 'camellia-ppc9le.c'.
* cipher/camellia-glue.c (USE_PPC_CRYPTO): New.
(CAMELLIA_context) [USE_PPC_CRYPTO]: Add 'use_ppc', 'use_ppc8'
and 'use_ppc9'.
[USE_PPC_CRYPTO] (_gcry_camellia_ppc8_encrypt_blk16)
(_gcry_camellia_ppc8_decrypt_blk16, _gcry_camellia_ppc8_keygen)
(_gcry_camellia_ppc9_encrypt_blk16)
(_gcry_camellia_ppc9_decrypt_blk16, _gcry_camellia_ppc9_keygen)
(camellia_ppc_enc_blk16, camellia_ppc_dec_blk16)
(ppc_burn_stack_depth): New.
(camellia_setkey) [USE_PPC_CRYPTO]: Setup 'use_ppc', 'use_ppc8'
and 'use_ppc9' and use PPC key-generation if HWF is available.
(camellia_encrypt_blk1_32)
(camellia_decrypt_blk1_32) [USE_PPC_CRYPTO]: Add 'use_ppc' paths.
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Enable
generic bulk path when USE_PPC_CRYPTO is defined.
* cipher/camellia-ppc8le.c: New.
* cipher/camellia-ppc9le.c: New.
* cipher/camellia-simd128.h: New.
* configure.ac: Add 'camellia-ppc8le.lo' and 'camellia-ppc9le.lo'.
--

Patch adds 128-bit vector instrinsics implementation of Camellia
cipher and enables implementation for POWER8 and POWER9.

Benchmark on POWER9:

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     13.45 ns/B     70.90 MiB/s     30.94 c/B
        ECB dec |     13.45 ns/B     70.92 MiB/s     30.93 c/B
        CBC enc |     15.22 ns/B     62.66 MiB/s     35.00 c/B
        CBC dec |     13.54 ns/B     70.41 MiB/s     31.15 c/B
        CFB enc |     15.24 ns/B     62.59 MiB/s     35.04 c/B
        CFB dec |     13.53 ns/B     70.48 MiB/s     31.12 c/B
        CTR enc |     13.60 ns/B     70.15 MiB/s     31.27 c/B
        CTR dec |     13.62 ns/B     70.02 MiB/s     31.33 c/B
        XTS enc |     13.67 ns/B     69.74 MiB/s     31.45 c/B
        XTS dec |     13.74 ns/B     69.41 MiB/s     31.60 c/B
        GCM enc |     18.18 ns/B     52.45 MiB/s     41.82 c/B
        GCM dec |     17.76 ns/B     53.69 MiB/s     40.86 c/B
       GCM auth |      4.12 ns/B     231.7 MiB/s      9.47 c/B
        OCB enc |     14.40 ns/B     66.22 MiB/s     33.12 c/B
        OCB dec |     14.40 ns/B     66.23 MiB/s     33.12 c/B
       OCB auth |     14.37 ns/B     66.37 MiB/s     33.05 c/B

 After (ECB ~4.1x faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      3.25 ns/B     293.7 MiB/s      7.47 c/B
        ECB dec |      3.25 ns/B     293.4 MiB/s      7.48 c/B
        CBC enc |     15.22 ns/B     62.68 MiB/s     35.00 c/B
        CBC dec |      3.36 ns/B     284.1 MiB/s      7.72 c/B
        CFB enc |     15.25 ns/B     62.55 MiB/s     35.07 c/B
        CFB dec |      3.36 ns/B     284.0 MiB/s      7.72 c/B
        CTR enc |      3.47 ns/B     275.1 MiB/s      7.97 c/B
        CTR dec |      3.47 ns/B     275.1 MiB/s      7.97 c/B
        XTS enc |      3.54 ns/B     269.0 MiB/s      8.15 c/B
        XTS dec |      3.54 ns/B     269.6 MiB/s      8.14 c/B
        GCM enc |      3.69 ns/B     258.2 MiB/s      8.49 c/B
        GCM dec |      3.69 ns/B     258.2 MiB/s      8.50 c/B
       GCM auth |     0.226 ns/B      4220 MiB/s     0.520 c/B
        OCB enc |      3.81 ns/B     250.2 MiB/s      8.77 c/B
        OCB dec |      4.08 ns/B     233.8 MiB/s      9.38 c/B
       OCB auth |      3.53 ns/B     270.0 MiB/s      8.12 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am        |   13 +
 cipher/camellia-glue.c    |  114 +-
 cipher/camellia-ppc8le.c  |   47 +
 cipher/camellia-ppc9le.c  |   47 +
 cipher/camellia-simd128.h | 2224 +++++++++++++++++++++++++++++++++++++
 configure.ac              |    5 +
 6 files changed, 2442 insertions(+), 8 deletions(-)
 create mode 100644 cipher/camellia-ppc8le.c
 create mode 100644 cipher/camellia-ppc9le.c
 create mode 100644 cipher/camellia-simd128.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 163c1f0f..52435ed5 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -149,6 +149,7 @@ EXTRA_libcipher_la_SOURCES = \
 	camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
 	camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
 	camellia-arm.S camellia-aarch64.S \
+	camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \
 	blake2.c \
 	blake2b-amd64-avx2.S blake2b-amd64-avx512.S \
 	blake2s-amd64-avx.S blake2s-amd64-avx512.S
@@ -284,3 +285,15 @@ cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
 
 cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc8le.o: $(srcdir)/camellia-ppc8le.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc8le.lo: $(srcdir)/camellia-ppc8le.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index b87faa91..46bbe182 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -109,6 +109,16 @@
 # define USE_GFNI_AVX512 1
 #endif
 
+/* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
+ * accelerated code. */
+#undef USE_PPC_CRYPTO
+#if !defined(WORDS_BIGENDIAN) && defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    (SIZEOF_UNSIGNED_LONG == 8) && (__GNUC__ >= 4)
+# define USE_PPC_CRYPTO 1
+#endif
+
 typedef struct
 {
   KEY_TABLE_TYPE keytable;
@@ -123,6 +133,11 @@ typedef struct
   unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used.  */
   unsigned int use_gfni_avx512:1; /* GFNI/AVX512 implementation shall be used.  */
 #endif /*USE_AESNI_AVX2*/
+#ifdef USE_PPC_CRYPTO
+  unsigned int use_ppc:1;
+  unsigned int use_ppc8:1;
+  unsigned int use_ppc9:1;
+#endif /*USE_PPC_CRYPTO*/
 } CAMELLIA_context;
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
@@ -404,6 +419,59 @@ extern void _gcry_camellia_gfni_avx512_dec_blk64(const CAMELLIA_context *ctx,
 static const int avx512_burn_stack_depth = 0;
 #endif
 
+#ifdef USE_PPC_CRYPTO
+extern void _gcry_camellia_ppc8_encrypt_blk16(const void *key_table,
+					      void *out,
+					      const void *in,
+					      int key_length);
+
+extern void _gcry_camellia_ppc8_decrypt_blk16(const void *key_table,
+					      void *out,
+					      const void *in,
+					      int key_length);
+
+extern void _gcry_camellia_ppc9_encrypt_blk16(const void *key_table,
+					      void *out,
+					      const void *in,
+					      int key_length);
+
+extern void _gcry_camellia_ppc9_decrypt_blk16(const void *key_table,
+					      void *out,
+					      const void *in,
+					      int key_length);
+
+extern void _gcry_camellia_ppc8_keygen(void *key_table, const void *vkey,
+				       unsigned int keylen);
+
+extern void _gcry_camellia_ppc9_keygen(void *key_table, const void *vkey,
+				       unsigned int keylen);
+
+void camellia_ppc_enc_blk16(const CAMELLIA_context *ctx, unsigned char *out,
+                            const unsigned char *in)
+{
+  if (ctx->use_ppc9)
+    _gcry_camellia_ppc9_encrypt_blk16 (ctx->keytable, out, in,
+				       ctx->keybitlength / 8);
+  else
+    _gcry_camellia_ppc8_encrypt_blk16 (ctx->keytable, out, in,
+				       ctx->keybitlength / 8);
+}
+
+void camellia_ppc_dec_blk16(const CAMELLIA_context *ctx, unsigned char *out,
+                            const unsigned char *in)
+{
+  if (ctx->use_ppc9)
+    _gcry_camellia_ppc9_decrypt_blk16 (ctx->keytable, out, in,
+				       ctx->keybitlength / 8);
+  else
+    _gcry_camellia_ppc8_decrypt_blk16 (ctx->keytable, out, in,
+				       ctx->keybitlength / 8);
+}
+
+static const int ppc_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *);
+#endif /*USE_PPC_CRYPTO*/
+
 static const char *selftest(void);
 
 static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -437,10 +505,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   CAMELLIA_context *ctx=c;
   static int initialized=0;
   static const char *selftest_failed=NULL;
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) \
-    || defined(USE_VAES_AVX2) || defined(USE_GFNI_AVX2)
   unsigned int hwf = _gcry_get_hw_features ();
-#endif
+
+  (void)hwf;
 
   if(keylen!=16 && keylen!=24 && keylen!=32)
     return GPG_ERR_INV_KEYLEN;
@@ -477,6 +544,11 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
 #ifdef USE_GFNI_AVX512
   ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
 #endif
+#ifdef USE_PPC_CRYPTO
+  ctx->use_ppc8 = (hwf & HWF_PPC_VCRYPTO) != 0;
+  ctx->use_ppc9 = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00);
+  ctx->use_ppc = ctx->use_ppc8 || ctx->use_ppc9;
+#endif
 
   ctx->keybitlength=keylen*8;
 
@@ -496,8 +568,14 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
 #ifdef USE_AESNI_AVX
   else if (ctx->use_aesni_avx)
     _gcry_camellia_aesni_avx_keygen(ctx, key, keylen);
-  else
 #endif
+#ifdef USE_PPC_CRYPTO
+  else if (ctx->use_ppc9)
+    _gcry_camellia_ppc9_keygen(ctx->keytable, key, keylen);
+  else if (ctx->use_ppc8)
+    _gcry_camellia_ppc8_keygen(ctx->keytable, key, keylen);
+#endif
+  else
     {
       Camellia_Ekeygen(ctx->keybitlength,key,ctx->keytable);
       _gcry_burn_stack
@@ -666,6 +744,16 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       num_blks -= 16;
     }
 #endif
+#ifdef USE_PPC_CRYPTO
+  while (ctx->use_ppc && num_blks >= 16)
+    {
+      camellia_ppc_enc_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = ppc_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
@@ -757,6 +845,16 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       num_blks -= 16;
     }
 #endif
+#ifdef USE_PPC_CRYPTO
+  while (ctx->use_ppc && num_blks >= 16)
+    {
+      camellia_ppc_dec_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = ppc_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
@@ -1251,7 +1349,7 @@ static size_t
 _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			  const void *inbuf_arg, size_t nblocks, int encrypt)
 {
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
@@ -1395,7 +1493,7 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     }
 #endif
 
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   /* Process remaining blocks. */
   if (nblocks)
     {
@@ -1428,7 +1526,7 @@ static size_t
 _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			 size_t nblocks)
 {
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   int burn_stack_depth = 0;
@@ -1523,7 +1621,7 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     }
 #endif
 
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_PPC_CRYPTO) || defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   /* Process remaining blocks. */
   if (nblocks)
     {
diff --git a/cipher/camellia-ppc8le.c b/cipher/camellia-ppc8le.c
new file mode 100644
index 00000000..3eeb91ae
--- /dev/null
+++ b/cipher/camellia-ppc8le.c
@@ -0,0 +1,47 @@
+/* camellia-ppc8le.c - POWER8 Vector Crypto Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if !defined(WORDS_BIGENDIAN) && defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    (SIZEOF_UNSIGNED_LONG == 8) && (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("arch=pwr8"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("cpu=power8"))) FUNC_ATTR_OPT
+#else
+# define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+#define FUNC_ENC_BLK16 _gcry_camellia_ppc8_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_ppc8_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_ppc8_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/camellia-ppc9le.c b/cipher/camellia-ppc9le.c
new file mode 100644
index 00000000..6d571733
--- /dev/null
+++ b/cipher/camellia-ppc9le.c
@@ -0,0 +1,47 @@
+/* camellia-ppc9le.c - POWER9 Vector Crypto Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if !defined(WORDS_BIGENDIAN) && defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    (SIZEOF_UNSIGNED_LONG == 8) && (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("arch=pwr9"))) FUNC_ATTR_OPT
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define SIMD128_OPT_ATTR __attribute__((target("cpu=power9"))) FUNC_ATTR_OPT
+#else
+# define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+#endif
+
+#define FUNC_ENC_BLK16 _gcry_camellia_ppc9_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_ppc9_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_ppc9_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h
new file mode 100644
index 00000000..9cb7b987
--- /dev/null
+++ b/cipher/camellia-simd128.h
@@ -0,0 +1,2224 @@
+/* camellia-simd128.h - Camellia cipher SIMD128 intrinsics implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * SSE/AVX/NEON implementation of Camellia cipher, using AES-NI/ARMv8-CE/
+ * PPC-crypto for sbox calculations. This implementation takes 16 input blocks
+ * and process them in parallel. Vectorized key setup is also available at
+ * the end of file. This implementation is from
+ *  - https://github.com/jkivilin/camellia-simd-aesni
+ *
+ * This work was originally presented in Master's Thesis,
+ *   "Block Ciphers: Fast Implementations on x86-64 Architecture" (pages 42-50)
+ *   http://urn.fi/URN:NBN:fi:oulu-201305311409
+ */
+
+#include <config.h>
+#include "types.h"
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE SIMD128_OPT_ATTR
+
+
+#if defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && !defined(WORDS_BIGENDIAN)
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (PowerPC VSX+crypto)
+ **********************************************************************/
+#include <altivec.h>
+
+typedef vector signed char int8x16_t;
+typedef vector unsigned char uint8x16_t;
+typedef vector unsigned short uint16x8_t;
+typedef vector unsigned int uint32x4_t;
+typedef vector unsigned long long uint64x2_t;
+typedef uint64x2_t __m128i;
+
+#ifdef __clang__
+/* clang has mismatching prototype for vec_sbox_be. */
+static ASM_FUNC_ATTR_INLINE uint8x16_t
+asm_sbox_be(uint8x16_t b)
+{
+  uint8x16_t o;
+  __asm__ ("vsbox %0, %1\n\t" : "=v" (o) : "v" (b));
+  return o;
+}
+#undef vec_sbox_be
+#define vec_sbox_be asm_sbox_be
+#endif
+
+#define vec_bswap(a)            ((__m128i)vec_reve((uint8x16_t)a))
+
+#define vpand128(a, b, o)       (o = vec_and(b, a))
+#define vpandn128(a, b, o)      (o = vec_andc(a, b))
+#define vpxor128(a, b, o)       (o = vec_xor(b, a))
+#define vpor128(a, b, o)        (o = vec_or(b, a))
+
+#define vpsrlb128(s, a, o)      ({ o = (__m128i)((uint8x16_t)a >> s); })
+#define vpsllb128(s, a, o)      ({ o = (__m128i)((uint8x16_t)a << s); })
+#define vpsrlw128(s, a, o)      ({ o = (__m128i)((uint16x8_t)a >> s); })
+#define vpsllw128(s, a, o)      ({ o = (__m128i)((uint16x8_t)a << s); })
+#define vpsrld128(s, a, o)      ({ o = (__m128i)((uint32x4_t)a >> s); })
+#define vpslld128(s, a, o)      ({ o = (__m128i)((uint32x4_t)a << s); })
+#define vpsrlq128(s, a, o)      ({ o = (__m128i)((uint64x2_t)a >> s); })
+#define vpsllq128(s, a, o)      ({ o = (__m128i)((uint64x2_t)a << s); })
+#define vpsrldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+				  o = (__m128i)vec_sld((uint8x16_t)__tmp, \
+						       (uint8x16_t)a, (16 - (s)) & 15);})
+#define vpslldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+				  o = (__m128i)vec_sld((uint8x16_t)a, \
+						       (uint8x16_t)__tmp, (s) & 15);})
+
+#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
+#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
+
+#define vpaddb128(a, b, o)      (o = (__m128i)vec_add((uint8x16_t)b, (uint8x16_t)a))
+
+#define vpcmpgtb128(a, b, o)    (o = (__m128i)vec_cmpgt((int8x16_t)b, (int8x16_t)a))
+#define vpabsb128(a, o)         (o = (__m128i)vec_abs((int8x16_t)a))
+
+#define vpshufd128_0x4e(a, o)   (o = (__m128i)vec_reve((uint64x2_t)a))
+#define vpshufd128_0x1b(a, o)   (o = (__m128i)vec_reve((uint32x4_t)a))
+
+#define vpshufb128(m, a, o) \
+	({ uint64x2_t __tmpz = { 0, 0 }; \
+	   o = (__m128i)vec_perm((uint8x16_t)a, (uint8x16_t)__tmpz, (uint8x16_t)m); })
+
+#define vpunpckhdq128(a, b, o)  (o = (__m128i)vec_mergel((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckldq128(a, b, o)  (o = (__m128i)vec_mergeh((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckhqdq128(a, b, o) (o = (__m128i)vec_mergel((uint64x2_t)b, (uint64x2_t)a))
+#define vpunpcklqdq128(a, b, o) (o = (__m128i)vec_mergeh((uint64x2_t)b, (uint64x2_t)a))
+
+#define vmovdqa128(a, o)        (o = a)
+#define vmovd128(a, o)          ({ uint32x4_t __tmp = { (a), 0, 0, 0 }; \
+				   o = (__m128i)(__tmp); })
+#define vmovq128(a, o)          ({ uint64x2_t __tmp = { (a), 0 }; \
+				   o = (__m128i)(__tmp); })
+
+#define vmovdqa128_memld(a, o)  (o = *(const __m128i *)(a))
+#define vmovdqa128_memst(a, o)  (*(__m128i *)(o) = (a))
+#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
+
+/* Following operations may have unaligned memory input */
+#define vmovdqu128_memld(a, o)  (o = (__m128i)vec_xl(0, (const uint8_t *)(a)))
+#define vpxor128_memld(a, b, o) vpxor128(b, (__m128i)vec_xl(0, (const uint8_t *)(a)), o)
+
+/* Following operations may have unaligned memory output */
+#define vmovdqu128_memst(a, o)  vec_xst((uint8x16_t)(a), 0, (uint8_t *)(o))
+#define vmovq128_memst(a, o)    (((uint64_unaligned_t *)(o))[0] = ((__m128i)(a))[0])
+
+/* PowerPC AES encrypt last round => ShiftRows + SubBytes + XOR round key  */
+static const uint8x16_t shift_row =
+  { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 };
+#define vaesenclast128(a, b, o) \
+	({ uint64x2_t __tmp = (__m128i)vec_sbox_be((uint8x16_t)(b)); \
+	   vpshufb128(shift_row, __tmp, __tmp); \
+	   vpxor128(a, __tmp, o); })
+
+/* Macros for exposing SubBytes from PowerPC crypto instructions. */
+#define aes_subbytes(a, o) \
+	(o = (__m128i)vec_sbox_be((uint8x16_t)(a)))
+#define aes_subbytes_and_shuf_and_xor(zero, a, o) \
+        vaesenclast128((zero), (a), (o))
+/*#define aes_load_inv_shufmask(shufmask_reg) \
+	load_frequent_const(inv_shift_row, (shufmask_reg))*/
+#define aes_inv_shuf(shufmask_reg, a, o) \
+	vpshufb128(shufmask_reg, (a), (o))
+#define if_aes_subbytes(...) __VA_ARGS__
+#define if_not_aes_subbytes(...) /*_*/
+
+#define memory_barrier_with_vec(a) __asm__("" : "+wa"(a) :: "memory")
+
+#endif /* __powerpc__ */
+
+#ifdef __ARM_NEON
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (ARMv8-CE)
+ **********************************************************************/
+#include <arm_neon.h>
+
+#define __m128i uint64x2_t
+
+#define vpand128(a, b, o)       (o = vandq_u64(b, a))
+#define vpandn128(a, b, o)      (o = vbicq_u64(a, b))
+#define vpxor128(a, b, o)       (o = veorq_u64(b, a))
+#define vpor128(a, b, o)        (o = vorrq_u64(b, a))
+
+#define vpsrlb128(s, a, o)      (o = (__m128i)vshrq_n_u8((uint8x16_t)a, s))
+#define vpsllb128(s, a, o)      (o = (__m128i)vshlq_n_u8((uint8x16_t)a, s))
+#define vpsrlw128(s, a, o)      (o = (__m128i)vshrq_n_u16((uint16x8_t)a, s))
+#define vpsllw128(s, a, o)      (o = (__m128i)vshlq_n_u16((uint16x8_t)a, s))
+#define vpsrld128(s, a, o)      (o = (__m128i)vshrq_n_u32((uint32x4_t)a, s))
+#define vpslld128(s, a, o)      (o = (__m128i)vshlq_n_u32((uint32x4_t)a, s))
+#define vpsrlq128(s, a, o)      (o = (__m128i)vshrq_n_u64(a, s))
+#define vpsllq128(s, a, o)      (o = (__m128i)vshlq_n_u64(a, s))
+#define vpsrldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+				o = (__m128i)vextq_u8((uint8x16_t)a, \
+						      (uint8x16_t)__tmp, (s) & 15);})
+#define vpslldq128(s, a, o)     ({ uint64x2_t __tmp = { 0, 0 }; \
+				o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
+						      (uint8x16_t)a, (16 - (s)) & 15);})
+
+#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
+#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
+
+#define vpaddb128(a, b, o)      (o = (__m128i)vaddq_u8((uint8x16_t)b, (uint8x16_t)a))
+
+#define vpcmpgtb128(a, b, o)    (o = (__m128i)vcgtq_s8((int8x16_t)b, (int8x16_t)a))
+#define vpabsb128(a, o)         (o = (__m128i)vabsq_s8((int8x16_t)a))
+
+#define vpshufd128_0x4e(a, o)   (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
+#define vpshufd128_0x1b(a, o)   (o = (__m128i)vrev64q_u32((uint32x4_t)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8)))
+#define vpshufb128(m, a, o)     (o = (__m128i)vqtbl1q_u8((uint8x16_t)a, (uint8x16_t)m))
+
+#define vpunpckhdq128(a, b, o)  (o = (__m128i)vzip2q_u32((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckldq128(a, b, o)  (o = (__m128i)vzip1q_u32((uint32x4_t)b, (uint32x4_t)a))
+#define vpunpckhqdq128(a, b, o) (o = (__m128i)vzip2q_u64(b, a))
+#define vpunpcklqdq128(a, b, o) (o = (__m128i)vzip1q_u64(b, a))
+
+/* CE AES encrypt last round => ShiftRows + SubBytes + XOR round key  */
+#define vaesenclast128(a, b, o) (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+
+#define vmovdqa128(a, o)        (o = a)
+#define vmovd128(a, o)          ({ uint32x4_t __tmp = { a, 0, 0, 0 }; o = (__m128i)__tmp; })
+#define vmovq128(a, o)          ({ uint64x2_t __tmp = { a, 0 }; o = (__m128i)__tmp; })
+
+#define vmovdqa128_memld(a, o)  (o = (*(const __m128i *)(a)))
+#define vmovdqa128_memst(a, o)  (*(__m128i *)(o) = (a))
+#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
+
+/* Following operations may have unaligned memory input */
+#define vmovdqu128_memld(a, o)  (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+#define vpxor128_memld(a, b, o) vpxor128(b, (__m128i)vld1q_u8((const uint8_t *)(a)), o)
+
+/* Following operations may have unaligned memory output */
+#define vmovdqu128_memst(a, o)  vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
+#define vmovq128_memst(a, o)    (((uint64_unaligned_t *)(o))[0] = (a)[0])
+
+/* Macros for exposing SubBytes from Crypto-Extension instruction set. */
+#define aes_subbytes_and_shuf_and_xor(zero, a, o) \
+        vaesenclast128(zero, a, o)
+#define aes_load_inv_shufmask(shufmask_reg) \
+	load_frequent_const(inv_shift_row, shufmask_reg)
+#define aes_inv_shuf(shufmask_reg, a, o) \
+	vpshufb128(shufmask_reg, a, o)
+#define if_aes_subbytes(...) /*_*/
+#define if_not_aes_subbytes(...) __VA_ARGS__
+
+#define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+
+#endif /* __ARM_NEON */
+
+#if defined(__x86_64__) || defined(__i386__)
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros
+ **********************************************************************/
+#include <x86intrin.h>
+
+#define vpand128(a, b, o)       (o = _mm_and_si128(b, a))
+#define vpandn128(a, b, o)      (o = _mm_andnot_si128(b, a))
+#define vpxor128(a, b, o)       (o = _mm_xor_si128(b, a))
+#define vpor128(a, b, o)        (o = _mm_or_si128(b, a))
+
+#define vpsrlw128(s, a, o)      (o = _mm_srli_epi16(a, s))
+#define vpsllw128(s, a, o)      (o = _mm_slli_epi16(a, s))
+#define vpsrld128(s, a, o)      (o = _mm_srli_epi32(a, s))
+#define vpslld128(s, a, o)      (o = _mm_slli_epi32(a, s))
+#define vpsrlq128(s, a, o)      (o = _mm_srli_epi64(a, s))
+#define vpsllq128(s, a, o)      (o = _mm_slli_epi64(a, s))
+#define vpsrldq128(s, a, o)     (o = _mm_srli_si128(a, s))
+#define vpslldq128(s, a, o)     (o = _mm_slli_si128(a, s))
+
+#define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o)
+#define vpsll_byte_128(s, a, o) vpslld128(s, a, o)
+
+#define vpaddb128(a, b, o)      (o = _mm_add_epi8(b, a))
+
+#define vpcmpgtb128(a, b, o)    (o = _mm_cmpgt_epi8(b, a))
+#define vpabsb128(a, o)         (o = _mm_abs_epi8(a))
+
+#define vpshufd128_0x1b(a, o)   (o = _mm_shuffle_epi32(a, 0x1b))
+#define vpshufd128_0x4e(a, o)   (o = _mm_shuffle_epi32(a, 0x4e))
+#define vpshufb128(m, a, o)     (o = _mm_shuffle_epi8(a, m))
+
+#define vpunpckhdq128(a, b, o)  (o = _mm_unpackhi_epi32(b, a))
+#define vpunpckldq128(a, b, o)  (o = _mm_unpacklo_epi32(b, a))
+#define vpunpckhqdq128(a, b, o) (o = _mm_unpackhi_epi64(b, a))
+#define vpunpcklqdq128(a, b, o) (o = _mm_unpacklo_epi64(b, a))
+
+/* AES-NI encrypt last round => ShiftRows + SubBytes + XOR round key  */
+#define vaesenclast128(a, b, o) (o = _mm_aesenclast_si128(b, a))
+
+#define vmovdqa128(a, o)        (o = a)
+#define vmovd128(a, o)          (o = _mm_set_epi32(0, 0, 0, a))
+#define vmovq128(a, o)          (o = _mm_set_epi64x(0, a))
+
+#define vmovdqa128_memld(a, o)  (o = (*(const __m128i *)(a)))
+#define vmovdqa128_memst(a, o)  (*(__m128i *)(o) = (a))
+#define vpshufb128_amemld(m, a, o) vpshufb128(*(const __m128i *)(m), a, o)
+
+/* Following operations may have unaligned memory input */
+#define vmovdqu128_memld(a, o)  (o = _mm_loadu_si128((const __m128i *)(a)))
+#define vpxor128_memld(a, b, o) \
+	vpxor128(b, _mm_loadu_si128((const __m128i *)(a)), o)
+
+/* Following operations may have unaligned memory output */
+#define vmovdqu128_memst(a, o)  _mm_storeu_si128((__m128i *)(o), a)
+#define vmovq128_memst(a, o)    _mm_storel_epi64((__m128i *)(o), a)
+
+/* Macros for exposing SubBytes from AES-NI instruction set. */
+#define aes_subbytes_and_shuf_and_xor(zero, a, o) \
+	vaesenclast128(zero, a, o)
+#define aes_load_inv_shufmask(shufmask_reg) \
+	load_frequent_const(inv_shift_row, shufmask_reg)
+#define aes_inv_shuf(shufmask_reg, a, o) \
+	vpshufb128(shufmask_reg, a, o)
+#define if_aes_subbytes(...) /*_*/
+#define if_not_aes_subbytes(...) __VA_ARGS__
+
+#define memory_barrier_with_vec(a) __asm__("" : "+x"(a) :: "memory")
+
+#endif /* defined(__x86_64__) || defined(__i386__) */
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand128(x, mask4bit, tmp0); \
+	vpandn128(x, mask4bit, x); \
+	vpsrl_byte_128(4, x, x); \
+	\
+	vpshufb128(tmp0, lo_t, tmp0); \
+	vpshufb128(x, hi_t, x); \
+	vpxor128(tmp0, x, x);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq128(x1, x0, t2); \
+	vpunpckldq128(x1, x0, x0); \
+	\
+	vpunpckldq128(x3, x2, t1); \
+	vpunpckhdq128(x3, x2, x2); \
+	\
+	vpunpckhqdq128(t1, x0, x1); \
+	vpunpcklqdq128(t1, x0, x0); \
+	\
+	vpunpckhqdq128(x2, t2, x3); \
+	vpunpcklqdq128(x2, t2, x2);
+
+#define load_zero(o) vmovq128(0, o)
+
+#define load_frequent_const(constant, o) vmovdqa128(constant ## _stack, o)
+
+#define prepare_frequent_const(constant) \
+	vmovdqa128_memld(&(constant), constant ## _stack); \
+	memory_barrier_with_vec(constant ## _stack)
+
+#define prepare_frequent_constants() \
+	prepare_frequent_const(inv_shift_row); \
+	prepare_frequent_const(pack_bswap); \
+	prepare_frequent_const(shufb_16x16b); \
+	prepare_frequent_const(mask_0f); \
+	prepare_frequent_const(pre_tf_lo_s1); \
+	prepare_frequent_const(pre_tf_hi_s1); \
+	prepare_frequent_const(pre_tf_lo_s4); \
+	prepare_frequent_const(pre_tf_hi_s4); \
+	prepare_frequent_const(post_tf_lo_s1); \
+	prepare_frequent_const(post_tf_hi_s1); \
+	prepare_frequent_const(post_tf_lo_s3); \
+	prepare_frequent_const(post_tf_hi_s3); \
+	prepare_frequent_const(post_tf_lo_s2); \
+	prepare_frequent_const(post_tf_hi_s2)
+
+#define frequent_constants_declare \
+	__m128i inv_shift_row_stack; \
+	__m128i pack_bswap_stack; \
+	__m128i shufb_16x16b_stack; \
+	__m128i mask_0f_stack; \
+	__m128i pre_tf_lo_s1_stack; \
+	__m128i pre_tf_hi_s1_stack; \
+	__m128i pre_tf_lo_s4_stack; \
+	__m128i pre_tf_hi_s4_stack; \
+	__m128i post_tf_lo_s1_stack; \
+	__m128i post_tf_hi_s1_stack; \
+	__m128i post_tf_lo_s3_stack; \
+	__m128i post_tf_hi_s3_stack; \
+	__m128i post_tf_lo_s2_stack; \
+	__m128i post_tf_hi_s2_stack
+
+/**********************************************************************
+  16-way camellia macros
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	if_not_aes_subbytes(aes_load_inv_shufmask(t4);) \
+	load_frequent_const(mask_0f, t7); \
+	load_frequent_const(pre_tf_lo_s1, t0); \
+	load_frequent_const(pre_tf_hi_s1, t1); \
+	\
+	/* AES inverse shift rows */ \
+	if_not_aes_subbytes( \
+	  aes_inv_shuf(t4, x0, x0); \
+	  aes_inv_shuf(t4, x7, x7); \
+	  aes_inv_shuf(t4, x1, x1); \
+	  aes_inv_shuf(t4, x4, x4); \
+	  aes_inv_shuf(t4, x2, x2); \
+	  aes_inv_shuf(t4, x5, x5); \
+	  aes_inv_shuf(t4, x3, x3); \
+	  aes_inv_shuf(t4, x6, x6); \
+	) \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	load_frequent_const(pre_tf_lo_s4, t2); \
+	load_frequent_const(pre_tf_hi_s4, t3); \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x1, t0, t1, t7, t6); \
+	filter_8bit(x4, t0, t1, t7, t6); \
+	filter_8bit(x2, t0, t1, t7, t6); \
+	filter_8bit(x5, t0, t1, t7, t6); \
+	\
+	/* prefilter sbox 4 */ \
+	if_not_aes_subbytes(load_zero(t4);) \
+	filter_8bit(x3, t2, t3, t7, t6); \
+	filter_8bit(x6, t2, t3, t7, t6); \
+	\
+	/* AES subbytes + AES shift rows */ \
+	load_frequent_const(post_tf_lo_s1, t0); \
+	load_frequent_const(post_tf_hi_s1, t1); \
+	if_not_aes_subbytes( \
+	  aes_subbytes_and_shuf_and_xor(t4, x0, x0); \
+	  aes_subbytes_and_shuf_and_xor(t4, x7, x7); \
+	  aes_subbytes_and_shuf_and_xor(t4, x1, x1); \
+	  aes_subbytes_and_shuf_and_xor(t4, x4, x4); \
+	  aes_subbytes_and_shuf_and_xor(t4, x2, x2); \
+	  aes_subbytes_and_shuf_and_xor(t4, x5, x5); \
+	  aes_subbytes_and_shuf_and_xor(t4, x3, x3); \
+	  aes_subbytes_and_shuf_and_xor(t4, x6, x6); \
+	) \
+	if_aes_subbytes( \
+	  aes_subbytes(x0, x0); \
+	  aes_subbytes(x7, x7); \
+	  aes_subbytes(x1, x1); \
+	  aes_subbytes(x4, x4); \
+	  aes_subbytes(x2, x2); \
+	  aes_subbytes(x5, x5); \
+	  aes_subbytes(x3, x3); \
+	  aes_subbytes(x6, x6); \
+	) \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	load_frequent_const(post_tf_lo_s3, t2); \
+	load_frequent_const(post_tf_hi_s3, t3); \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	load_frequent_const(post_tf_lo_s2, t4); \
+	load_frequent_const(post_tf_hi_s2, t5); \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vmovq128((key), t0); \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	\
+	/* P-function */ \
+	vpxor128(x5, x0, x0); \
+	vpxor128(x6, x1, x1); \
+	vpxor128(x7, x2, x2); \
+	vpxor128(x4, x3, x3); \
+	\
+	vpxor128(x2, x4, x4); \
+	vpxor128(x3, x5, x5); \
+	vpxor128(x0, x6, x6); \
+	vpxor128(x1, x7, x7); \
+	\
+	vpxor128(x7, x0, x0); \
+	vpxor128(x4, x1, x1); \
+	vpxor128(x5, x2, x2); \
+	vpxor128(x6, x3, x3); \
+	\
+	vpxor128(x3, x4, x4); \
+	vpxor128(x0, x5, x5); \
+	vpxor128(x1, x6, x6); \
+	vpxor128(x2, x7, x7); /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpshufb128(bcast[7], t0, t7); \
+	vpshufb128(bcast[6], t0, t6); \
+	vpshufb128(bcast[5], t0, t5); \
+	vpshufb128(bcast[4], t0, t4); \
+	vpshufb128(bcast[3], t0, t3); \
+	vpshufb128(bcast[2], t0, t2); \
+	vpshufb128(bcast[1], t0, t1); \
+	\
+	vpxor128(t3, x4, x4); \
+	vpxor128(mem_cd[0], x4, x4); \
+	\
+	load_zero(t3); \
+	vpshufb128(t3, t0, t0); \
+	\
+	vpxor128(t2, x5, x5); \
+	vpxor128(mem_cd[1], x5, x5); \
+	\
+	vpxor128(t1, x6, x6); \
+	vpxor128(mem_cd[2], x6, x6); \
+	\
+	vpxor128(t0, x7, x7); \
+	vpxor128(mem_cd[3], x7, x7); \
+	\
+	vpxor128(t7, x0, x0); \
+	vpxor128(mem_cd[4], x0, x0); \
+	\
+	vpxor128(t6, x1, x1); \
+	vpxor128(mem_cd[5], x1, x1); \
+	\
+	vpxor128(t5, x2, x2); \
+	vpxor128(mem_cd[6], x2, x2); \
+	\
+	vpxor128(t4, x3, x3); \
+	vpxor128(mem_cd[7], x3, x3);
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_cd, ctx->key_table[(i)]); \
+	\
+	vmovdqa128(x4, mem_cd[0]); \
+	vmovdqa128(x5, mem_cd[1]); \
+	vmovdqa128(x6, mem_cd[2]); \
+	vmovdqa128(x7, mem_cd[3]); \
+	vmovdqa128(x0, mem_cd[4]); \
+	vmovdqa128(x1, mem_cd[5]); \
+	vmovdqa128(x2, mem_cd[6]); \
+	vmovdqa128(x3, mem_cd[7]); \
+	\
+	roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_ab, ctx->key_table[(i) + (dir)]); \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqa128(x0, mem_ab[0]); \
+	vmovdqa128(x1, mem_ab[1]); \
+	vmovdqa128(x2, mem_ab[2]); \
+	vmovdqa128(x3, mem_ab[3]); \
+	vmovdqa128(x4, mem_ab[4]); \
+	vmovdqa128(x5, mem_ab[5]); \
+	vmovdqa128(x6, mem_ab[6]); \
+	vmovdqa128(x7, mem_ab[7]);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+#define LE64_LO32(x) ((x) & 0xffffffffU)
+#define LE64_HI32(x) ((x >> 32) & 0xffffffffU)
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb128(v0, zero, t0); \
+	vpaddb128(v0, v0, v0); \
+	vpabsb128(t0, t0); \
+	\
+	vpcmpgtb128(v1, zero, t1); \
+	vpaddb128(v1, v1, v1); \
+	vpabsb128(t1, t1); \
+	\
+	vpcmpgtb128(v2, zero, t2); \
+	vpaddb128(v2, v2, v2); \
+	vpabsb128(t2, t2); \
+	\
+	vpor128(t0, v1, v1); \
+	\
+	vpcmpgtb128(v3, zero, t0); \
+	vpaddb128(v3, v3, v3); \
+	vpabsb128(t0, t0); \
+	\
+	vpor128(t1, v2, v2); \
+	vpor128(t2, v3, v3); \
+	vpor128(t0, v0, v0);
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kl, kr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	load_zero(tt0); \
+	vmovd128(LE64_LO32(*(kl)), t0); \
+	vpshufb128(tt0, t0, t3); \
+	vpshufb128(bcast[1], t0, t2); \
+	vpshufb128(bcast[2], t0, t1); \
+	vpshufb128(bcast[3], t0, t0); \
+	\
+	vpand128(l0, t0, t0); \
+	vpand128(l1, t1, t1); \
+	vpand128(l2, t2, t2); \
+	vpand128(l3, t3, t3); \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor128(l4, t0, l4); \
+	vmovdqa128(l4, l[4]); \
+	vpxor128(l5, t1, l5); \
+	vmovdqa128(l5, l[5]); \
+	vpxor128(l6, t2, l6); \
+	vmovdqa128(l6, l[6]); \
+	vpxor128(l7, t3, l7); \
+	vmovdqa128(l7, l[7]); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vmovd128(LE64_HI32(*(kr)), t0); \
+	vpshufb128(tt0, t0, t3); \
+	vpshufb128(bcast[1], t0, t2); \
+	vpshufb128(bcast[2], t0, t1); \
+	vpshufb128(bcast[3], t0, t0); \
+	\
+	vpor128(r[4], t0, t0); \
+	vpor128(r[5], t1, t1); \
+	vpor128(r[6], t2, t2); \
+	vpor128(r[7], t3, t3); \
+	\
+	vpxor128(r[0], t0, t0); \
+	vpxor128(r[1], t1, t1); \
+	vpxor128(r[2], t2, t2); \
+	vpxor128(r[3], t3, t3); \
+	vmovdqa128(t0, r[0]); \
+	vmovdqa128(t1, r[1]); \
+	vmovdqa128(t2, r[2]); \
+	vmovdqa128(t3, r[3]); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vmovd128(LE64_LO32(*(kr)), t0); \
+	vpshufb128(tt0, t0, t3); \
+	vpshufb128(bcast[1], t0, t2); \
+	vpshufb128(bcast[2], t0, t1); \
+	vpshufb128(bcast[3], t0, t0); \
+	\
+	vpand128(r[0], t0, t0); \
+	vpand128(r[1], t1, t1); \
+	vpand128(r[2], t2, t2); \
+	vpand128(r[3], t3, t3); \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor128(r[4], t0, t0); \
+	vpxor128(r[5], t1, t1); \
+	vpxor128(r[6], t2, t2); \
+	vpxor128(r[7], t3, t3); \
+	vmovdqa128(t0, r[4]); \
+	vmovdqa128(t1, r[5]); \
+	vmovdqa128(t2, r[6]); \
+	vmovdqa128(t3, r[7]); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vmovd128(LE64_HI32(*(kl)), t0); \
+	vpshufb128(tt0, t0, t3); \
+	vpshufb128(bcast[1], t0, t2); \
+	vpshufb128(bcast[2], t0, t1); \
+	vpshufb128(bcast[3], t0, t0); \
+	\
+	vpor128(l4, t0, t0); \
+	vpor128(l5, t1, t1); \
+	vpor128(l6, t2, t2); \
+	vpor128(l7, t3, t3); \
+	\
+	vpxor128(l0, t0, l0); \
+	vmovdqa128(l0, l[0]); \
+	vpxor128(l1, t1, l1); \
+	vmovdqa128(l1, l[1]); \
+	vpxor128(l2, t2, l2); \
+	vmovdqa128(l2, l[2]); \
+	vpxor128(l3, t3, l3); \
+	vmovdqa128(l3, l[3]);
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	vmovdqa128(d2, st0); \
+	vmovdqa128(d3, st1); \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqa128(st0, d2); \
+	vmovdqa128(st1, d3); \
+	\
+	vmovdqa128(a0, st0); \
+	vmovdqa128(a1, st1); \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vmovdqa128(shufb_16x16b_stack, a0); \
+	vmovdqa128(st1, a1); \
+	vpshufb128(a0, a2, a2); \
+	vpshufb128(a0, a3, a3); \
+	vpshufb128(a0, b0, b0); \
+	vpshufb128(a0, b1, b1); \
+	vpshufb128(a0, b2, b2); \
+	vpshufb128(a0, b3, b3); \
+	vpshufb128(a0, a1, a1); \
+	vpshufb128(a0, c0, c0); \
+	vpshufb128(a0, c1, c1); \
+	vpshufb128(a0, c2, c2); \
+	vpshufb128(a0, c3, c3); \
+	vpshufb128(a0, d0, d0); \
+	vpshufb128(a0, d1, d1); \
+	vpshufb128(a0, d2, d2); \
+	vpshufb128(a0, d3, d3); \
+	vmovdqa128(d3, st1); \
+	vmovdqa128(st0, d3); \
+	vpshufb128(a0, d3, a0); \
+	vmovdqa128(d2, st0); \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqa128(st0, d2); \
+	vmovdqa128(st1, d3); \
+	\
+	vmovdqa128(b0, st0); \
+	vmovdqa128(b1, st1); \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqa128(st0, b0); \
+	vmovdqa128(st1, b1); \
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vmovq128((key), x0); \
+	vpshufb128(pack_bswap_stack, x0, x0); \
+	\
+	vpxor128_memld((rio) + 0 * 16, x0, y7); \
+	vpxor128_memld((rio) + 1 * 16, x0, y6); \
+	vpxor128_memld((rio) + 2 * 16, x0, y5); \
+	vpxor128_memld((rio) + 3 * 16, x0, y4); \
+	vpxor128_memld((rio) + 4 * 16, x0, y3); \
+	vpxor128_memld((rio) + 5 * 16, x0, y2); \
+	vpxor128_memld((rio) + 6 * 16, x0, y1); \
+	vpxor128_memld((rio) + 7 * 16, x0, y0); \
+	vpxor128_memld((rio) + 8 * 16, x0, x7); \
+	vpxor128_memld((rio) + 9 * 16, x0, x6); \
+	vpxor128_memld((rio) + 10 * 16, x0, x5); \
+	vpxor128_memld((rio) + 11 * 16, x0, x4); \
+	vpxor128_memld((rio) + 12 * 16, x0, x3); \
+	vpxor128_memld((rio) + 13 * 16, x0, x2); \
+	vpxor128_memld((rio) + 14 * 16, x0, x1); \
+	vpxor128_memld((rio) + 15 * 16, x0, x0);
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, mem_ab[0], mem_cd[0]); \
+	\
+	vmovdqa128(x0, mem_ab[0]); \
+	vmovdqa128(x1, mem_ab[1]); \
+	vmovdqa128(x2, mem_ab[2]); \
+	vmovdqa128(x3, mem_ab[3]); \
+	vmovdqa128(x4, mem_ab[4]); \
+	vmovdqa128(x5, mem_ab[5]); \
+	vmovdqa128(x6, mem_ab[6]); \
+	vmovdqa128(x7, mem_ab[7]); \
+	vmovdqa128(y0, mem_cd[0]); \
+	vmovdqa128(y1, mem_cd[1]); \
+	vmovdqa128(y2, mem_cd[2]); \
+	vmovdqa128(y3, mem_cd[3]); \
+	vmovdqa128(y4, mem_cd[4]); \
+	vmovdqa128(y5, mem_cd[5]); \
+	vmovdqa128(y6, mem_cd[6]); \
+	vmovdqa128(y7, mem_cd[7]);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqa128(x0, stack_tmp0); \
+	\
+	vmovq128((key), x0); \
+	vpshufb128(pack_bswap_stack, x0, x0); \
+	\
+	vpxor128(x0, y7, y7); \
+	vpxor128(x0, y6, y6); \
+	vpxor128(x0, y5, y5); \
+	vpxor128(x0, y4, y4); \
+	vpxor128(x0, y3, y3); \
+	vpxor128(x0, y2, y2); \
+	vpxor128(x0, y1, y1); \
+	vpxor128(x0, y0, y0); \
+	vpxor128(x0, x7, x7); \
+	vpxor128(x0, x6, x6); \
+	vpxor128(x0, x5, x5); \
+	vpxor128(x0, x4, x4); \
+	vpxor128(x0, x3, x3); \
+	vpxor128(x0, x2, x2); \
+	vpxor128(x0, x1, x1); \
+	vpxor128(stack_tmp0, x0, x0);
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu128_memst(x0, (rio) + 0 * 16); \
+	vmovdqu128_memst(x1, (rio) + 1 * 16); \
+	vmovdqu128_memst(x2, (rio) + 2 * 16); \
+	vmovdqu128_memst(x3, (rio) + 3 * 16); \
+	vmovdqu128_memst(x4, (rio) + 4 * 16); \
+	vmovdqu128_memst(x5, (rio) + 5 * 16); \
+	vmovdqu128_memst(x6, (rio) + 6 * 16); \
+	vmovdqu128_memst(x7, (rio) + 7 * 16); \
+	vmovdqu128_memst(y0, (rio) + 8 * 16); \
+	vmovdqu128_memst(y1, (rio) + 9 * 16); \
+	vmovdqu128_memst(y2, (rio) + 10 * 16); \
+	vmovdqu128_memst(y3, (rio) + 11 * 16); \
+	vmovdqu128_memst(y4, (rio) + 12 * 16); \
+	vmovdqu128_memst(y5, (rio) + 13 * 16); \
+	vmovdqu128_memst(y6, (rio) + 14 * 16); \
+	vmovdqu128_memst(y7, (rio) + 15 * 16);
+
+/**********************************************************************
+  macros for defining constant vectors
+ **********************************************************************/
+#define SWAP_LE64(x) (x)
+
+#define M128I_BYTE(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7) \
+	{ \
+	  SWAP_LE64((((a0) & 0xffULL) << 0) | \
+		    (((a1) & 0xffULL) << 8) | \
+		    (((a2) & 0xffULL) << 16) | \
+		    (((a3) & 0xffULL) << 24) | \
+		    (((a4) & 0xffULL) << 32) | \
+		    (((a5) & 0xffULL) << 40) | \
+		    (((a6) & 0xffULL) << 48) | \
+		    (((a7) & 0xffULL) << 56)), \
+	  SWAP_LE64((((b0) & 0xffULL) << 0) | \
+		    (((b1) & 0xffULL) << 8) | \
+		    (((b2) & 0xffULL) << 16) | \
+		    (((b3) & 0xffULL) << 24) | \
+		    (((b4) & 0xffULL) << 32) | \
+		    (((b5) & 0xffULL) << 40) | \
+		    (((b6) & 0xffULL) << 48) | \
+		    (((b7) & 0xffULL) << 56)) \
+	}
+
+#define M128I_U32(a0, a1, b0, b1) \
+	{ \
+	  SWAP_LE64((((a0) & 0xffffffffULL) << 0) | \
+		    (((a1) & 0xffffffffULL) << 32)), \
+	  SWAP_LE64((((b0) & 0xffffffffULL) << 0) | \
+		    (((b1) & 0xffffffffULL) << 32)) \
+	}
+
+#define M128I_REP16(x) { (0x0101010101010101ULL * (x)), (0x0101010101010101ULL * (x)) }
+
+#define SHUFB_BYTES(idx) \
+	(((0 + (idx)) << 0)  | ((4 + (idx)) << 8) | \
+	 ((8 + (idx)) << 16) | ((12 + (idx)) << 24))
+
+typedef u64 uint64_unaligned_t __attribute__((aligned(1), may_alias));
+
+static const __m128i shufb_16x16b =
+  M128I_U32(SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3));
+
+static const __m128i pack_bswap =
+  M128I_U32(0x00010203, 0x04050607, 0x0f0f0f0f, 0x0f0f0f0f);
+
+static const __m128i bcast[8] =
+{
+  M128I_REP16(0), M128I_REP16(1), M128I_REP16(2), M128I_REP16(3),
+  M128I_REP16(4), M128I_REP16(5), M128I_REP16(6), M128I_REP16(7)
+};
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '? 0xc5' inside camellia_f())
+ */
+static const __m128i pre_tf_lo_s1 =
+  M128I_BYTE(0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86,
+	     0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88);
+
+static const __m128i pre_tf_hi_s1 =
+  M128I_BYTE(0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a,
+	     0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23);
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '? 0xc5' inside camellia_f())
+ */
+static const __m128i pre_tf_lo_s4 =
+  M128I_BYTE(0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25,
+	     0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74);
+
+static const __m128i pre_tf_hi_s4 =
+  M128I_BYTE(0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72,
+	     0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf);
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '? 0x6e' inside camellia_h())
+ */
+static const __m128i post_tf_lo_s1 =
+  M128I_BYTE(0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31,
+	     0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1);
+
+static const __m128i post_tf_hi_s1 =
+  M128I_BYTE(0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8,
+	     0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c);
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '? 0x6e' inside camellia_h())
+ */
+static const __m128i post_tf_lo_s2 =
+  M128I_BYTE(0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62,
+	     0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3);
+
+static const __m128i post_tf_hi_s2 =
+  M128I_BYTE(0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51,
+	     0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18);
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '? 0x6e' inside camellia_h())
+ */
+static const __m128i post_tf_lo_s3 =
+  M128I_BYTE(0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98,
+	     0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8);
+
+static const __m128i post_tf_hi_s3 =
+  M128I_BYTE(0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54,
+	     0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06);
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+static const __m128i inv_shift_row =
+  M128I_BYTE(0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b,
+	     0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03);
+
+/* 4-bit mask */
+static const __m128i mask_0f =
+  M128I_U32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
+
+/* Encrypts 16 input block from IN and writes result to OUT. IN and OUT may
+ * unaligned pointers. */
+void ASM_FUNC_ATTR_NOINLINE
+FUNC_ENC_BLK16(const void *key_table, void *vout, const void *vin,
+	       int key_length)
+{
+  const struct enc_ctx_s
+  {
+    const u64 *key_table;
+    int key_length;
+  } sctx =
+    {
+      .key_table = (const u64 *)key_table,
+      .key_length = key_length
+    };
+  const struct enc_ctx_s *ctx = &sctx;
+  char *out = vout;
+  const char *in = vin;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i ab[8];
+  __m128i cd[8];
+  __m128i tmp0, tmp1;
+  unsigned int lastk, k;
+  frequent_constants_declare;
+
+  prepare_frequent_constants();
+
+  if (ctx->key_length > 16)
+    lastk = 32;
+  else
+    lastk = 24;
+
+  inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+	       x15, in, ctx->key_table[0]);
+
+  inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+		x15, ab, cd);
+
+  k = 0;
+  while (1)
+    {
+      enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+		  x15, ab, cd, k);
+
+      if (k == lastk - 8)
+	break;
+
+      fls16(ab, x0, x1, x2, x3, x4, x5, x6, x7, cd, x8, x9, x10, x11, x12, x13, x14,
+	    x15, &ctx->key_table[k + 8], &ctx->key_table[k + 9]);
+
+      k += 8;
+    }
+
+  /* load CD for output */
+  vmovdqa128(cd[0], x8);
+  vmovdqa128(cd[1], x9);
+  vmovdqa128(cd[2], x10);
+  vmovdqa128(cd[3], x11);
+  vmovdqa128(cd[4], x12);
+  vmovdqa128(cd[5], x13);
+  vmovdqa128(cd[6], x14);
+  vmovdqa128(cd[7], x15);
+
+  outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+	      x15, ctx->key_table[lastk], tmp0, tmp1);
+
+  write_output(x7, x6, x5, x4, x3, x2, x1, x0, x15, x14, x13, x12, x11, x10, x9,
+	       x8, out);
+}
+
+/* Decrypts 16 input block from IN and writes result to OUT. IN and OUT may
+ * unaligned pointers. */
+void ASM_FUNC_ATTR_NOINLINE
+FUNC_DEC_BLK16(const void *key_table, void *vout, const void *vin,
+	       int key_length)
+{
+  const struct dec_ctx_s
+  {
+    const u64 *key_table;
+    int key_length;
+  } sctx =
+    {
+      .key_table = (const u64 *)key_table,
+      .key_length = key_length
+    };
+  const struct dec_ctx_s *ctx = &sctx;
+  char *out = vout;
+  const char *in = vin;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i ab[8];
+  __m128i cd[8];
+  __m128i tmp0, tmp1;
+  unsigned int firstk, k;
+  frequent_constants_declare;
+
+  prepare_frequent_constants();
+
+  if (ctx->key_length > 16)
+    firstk = 32;
+  else
+    firstk = 24;
+
+  inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+	       x15, in, ctx->key_table[firstk]);
+
+  inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+		x15, ab, cd);
+
+  k = firstk - 8;
+  while (1)
+    {
+      dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13,
+		  x14, x15, ab, cd, k);
+
+      if (k == 0)
+	break;
+
+      fls16(ab, x0, x1, x2, x3, x4, x5, x6, x7, cd, x8, x9, x10, x11, x12, x13,
+	    x14, x15, &ctx->key_table[k + 1], &ctx->key_table[k]);
+
+      k -= 8;
+    }
+
+  /* load CD for output */
+  vmovdqa128(cd[0], x8);
+  vmovdqa128(cd[1], x9);
+  vmovdqa128(cd[2], x10);
+  vmovdqa128(cd[3], x11);
+  vmovdqa128(cd[4], x12);
+  vmovdqa128(cd[5], x13);
+  vmovdqa128(cd[6], x14);
+  vmovdqa128(cd[7], x15);
+
+  outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
+	      x15, ctx->key_table[0], tmp0, tmp1);
+
+  write_output(x7, x6, x5, x4, x3, x2, x1, x0, x15, x14, x13, x12, x11, x10, x9,
+	       x8, out);
+}
+
+/********* Key setup **********************************************************/
+
+/*
+ * Camellia F-function, 1-way SIMD/AESNI.
+ *
+ * IN:
+ *  ab: 64-bit AB state
+ *  cd: 64-bit CD state
+ */
+#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
+		   _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
+	vmovq128((key), t0); \
+	load_zero(t3); \
+	\
+	vpxor128(ab, t0, x); \
+	\
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	\
+	/* input rotation for sbox4 (<<< 1) */ \
+	vpand128(x, sbox4mask, t0); \
+	vpandn128(x, sbox4mask, x); \
+	vpaddb128(t0, t0, t1); \
+	vpsrl_byte_128(7, t0, t0); \
+	vpor128(t0, t1, t0); \
+	vpand128(sbox4mask, t0, t0); \
+	vpor128(t0, x, x); \
+	\
+	vmovdqa128_memld(&post_tf_lo_s1, t0); \
+	vmovdqa128_memld(&post_tf_hi_s1, t1); \
+	\
+	/* prefilter sboxes */ \
+	filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
+	\
+	/* AES subbytes + AES shift rows + AES inv shift rows */ \
+	aes_subbytes_and_shuf_and_xor(t3, x, x); \
+	\
+	/* postfilter sboxes */ \
+	filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
+	\
+	/* output rotation for sbox2 (<<< 1) */ \
+	/* output rotation for sbox3 (>>> 1) */ \
+	aes_inv_shuf(inv_shift_row, x, t1); \
+	vpshufb128_amemld(&sp0044440444044404mask, x, t4); \
+	vpshufb128_amemld(&sp1110111010011110mask, x, x); \
+	vpaddb128(t1, t1, t2); \
+	vpsrl_byte_128(7, t1, t0); \
+	vpsll_byte_128(7, t1, t3); \
+	vpor128(t0, t2, t0); \
+	vpsrl_byte_128(1, t1, t1); \
+	vpshufb128_amemld(&sp0222022222000222mask, t0, t0); \
+	vpor128(t1, t3, t1); \
+	\
+	vpxor128(x, t4, t4); \
+	vpshufb128_amemld(&sp3033303303303033mask, t1, t1); \
+	vpxor128(t4, t0, t0); \
+	vpxor128(t1, t0, t0); \
+	vpsrldq128(8, t0, x); \
+	vpxor128(t0, x, x); \
+
+#define vec_rol128(in, out, nrol, t0) \
+	vpshufd128_0x4e(in, out); \
+	vpsllq128((nrol), in, t0); \
+	vpsrlq128((64-(nrol)), out, out); \
+	vpaddb128(t0, out, out);
+
+#define vec_ror128(in, out, nror, t0) \
+	vpshufd128_0x4e(in, out); \
+	vpsrlq128((nror), in, t0); \
+	vpsllq128((64-(nror)), out, out); \
+	vpaddb128(t0, out, out);
+
+#define U64_BYTE(a0, a1, a2, a3, b0, b1, b2, b3) \
+	( \
+	  SWAP_LE64((((a0) & 0xffULL) << 0) | \
+		    (((a1) & 0xffULL) << 8) | \
+		    (((a2) & 0xffULL) << 16) | \
+		    (((a3) & 0xffULL) << 24) | \
+		    (((b0) & 0xffULL) << 32) | \
+		    (((b1) & 0xffULL) << 40) | \
+		    (((b2) & 0xffULL) << 48) | \
+		    (((b3) & 0xffULL) << 56)) \
+	)
+
+#define U64_U32(a0, b0) \
+	( \
+	  SWAP_LE64((((a0) & 0xffffffffULL) << 0) | \
+		    (((b0) & 0xffffffffULL) << 32)) \
+	)
+
+static const __m128i bswap128_mask =
+  M128I_BYTE(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+static const __m128i inv_shift_row_and_unpcklbw =
+  M128I_BYTE(0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff,
+	     0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff);
+
+static const __m128i sp0044440444044404mask =
+  M128I_U32(0xffff0404, 0x0404ff04, 0x0d0dff0d, 0x0d0dff0d);
+
+static const __m128i sp1110111010011110mask =
+  M128I_U32(0x000000ff, 0x000000ff, 0x0bffff0b, 0x0b0b0bff);
+
+static const __m128i sp0222022222000222mask =
+  M128I_U32(0xff060606, 0xff060606, 0x0c0cffff, 0xff0c0c0c);
+
+static const __m128i sp3033303303303033mask =
+  M128I_U32(0x04ff0404, 0x04ff0404, 0xff0a0aff, 0x0aff0a0a);
+
+static const u64 sbox4_input_mask =
+  U64_BYTE(0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00);
+
+static const u64 sigma1 =
+  U64_U32(0x3BCC908B, 0xA09E667F);
+
+static const u64 sigma2 =
+  U64_U32(0x4CAA73B2, 0xB67AE858);
+
+static const u64 sigma3 =
+  U64_U32(0xE94F82BE, 0xC6EF372F);
+
+static const u64 sigma4 =
+  U64_U32(0xF1D36F1C, 0x54FF53A5);
+
+static const u64 sigma5 =
+  U64_U32(0xDE682D1D, 0x10E527FA);
+
+static const u64 sigma6 =
+  U64_U32(0xB3E6C1FD, 0xB05688C2);
+
+#define cmll_sub(n, ctx) &ctx->key_table[n]
+
+static ASM_FUNC_ATTR_INLINE void
+camellia_setup128(void *key_table, __m128i x0)
+{
+  struct setup128_ctx_s
+  {
+    u64 *key_table;
+  } sctx = { .key_table = (u64 *)key_table };
+  struct setup128_ctx_s *ctx = &sctx;
+
+  /* input:
+   *   ctx: subkey storage at key_table(CTX)
+   *   x0: key
+   */
+
+  __m128i x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i tmp0;
+
+#define KL128 x0
+#define KA128 x2
+
+  vpshufb128_amemld(&bswap128_mask, KL128, KL128);
+
+  vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11);
+  vmovq128(sbox4_input_mask, x12);
+  vmovdqa128_memld(&mask_0f, x13);
+  vmovdqa128_memld(&pre_tf_lo_s1, x14);
+  vmovdqa128_memld(&pre_tf_hi_s1, x15);
+
+  /*
+   * Generate KA
+   */
+  vpsrldq128(8, KL128, x2);
+  vmovdqa128(KL128, x3);
+  vpslldq128(8, x3, x3);
+  vpsrldq128(8, x3, x3);
+
+  camellia_f(x2, x4, x1,
+	     x5, x6, x7, x8,
+	     x11, x12, x13, x14, x15, sigma1);
+  vpxor128(x4, x3, x3);
+  camellia_f(x3, x2, x1,
+	     x5, x6, x7, x8,
+	     x11, x12, x13, x14, x15, sigma2);
+  camellia_f(x2, x3, x1,
+	     x5, x6, x7, x8,
+	     x11, x12, x13, x14, x15, sigma3);
+  vpxor128(x4, x3, x3);
+  camellia_f(x3, x4, x1,
+	     x5, x6, x7, x8,
+	     x11, x12, x13, x14, x15, sigma4);
+
+  vpslldq128(8, x3, x3);
+  vpxor128(x4, x2, x2);
+  vpsrldq128(8, x3, x3);
+  vpslldq128(8, x2, KA128);
+  vpor128(x3, KA128, KA128);
+
+  /*
+   * Generate subkeys
+   */
+  vmovdqu128_memst(KA128, cmll_sub(24, ctx));
+  vec_rol128(KL128, x3, 15, x15);
+  vec_rol128(KA128, x4, 15, x15);
+  vec_rol128(KA128, x5, 30, x15);
+  vec_rol128(KL128, x6, 45, x15);
+  vec_rol128(KA128, x7, 45, x15);
+  vec_rol128(KL128, x8, 60, x15);
+  vec_rol128(KA128, x9, 60, x15);
+  vec_ror128(KL128, x10, 128-77, x15);
+
+  /* absorb kw2 to other subkeys */
+  vpslldq128(8, KL128, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, KA128, KA128);
+  vpxor128(x15, x3, x3);
+  vpxor128(x15, x4, x4);
+
+  /* subl(1) ^= subr(1) & ~subr(9); */
+  vpandn128(x15, x5, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x5, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x8, x8);
+  vpxor128(x15, x9, x9);
+
+  /* subl(1) ^= subr(1) & ~subr(17); */
+  vpandn128(x15, x10, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x10, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(KL128, KL128);
+  vpshufd128_0x1b(KA128, KA128);
+  vpshufd128_0x1b(x3, x3);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x7, x7);
+  vpshufd128_0x1b(x8, x8);
+  vpshufd128_0x1b(x9, x9);
+  vpshufd128_0x1b(x10, x10);
+
+  vmovdqu128_memst(KL128, cmll_sub(0, ctx));
+  vpshufd128_0x1b(KL128, KL128);
+  vmovdqu128_memst(KA128, cmll_sub(2, ctx));
+  vmovdqu128_memst(x3, cmll_sub(4, ctx));
+  vmovdqu128_memst(x4, cmll_sub(6, ctx));
+  vmovdqu128_memst(x5, cmll_sub(8, ctx));
+  vmovdqu128_memst(x6, cmll_sub(10, ctx));
+  vpsrldq128(8, x8, x8);
+  vmovq128_memst(x7, cmll_sub(12, ctx));
+  vmovq128_memst(x8, cmll_sub(13, ctx));
+  vmovdqu128_memst(x9, cmll_sub(14, ctx));
+  vmovdqu128_memst(x10, cmll_sub(16, ctx));
+
+  vmovdqu128_memld(cmll_sub(24, ctx), KA128);
+
+  vec_ror128(KL128, x3, 128 - 94, x7);
+  vec_ror128(KA128, x4, 128 - 94, x7);
+  vec_ror128(KL128, x5, 128 - 111, x7);
+  vec_ror128(KA128, x6, 128 - 111, x7);
+
+  vpxor128(x15, x3, x3);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+  vpslldq128(8, x15, x15);
+  vpxor128(x15, x6, x6);
+
+  /* absorb kw4 to other subkeys */
+  vpslldq128(8, x6, x15);
+  vpxor128(x15, x5, x5);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x3, x3);
+
+  /* subl(25) ^= subr(25) & ~subr(16); */
+  vmovdqu128_memld(cmll_sub(16, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x10);
+  vpandn128(x15, x10, x13);
+  vpslldq128(4, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(25) & subl(16), subr(25) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x10, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(x3, x3);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+
+  vmovdqu128_memst(x3, cmll_sub(18, ctx));
+  vmovdqu128_memst(x4, cmll_sub(20, ctx));
+  vmovdqu128_memst(x5, cmll_sub(22, ctx));
+  vmovdqu128_memst(x6, cmll_sub(24, ctx));
+
+  vmovdqu128_memld(cmll_sub(14, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x3);
+  vmovdqu128_memld(cmll_sub(12, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(10, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x5);
+  vmovdqu128_memld(cmll_sub(8, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+
+  vpxor128(x15, x3, x3);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+
+  /* subl(25) ^= subr(25) & ~subr(8); */
+  vpandn128(x15, x6, x13);
+  vpslldq128(4, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(25) & subl(8), subr(25) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x6, x14);
+  vpslld128(1, x14, x11);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x11, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(x3, x3);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+
+  vmovdqu128_memst(x3, cmll_sub(14, ctx));
+  vmovdqu128_memst(x4, cmll_sub(12, ctx));
+  vmovdqu128_memst(x5, cmll_sub(10, ctx));
+
+  vmovdqu128_memld(cmll_sub(6, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+  vmovdqu128_memld(cmll_sub(4, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(2, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x2);
+  vmovdqu128_memld(cmll_sub(0, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x0);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x2, x2);
+  vpxor128(x15, x0, x0);
+
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x2, x2);
+  vpshufd128_0x1b(x0, x0);
+
+  vpsrldq128(8, x2, x3);
+  vpsrldq128(8, x4, x5);
+  vpsrldq128(8, x6, x7);
+
+  /*
+   * key XOR is end of F-function.
+   */
+  vpxor128(x2, x0, x0);
+  vpxor128(x4, x2, x2);
+
+  vmovq128_memst(x0, cmll_sub(0, ctx));
+  vmovq128_memst(x3, cmll_sub(2, ctx));
+  vpxor128(x5, x3, x3);
+  vpxor128(x6, x4, x4);
+  vpxor128(x7, x5, x5);
+  vmovq128_memst(x2, cmll_sub(3, ctx));
+  vmovq128_memst(x3, cmll_sub(4, ctx));
+  vmovq128_memst(x4, cmll_sub(5, ctx));
+  vmovq128_memst(x5, cmll_sub(6, ctx));
+
+  vmovq128(*cmll_sub(7, ctx), x7);
+  vmovq128(*cmll_sub(8, ctx), x8);
+  vmovq128(*cmll_sub(9, ctx), x9);
+  vmovq128(*cmll_sub(10, ctx), x10);
+  /* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+  vpandn128(x10, x8, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x10, x0);
+  /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+  vpand128(x8, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x0, x6, x6);
+  vmovq128_memst(x6, cmll_sub(7, ctx));
+
+  vmovq128(*cmll_sub(11, ctx), x11);
+  vmovq128(*cmll_sub(12, ctx), x12);
+  vmovq128(*cmll_sub(13, ctx), x13);
+  vmovq128(*cmll_sub(14, ctx), x14);
+  vmovq128(*cmll_sub(15, ctx), x15);
+  /* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+  vpandn128(x7, x9, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x7, x0);
+  /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+  vpand128(x9, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x11, x0, x0);
+  vpxor128(x12, x10, x10);
+  vpxor128(x13, x11, x11);
+  vpxor128(x14, x12, x12);
+  vpxor128(x15, x13, x13);
+  vmovq128_memst(x0, cmll_sub(10, ctx));
+  vmovq128_memst(x10, cmll_sub(11, ctx));
+  vmovq128_memst(x11, cmll_sub(12, ctx));
+  vmovq128_memst(x12, cmll_sub(13, ctx));
+  vmovq128_memst(x13, cmll_sub(14, ctx));
+
+  vmovq128(*cmll_sub(16, ctx), x6);
+  vmovq128(*cmll_sub(17, ctx), x7);
+  vmovq128(*cmll_sub(18, ctx), x8);
+  vmovq128(*cmll_sub(19, ctx), x9);
+  vmovq128(*cmll_sub(20, ctx), x10);
+  /* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+  vpandn128(x8, x6, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x8, x0);
+  /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+  vpand128(x6, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x14, x0, x0);
+  vmovq128_memst(x0, cmll_sub(15, ctx));
+
+  /* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+  vpandn128(x15, x7, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x15, x0);
+  /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+  vpand128(x7, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vmovq128(*cmll_sub(21, ctx), x1);
+  vmovq128(*cmll_sub(22, ctx), x2);
+  vmovq128(*cmll_sub(23, ctx), x3);
+  vmovq128(*cmll_sub(24, ctx), x4);
+
+  vpxor128(x9, x0, x0);
+  vpxor128(x10, x8, x8);
+  vpxor128(x1, x9, x9);
+  vpxor128(x2, x10, x10);
+  vpxor128(x3, x1, x1);
+  vpxor128(x4, x3, x3);
+
+  vmovq128_memst(x0, cmll_sub(18, ctx));
+  vmovq128_memst(x8, cmll_sub(19, ctx));
+  vmovq128_memst(x9, cmll_sub(20, ctx));
+  vmovq128_memst(x10, cmll_sub(21, ctx));
+  vmovq128_memst(x1, cmll_sub(22, ctx));
+  vmovq128_memst(x2, cmll_sub(23, ctx));
+  vmovq128_memst(x3, cmll_sub(24, ctx));
+
+#undef KL128
+#undef KA128
+
+  /* kw2 and kw4 are unused now. */
+  load_zero(tmp0);
+  vmovq128_memst(tmp0, cmll_sub(1, ctx));
+  vmovq128_memst(tmp0, cmll_sub(25, ctx));
+}
+
+static ASM_FUNC_ATTR_INLINE void
+camellia_setup256(void *key_table, __m128i x0, __m128i x1)
+{
+  struct setup256_ctx_s
+  {
+    u64 *key_table;
+  } sctx = { .key_table = (u64 *)key_table };
+  struct setup256_ctx_s *ctx = &sctx;
+
+  /* input:
+   *   ctx: subkey storage at key_table(CTX)
+   *   x0, x1: key
+   */
+
+  __m128i x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m128i tmp0;
+
+#define KL128 x0
+#define KR128 x1
+#define KA128 x2
+#define KB128 x3
+
+  vpshufb128_amemld(&bswap128_mask, KL128, KL128);
+  vpshufb128_amemld(&bswap128_mask, KR128, KR128);
+
+  vmovdqa128_memld(&inv_shift_row_and_unpcklbw, x11);
+  vmovq128(*&sbox4_input_mask, x12);
+  vmovdqa128_memld(&mask_0f, x13);
+  vmovdqa128_memld(&pre_tf_lo_s1, x14);
+  vmovdqa128_memld(&pre_tf_hi_s1, x15);
+
+  /*
+   * Generate KA
+   */
+  vpxor128(KL128, KR128, x3);
+  vpsrldq128(8, KR128, x6);
+  vpsrldq128(8, x3, x2);
+  vpslldq128(8, x3, x3);
+  vpsrldq128(8, x3, x3);
+
+  camellia_f(x2, x4, x5,
+	     x7, x8, x9, x10,
+	     x11, x12, x13, x14, x15, sigma1);
+  vpxor128(x4, x3, x3);
+  camellia_f(x3, x2, x5,
+	     x7, x8, x9, x10,
+	     x11, x12, x13, x14, x15, sigma2);
+  vpxor128(x6, x2, x2);
+  camellia_f(x2, x3, x5,
+	     x7, x8, x9, x10,
+	     x11, x12, x13, x14, x15, sigma3);
+  vpxor128(x4, x3, x3);
+  vpxor128(KR128, x3, x3);
+  camellia_f(x3, x4, x5,
+	     x7, x8, x9, x10,
+	     x11, x12, x13, x14, x15, sigma4);
+
+  vpslldq128(8, x3, x3);
+  vpxor128(x4, x2, x2);
+  vpsrldq128(8, x3, x3);
+  vpslldq128(8, x2, KA128);
+  vpor128(x3, KA128, KA128);
+
+  /*
+   * Generate KB
+   */
+  vpxor128(KA128, KR128, x3);
+  vpsrldq128(8, x3, x4);
+  vpslldq128(8, x3, x3);
+  vpsrldq128(8, x3, x3);
+
+  camellia_f(x4, x5, x6,
+	     x7, x8, x9, x10,
+	     x11, x12, x13, x14, x15, sigma5);
+  vpxor128(x5, x3, x3);
+
+  camellia_f(x3, x5, x6,
+	     x7, x8, x9, x10,
+	     x11, x12, x13, x14, x15, sigma6);
+  vpslldq128(8, x3, x3);
+  vpxor128(x5, x4, x4);
+  vpsrldq128(8, x3, x3);
+  vpslldq128(8, x4, x4);
+  vpor128(x3, x4, KB128);
+
+  /*
+   * Generate subkeys
+   */
+  vmovdqu128_memst(KB128, cmll_sub(32, ctx));
+  vec_rol128(KR128, x4, 15, x15);
+  vec_rol128(KA128, x5, 15, x15);
+  vec_rol128(KR128, x6, 30, x15);
+  vec_rol128(KB128, x7, 30, x15);
+  vec_rol128(KL128, x8, 45, x15);
+  vec_rol128(KA128, x9, 45, x15);
+  vec_rol128(KL128, x10, 60, x15);
+  vec_rol128(KR128, x11, 60, x15);
+  vec_rol128(KB128, x12, 60, x15);
+
+  /* absorb kw2 to other subkeys */
+  vpslldq128(8, KL128, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, KB128, KB128);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+
+  /* subl(1) ^= subr(1) & ~subr(9); */
+  vpandn128(x15, x6, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x6, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x7, x7);
+  vpxor128(x15, x8, x8);
+  vpxor128(x15, x9, x9);
+
+  vpshufd128_0x1b(KL128, KL128);
+  vpshufd128_0x1b(KB128, KB128);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x7, x7);
+  vpshufd128_0x1b(x8, x8);
+  vpshufd128_0x1b(x9, x9);
+
+  vmovdqu128_memst(KL128, cmll_sub(0, ctx));
+  vpshufd128_0x1b(KL128, KL128);
+  vmovdqu128_memst(KB128, cmll_sub(2, ctx));
+  vmovdqu128_memst(x4, cmll_sub(4, ctx));
+  vmovdqu128_memst(x5, cmll_sub(6, ctx));
+  vmovdqu128_memst(x6, cmll_sub(8, ctx));
+  vmovdqu128_memst(x7, cmll_sub(10, ctx));
+  vmovdqu128_memst(x8, cmll_sub(12, ctx));
+  vmovdqu128_memst(x9, cmll_sub(14, ctx));
+
+  vmovdqu128_memld(cmll_sub(32, ctx), KB128);
+
+  /* subl(1) ^= subr(1) & ~subr(17); */
+  vpandn128(x15, x10, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x10, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x11, x11);
+  vpxor128(x15, x12, x12);
+
+  vec_ror128(KL128, x4, 128-77, x14);
+  vec_ror128(KA128, x5, 128-77, x14);
+  vec_ror128(KR128, x6, 128-94, x14);
+  vec_ror128(KA128, x7, 128-94, x14);
+  vec_ror128(KL128, x8, 128-111, x14);
+  vec_ror128(KB128, x9, 128-111, x14);
+
+  vpxor128(x15, x4, x4);
+
+  vpshufd128_0x1b(x10, x10);
+  vpshufd128_0x1b(x11, x11);
+  vpshufd128_0x1b(x12, x12);
+  vpshufd128_0x1b(x4, x4);
+
+  vmovdqu128_memst(x10, cmll_sub(16, ctx));
+  vmovdqu128_memst(x11, cmll_sub(18, ctx));
+  vmovdqu128_memst(x12, cmll_sub(20, ctx));
+  vmovdqu128_memst(x4, cmll_sub(22, ctx));
+
+  /* subl(1) ^= subr(1) & ~subr(25); */
+  vpandn128(x15, x5, x13);
+  vpslldq128(12, x13, x13);
+  vpsrldq128(8, x13, x13);
+  vpxor128(x13, x15, x15);
+  /* dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x5, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x7, x7);
+  vpxor128(x15, x8, x8);
+  vpslldq128(8, x15, x15);
+  vpxor128(x15, x9, x9);
+
+  /* absorb kw4 to other subkeys */
+  vpslldq128(8, x9, x15);
+  vpxor128(x15, x8, x8);
+  vpxor128(x15, x7, x7);
+  vpxor128(x15, x6, x6);
+
+  /* subl(33) ^= subr(33) & ~subr(24); */
+  vpandn128(x15, x5, x14);
+  vpslldq128(4, x14, x14);
+  vpxor128(x14, x15, x15);
+  /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x5, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x7, x7);
+  vpshufd128_0x1b(x8, x8);
+  vpshufd128_0x1b(x9, x9);
+
+  vmovdqu128_memst(x5, cmll_sub(24, ctx));
+  vmovdqu128_memst(x6, cmll_sub(26, ctx));
+  vmovdqu128_memst(x7, cmll_sub(28, ctx));
+  vmovdqu128_memst(x8, cmll_sub(30, ctx));
+  vmovdqu128_memst(x9, cmll_sub(32, ctx));
+
+  vmovdqu128_memld(cmll_sub(22, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x0);
+  vmovdqu128_memld(cmll_sub(20, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x1);
+  vmovdqu128_memld(cmll_sub(18, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x2);
+  vmovdqu128_memld(cmll_sub(16, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x3);
+  vmovdqu128_memld(cmll_sub(14, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(12, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x5);
+  vmovdqu128_memld(cmll_sub(10, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+  vmovdqu128_memld(cmll_sub(8, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x7);
+
+  vpxor128(x15, x0, x0);
+  vpxor128(x15, x1, x1);
+  vpxor128(x15, x2, x2);
+
+  /* subl(33) ^= subr(33) & ~subr(24); */
+  vpandn128(x15, x3, x14);
+  vpslldq128(4, x14, x14);
+  vpxor128(x14, x15, x15);
+  /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x3, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x5, x5);
+  vpxor128(x15, x6, x6);
+
+  vpshufd128_0x1b(x0, x0);
+  vpshufd128_0x1b(x1, x1);
+  vpshufd128_0x1b(x2, x2);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x5, x5);
+  vpshufd128_0x1b(x6, x6);
+
+  vmovdqu128_memst(x0, cmll_sub(22, ctx));
+  vmovdqu128_memst(x1, cmll_sub(20, ctx));
+  vmovdqu128_memst(x2, cmll_sub(18, ctx));
+  vmovdqu128_memst(x4, cmll_sub(14, ctx));
+  vmovdqu128_memst(x5, cmll_sub(12, ctx));
+  vmovdqu128_memst(x6, cmll_sub(10, ctx));
+
+  vmovdqu128_memld(cmll_sub(6, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x6);
+  vmovdqu128_memld(cmll_sub(4, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x4);
+  vmovdqu128_memld(cmll_sub(2, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x2);
+  vmovdqu128_memld(cmll_sub(0, ctx), tmp0);
+  vpshufd128_0x1b(tmp0, x0);
+
+  /* subl(33) ^= subr(33) & ~subr(24); */
+  vpandn128(x15, x7, x14);
+  vpslldq128(4, x14, x14);
+  vpxor128(x14, x15, x15);
+  /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+  vpand128(x15, x7, x14);
+  vpslld128(1, x14, x13);
+  vpsrld128(31, x14, x14);
+  vpaddb128(x13, x14, x14);
+  vpsrldq128(12, x14, x14);
+  vpslldq128(8, x14, x14);
+  vpxor128(x14, x15, x15);
+
+  vpxor128(x15, x6, x6);
+  vpxor128(x15, x4, x4);
+  vpxor128(x15, x2, x2);
+  vpxor128(x15, x0, x0);
+
+  vpshufd128_0x1b(x6, x6);
+  vpshufd128_0x1b(x4, x4);
+  vpshufd128_0x1b(x2, x2);
+  vpshufd128_0x1b(x0, x0);
+
+  vpsrldq128(8, x2, x3);
+  vpsrldq128(8, x4, x5);
+  vpsrldq128(8, x6, x7);
+
+  /*
+    * key XOR is end of F-function.
+    */
+  vpxor128(x2, x0, x0);
+  vpxor128(x4, x2, x2);
+
+  vmovq128_memst(x0, cmll_sub(0, ctx));
+  vmovq128_memst(x3, cmll_sub(2, ctx));
+  vpxor128(x5, x3, x3);
+  vpxor128(x6, x4, x4);
+  vpxor128(x7, x5, x5);
+  vmovq128_memst(x2, cmll_sub(3, ctx));
+  vmovq128_memst(x3, cmll_sub(4, ctx));
+  vmovq128_memst(x4, cmll_sub(5, ctx));
+  vmovq128_memst(x5, cmll_sub(6, ctx));
+
+  vmovq128(*cmll_sub(7, ctx), x7);
+  vmovq128(*cmll_sub(8, ctx), x8);
+  vmovq128(*cmll_sub(9, ctx), x9);
+  vmovq128(*cmll_sub(10, ctx), x10);
+  /* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+  vpandn128(x10, x8, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x10, x0);
+  /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+  vpand128(x8, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x0, x6, x6);
+  vmovq128_memst(x6, cmll_sub(7, ctx));
+
+  vmovq128(*cmll_sub(11, ctx), x11);
+  vmovq128(*cmll_sub(12, ctx), x12);
+  vmovq128(*cmll_sub(13, ctx), x13);
+  vmovq128(*cmll_sub(14, ctx), x14);
+  vmovq128(*cmll_sub(15, ctx), x15);
+  /* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+  vpandn128(x7, x9, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x7, x0);
+  /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+  vpand128(x9, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x11, x0, x0);
+  vpxor128(x12, x10, x10);
+  vpxor128(x13, x11, x11);
+  vpxor128(x14, x12, x12);
+  vpxor128(x15, x13, x13);
+  vmovq128_memst(x0, cmll_sub(10, ctx));
+  vmovq128_memst(x10, cmll_sub(11, ctx));
+  vmovq128_memst(x11, cmll_sub(12, ctx));
+  vmovq128_memst(x12, cmll_sub(13, ctx));
+  vmovq128_memst(x13, cmll_sub(14, ctx));
+
+  vmovq128(*cmll_sub(16, ctx), x6);
+  vmovq128(*cmll_sub(17, ctx), x7);
+  vmovq128(*cmll_sub(18, ctx), x8);
+  vmovq128(*cmll_sub(19, ctx), x9);
+  vmovq128(*cmll_sub(20, ctx), x10);
+  /* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+  vpandn128(x8, x6, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x8, x0);
+  /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+  vpand128(x6, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vpxor128(x14, x0, x0);
+  vmovq128_memst(x0, cmll_sub(15, ctx));
+
+  /* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+  vpandn128(x15, x7, x1);
+  vpsrldq128(4, x1, x1);
+  vpxor128(x1, x15, x0);
+  /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+  vpand128(x7, x0, x1);
+  vpslld128(1, x1, x2);
+  vpsrld128(31, x1, x1);
+  vpaddb128(x2, x1, x1);
+  vpslldq128(12, x1, x1);
+  vpsrldq128(8, x1, x1);
+  vpxor128(x1, x0, x0);
+
+  vmovq128(*cmll_sub(21, ctx), x1);
+  vmovq128(*cmll_sub(22, ctx), x2);
+  vmovq128(*cmll_sub(23, ctx), x3);
+  vmovq128(*cmll_sub(24, ctx), x4);
+
+  vpxor128(x9, x0, x0);
+  vpxor128(x10, x8, x8);
+  vpxor128(x1, x9, x9);
+  vpxor128(x2, x10, x10);
+  vpxor128(x3, x1, x1);
+
+  vmovq128_memst(x0, cmll_sub(18, ctx));
+  vmovq128_memst(x8, cmll_sub(19, ctx));
+  vmovq128_memst(x9, cmll_sub(20, ctx));
+  vmovq128_memst(x10, cmll_sub(21, ctx));
+  vmovq128_memst(x1, cmll_sub(22, ctx));
+
+  vmovq128(*cmll_sub(25, ctx), x5);
+  vmovq128(*cmll_sub(26, ctx), x6);
+  vmovq128(*cmll_sub(27, ctx), x7);
+  vmovq128(*cmll_sub(28, ctx), x8);
+  vmovq128(*cmll_sub(29, ctx), x9);
+  vmovq128(*cmll_sub(30, ctx), x10);
+  vmovq128(*cmll_sub(31, ctx), x11);
+  vmovq128(*cmll_sub(32, ctx), x12);
+
+  /* tl = subl(26) ^ (subr(26) & ~subr(24)); */
+  vpandn128(x6, x4, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x6, x0);
+  /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+  vpand128(x4, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x0, x2, x2);
+  vmovq128_memst(x2, cmll_sub(23, ctx));
+
+  /* tl = subl(23) ^ (subr(23) &  ~subr(25)); */
+  vpandn128(x3, x5, x15);
+  vpsrldq128(4, x15, x15);
+  vpxor128(x15, x3, x0);
+  /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+  vpand128(x5, x0, x15);
+  vpslld128(1, x15, x14);
+  vpsrld128(31, x15, x15);
+  vpaddb128(x14, x15, x15);
+  vpslldq128(12, x15, x15);
+  vpsrldq128(8, x15, x15);
+  vpxor128(x15, x0, x0);
+
+  vpxor128(x7, x0, x0);
+  vpxor128(x8, x6, x6);
+  vpxor128(x9, x7, x7);
+  vpxor128(x10, x8, x8);
+  vpxor128(x11, x9, x9);
+  vpxor128(x12, x11, x11);
+
+  vmovq128_memst(x0, cmll_sub(26, ctx));
+  vmovq128_memst(x6, cmll_sub(27, ctx));
+  vmovq128_memst(x7, cmll_sub(28, ctx));
+  vmovq128_memst(x8, cmll_sub(29, ctx));
+  vmovq128_memst(x9, cmll_sub(30, ctx));
+  vmovq128_memst(x10, cmll_sub(31, ctx));
+  vmovq128_memst(x11, cmll_sub(32, ctx));
+
+#undef KL128
+#undef KR128
+#undef KA128
+#undef KB128
+
+  /* kw2 and kw4 are unused now. */
+  load_zero(tmp0);
+  vmovq128_memst(tmp0, cmll_sub(1, ctx));
+  vmovq128_memst(tmp0, cmll_sub(33, ctx));
+}
+
+void ASM_FUNC_ATTR_NOINLINE
+FUNC_KEY_SETUP(void *key_table, const void *vkey, unsigned int keylen)
+{
+  const char *key = vkey;
+
+  /* input:
+   *   key_table: subkey storage at key_table(CTX)
+   *   key_length_bits: output key length as number of bits
+   *   key: input key buffer
+   *   keylen: key length in bytes
+   */
+
+  __m128i x0, x1, x2;
+
+  switch (keylen)
+    {
+      default:
+	return; /* Unsupported key length! */
+
+      case 16:
+	vmovdqu128_memld(key, x0);
+	camellia_setup128(key_table, x0);
+	return;
+
+      case 24:
+	vmovdqu128_memld(key, x0);
+	vmovq128(*(uint64_unaligned_t *)(key + 16), x1);
+
+	x2[0] = -1;
+	x2[1] = -1;
+	vpxor128(x1, x2, x2);
+	vpslldq128(8, x2, x2);
+	vpor128(x2, x1, x1);
+	break;
+
+      case 32:
+	vmovdqu128_memld(key, x0);
+	vmovdqu128_memld(key + 16, x1);
+	break;
+    }
+
+  camellia_setup256(key_table, x0, x1);
+}
diff --git a/configure.ac b/configure.ac
index b9ac99bb..a40a8135 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2967,6 +2967,11 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
       ;;
+      powerpc64le-*-*)
+         # Build with the POWER vector implementations
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-ppc8le.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-ppc9le.lo"
+      ;;
    esac
 
    if test x"$avxsupport" = xyes ; then
-- 
2.37.2