[PATCH 3/6] Add RISC-V vector permute AES

Jussi Kivilinna jussi.kivilinna at iki.fi
Mon Jan 6 16:08:50 CET 2025


* cipher/Makefile.am: Add 'rinjdael-vp-riscv.c' and
CFLAG handling for 'rijndael-vp-riscv.o' and 'rijndael-vp-riscv.lo'.
(ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS): New.
* cipher/rijndael-internal.h (USE_VP_RISCV): New.
* cipher/rijndael-vp-simd128.h [__ARM_NEON]: Move ARM NEON macros to ...
* cipher/rijndael-vp-aarch64.c: ... here.
* cipher/rijndael-vp-riscv.c: New.
* cipher/rijndael-vp-simd128.h: Use '__m128i_const' type for constant
vector values and use *_amemld() macros to load these values to vector
registers.
[__x86_64__] (vpaddd128, vpaddb128): Remove.
[__x86_64__] (psrl_byte_128, movdqa128_memld, pand128_amemld)
(paddq128_amemld, paddd128_amemld, pshufb128_amemld): New.
[HAVE_SIMD256] (aes_encrypt_core_4blks_simd256)
(aes_decrypt_core_4blks_simd256): New.
(FUNC_CTR_ENC, FUNC_CTR32LE_ENC, FUNC_CFB_DEC, FUNC_CBC_DEC)
(aes_simd128_ocb_enc, aes_simd128_ocb_dec, FUNC_OCB_AUTH)
(aes_simd128_ecb_enc, aes_simd128_ecb_dec, aes_simd128_xts_enc)
(aes_simd128_xts_dec) [HAVE_SIMD256]: Add 4 block parallel code paths
for HW with 256-bit wide vectors.
* cipher/rijndael.c [USE_VP_RISCV]
(_gcry_aes_vp_riscv_setup_acceleration, _gcry_aes_vp_riscv_do_setkey)
(_gcry_aes_vp_riscv_prepare_decryption, _gcry_aes_vp_riscv_encrypt)
(_gcry_aes_vp_riscv_decrypt, _gcry_aes_vp_riscv_cfb_enc)
(_gcry_aes_vp_riscv_cbc_enc, _gcry_aes_vp_riscv_ctr_enc)
(_gcry_aes_vp_riscv_ctr32le_enc, _gcry_aes_vp_riscv_cfb_dec)
(_gcry_aes_vp_riscv_cbc_dec, _gcry_aes_vp_riscv_ocb_crypt)
(_gcry_aes_vp_riscv_ocb_auth, _gcry_aes_vp_riscv_ecb_crypt)
(_gcry_aes_vp_riscv_xts_crypt): New.
(do_setkey) [USE_VP_RISCV]: Setup vector permute AES for RISC-V with
HWF_RISCV_IMAFDC and HWF_RISCV_V.
* cipher/simd-common-riscv.h: New.
* configure.ac: Add 'rijndael-vp-riscv.lo'.
(gcry_cv_cc_riscv_vector_intrinsics)
(gcry_cv_cc_riscv_vector_intrinsics_cflags): New.
--

Patch adds AES vector permutation implementation for RISC-V with
fixed vector lengths of 128-bit and 256-bit.

Benchmark on SpacemiT K1 (1600 Mhz):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     35.30 ns/B     27.02 MiB/s     56.48 c/B
        ECB dec |     35.51 ns/B     26.86 MiB/s     56.81 c/B
        CBC enc |     35.40 ns/B     26.94 MiB/s     56.63 c/B
        CBC dec |     36.30 ns/B     26.27 MiB/s     58.08 c/B
        CFB enc |     36.25 ns/B     26.31 MiB/s     58.00 c/B
        CFB dec |     36.25 ns/B     26.31 MiB/s     58.00 c/B
        OFB enc |     38.28 ns/B     24.91 MiB/s     61.25 c/B
        OFB dec |     38.28 ns/B     24.91 MiB/s     61.26 c/B
        CTR enc |     39.81 ns/B     23.96 MiB/s     63.69 c/B
        CTR dec |     39.81 ns/B     23.96 MiB/s     63.69 c/B
        XTS enc |     36.38 ns/B     26.22 MiB/s     58.20 c/B
        XTS dec |     36.26 ns/B     26.30 MiB/s     58.01 c/B
        OCB enc |     40.94 ns/B     23.29 MiB/s     65.50 c/B
        OCB dec |     40.71 ns/B     23.43 MiB/s     65.13 c/B
       OCB auth |     37.34 ns/B     25.54 MiB/s     59.75 c/B

After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  speed vs old
        ECB enc |     16.76 ns/B     56.90 MiB/s     26.82 c/B     2.11x
        ECB dec |     19.94 ns/B     47.84 MiB/s     31.90 c/B     1.78x
        CBC enc |     31.72 ns/B     30.06 MiB/s     50.75 c/B     1.12x
        CBC dec |     20.24 ns/B     47.12 MiB/s     32.38 c/B     1.79x
        CFB enc |     31.80 ns/B     29.99 MiB/s     50.88 c/B     1.14x
        CFB dec |     16.87 ns/B     56.55 MiB/s     26.98 c/B     2.15x
        OFB enc |     38.68 ns/B     24.66 MiB/s     61.88 c/B     0.99x
        OFB dec |     38.65 ns/B     24.67 MiB/s     61.85 c/B     0.99x
        CTR enc |     16.86 ns/B     56.57 MiB/s     26.97 c/B     2.36x
        XTS enc |     17.49 ns/B     54.51 MiB/s     27.99 c/B     2.08x
        XTS dec |     20.80 ns/B     45.86 MiB/s     33.27 c/B     1.74x
        GCM enc |     31.16 ns/B     30.61 MiB/s     49.85 c/B     1.73x
        OCB enc |     17.25 ns/B     55.28 MiB/s     27.60 c/B     2.37x
        OCB dec |     20.64 ns/B     46.21 MiB/s     33.02 c/B     1.97x
       OCB auth |     17.11 ns/B     55.73 MiB/s     27.38 c/B     2.18x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am           |   16 +-
 cipher/rijndael-internal.h   |    8 +-
 cipher/rijndael-vp-aarch64.c |   60 +-
 cipher/rijndael-vp-riscv.c   |  285 ++++++++++
 cipher/rijndael-vp-simd128.h | 1044 +++++++++++++++++++++++++++-------
 cipher/rijndael.c            |   80 +++
 cipher/simd-common-riscv.h   |   48 ++
 configure.ac                 |  118 ++++
 8 files changed, 1463 insertions(+), 196 deletions(-)
 create mode 100644 cipher/rijndael-vp-riscv.c
 create mode 100644 cipher/simd-common-riscv.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 90415d83..88b2d17c 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -119,7 +119,8 @@ EXTRA_libcipher_la_SOURCES = \
 	rijndael-p10le.c rijndael-gcm-p10le.s              \
 	rijndael-ppc-common.h rijndael-ppc-functions.h     \
 	rijndael-s390x.c                                   \
-	rijndael-vp-aarch64.c rijndael-vp-simd128.h        \
+	rijndael-vp-aarch64.c rijndael-vp-riscv.c          \
+	rijndael-vp-simd128.h                              \
 	rmd160.c \
 	rsa.c \
 	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
@@ -349,3 +350,16 @@ serpent-avx512-x86.o: $(srcdir)/serpent-avx512-x86.c Makefile
 
 serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
 	`echo $(LTCOMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
+
+if ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS
+# Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+riscv_vector_cflags = -O2 -march=rv64imafdcv -mstrict-align
+else
+riscv_vector_cflags =
+endif
+
+rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
+	`echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-vp-riscv.lo: $(srcdir)/rijndael-vp-riscv.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 69ef86af..92310fc5 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -124,12 +124,18 @@
 # endif
 #endif /* ENABLE_ARM_CRYPTO_SUPPORT */
 
-/* USE_ARM_CE indicates whether to enable vector permute AArch64 SIMD code. */
+/* USE_VP_AARCH64 indicates whether to enable vector permute AArch64 SIMD code. */
 #undef USE_VP_AARCH64
 #if defined(__AARCH64EL__) && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS)
 # define USE_VP_AARCH64 1
 #endif
 
+/* USE_VP_RISCV indicates whether to enable vector permute RISC-V code. */
+#undef USE_VP_RISCV
+#if defined (__riscv) && defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS)
+# define USE_VP_RISCV 1
+#endif
+
 /* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
  * accelerated code.  USE_PPC_CRYPTO_WITH_PPC9LE indicates whether to
  * enable POWER9 optimized variant.  */
diff --git a/cipher/rijndael-vp-aarch64.c b/cipher/rijndael-vp-aarch64.c
index 0532c421..9c8b852b 100644
--- a/cipher/rijndael-vp-aarch64.c
+++ b/cipher/rijndael-vp-aarch64.c
@@ -1,5 +1,5 @@
-/* SSSE3 vector permutation AES for Libgcrypt
- * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+/* AArch64 SIMD vector permutation AES for Libgcrypt
+ * Copyright (C) 2014-2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -50,6 +50,62 @@
 #ifdef USE_VP_AARCH64
 
 
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (ARM)
+ **********************************************************************/
+
+#include "simd-common-aarch64.h"
+#include <arm_neon.h>
+
+#define __m128i uint64x2_t
+
+#define pand128(a, o)           (o = vandq_u64(o, a))
+#define pandn128(a, o)          (o = vbicq_u64(a, o))
+#define pxor128(a, o)           (o = veorq_u64(o, a))
+#define paddq128(a, o)          (o = vaddq_u64(o, a))
+#define paddd128(a, o)          (o = (__m128i)vaddq_u32((uint32x4_t)o, (uint32x4_t)a))
+#define paddb128(a, o)          (o = (__m128i)vaddq_u8((uint8x16_t)o, (uint8x16_t)a))
+
+#define psrld128(s, o)          (o = (__m128i)vshrq_n_u32((uint32x4_t)o, s))
+#define psraq128(s, o)          (o = (__m128i)vshrq_n_s64((int64x2_t)o, s))
+#define psrldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
+				   o = (__m128i)vextq_u8((uint8x16_t)o, \
+				                         (uint8x16_t)__tmp, (s) & 15);})
+#define pslldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
+                                   o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
+                                                         (uint8x16_t)o, (16 - (s)) & 15);})
+#define psrl_byte_128(s, o)     (o = (__m128i)vshrq_n_u8((uint8x16_t)o, s))
+
+#define pshufb128(m8, o)        (o = (__m128i)vqtbl1q_u8((uint8x16_t)o, (uint8x16_t)m8))
+#define pshufd128(m32, a, o)    ({ static const __m128i __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+				   __m128i __tmp2; \
+				   movdqa128(a, o); \
+				   movdqa128_memld(&__tmp1, __tmp2); \
+				   pshufb128(__tmp2, o); })
+#define pshufd128_0x93(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 12))
+#define pshufd128_0xFF(a, o)    (o = (__m128i)vdupq_laneq_u32((uint32x4_t)a, 3))
+#define pshufd128_0xFE(a, o)    pshufd128(0xFE, a, o)
+#define pshufd128_0x4E(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
+
+#define palignr128(s, a, o)     (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)o, s))
+
+#define movdqa128(a, o)         (o = a)
+
+#define movdqa128_memld(a, o)   (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+
+#define pand128_amemld(m, o)    pand128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define pxor128_amemld(m, o)    pxor128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define paddq128_amemld(m, o)   paddq128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define paddd128_amemld(m, o)   paddd128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define pshufb128_amemld(m, o)  pshufb128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+
+/* Following operations may have unaligned memory input */
+#define movdqu128_memld(a, o)   (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+
+/* Following operations may have unaligned memory output */
+#define movdqu128_memst(a, o)   vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
+
+
 #ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
 # define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
 #else
diff --git a/cipher/rijndael-vp-riscv.c b/cipher/rijndael-vp-riscv.c
new file mode 100644
index 00000000..b8c6ed13
--- /dev/null
+++ b/cipher/rijndael-vp-riscv.c
@@ -0,0 +1,285 @@
+/* RISC-V vector permutation AES for Libgcrypt
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The code is based on the public domain library libvpaes version 0.5
+ * available at http://crypto.stanford.edu/vpaes/ and which carries
+ * this notice:
+ *
+ *     libvpaes: constant-time SSSE3 AES encryption and decryption.
+ *     version 0.5
+ *
+ *     By Mike Hamburg, Stanford University, 2009.  Public domain.
+ *     I wrote essentially all of this code.  I did not write the test
+ *     vectors; they are the NIST known answer tests.  I hereby release all
+ *     the code and documentation here that I wrote into the public domain.
+ *
+ *     This is an implementation of AES following my paper,
+ *       "Accelerating AES with Vector Permute Instructions"
+ *       CHES 2009; http://shiftleft.org/papers/vector_aes/
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_VP_RISCV
+
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (RISC-V)
+ **********************************************************************/
+
+#include <riscv_vector.h>
+#include "simd-common-riscv.h"
+
+/*
+ * SIMD128
+ */
+
+typedef vuint8m1_t __m128i;
+
+#define cast_m128i_to_s8(a)     (__riscv_vreinterpret_v_u8m1_i8m1(a))
+#define cast_m128i_to_u32(a)    (__riscv_vreinterpret_v_u8m1_u32m1(a))
+#define cast_m128i_to_u64(a)    (__riscv_vreinterpret_v_u8m1_u64m1(a))
+#define cast_m128i_to_s64(a)    (__riscv_vreinterpret_v_u64m1_i64m1(cast_m128i_to_u64(a)))
+
+#define cast_s8_to_m128i(a)     (__riscv_vreinterpret_v_i8m1_u8m1(a))
+#define cast_u32_to_m128i(a)    (__riscv_vreinterpret_v_u32m1_u8m1(a))
+#define cast_u64_to_m128i(a)    (__riscv_vreinterpret_v_u64m1_u8m1(a))
+#define cast_s64_to_m128i(a)    (cast_u64_to_m128i(__riscv_vreinterpret_v_i64m1_u64m1(a)))
+
+#define pand128(a, o)           (o = __riscv_vand_vv_u8m1((o), (a), 16))
+#define pandn128(a, o)          (o = __riscv_vand_vv_u8m1(__riscv_vnot_v_u8m1((o), 16), (a), 16))
+#define pxor128(a, o)           (o = __riscv_vxor_vv_u8m1((o), (a), 16))
+#define paddb128(a, o)          (o = __riscv_vadd_vv_u8m1((o), (a), 16))
+#define paddd128(a, o)          (o = cast_u32_to_m128i(__riscv_vadd_vv_u32m1( \
+							cast_m128i_to_u32(o), \
+							cast_m128i_to_u32(a), 4)))
+#define paddq128(a, o)          (o = cast_u64_to_m128i(__riscv_vadd_vv_u64m1( \
+							cast_m128i_to_u64(o), \
+							cast_m128i_to_u64(a), 2)))
+
+#define psrld128(s, o)          (o = cast_u32_to_m128i(__riscv_vsrl_vx_u32m1(cast_m128i_to_u32(o), (s), 4))
+#define psraq128(s, o)          (o = cast_s64_to_m128i(__riscv_vsra_vx_i64m1(cast_m128i_to_s64(o), (s), 2)))
+#define psrldq128(s, o)         (o = __riscv_vslidedown_vx_u8m1((o), (s), 16))
+#define pslldq128(s, o)         ({ vuint8m1_t __tmp = __riscv_vmv_v_x_u8m1(0, 16); \
+				   o = __riscv_vslideup_vx_u8m1(__tmp, (o), (s), 16); })
+#define psrl_byte_128(s, o)     (o = __riscv_vsrl_vx_u8m1((o), (s), 16))
+
+#define pshufb128(m8, o)        (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+#define pshufd128(m32, a, o)    ({ static const __m128i_const __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+				   __m128i __tmp2; \
+				   movdqa128(a, o); \
+				   movdqa128_memld(&__tmp1, __tmp2); \
+				   pshufb128(__tmp2, o); })
+
+#define pshufd128_0x93(a, o)    pshufd128(0x93, a, o)
+#define pshufd128_0xFF(a, o)    (o = cast_u32_to_m128i(__riscv_vrgather_vx_u32m1(cast_m128i_to_u32(a), 3, 4)))
+#define pshufd128_0xFE(a, o)    pshufd128(0xFE, a, o)
+#define pshufd128_0x4E(a, o)    pshufd128(0x4E, a, o)
+
+#define palignr128(s, a, o)     (o = __riscv_vslideup_vx_u8m1(__riscv_vslidedown_vx_u8m1((a), (s), 16), (o), 16 - (s), 16))
+
+#define movdqa128(a, o)         (o = (a))
+
+#define movdqa128_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 16))
+
+#define pand128_amemld(m, o)    pand128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define pxor128_amemld(m, o)    pxor128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define paddq128_amemld(m, o)   paddq128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define paddd128_amemld(m, o)   paddd128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define pshufb128_amemld(m, o)  pshufb128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+
+/* Following operations may have unaligned memory input */
+#define movdqu128_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 16))
+
+/* Following operations may have unaligned memory output */
+#define movdqu128_memst(a, o)   (__riscv_vse8_v_u8m1((void *)(o), (a), 16))
+
+/*
+ * SIMD256
+ */
+
+#define PSHUFD256_MASK_TO_PSHUFB256_MASK(m32) { \
+	M128I_BYTE(((((m32) >> 0) & 0x03) * 4) + 0, \
+		   ((((m32) >> 0) & 0x03) * 4) + 1, \
+		   ((((m32) >> 0) & 0x03) * 4) + 2, \
+		   ((((m32) >> 0) & 0x03) * 4) + 3, \
+		   ((((m32) >> 2) & 0x03) * 4) + 0, \
+		   ((((m32) >> 2) & 0x03) * 4) + 1, \
+		   ((((m32) >> 2) & 0x03) * 4) + 2, \
+		   ((((m32) >> 2) & 0x03) * 4) + 3, \
+		   ((((m32) >> 4) & 0x03) * 4) + 0, \
+		   ((((m32) >> 4) & 0x03) * 4) + 1, \
+		   ((((m32) >> 4) & 0x03) * 4) + 2, \
+		   ((((m32) >> 4) & 0x03) * 4) + 3, \
+		   ((((m32) >> 6) & 0x03) * 4) + 0, \
+		   ((((m32) >> 6) & 0x03) * 4) + 1, \
+		   ((((m32) >> 6) & 0x03) * 4) + 2, \
+		   ((((m32) >> 6) & 0x03) * 4) + 3), \
+	M128I_BYTE(((((m32) >> 0) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 0) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 0) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 0) & 0x03) * 4) + 3 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 0 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 3 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 0 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 3 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 0 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 3 + 16) }
+
+typedef vuint8m1_t __m256i;
+
+#define HAVE_SIMD256 1
+
+#define check_simd256_support() (__riscv_vsetvl_e8m1(32) == 32)
+
+#define cast_m256i_to_s8(a)     cast_m128i_to_s8(a)
+#define cast_m256i_to_u32(a)    cast_m128i_to_u32(a)
+#define cast_m256i_to_u64(a)    cast_m128i_to_u64(a)
+#define cast_m256i_to_s64(a)    cast_m128i_to_s64(a)
+
+#define cast_s8_to_m256i(a)     (__riscv_vreinterpret_v_i8m1_u8m1(a))
+#define cast_u32_to_m256i(a)    (__riscv_vreinterpret_v_u32m1_u8m1(a))
+#define cast_u64_to_m256i(a)    (__riscv_vreinterpret_v_u64m1_u8m1(a))
+#define cast_s64_to_m256i(a)    (cast_u64_to_m128i(__riscv_vreinterpret_v_i64m1_u64m1(a)))
+
+#define pand256(a, o)           (o = __riscv_vand_vv_u8m1((o), (a), 32))
+#define pandn256(a, o)          (o = __riscv_vand_vv_u8m1(__riscv_vnot_v_u8m1((o), 32), (a), 32))
+#define pxor256(a, o)           (o = __riscv_vxor_vv_u8m1((o), (a), 32))
+#define paddb256(a, o)          (o = __riscv_vadd_vv_u8m1((o), (a), 32))
+#define paddd256(a, o)          (o = cast_u32_to_m256i(__riscv_vadd_vv_u32m1( \
+							cast_m256i_to_u32(o), \
+							cast_m256i_to_u32(a), 8)))
+#define paddq256(a, o)          (o = cast_u64_to_m256i(__riscv_vadd_vv_u64m1( \
+							cast_m256i_to_u64(o), \
+							cast_m256i_to_u64(a), 4)))
+
+#define psrld256(s, o)          (o = cast_u32_to_m256i(__riscv_vsrl_vx_u32m1(cast_m256i_to_u32(o), (s), 8))
+#define psraq256(s, o)          (o = cast_s64_to_m256i(__riscv_vsra_vx_i64m1(cast_m256i_to_s64(o), (s), 4)))
+#define psrl_byte_256(s, o)     (o = __riscv_vsrl_vx_u8m1((o), (s), 32))
+
+/* Note: these are not PSHUFB equavalent as full 256-bit vector is used as
+ * 32 byte table. 256-bit PSHUFB on x86 handles 128-bit lanes separately as
+ * 128-bit 16 byte tables. */
+
+/* tab32 variant: indexes have values 0..31. Used when 'm8' is constant and
+ * variable data is in 'o'. */
+#define pshufb256_tab32(m8, o)  (o = __riscv_vrgather_vv_u8m1((o), (m8), 32))
+
+/* tab16 variant: indexes have values 0..16 and only low 128-bit of 'o' is
+ * used. Used when 'o' is constant and variable data is in 'm8'. */
+#define pshufb256_tab16(m8, o)  (o = __riscv_vrgather_vv_u8m1((o), (m8), 32))
+
+/* Load 16 byte mask for 'pshufb256_tab32' usage as if 256-bit PSHUFB was to be
+ * used as on x86 (two separate 128-bit lanes). */
+#define load_tab32_mask(m, o)   ({ __m128i __tmp_lo128; \
+				   __m128i __tmp_hi128; \
+				   movdqu128_memld(m, __tmp_lo128); \
+				   __tmp_hi128 = __riscv_vadd_vx_u8m1(__tmp_lo128, 16, 16); \
+				   o = __riscv_vslideup_vx_u8m1(__tmp_lo128, __tmp_hi128, 16, 32); })
+
+#define broadcast128_256(a, o)  (o = __riscv_vslideup_vx_u8m1((a), (a), 16, 32))
+
+/* Load 16 byte table for 'pshufb256_tab16' usage. On x86 this would splat
+ * 128-bit table from memory to both 128-bit lanes of 256-bit register.
+ * On RISC-V this just loads memory to lower 128-bits. */
+#define load_tab16_table(m, o)  movdqu128_memld(m, o)
+
+#define pshufd256(m32, a, o)    ({ static const __m128i_const __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+				   __m256i __tmp2; \
+				   movdqa256(a, o); \
+				   load_tab32_mask(&__tmp1, __tmp2); \
+				   pshufb256_tab32(__tmp2, o); })
+
+#define pshufd256_0x93(a, o)    pshufd256(0x93, a, o)
+
+#define insert256_hi128(x, o)   (o = __riscv_vslideup_vx_u8m1((o), (x), 16, 32))
+#define extract256_hi128(y, o)  (o = __riscv_vslidedown_vx_u8m1((y), 16, 32))
+
+#define movdqa256(a, o)         (o = (a))
+
+#define movdqa128_256(a, o)     (o = (a))
+#define movdqa256_128(a, o)     (o = (a))
+
+#define movdqa256_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 32))
+
+#define pand256_amemld(m, o)    pand128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define pxor256_amemld(m, o)    pxor128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define paddq256_amemld(m, o)   paddq128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define paddd256_amemld(m, o)   paddd128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define pshufb256_amemld(m, o)  pshufb128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define broadcast128_256_amemld(m, o) \
+				broadcast128_256(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+
+/* Following operations may have unaligned memory input */
+#define movdqu256_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 32))
+
+/* Following operations may have unaligned memory output */
+#define movdqu256_memst(a, o)   (__riscv_vse8_v_u8m1((void *)(o), (a), 32))
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENCRYPT _gcry_aes_vp_riscv_encrypt
+#define FUNC_DECRYPT _gcry_aes_vp_riscv_decrypt
+#define FUNC_CFB_ENC _gcry_aes_vp_riscv_cfb_enc
+#define FUNC_CFB_DEC _gcry_aes_vp_riscv_cfb_dec
+#define FUNC_CBC_ENC _gcry_aes_vp_riscv_cbc_enc
+#define FUNC_CBC_DEC _gcry_aes_vp_riscv_cbc_dec
+#define FUNC_CTR_ENC _gcry_aes_vp_riscv_ctr_enc
+#define FUNC_CTR32LE_ENC _gcry_aes_vp_riscv_ctr32le_enc
+#define FUNC_OCB_CRYPT _gcry_aes_vp_riscv_ocb_crypt
+#define FUNC_OCB_AUTH _gcry_aes_vp_riscv_ocb_auth
+#define FUNC_ECB_CRYPT _gcry_aes_vp_riscv_ecb_crypt
+#define FUNC_XTS_CRYPT _gcry_aes_vp_riscv_xts_crypt
+#define FUNC_SETKEY _gcry_aes_vp_riscv_do_setkey
+#define FUNC_PREPARE_DEC _gcry_aes_vp_riscv_prepare_decryption
+
+#include "rijndael-vp-simd128.h"
+
+int
+_gcry_aes_vp_riscv_setup_acceleration(RIJNDAEL_context *ctx)
+{
+  (void)ctx;
+  return (__riscv_vsetvl_e8m1(16) == 16);
+}
+
+#endif /* USE_VP_RISCV */
diff --git a/cipher/rijndael-vp-simd128.h b/cipher/rijndael-vp-simd128.h
index f6fc8d5e..af8ee291 100644
--- a/cipher/rijndael-vp-simd128.h
+++ b/cipher/rijndael-vp-simd128.h
@@ -1,5 +1,5 @@
 /* SIMD128 intrinsics implementation vector permutation AES for Libgcrypt
- * Copyright (C) 2024 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2024-2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -92,55 +92,7 @@
 
 #define M128I_U64(a0, a1) { a0, a1 }
 
-#ifdef __ARM_NEON
-
-/**********************************************************************
-  AT&T x86 asm to intrinsics conversion macros (ARM)
- **********************************************************************/
-
-#include "simd-common-aarch64.h"
-#include <arm_neon.h>
-
-#define __m128i uint64x2_t
-
-#define pand128(a, o)           (o = vandq_u64(o, a))
-#define pandn128(a, o)          (o = vbicq_u64(a, o))
-#define pxor128(a, o)           (o = veorq_u64(o, a))
-#define paddq128(a, o)          (o = vaddq_u64(o, a))
-#define paddd128(a, o)          (o = (__m128i)vaddq_u32((uint32x4_t)o, (uint32x4_t)a))
-#define paddb128(a, o)          (o = (__m128i)vaddq_u8((uint8x16_t)o, (uint8x16_t)a))
-
-#define psrld128(s, o)          (o = (__m128i)vshrq_n_u32((uint32x4_t)o, s))
-#define psraq128(s, o)          (o = (__m128i)vshrq_n_s64((int64x2_t)o, s))
-#define psrldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
-				   o = (__m128i)vextq_u8((uint8x16_t)o, \
-				                         (uint8x16_t)__tmp, (s) & 15);})
-#define pslldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
-                                   o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
-                                                         (uint8x16_t)o, (16 - (s)) & 15);})
-
-#define pshufb128(m8, o)        (o = (__m128i)vqtbl1q_u8((uint8x16_t)o, (uint8x16_t)m8))
-#define pshufd128(m32, a, o)    ({ static const __m128i __tmp = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
-				   movdqa128(a, o); \
-				   pshufb128(__tmp, o); })
-#define pshufd128_0x93(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 12))
-#define pshufd128_0xFF(a, o)    (o = (__m128i)vdupq_laneq_u32((uint32x4_t)a, 3))
-#define pshufd128_0xFE(a, o)    pshufd128(0xFE, a, o)
-#define pshufd128_0x4E(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
-
-#define palignr128(s, a, o)     (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)o, s))
-
-#define movdqa128(a, o)         (o = a)
-
-#define pxor128_amemld(m, o)    pxor128(*(const __m128i *)(m), o)
-
-/* Following operations may have unaligned memory input */
-#define movdqu128_memld(a, o)   (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
-
-/* Following operations may have unaligned memory output */
-#define movdqu128_memst(a, o)   vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
-
-#endif /* __ARM_NEON */
+typedef u64 __m128i_const[2]  __attribute__ ((aligned (16)));
 
 #if defined(__x86_64__) || defined(__i386__)
 
@@ -154,13 +106,12 @@
 #define pandn128(a, o)          (o = _mm_andnot_si128(o, a))
 #define pxor128(a, o)           (o = _mm_xor_si128(o, a))
 #define paddq128(a, o)          (o = _mm_add_epi64(o, a))
-#define vpaddd128(a, o)         (o = _mm_add_epi32(o, a))
-#define vpaddb128(a, o)         (o = _mm_add_epi8(o, a))
 
 #define psrld128(s, o)          (o = _mm_srli_epi32(o, s))
 #define psraq128(s, o)          (o = _mm_srai_epi64(o, s))
 #define psrldq128(s, o)         (o = _mm_srli_si128(o, s))
 #define pslldq128(s, o)         (o = _mm_slli_si128(o, s))
+#define psrl_byte_128(s, o)     psrld128(o, s)
 
 #define pshufb128(m8, o)        (o = _mm_shuffle_epi8(o, m8))
 #define pshufd128(m32, a, o)    (o = _mm_shuffle_epi32(a, m32))
@@ -173,7 +124,13 @@
 
 #define movdqa128(a, o)         (o = a)
 
-#define pxor128_amemld(m, o)    pxor128(*(const __m128i *)(m), o)
+#define movdqa128_memld(a, o)   (o = (__m128i)_mm_load_si128((const void *)(a)))
+
+#define pand128_amemld(m, o)    pand128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define pxor128_amemld(m, o)    pxor128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define paddq128_amemld(m, o)   paddq128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define paddd128_amemld(m, o)   paddd128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define pshufb128_amemld(m, o)  pshufb128((__m128i)_mm_load_si128((const void *)(m)), o)
 
 /* Following operations may have unaligned memory input */
 #define movdqu128_memld(a, o)   (o = _mm_loadu_si128((const __m128i *)(a)))
@@ -225,73 +182,73 @@
   constant vectors
  **********************************************************************/
 
-static const __m128i k_s0F =
+static const __m128i_const k_s0F =
 	M128I_U64(
 		0x0F0F0F0F0F0F0F0F,
 		0x0F0F0F0F0F0F0F0F
 	);
 
-static const __m128i k_iptlo =
+static const __m128i_const k_iptlo =
 	M128I_U64(
 		0xC2B2E8985A2A7000,
 		0xCABAE09052227808
 	);
 
-static const __m128i k_ipthi =
+static const __m128i_const k_ipthi =
 	M128I_U64(
 		0x4C01307D317C4D00,
 		0xCD80B1FCB0FDCC81
 	);
 
-static const __m128i k_inv =
+static const __m128i_const k_inv =
 	M128I_U64(
 		0x0E05060F0D080180,
 		0x040703090A0B0C02
 	);
 
-static const __m128i k_inva =
+static const __m128i_const k_inva =
 	M128I_U64(
 		0x01040A060F0B0780,
 		0x030D0E0C02050809
 	);
 
-static const __m128i k_sb1u =
+static const __m128i_const k_sb1u =
 	M128I_U64(
 		0xB19BE18FCB503E00,
 		0xA5DF7A6E142AF544
 	);
 
-static const __m128i k_sb1t =
+static const __m128i_const k_sb1t =
 	M128I_U64(
 		0x3618D415FAE22300,
 		0x3BF7CCC10D2ED9EF
 	);
 
-static const __m128i k_sb2u =
+static const __m128i_const k_sb2u =
 	M128I_U64(
 		0xE27A93C60B712400,
 		0x5EB7E955BC982FCD
 	);
 
-static const __m128i k_sb2t =
+static const __m128i_const k_sb2t =
 	M128I_U64(
 		0x69EB88400AE12900,
 		0xC2A163C8AB82234A
 	);
 
-static const __m128i k_sbou =
+static const __m128i_const k_sbou =
 	M128I_U64(
 		0xD0D26D176FBDC700,
 		0x15AABF7AC502A878
 	);
 
-static const __m128i k_sbot =
+static const __m128i_const k_sbot =
 	M128I_U64(
 		0xCFE474A55FBB6A00,
 		0x8E1E90D1412B35FA
 	);
 
-static const __m128i k_mc_forward[4] =
+static const __m128i_const k_mc_forward[4] =
 {
 	M128I_U64(
 		0x0407060500030201,
@@ -311,7 +268,7 @@ static const __m128i k_mc_forward[4] =
 	)
 };
 
-static const __m128i k_mc_backward[4] =
+static const __m128i_const k_mc_backward[4] =
 {
 	M128I_U64(
 		0x0605040702010003,
@@ -331,7 +288,7 @@ static const __m128i k_mc_backward[4] =
 	)
 };
 
-static const __m128i k_sr[4] =
+static const __m128i_const k_sr[4] =
 {
 	M128I_U64(
 		0x0706050403020100,
@@ -351,19 +308,19 @@ static const __m128i k_sr[4] =
 	)
 };
 
-static const __m128i k_rcon =
+static const __m128i_const k_rcon =
 	M128I_U64(
 		0x1F8391B9AF9DEEB6,
 		0x702A98084D7C7D81
 	);
 
-static const __m128i k_s63 =
+static const __m128i_const k_s63 =
 	M128I_U64(
 		0x5B5B5B5B5B5B5B5B,
 		0x5B5B5B5B5B5B5B5B
 	);
 
-static const __m128i k_opt[2] =
+static const __m128i_const k_opt[2] =
 {
 	M128I_U64(
 		0xFF9F4929D6B66000,
@@ -375,7 +332,7 @@ static const __m128i k_opt[2] =
 	)
 };
 
-static const __m128i k_deskew[2] =
+static const __m128i_const k_deskew[2] =
 {
 	M128I_U64(
 		0x07E4A34047A4E300,
@@ -387,7 +344,7 @@ static const __m128i k_deskew[2] =
 	)
 };
 
-static const __m128i k_dks_1[2] =
+static const __m128i_const k_dks_1[2] =
 {
 	M128I_U64(
 		0xB6116FC87ED9A700,
@@ -399,7 +356,7 @@ static const __m128i k_dks_1[2] =
 	)
 };
 
-static const __m128i k_dks_2[2] =
+static const __m128i_const k_dks_2[2] =
 {
 	M128I_U64(
 		0x27438FEBCCA86400,
@@ -411,7 +368,7 @@ static const __m128i k_dks_2[2] =
 	)
 };
 
-static const __m128i k_dks_3[2] =
+static const __m128i_const k_dks_3[2] =
 {
 	M128I_U64(
 		0x03C4C50201C6C700,
@@ -423,7 +380,7 @@ static const __m128i k_dks_3[2] =
 	)
 };
 
-static const __m128i k_dks_4[2] =
+static const __m128i_const k_dks_4[2] =
 {
 	M128I_U64(
 		0xE3C390B053732000,
@@ -435,7 +392,7 @@ static const __m128i k_dks_4[2] =
 	)
 };
 
-static const __m128i k_dipt[2] =
+static const __m128i_const k_dipt[2] =
 {
 	M128I_U64(
 		0x0F505B040B545F00,
@@ -447,7 +404,7 @@ static const __m128i k_dipt[2] =
 	)
 };
 
-static const __m128i k_dsb9[2] =
+static const __m128i_const k_dsb9[2] =
 {
 	M128I_U64(
 		0x851C03539A86D600,
@@ -459,7 +416,7 @@ static const __m128i k_dsb9[2] =
 	)
 };
 
-static const __m128i k_dsbd[2] =
+static const __m128i_const k_dsbd[2] =
 {
 	M128I_U64(
 		0x7D57CCDFE6B1A200,
@@ -471,7 +428,7 @@ static const __m128i k_dsbd[2] =
 	)
 };
 
-static const __m128i k_dsbb[2] =
+static const __m128i_const k_dsbb[2] =
 {
 	M128I_U64(
 		0xD022649296B44200,
@@ -483,7 +440,7 @@ static const __m128i k_dsbb[2] =
 	)
 };
 
-static const __m128i k_dsbe[2] =
+static const __m128i_const k_dsbe[2] =
 {
 	M128I_U64(
 		0x46F2929626D4D000,
@@ -495,7 +452,7 @@ static const __m128i k_dsbe[2] =
 	)
 };
 
-static const __m128i k_dsbo[2] =
+static const __m128i_const k_dsbo[2] =
 {
 	M128I_U64(
 		0x1387EA537EF94000,
@@ -551,8 +508,8 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
   if (!low_round_only)
     {
       /* extract rcon from xmm8 */
-      static const __m128i zero = { 0 };
-      xmm1 = zero;
+      static const __m128i_const zero = { 0 };
+      movdqa128_memld(&zero, xmm1);
       palignr128(15, xmm8, xmm1);
       palignr128(15, xmm8, xmm8);
       pxor128(xmm1, xmm7);
@@ -569,12 +526,12 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
   movdqa128(xmm7, xmm1);
   pslldq128(8, xmm7);
   pxor128(xmm1, xmm7);
-  pxor128(k_s63, xmm7);
+  pxor128_amemld(&k_s63, xmm7);
 
   /* subbytes */
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);            /* 1 = i */
+  psrl_byte_128(4, xmm1);       /* 1 = i */
   pand128(xmm9, xmm0);          /* 0 = k */
   movdqa128(xmm11, xmm2);       /* 2 : a/k */
   pshufb128(xmm0, xmm2);        /* 2 = a/k */
@@ -591,9 +548,9 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
   movdqa128(xmm10, xmm3);       /* 3 : 1/jak */
   pshufb128(xmm4, xmm3);        /* 3 = 1/jak */
   pxor128(xmm1, xmm3);          /* 3 = jo */
-  movdqa128(k_sb1u, xmm4);      /* 4 : sbou */
+  movdqa128_memld(&k_sb1u, xmm4);      /* 4 : sbou */
   pshufb128(xmm2, xmm4);        /* 4 = sbou */
-  movdqa128(k_sb1t, xmm0);      /* 0 : sbot */
+  movdqa128_memld(&k_sb1t, xmm0);      /* 0 : sbot */
   pshufb128(xmm3, xmm0);        /* 0 = sb1t */
   pxor128(xmm4, xmm0);          /* 0 = sbox output */
 
@@ -608,7 +565,8 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
 
 static ASM_FUNC_ATTR_INLINE __m128i
 aes_schedule_transform(__m128i xmm0, const __m128i xmm9,
-		       const __m128i tablelo, const __m128i tablehi)
+		       const __m128i_const *tablelo,
+		       const __m128i_const *tablehi)
 {
   /* aes_schedule_transform
    *
@@ -622,11 +580,11 @@ aes_schedule_transform(__m128i xmm0, const __m128i xmm9,
 
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);
+  psrl_byte_128(4, xmm1);
   pand128(xmm9, xmm0);
-  movdqa128(tablelo, xmm2);
+  movdqa128_memld(tablelo, xmm2);
   pshufb128(xmm0, xmm2);
-  movdqa128(tablehi, xmm0);
+  movdqa128_memld(tablehi, xmm0);
   pshufb128(xmm1, xmm0);
   pxor128(xmm2, xmm0);
 
@@ -662,12 +620,12 @@ aes_schedule_mangle(__m128i xmm0, struct vp_aes_config_s *pconfig, int decrypt,
   unsigned int rotoffs = *protoffs;
 
   movdqa128(xmm0, xmm4);
-  movdqa128(k_mc_forward[0], xmm5);
+  movdqa128_memld(&k_mc_forward[0], xmm5);
 
   if (!decrypt)
     {
       keysched += 16;
-      pxor128(k_s63, xmm4);
+      pxor128_amemld(&k_s63, xmm4);
       pshufb128(xmm5, xmm4);
       movdqa128(xmm4, xmm3);
       pshufb128(xmm5, xmm4);
@@ -678,29 +636,29 @@ aes_schedule_mangle(__m128i xmm0, struct vp_aes_config_s *pconfig, int decrypt,
   else
     {
       /* first table: *9 */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_1[0], k_dks_1[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_1[0], &k_dks_1[1]);
       movdqa128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       /* next table:  *B */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_2[0], k_dks_2[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_2[0], &k_dks_2[1]);
       pxor128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       /* next table:  *D */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_3[0], k_dks_3[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_3[0], &k_dks_3[1]);
       pxor128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       /* next table:  *E */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_4[0], k_dks_4[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_4[0], &k_dks_4[1]);
       pxor128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       keysched -= 16;
     }
 
-  pshufb128(k_sr[rotoffs], xmm3);
+  pshufb128_amemld(&k_sr[rotoffs], xmm3);
   rotoffs -= 16 / 16;
   rotoffs &= 48 / 16;
   movdqu128_memst(xmm3, keysched);
@@ -725,16 +683,16 @@ aes_schedule_mangle_last(__m128i xmm0, struct vp_aes_config_s config,
 
   if (!decrypt)
     {
-      pshufb128(k_sr[rotoffs], xmm0); /* output permute */
+      pshufb128_amemld(&k_sr[rotoffs], xmm0); /* output permute */
       config.keysched += 16;
-      pxor128(k_s63, xmm0);
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_opt[0], k_opt[1]);
+      pxor128_amemld(&k_s63, xmm0);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_opt[0], &k_opt[1]);
     }
   else
     {
       config.keysched -= 16;
-      pxor128(k_s63, xmm0);
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_deskew[0], k_deskew[1]);
+      pxor128_amemld(&k_s63, xmm0);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_deskew[0], &k_deskew[1]);
     }
 
   movdqu128_memst(xmm0, config.keysched); /* save last key */
@@ -825,7 +783,7 @@ aes_schedule_192(const byte *key, struct vp_aes_config_s config, int decrypt,
   int r = 4;
 
   movdqu128_memld(key + 8, xmm0); /* load key part 2 (very unaligned) */
-  xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi); /* input transform */
+  xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi); /* input transform */
   movdqa128(xmm0, xmm6);
   psrldq128(8, xmm6);
   pslldq128(8, xmm6); /* clobber low side with zeros */
@@ -867,7 +825,7 @@ aes_schedule_256(const byte *key, struct vp_aes_config_s config, int decrypt,
   int r = 7;
 
   movdqu128_memld(key + 16, xmm0); /* load key part 2 (unaligned) */
-  xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi); /* input transform */
+  xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi); /* input transform */
 
   while (1)
     {
@@ -900,16 +858,16 @@ aes_schedule_core(const byte *key, struct vp_aes_config_s config,
   unsigned int keybits = (config.nround - 10) * 32 + 128;
   __m128i xmm0, xmm3, xmm7, xmm8, xmm9, xmm10, xmm11;
 
-  movdqa128(k_s0F, xmm9);
-  movdqa128(k_inv, xmm10);
-  movdqa128(k_inva, xmm11);
-  movdqa128(k_rcon, xmm8);
+  movdqa128_memld(&k_s0F, xmm9);
+  movdqa128_memld(&k_inv, xmm10);
+  movdqa128_memld(&k_inva, xmm11);
+  movdqa128_memld(&k_rcon, xmm8);
 
   movdqu128_memld(key, xmm0);
 
   /* input transform */
   movdqa128(xmm0, xmm3);
-  xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi);
+  xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi);
   movdqa128(xmm0, xmm7);
 
   if (!decrypt)
@@ -920,7 +878,7 @@ aes_schedule_core(const byte *key, struct vp_aes_config_s config,
   else
     {
       /* decrypting, output zeroth round key after shiftrows */
-      pshufb128(k_sr[rotoffs], xmm3);
+      pshufb128_amemld(&k_sr[rotoffs], xmm3);
       movdqu128_memst(xmm3, config.keysched);
       rotoffs ^= 48 / 16;
     }
@@ -998,23 +956,23 @@ FUNC_PREPARE_DEC (RIJNDAEL_context *ctx)
 }
 
 #define enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15) \
-	movdqa128(k_s0F, xmm9); \
-	movdqa128(k_inv, xmm10); \
-	movdqa128(k_inva, xmm11); \
-	movdqa128(k_sb1u, xmm13); \
-	movdqa128(k_sb1t, xmm12); \
-	movdqa128(k_sb2u, xmm15); \
-	movdqa128(k_sb2t, xmm14);
+	movdqa128_memld(&k_s0F, xmm9); \
+	movdqa128_memld(&k_inv, xmm10); \
+	movdqa128_memld(&k_inva, xmm11); \
+	movdqa128_memld(&k_sb1u, xmm13); \
+	movdqa128_memld(&k_sb1t, xmm12); \
+	movdqa128_memld(&k_sb2u, xmm15); \
+	movdqa128_memld(&k_sb2t, xmm14);
 
 #define dec_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm8) \
-	movdqa128(k_s0F, xmm9); \
-	movdqa128(k_inv, xmm10); \
-	movdqa128(k_inva, xmm11); \
-	movdqa128(k_dsb9[0], xmm13); \
-	movdqa128(k_dsb9[1], xmm12); \
-	movdqa128(k_dsbd[0], xmm15); \
-	movdqa128(k_dsbb[0], xmm14); \
-	movdqa128(k_dsbe[0], xmm8);
+	movdqa128_memld(&k_s0F, xmm9); \
+	movdqa128_memld(&k_inv, xmm10); \
+	movdqa128_memld(&k_inva, xmm11); \
+	movdqa128_memld(&k_dsb9[0], xmm13); \
+	movdqa128_memld(&k_dsb9[1], xmm12); \
+	movdqa128_memld(&k_dsbd[0], xmm15); \
+	movdqa128_memld(&k_dsbb[0], xmm14); \
+	movdqa128_memld(&k_dsbe[0], xmm8);
 
 static ASM_FUNC_ATTR_INLINE __m128i
 aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
@@ -1025,13 +983,13 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
   const byte *end_keys = config.sched_keys + 16 * config.nround;
   unsigned int mc_pos = 1;
 
-  movdqa128(k_iptlo, xmm2);
+  movdqa128_memld(&k_iptlo, xmm2);
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);
+  psrl_byte_128(4, xmm1);
   pand128(xmm9, xmm0);
   pshufb128(xmm0, xmm2);
-  movdqa128(k_ipthi, xmm0);
+  movdqa128_memld(&k_ipthi, xmm0);
 
   pshufb128(xmm1, xmm0);
   pxor128_amemld(config.sched_keys, xmm2);
@@ -1044,7 +1002,7 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       /* top of round */
       movdqa128(xmm9, xmm1);                  /* 1 : i */
       pandn128(xmm0, xmm1);                   /* 1 = i<<4 */
-      psrld128(4, xmm1);                      /* 1 = i */
+      psrl_byte_128(4, xmm1);                 /* 1 = i */
       pand128(xmm9, xmm0);                    /* 0 = k */
       movdqa128(xmm11, xmm2);                 /* 2 : a/k */
       pshufb128(xmm0, xmm2);                  /* 2 = a/k */
@@ -1074,14 +1032,14 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       pxor128(xmm4, xmm0);                    /* 0 = A */
       movdqa128(xmm15, xmm4);                 /* 4 : sb2u */
       pshufb128(xmm2, xmm4);                  /* 4 = sb2u */
-      movdqa128(k_mc_forward[mc_pos], xmm1);
+      movdqa128_memld(&k_mc_forward[mc_pos], xmm1);
       movdqa128(xmm14, xmm2);                 /* 2 : sb2t */
       pshufb128(xmm3, xmm2);                  /* 2 = sb2t */
       pxor128(xmm4, xmm2);                    /* 2 = 2A */
       movdqa128(xmm0, xmm3);                  /* 3 = A */
       pshufb128(xmm1, xmm0);                  /* 0 = B */
       pxor128(xmm2, xmm0);                    /* 0 = 2A+B */
-      pshufb128(k_mc_backward[mc_pos], xmm3); /* 3 = D */
+      pshufb128_amemld(&k_mc_backward[mc_pos], xmm3); /* 3 = D */
       pxor128(xmm0, xmm3);                    /* 3 = 2A+B+D */
       pshufb128(xmm1, xmm0);                  /* 0 = 2B+C */
       pxor128(xmm3, xmm0);                    /* 0 = 2A+3B+C+D */
@@ -1091,13 +1049,13 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
     }
 
   /* middle of last round */
-  movdqa128(k_sbou, xmm4);          /* 3 : sbou */
+  movdqa128_memld(&k_sbou, xmm4);   /* 3 : sbou */
   pshufb128(xmm2, xmm4);            /* 4 = sbou */
   pxor128_amemld(config.sched_keys, xmm4); /* 4 = sb1u + k */
-  movdqa128(k_sbot, xmm0);          /* 0 : sbot */
+  movdqa128_memld(&k_sbot, xmm0);   /* 0 : sbot */
   pshufb128(xmm3, xmm0);            /* 0 = sb1t */
   pxor128(xmm4, xmm0);              /* 0 = A */
-  pshufb128(k_sr[mc_pos], xmm0);
+  pshufb128_amemld(&k_sr[mc_pos], xmm0);
 
   return xmm0;
 }
@@ -1112,20 +1070,20 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
   __m128i xmm0_a, xmm0_b;
   __m128i xmm1_a, xmm2_a, xmm3_a, xmm4_a;
   __m128i xmm1_b, xmm2_b, xmm3_b, xmm4_b;
-  __m128i xmm5;
+  __m128i xmm5, xmm6;
   const byte *end_keys = config.sched_keys + 16 * config.nround;
   unsigned int mc_pos = 1;
 
   xmm0_a = *pxmm0_a;
   xmm0_b = *pxmm0_b;
 
-  movdqa128(k_iptlo, xmm2_a);	movdqa128(k_iptlo, xmm2_b);
+  movdqa128_memld(&k_iptlo, xmm2_a); movdqa128(xmm2_a, xmm2_b);
   movdqa128(xmm9, xmm1_a);	movdqa128(xmm9, xmm1_b);
   pandn128(xmm0_a, xmm1_a);	pandn128(xmm0_b, xmm1_b);
-  psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+  psrl_byte_128(4, xmm1_a);	psrl_byte_128(4, xmm1_b);
   pand128(xmm9, xmm0_a);	pand128(xmm9, xmm0_b);
   pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
-  movdqa128(k_ipthi, xmm0_a);	movdqa128(k_ipthi, xmm0_b);
+  movdqa128_memld(&k_ipthi, xmm0_a); movdqa128(xmm0_a, xmm0_b);
 
   pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
   movdqu128_memld(config.sched_keys, xmm5);
@@ -1139,7 +1097,7 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       /* top of round */
       movdqa128(xmm9, xmm1_a);		movdqa128(xmm9, xmm1_b);
       pandn128(xmm0_a, xmm1_a);		pandn128(xmm0_b, xmm1_b);
-      psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+      psrl_byte_128(4, xmm1_a);		psrl_byte_128(4, xmm1_b);
       pand128(xmm9, xmm0_a);		pand128(xmm9, xmm0_b);
       movdqa128(xmm11, xmm2_a);		movdqa128(xmm11, xmm2_b);
       pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
@@ -1170,18 +1128,17 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
       movdqa128(xmm15, xmm4_a);		movdqa128(xmm15, xmm4_b);
       pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
-      movdqa128(k_mc_forward[mc_pos], xmm1_a);
-					movdqa128(k_mc_forward[mc_pos], xmm1_b);
+      movdqa128_memld(&k_mc_forward[mc_pos], xmm6);
       movdqa128(xmm14, xmm2_a);		movdqa128(xmm14, xmm2_b);
       pshufb128(xmm3_a, xmm2_a);	pshufb128(xmm3_b, xmm2_b);
       pxor128(xmm4_a, xmm2_a);		pxor128(xmm4_b, xmm2_b);
       movdqa128(xmm0_a, xmm3_a);	movdqa128(xmm0_b, xmm3_b);
-      pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
+      pshufb128(xmm6, xmm0_a);		pshufb128(xmm6, xmm0_b);
       pxor128(xmm2_a, xmm0_a);		pxor128(xmm2_b, xmm0_b);
-      pshufb128(k_mc_backward[mc_pos], xmm3_a);
-					pshufb128(k_mc_backward[mc_pos], xmm3_b);
+      movdqa128_memld(&k_mc_backward[mc_pos], xmm5);
+      pshufb128(xmm5, xmm3_a);		pshufb128(xmm5, xmm3_b);
       pxor128(xmm0_a, xmm3_a);		pxor128(xmm0_b, xmm3_b);
-      pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
+      pshufb128(xmm6, xmm0_a);		pshufb128(xmm6, xmm0_b);
       pxor128(xmm3_a, xmm0_a);		pxor128(xmm3_b, xmm0_b);
 
       config.sched_keys += 16;
@@ -1189,20 +1146,133 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
     }
 
   /* middle of last round */
-  movdqa128(k_sbou, xmm4_a);	movdqa128(k_sbou, xmm4_b);
+  movdqa128_memld(&k_sbou, xmm4_a); movdqa128_memld(&k_sbou, xmm4_b);
   pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
   movdqu128_memld(config.sched_keys, xmm5);
   pxor128(xmm5, xmm4_a);	pxor128(xmm5, xmm4_b);
-  movdqa128(k_sbot, xmm0_a);	movdqa128(k_sbot, xmm0_b);
+  movdqa128_memld(&k_sbot, xmm0_a); movdqa128_memld(&k_sbot, xmm0_b);
   pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
   pxor128(xmm4_a, xmm0_a);	pxor128(xmm4_b, xmm0_b);
-  pshufb128(k_sr[mc_pos], xmm0_a);
-				pshufb128(k_sr[mc_pos], xmm0_b);
+  movdqa128_memld(&k_sr[mc_pos], xmm5);
+  pshufb128(xmm5, xmm0_a);	pshufb128(xmm5, xmm0_b);
 
   *pxmm0_a = xmm0_a;
   *pxmm0_b = xmm0_b;
 }
 
+#ifdef HAVE_SIMD256
+
+static ASM_FUNC_ATTR_INLINE void
+aes_encrypt_core_4blks_simd256(__m256i *pymm0_a, __m256i *pymm0_b,
+			       struct vp_aes_config_s config,
+			       __m128i xmm9, __m128i xmm10, __m128i xmm11,
+			       __m128i xmm12, __m128i xmm13, __m128i xmm14,
+			       __m128i xmm15)
+{
+  __m256i ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+  __m256i ymm0_a, ymm0_b;
+  __m256i ymm1_a, ymm2_a, ymm3_a, ymm4_a;
+  __m256i ymm1_b, ymm2_b, ymm3_b, ymm4_b;
+  __m256i ymm5, ymm6;
+  const byte *end_keys = config.sched_keys + 16 * config.nround;
+  unsigned int mc_pos = 1;
+
+  broadcast128_256(xmm9, ymm9);
+  movdqa128_256(xmm10, ymm10);
+  movdqa128_256(xmm11, ymm11);
+  movdqa128_256(xmm12, ymm12);
+  movdqa128_256(xmm13, ymm13);
+  movdqa128_256(xmm14, ymm14);
+  movdqa128_256(xmm15, ymm15);
+
+  ymm0_a = *pymm0_a;
+  ymm0_b = *pymm0_b;
+
+  load_tab16_table(&k_iptlo, ymm2_a); 	movdqa256(ymm2_a, ymm2_b);
+  movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+  pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+  psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+  pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+  pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+  load_tab16_table(&k_ipthi, ymm0_a); 	movdqa256(ymm0_a, ymm0_b);
+
+  pshufb256_tab16(ymm1_a, ymm0_a);	pshufb256_tab16(ymm1_b, ymm0_b);
+  broadcast128_256_amemld(config.sched_keys, ymm5);
+  pxor256(ymm5, ymm2_a);		pxor256(ymm5, ymm2_b);
+  pxor256(ymm2_a, ymm0_a);		pxor256(ymm2_b, ymm0_b);
+
+  config.sched_keys += 16;
+
+  while (1)
+    {
+      /* top of round */
+      movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+      pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+      psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+      pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+      movdqa256(ymm11, ymm2_a);		movdqa256(ymm11, ymm2_b);
+      pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+      pxor256(ymm1_a, ymm0_a);		pxor256(ymm1_b, ymm0_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm1_a, ymm3_a);	pshufb256_tab16(ymm1_b, ymm3_b);
+      pxor256(ymm2_a, ymm3_a);		pxor256(ymm2_b, ymm3_b);
+      movdqa256(ymm10, ymm4_a);		movdqa256(ymm10, ymm4_b);
+      pshufb256_tab16(ymm0_a,  ymm4_a);	pshufb256_tab16(ymm0_b,  ymm4_b);
+      pxor256(ymm2_a, ymm4_a);		pxor256(ymm2_b, ymm4_b);
+      movdqa256(ymm10, ymm2_a);		movdqa256(ymm10, ymm2_b);
+      pshufb256_tab16(ymm3_a, ymm2_a);	pshufb256_tab16(ymm3_b, ymm2_b);
+      pxor256(ymm0_a, ymm2_a);		pxor256(ymm0_b, ymm2_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm4_a, ymm3_a);	pshufb256_tab16(ymm4_b, ymm3_b);
+      pxor256(ymm1_a, ymm3_a);		pxor256(ymm1_b, ymm3_b);
+
+      if (config.sched_keys == end_keys)
+	break;
+
+      /* middle of middle round */
+      movdqa256(ymm13, ymm4_a);		movdqa256(ymm13, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      broadcast128_256_amemld(config.sched_keys, ymm5);
+      pxor256(ymm5, ymm4_a);		pxor256(ymm5, ymm4_b);
+      movdqa256(ymm12, ymm0_a);		movdqa256(ymm12, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+      movdqa256(ymm15, ymm4_a);		movdqa256(ymm15, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      load_tab32_mask(&k_mc_forward[mc_pos], ymm6);
+      movdqa256(ymm14, ymm2_a);		movdqa256(ymm14, ymm2_b);
+      pshufb256_tab16(ymm3_a, ymm2_a);	pshufb256_tab16(ymm3_b, ymm2_b);
+      pxor256(ymm4_a, ymm2_a);		pxor256(ymm4_b, ymm2_b);
+      movdqa256(ymm0_a, ymm3_a);	movdqa256(ymm0_b, ymm3_b);
+      pshufb256_tab32(ymm6, ymm0_a);	pshufb256_tab32(ymm6, ymm0_b);
+      pxor256(ymm2_a, ymm0_a);		pxor256(ymm2_b, ymm0_b);
+      load_tab32_mask(&k_mc_backward[mc_pos], ymm5);
+      pshufb256_tab32(ymm5, ymm3_a);	pshufb256_tab32(ymm5, ymm3_b);
+      pxor256(ymm0_a, ymm3_a);		pxor256(ymm0_b, ymm3_b);
+      pshufb256_tab32(ymm6, ymm0_a);	pshufb256_tab32(ymm6, ymm0_b);
+      pxor256(ymm3_a, ymm0_a);		pxor256(ymm3_b, ymm0_b);
+
+      config.sched_keys += 16;
+      mc_pos = (mc_pos + 1) % 4; /* next mc mod 4 */
+    }
+
+  /* middle of last round */
+  movdqa256_memld(&k_sbou, ymm4_a); 	movdqa256_memld(&k_sbou, ymm4_b);
+  pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+  broadcast128_256_amemld(config.sched_keys, ymm5);
+  pxor256(ymm5, ymm4_a);		pxor256(ymm5, ymm4_b);
+  movdqa256_memld(&k_sbot, ymm0_a); 	movdqa256_memld(&k_sbot, ymm0_b);
+  pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+  pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+  load_tab32_mask(&k_sr[mc_pos], ymm5);
+  pshufb256_tab32(ymm5, ymm0_a);	pshufb256_tab32(ymm5, ymm0_b);
+
+  *pymm0_a = ymm0_a;
+  *pymm0_b = ymm0_b;
+}
+
+#endif /* HAVE_SIMD256 */
+
 static ASM_FUNC_ATTR_INLINE __m128i
 aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
 		 __m128i xmm9, __m128i xmm10, __m128i xmm11, __m128i xmm12,
@@ -1212,17 +1282,17 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
   const byte *end_keys = config.sched_keys + 16 * config.nround;
   unsigned int mc_pos = config.nround % 4;
 
-  movdqa128(k_dipt[0], xmm2);
+  movdqa128_memld(&k_dipt[0], xmm2);
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);
+  psrl_byte_128(4, xmm1);
   pand128(xmm9, xmm0);
   pshufb128(xmm0, xmm2);
-  movdqa128(k_dipt[1], xmm0);
+  movdqa128_memld(&k_dipt[1], xmm0);
   pshufb128(xmm1, xmm0);
   pxor128_amemld(config.sched_keys, xmm2);
   pxor128(xmm2, xmm0);
-  movdqa128(k_mc_forward[3], xmm5);
+  movdqa128_memld(&k_mc_forward[3], xmm5);
 
   config.sched_keys += 16;
 
@@ -1231,7 +1301,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       /* top of round */
       movdqa128(xmm9, xmm1);                  /* 1 : i */
       pandn128(xmm0, xmm1);                   /* 1 = i<<4 */
-      psrld128(4, xmm1);                      /* 1 = i */
+      psrl_byte_128(4, xmm1);                 /* 1 = i */
       pand128(xmm9, xmm0);                    /* 0 = k */
       movdqa128(xmm11, xmm2);                 /* 2 : a/k */
       pshufb128(xmm0, xmm2);                  /* 2 = a/k */
@@ -1258,7 +1328,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       pxor128_amemld(config.sched_keys, xmm4);
       movdqa128(xmm12, xmm0);                 /* 0 : sb9t */
       pshufb128(xmm3, xmm0);                  /* 0 = sb9t */
-      movdqa128(k_dsbd[1], xmm1);             /* 1 : sbdt */
+      movdqa128_memld(&k_dsbd[1], xmm1);      /* 1 : sbdt */
       pxor128(xmm4, xmm0);                    /* 0 = ch */
 
       pshufb128(xmm5, xmm0);                  /* MC ch */
@@ -1272,7 +1342,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       movdqa128(xmm14, xmm4);                 /* 4 : sbbu */
       pshufb128(xmm2, xmm4);                  /* 4 = sbbu */
       pxor128(xmm1, xmm4);                    /* 4 = ch */
-      movdqa128(k_dsbb[1], xmm0);             /* 0 : sbbt */
+      movdqa128_memld(&k_dsbb[1], xmm0);      /* 0 : sbbt */
       pshufb128(xmm3, xmm0);                  /* 0 = sbbt */
       pxor128(xmm4, xmm0);                    /* 0 = ch */
 
@@ -1281,7 +1351,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       pshufb128(xmm2, xmm4);                  /* 4 = sbeu */
       pshufd128_0x93(xmm5, xmm5);
       pxor128(xmm0, xmm4);                    /* 4 = ch */
-      movdqa128(k_dsbe[1], xmm0);             /* 0 : sbet */
+      movdqa128_memld(&k_dsbe[1], xmm0);      /* 0 : sbet */
       pshufb128(xmm3, xmm0);                  /* 0 = sbet */
       pxor128(xmm4, xmm0);                    /* 0 = ch */
 
@@ -1289,13 +1359,13 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
     }
 
   /* middle of last round */
-  movdqa128(k_dsbo[0], xmm4);       /* 3 : sbou */
+  movdqa128_memld(&k_dsbo[0], xmm4);/* 3 : sbou */
   pshufb128(xmm2, xmm4);            /* 4 = sbou */
   pxor128_amemld(config.sched_keys, xmm4); /* 4 = sb1u + k */
-  movdqa128(k_dsbo[1], xmm0);       /* 0 : sbot */
+  movdqa128_memld(&k_dsbo[1], xmm0);/* 0 : sbot */
   pshufb128(xmm3, xmm0);            /* 0 = sb1t */
   pxor128(xmm4, xmm0);              /* 0 = A */
-  pshufb128(k_sr[mc_pos], xmm0);
+  pshufb128_amemld(&k_sr[mc_pos], xmm0);
 
   return xmm0;
 }
@@ -1317,18 +1387,18 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
   xmm0_a = *pxmm0_a;
   xmm0_b = *pxmm0_b;
 
-  movdqa128(k_dipt[0], xmm2_a);	movdqa128(k_dipt[0], xmm2_b);
+  movdqa128_memld(&k_dipt[0], xmm2_a); movdqa128(xmm2_a, xmm2_b);
   movdqa128(xmm9, xmm1_a);	movdqa128(xmm9, xmm1_b);
   pandn128(xmm0_a, xmm1_a);	pandn128(xmm0_b, xmm1_b);
-  psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+  psrl_byte_128(4, xmm1_a);	psrl_byte_128(4, xmm1_b);
   pand128(xmm9, xmm0_a);	pand128(xmm9, xmm0_b);
   pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
-  movdqa128(k_dipt[1], xmm0_a);	movdqa128(k_dipt[1], xmm0_b);
+  movdqa128_memld(&k_dipt[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
   pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
   movdqu128_memld(config.sched_keys, xmm6);
   pxor128(xmm6, xmm2_a);	pxor128(xmm6, xmm2_b);
   pxor128(xmm2_a, xmm0_a);	pxor128(xmm2_b, xmm0_b);
-  movdqa128(k_mc_forward[3], xmm5);
+  movdqa128_memld(&k_mc_forward[3], xmm5);
 
   config.sched_keys += 16;
 
@@ -1337,7 +1407,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       /* top of round */
       movdqa128(xmm9, xmm1_a);		movdqa128(xmm9, xmm1_b);
       pandn128(xmm0_a, xmm1_a);		pandn128(xmm0_b, xmm1_b);
-      psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+      psrl_byte_128(4, xmm1_a);		psrl_byte_128(4, xmm1_b);
       pand128(xmm9, xmm0_a);		pand128(xmm9, xmm0_b);
       movdqa128(xmm11, xmm2_a);		movdqa128(xmm11, xmm2_b);
       pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
@@ -1365,7 +1435,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       pxor128(xmm6, xmm4_a);		pxor128(xmm6, xmm4_b);
       movdqa128(xmm12, xmm0_a);		movdqa128(xmm12, xmm0_b);
       pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
-      movdqa128(k_dsbd[1], xmm1_a);	movdqa128(k_dsbd[1], xmm1_b);
+      movdqa128_memld(&k_dsbd[1], xmm1_a); movdqa128(xmm1_a, xmm1_b);
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
 
       pshufb128(xmm5, xmm0_a);		pshufb128(xmm5, xmm0_b);
@@ -1379,7 +1449,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       movdqa128(xmm14, xmm4_a);		movdqa128(xmm14, xmm4_b);
       pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
       pxor128(xmm1_a, xmm4_a);		pxor128(xmm1_b, xmm4_b);
-      movdqa128(k_dsbb[1], xmm0_a);	movdqa128(k_dsbb[1], xmm0_b);
+      movdqa128_memld(&k_dsbb[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
       pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
 
@@ -1388,7 +1458,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
       pshufd128_0x93(xmm5, xmm5);
       pxor128(xmm0_a, xmm4_a);		pxor128(xmm0_b, xmm4_b);
-      movdqa128(k_dsbe[1], xmm0_a);	movdqa128(k_dsbe[1], xmm0_b);
+      movdqa128_memld(&k_dsbe[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
       pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
 
@@ -1396,20 +1466,144 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
     }
 
   /* middle of last round */
-  movdqa128(k_dsbo[0], xmm4_a);	movdqa128(k_dsbo[0], xmm4_b);
+  movdqa128_memld(&k_dsbo[0], xmm4_a); movdqa128(xmm4_a, xmm4_b);
   pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
   movdqu128_memld(config.sched_keys, xmm6);
   pxor128(xmm6, xmm4_a);	pxor128(xmm6, xmm4_b);
-  movdqa128(k_dsbo[1], xmm0_a);	movdqa128(k_dsbo[1], xmm0_b);
+  movdqa128_memld(&k_dsbo[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
   pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
   pxor128(xmm4_a, xmm0_a);	pxor128(xmm4_b, xmm0_b);
-  pshufb128(k_sr[mc_pos], xmm0_a);
-				pshufb128(k_sr[mc_pos], xmm0_b);
+  movdqa128_memld(&k_sr[mc_pos], xmm5);
+  pshufb128(xmm5, xmm0_a);	pshufb128(xmm5, xmm0_b);
 
   *pxmm0_a = xmm0_a;
   *pxmm0_b = xmm0_b;
 }
 
+#ifdef HAVE_SIMD256
+
+static ASM_FUNC_ATTR_INLINE void
+aes_decrypt_core_4blks_simd256(__m256i *pymm0_a, __m256i *pymm0_b,
+			       struct vp_aes_config_s config,
+			       __m128i xmm9, __m128i xmm10, __m128i xmm11,
+			       __m128i xmm12, __m128i xmm13, __m128i xmm14,
+			       __m128i xmm15, __m128i xmm8)
+{
+  __m256i ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm8;
+  __m256i ymm0_a, ymm0_b;
+  __m256i ymm1_a, ymm2_a, ymm3_a, ymm4_a;
+  __m256i ymm1_b, ymm2_b, ymm3_b, ymm4_b;
+  __m256i ymm5, ymm6;
+  const byte *end_keys = config.sched_keys + 16 * config.nround;
+  unsigned int mc_pos = config.nround % 4;
+
+  broadcast128_256(xmm9, ymm9);
+  movdqa128_256(xmm10, ymm10);
+  movdqa128_256(xmm11, ymm11);
+  movdqa128_256(xmm12, ymm12);
+  movdqa128_256(xmm13, ymm13);
+  movdqa128_256(xmm14, ymm14);
+  movdqa128_256(xmm15, ymm15);
+  movdqa128_256(xmm8, ymm8);
+
+  ymm0_a = *pymm0_a;
+  ymm0_b = *pymm0_b;
+
+  load_tab16_table(&k_dipt[0], ymm2_a); movdqa256(ymm2_a, ymm2_b);
+  movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+  pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+  psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+  pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+  pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+  load_tab16_table(&k_dipt[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+  pshufb256_tab16(ymm1_a, ymm0_a);	pshufb256_tab16(ymm1_b, ymm0_b);
+  broadcast128_256_amemld(config.sched_keys, ymm6);
+  pxor256(ymm6, ymm2_a);		pxor256(ymm6, ymm2_b);
+  pxor256(ymm2_a, ymm0_a);		pxor256(ymm2_b, ymm0_b);
+  load_tab32_mask(&k_mc_forward[3], ymm5);
+
+  config.sched_keys += 16;
+
+  while (1)
+    {
+      /* top of round */
+      movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+      pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+      psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+      pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+      movdqa256(ymm11, ymm2_a);		movdqa256(ymm11, ymm2_b);
+      pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+      pxor256(ymm1_a, ymm0_a);		pxor256(ymm1_b, ymm0_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm1_a, ymm3_a);	pshufb256_tab16(ymm1_b, ymm3_b);
+      pxor256(ymm2_a, ymm3_a);		pxor256(ymm2_b, ymm3_b);
+      movdqa256(ymm10, ymm4_a);		movdqa256(ymm10, ymm4_b);
+      pshufb256_tab16(ymm0_a, ymm4_a);	pshufb256_tab16(ymm0_b, ymm4_b);
+      pxor256(ymm2_a, ymm4_a);		pxor256(ymm2_b, ymm4_b);
+      movdqa256(ymm10, ymm2_a);		movdqa256(ymm10, ymm2_b);
+      pshufb256_tab16(ymm3_a, ymm2_a);	pshufb256_tab16(ymm3_b, ymm2_b);
+      pxor256(ymm0_a, ymm2_a);		pxor256(ymm0_b, ymm2_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm4_a, ymm3_a);	pshufb256_tab16(ymm4_b, ymm3_b);
+      pxor256(ymm1_a, ymm3_a);		pxor256(ymm1_b, ymm3_b);
+
+      if (config.sched_keys == end_keys)
+	break;
+
+      /* Inverse mix columns */
+      movdqa256(ymm13, ymm4_a);		movdqa256(ymm13, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      broadcast128_256_amemld(config.sched_keys, ymm6);
+      pxor256(ymm6, ymm4_a);		pxor256(ymm6, ymm4_b);
+      movdqa256(ymm12, ymm0_a);		movdqa256(ymm12, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      load_tab16_table(&k_dsbd[1], ymm1_a); movdqa256(ymm1_a, ymm1_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+
+      pshufb256_tab32(ymm5, ymm0_a);	pshufb256_tab32(ymm5, ymm0_b);
+      movdqa256(ymm15, ymm4_a);		movdqa256(ymm15, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      pxor256(ymm0_a, ymm4_a);		pxor256(ymm0_b, ymm4_b);
+      pshufb256_tab16(ymm3_a, ymm1_a);	pshufb256_tab16(ymm3_b, ymm1_b);
+      pxor256(ymm4_a, ymm1_a);		pxor256(ymm4_b, ymm1_b);
+
+      pshufb256_tab32(ymm5, ymm1_a);	pshufb256_tab32(ymm5, ymm1_b);
+      movdqa256(ymm14, ymm4_a);		movdqa256(ymm14, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      pxor256(ymm1_a, ymm4_a);		pxor256(ymm1_b, ymm4_b);
+      load_tab16_table(&k_dsbb[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+
+      pshufb256_tab32(ymm5, ymm0_a);	pshufb256_tab32(ymm5, ymm0_b);
+      movdqa256(ymm8, ymm4_a);		movdqa256(ymm8, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      pshufd256_0x93(ymm5, ymm5);
+      pxor256(ymm0_a, ymm4_a);		pxor256(ymm0_b, ymm4_b);
+      load_tab16_table(&k_dsbe[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+
+      config.sched_keys += 16;
+    }
+
+  /* middle of last round */
+  load_tab16_table(&k_dsbo[0], ymm4_a); movdqa256(ymm4_a, ymm4_b);
+  pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+  broadcast128_256_amemld(config.sched_keys, ymm6);
+  pxor256(ymm6, ymm4_a);		pxor256(ymm6, ymm4_b);
+  load_tab16_table(&k_dsbo[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+  pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+  pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+  load_tab32_mask(&k_sr[mc_pos], ymm5);
+  pshufb256_tab16(ymm5, ymm0_a);	pshufb256_tab16(ymm5, ymm0_b);
+
+  *pymm0_a = ymm0_a;
+  *pymm0_b = ymm0_b;
+}
+
+#endif /* HAVE_SIMD256 */
+
 ASM_FUNC_ATTR_NOINLINE unsigned int
 FUNC_ENCRYPT (const RIJNDAEL_context *ctx, unsigned char *dst,
               const unsigned char *src)
@@ -1534,12 +1728,12 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 {
   __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7, xmm8;
   __m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static const __m128i be_mask =
+  static const __m128i_const be_mask =
     M128I_BYTE(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  static const __m128i bigendian_add =
+  static const __m128i_const bigendian_add =
     M128I_BYTE(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
-  static const __m128i carry_add = M128I_U64(1, 1);
-  static const __m128i nocarry_add = M128I_U64(1, 0);
+  static const __m128i_const carry_add = M128I_U64(1, 1);
+  static const __m128i_const nocarry_add = M128I_U64(1, 0);
   u64 ctrlow = buf_get_be64(ctr + 8);
   struct vp_aes_config_s config;
 
@@ -1548,9 +1742,77 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 
   enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
 
-  movdqa128(bigendian_add, xmm8); /* Preload byte add */
+  movdqa128_memld(&bigendian_add, xmm8); /* Preload byte add */
   movdqu128_memld(ctr, xmm7); /* Preload CTR */
-  movdqa128(be_mask, xmm6); /* Preload mask */
+  movdqa128_memld(&be_mask, xmm6); /* Preload mask */
+
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqa128_256(xmm7, ymm0);
+
+	  /* detect if 8-bit carry handling is needed */
+	  if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3))
+	    {
+	      static const __m128i_const *adders[5][4] =
+	      {
+		{ &nocarry_add, &nocarry_add, &nocarry_add, &carry_add },
+		{ &nocarry_add, &nocarry_add, &carry_add, &nocarry_add },
+		{ &nocarry_add, &carry_add, &nocarry_add, &nocarry_add },
+		{ &carry_add, &nocarry_add, &nocarry_add, &nocarry_add },
+		{ &nocarry_add, &nocarry_add, &nocarry_add, &nocarry_add }
+	      };
+	      unsigned int idx = ctrlow <= 3 ? ctrlow : 4;
+
+	      pshufb128(xmm6, xmm7);
+
+	      paddq128_amemld(adders[idx][0], xmm7);
+	      movdqa128(xmm7, xmm2);
+	      pshufb128(xmm6, xmm2);
+	      insert256_hi128(xmm2, ymm0);
+	      paddq128_amemld(adders[idx][1], xmm7);
+	      movdqa128(xmm7, xmm2);
+	      pshufb128(xmm6, xmm2);
+	      movdqa128_256(xmm2, ymm1);
+	      paddq128_amemld(adders[idx][2], xmm7);
+	      movdqa128(xmm7, xmm2);
+	      pshufb128(xmm6, xmm2);
+	      insert256_hi128(xmm2, ymm1);
+	      paddq128_amemld(adders[idx][3], xmm7);
+
+	      pshufb128(xmm6, xmm7);
+	    }
+	  else
+	    {
+	      paddb128(xmm8, xmm7);
+	      insert256_hi128(xmm7, ymm0);
+	      paddb128(xmm8, xmm7);
+	      movdqa128_256(xmm7, ymm1);
+	      paddb128(xmm8, xmm7);
+	      insert256_hi128(xmm7, ymm1);
+	      paddb128(xmm8, xmm7);
+	    }
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
+					xmm15);
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
 
   for (; nblocks >= 2; nblocks -= 2)
     {
@@ -1564,24 +1826,24 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 	  /* detect if 64-bit carry handling is needed */
 	  if (UNLIKELY(ctrlow == 1))
 	    {
-	      paddq128(carry_add, xmm7);
+	      paddq128_amemld(&carry_add, xmm7);
 	      movdqa128(xmm7, xmm1);
 	      pshufb128(xmm6, xmm1);
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	    }
 	  else if (UNLIKELY(ctrlow == 0))
 	    {
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	      movdqa128(xmm7, xmm1);
 	      pshufb128(xmm6, xmm1);
-	      paddq128(carry_add, xmm7);
+	      paddq128_amemld(&carry_add, xmm7);
 	    }
 	  else
 	    {
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	      movdqa128(xmm7, xmm1);
 	      pshufb128(xmm6, xmm1);
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	    }
 
 	  pshufb128(xmm6, xmm7);
@@ -1617,7 +1879,7 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 	  pshufb128(xmm6, xmm7);
 
 	  /* detect if 64-bit carry handling is needed */
-	  paddq128(UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add, xmm7);
+	  paddq128_amemld(UNLIKELY(ctrlow == 0) ? &carry_add : &nocarry_add, xmm7);
 
 	  pshufb128(xmm6, xmm7);
 	}
@@ -1649,8 +1911,8 @@ FUNC_CTR32LE_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 {
   __m128i xmm0, xmm1, xmm2, xmm3, xmm7, xmm8;
   __m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static const __m128i add_one = M128I_U64(1, 0);
-  static const __m128i add_two = M128I_U64(2, 0);
+  static const __m128i_const add_one = M128I_U64(1, 0);
+  static const __m128i_const add_two = M128I_U64(2, 0);
   struct vp_aes_config_s config;
 
   config.nround = ctx->rounds;
@@ -1658,15 +1920,53 @@ FUNC_CTR32LE_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 
   enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
 
-  movdqa128(add_one, xmm8); /* Preload byte add */
+  movdqa128_memld(&add_one, xmm8); /* Preload byte add */
   movdqu128_memld(ctr, xmm7); /* Preload CTR */
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqa128(xmm7, xmm0);
+	  movdqa128(xmm7, xmm1);
+	  paddd128(xmm8, xmm1);
+	  paddd128_amemld(&add_two, xmm7);
+	  movdqa128_256(xmm0, ymm0);
+	  insert256_hi128(xmm1, ymm0);
+
+	  movdqa128(xmm7, xmm1);
+	  movdqa128(xmm7, xmm2);
+	  paddd128(xmm8, xmm2);
+	  paddd128_amemld(&add_two, xmm7);
+	  movdqa128_256(xmm1, ymm1);
+	  insert256_hi128(xmm2, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqa128(xmm7, xmm0);
       movdqa128(xmm7, xmm1);
       paddd128(xmm8, xmm1);
-      paddd128(add_two, xmm7);
+      paddd128_amemld(&add_two, xmm7);
 
       aes_encrypt_core_2blks(&xmm0, &xmm1, config,
 			      xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
@@ -1719,6 +2019,36 @@ FUNC_CFB_DEC (RIJNDAEL_context *ctx, unsigned char *iv,
 
   movdqu128_memld(iv, xmm0);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm6, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqa128_256(xmm0, ymm6);
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+	  movdqa256_128(ymm2, xmm2);
+	  insert256_hi128(xmm2, ymm6);
+	  movdqu256_memld(inbuf + 1 * BLOCKSIZE, ymm1);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+	  extract256_hi128(ymm3, xmm0);
+
+	  aes_encrypt_core_4blks_simd256(&ymm6, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm2, ymm6);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm6, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqa128(xmm0, xmm1);
@@ -1779,6 +2109,36 @@ FUNC_CBC_DEC (RIJNDAEL_context *ctx, unsigned char *iv,
 
   movdqu128_memld(iv, xmm7);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+	  movdqa256_128(ymm0, xmm0);
+	  movdqa128_256(xmm7, ymm2);
+	  insert256_hi128(xmm0, ymm2);
+	  movdqu256_memld(inbuf + 1 * BLOCKSIZE, ymm3);
+	  extract256_hi128(ymm1, xmm7);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf, xmm0);
@@ -1843,6 +2203,68 @@ aes_simd128_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   movdqu128_memld(c->u_iv.iv, xmm7);
   movdqu128_memld(c->u_ctr.ctr, xmm6);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support() && nblocks >= 4)
+    {
+      __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+      movdqa128_256(xmm6, ymm6);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm8);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm8);
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  pxor256(ymm0, ymm6);
+	  pxor256(ymm1, ymm6);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      extract256_hi128(ymm6, xmm0);
+      movdqa256_128(ymm6, xmm6);
+      pxor128(xmm0, xmm6);
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       const unsigned char *l;
@@ -1942,6 +2364,69 @@ aes_simd128_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
   movdqu128_memld(c->u_iv.iv, xmm7);
   movdqu128_memld(c->u_ctr.ctr, xmm6);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support() && nblocks >= 4)
+    {
+      __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+      movdqa128_256(xmm6, ymm6);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm8);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm8);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  pxor256(ymm0, ymm6);
+	  pxor256(ymm1, ymm6);
+
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      extract256_hi128(ymm6, xmm0);
+      movdqa256_128(ymm6, xmm6);
+      pxor128(xmm0, xmm6);
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       const unsigned char *l;
@@ -2044,6 +2529,61 @@ FUNC_OCB_AUTH(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
   movdqu128_memld(c->u_mode.ocb.aad_offset, xmm7);
   movdqu128_memld(c->u_mode.ocb.aad_sum, xmm6);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support() && nblocks >= 4)
+    {
+      __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+      movdqa128_256(xmm6, ymm6);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+
+	  movdqu256_memld(abuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(abuf + 2 * BLOCKSIZE, ymm1);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm8);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm8);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm0, ymm6);
+	  pxor256(ymm1, ymm6);
+
+	  abuf += 4 * BLOCKSIZE;
+	}
+
+      extract256_hi128(ymm6, xmm0);
+      movdqa256_128(ymm6, xmm6);
+      pxor128(xmm0, xmm6);
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       const unsigned char *l;
@@ -2117,6 +2657,29 @@ aes_simd128_ecb_enc (void *context, void *outbuf_arg, const void *inbuf_arg,
 
   enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf + 0 * BLOCKSIZE, xmm0);
@@ -2171,6 +2734,29 @@ aes_simd128_ecb_dec (void *context, void *outbuf_arg, const void *inbuf_arg,
 
   dec_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm8);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf + 0 * BLOCKSIZE, xmm0);
@@ -2216,13 +2802,13 @@ FUNC_ECB_CRYPT (void *context, void *outbuf_arg, const void *inbuf_arg,
 
 static ASM_FUNC_ATTR_INLINE __m128i xts_gfmul_byA (__m128i xmm5)
 {
-  static const __m128i xts_gfmul_const = M128I_U64(0x87, 0x01);
+  static const __m128i_const xts_gfmul_const = M128I_U64(0x87, 0x01);
   __m128i xmm1;
 
   pshufd128_0x4E(xmm5, xmm1);
   psraq128(63, xmm1);
   paddq128(xmm5, xmm5);
-  pand128(xts_gfmul_const, xmm1);
+  pand128_amemld(&xts_gfmul_const, xmm1);
   pxor128(xmm1, xmm5);
 
   return xmm5;
@@ -2246,6 +2832,43 @@ aes_simd128_xts_enc (void *context, unsigned char *tweak, void *outbuf_arg,
 
   movdqu128_memld(tweak, xmm7); /* Preload tweak */
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  movdqa128_256(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  movdqa128_256(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf, xmm0);
@@ -2315,6 +2938,43 @@ aes_simd128_xts_dec (void *context, unsigned char *tweak, void *outbuf_arg,
 
   movdqu128_memld(tweak, xmm7); /* Preload tweak */
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  movdqa128_256(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  movdqa128_256(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf, xmm0);
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 12c27319..0c48793b 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -224,6 +224,62 @@ extern void _gcry_aes_vp_aarch64_xts_crypt (void *context, unsigned char *tweak,
 					    size_t nblocks, int encrypt);
 #endif
 
+#ifdef USE_VP_RISCV
+/* RISC-V vector permutation implementation of AES */
+extern int _gcry_aes_vp_riscv_setup_acceleration(RIJNDAEL_context *ctx);
+
+extern void _gcry_aes_vp_riscv_do_setkey(RIJNDAEL_context *ctx,
+					 const byte *key);
+extern void _gcry_aes_vp_riscv_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_vp_riscv_encrypt (const RIJNDAEL_context *ctx,
+						unsigned char *dst,
+						const unsigned char *src);
+extern unsigned int _gcry_aes_vp_riscv_decrypt (const RIJNDAEL_context *ctx,
+						unsigned char *dst,
+						const unsigned char *src);
+extern void _gcry_aes_vp_riscv_cfb_enc (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern void _gcry_aes_vp_riscv_cbc_enc (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks,
+					int cbc_mac);
+extern void _gcry_aes_vp_riscv_ctr_enc (void *context, unsigned char *ctr,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern void _gcry_aes_vp_riscv_ctr32le_enc (void *context, unsigned char *ctr,
+					    void *outbuf_arg,
+					    const void *inbuf_arg,
+					    size_t nblocks);
+extern void _gcry_aes_vp_riscv_cfb_dec (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern void _gcry_aes_vp_riscv_cbc_dec (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern size_t _gcry_aes_vp_riscv_ocb_crypt (gcry_cipher_hd_t c,
+					    void *outbuf_arg,
+					    const void *inbuf_arg,
+					    size_t nblocks,
+					    int encrypt);
+extern size_t _gcry_aes_vp_riscv_ocb_auth (gcry_cipher_hd_t c,
+					   const void *abuf_arg,
+					   size_t nblocks);
+extern void _gcry_aes_vp_riscv_ecb_crypt (void *context, void *outbuf_arg,
+					  const void *inbuf_arg,
+					  size_t nblocks, int encrypt);
+extern void _gcry_aes_vp_riscv_xts_crypt (void *context, unsigned char *tweak,
+					  void *outbuf_arg,
+					  const void *inbuf_arg,
+					  size_t nblocks, int encrypt);
+#endif
+
 #ifdef USE_PADLOCK
 extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
                                                unsigned char *bx,
@@ -718,6 +774,30 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->xts_crypt = _gcry_aes_vp_aarch64_xts_crypt;
     }
 #endif
+#ifdef USE_VP_RISCV
+  else if ((hwfeatures & HWF_RISCV_IMAFDC) && (hwfeatures & HWF_RISCV_V) &&
+           _gcry_aes_vp_riscv_setup_acceleration(ctx))
+    {
+      hw_setkey = _gcry_aes_vp_riscv_do_setkey;
+      ctx->encrypt_fn = _gcry_aes_vp_riscv_encrypt;
+      ctx->decrypt_fn = _gcry_aes_vp_riscv_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_vp_riscv_prepare_decryption;
+
+      /* Setup vector permute AArch64 bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_vp_riscv_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_vp_riscv_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_vp_riscv_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_vp_riscv_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_vp_riscv_ctr_enc;
+      bulk_ops->ctr32le_enc = _gcry_aes_vp_riscv_ctr32le_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_vp_riscv_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_vp_riscv_ocb_auth;
+      bulk_ops->ecb_crypt = _gcry_aes_vp_riscv_ecb_crypt;
+      bulk_ops->xts_crypt = _gcry_aes_vp_riscv_xts_crypt;
+    }
+#endif
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
   else if ((hwfeatures & HWF_PPC_VCRYPTO) && (hwfeatures & HWF_PPC_ARCH_3_00))
     {
diff --git a/cipher/simd-common-riscv.h b/cipher/simd-common-riscv.h
new file mode 100644
index 00000000..8381000f
--- /dev/null
+++ b/cipher/simd-common-riscv.h
@@ -0,0 +1,48 @@
+/* simd-common-riscv.h  -  Common macros for RISC-V vector code
+ *
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_SIMD_COMMON_RISCV_H
+#define GCRY_SIMD_COMMON_RISCV_H
+
+#include <config.h>
+
+#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+
+#define clear_vec_regs() __asm__ volatile("vsetvli zero, %0, e8, m1, ta, ma;\n" \
+					  "vmv.v.i v0, 0;\n" \
+					  "vmv.v.i v1, 0;\n" \
+					  "vmv2r.v v2, v0;\n" \
+					  "vmv4r.v v4, v0;\n" \
+					  "vmv8r.v v8, v0;\n" \
+					  "vmv8r.v v16, v0;\n" \
+					  "vmv8r.v v24, v0;\n" \
+					  : \
+					  : "r" (~0) \
+					  : "memory", "vl", "vtype", \
+					    "v0", "v1", "v2", "v3", \
+					    "v4", "v5", "v6", "v7", \
+					    "v8", "v9", "v10", "v11", \
+					    "v12", "v13", "v14", "v15", \
+					    "v16", "v17", "v18", "v19", \
+					    "v20", "v21", "v22", "v23", \
+					    "v24", "v25", "v26", "v27", \
+					    "v28", "v29", "v30", "v31")
+
+#endif /* GCRY_SIMD_COMMON_RISCV_H */
diff --git a/configure.ac b/configure.ac
index f20d654d..55d15fa3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2705,6 +2705,120 @@ if test "$gcry_cv_gcc_inline_asm_riscv_v" = "yes" ; then
 fi
 
 
+#
+# Check whether compiler supports RISC-V vector intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics],
+      [gcry_cv_cc_riscv_vector_intrinsics],
+      [if test "$mpi_cpu_arch" != "riscv64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	gcry_cv_cc_riscv_vector_intrinsics="n/a"
+      else
+	gcry_cv_cc_riscv_vector_intrinsics=no
+	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+	[[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+	  #error __riscv_v_intrinsic not defined or too old version
+	  #endif
+	  #include <riscv_vector.h>
+	  typedef vuint8m1_t __m128i;
+	  #define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+	  #define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+	  #define paddq128(a, o) (o = cast_u64_to_m128i( \
+					__riscv_vadd_vv_u64m1( \
+					  cast_m128i_to_u64(o), \
+					  cast_m128i_to_u64(a), 2)))
+	  #define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+	  #define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+	  #define clear_vec_reg_v0() \
+	    __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
+			     "vmv.v.x v0, zero;\n" \
+			     ::: "memory", "vtype", "vl", "v0")
+	  static inline __attribute__((always_inline)) __m128i
+	  fn2(__m128i a)
+	  {
+	    paddq128(a, a);
+	    return a;
+	  }
+	  __m128i fn(__m128i in)
+	  {
+	    __m128i x;
+	    memory_barrier_with_vec(in);
+	    x = fn2(in);
+	    memory_barrier_with_vec(x);
+	    pshufb128(in, x);
+	    memory_barrier_with_vec(in);
+	    clear_vec_reg_v0();
+	    return in;
+	  }
+	  ]])],
+	[gcry_cv_cc_riscv_vector_intrinsics=yes])
+      fi])
+if test "$gcry_cv_cc_riscv_vector_intrinsics" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
+	    [Defined if underlying compiler supports RISC-V vector intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=rv64imafdcv -mstrict-align"
+
+if test "$gcry_cv_cc_riscv_vector_intrinsics" = "no" &&
+   test "$mpi_cpu_arch" = "riscv64" &&
+   test "$try_asm_modules" = "yes" ; then
+  AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics with extra GCC flags],
+    [gcry_cv_cc_riscv_vector_intrinsics_cflags],
+    [gcry_cv_cc_riscv_vector_intrinsics_cflags=no
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+      [[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+	#error __riscv_v_intrinsic not defined or too old version
+	#endif
+	#include <riscv_vector.h>
+	typedef vuint8m1_t __m128i;
+	#define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+	#define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+	#define paddq128(a, o) (o = cast_u64_to_m128i( \
+					__riscv_vadd_vv_u64m1( \
+					  cast_m128i_to_u64(o), \
+					  cast_m128i_to_u64(a), 2)))
+	#define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+	#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+	#define clear_vec_reg_v0() \
+	  __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
+			    "vmv.v.x v0, zero;\n" \
+			    ::: "memory", "vl", "v0")
+	static inline __attribute__((always_inline)) __m128i
+	fn2(__m128i a)
+	{
+	  paddq128(a, a);
+	  return a;
+	}
+	__m128i fn(__m128i in)
+	{
+	  __m128i x;
+	  memory_barrier_with_vec(in);
+	  x = fn2(in);
+	  memory_barrier_with_vec(x);
+	  pshufb128(in, x);
+	  memory_barrier_with_vec(in);
+	  clear_vec_reg_v0();
+	  return in;
+	}
+	]])],
+      [gcry_cv_cc_riscv_vector_intrinsics_cflags=yes])])
+  if test "$gcry_cv_cc_riscv_vector_intrinsics_cflags" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
+	      [Defined if underlying compiler supports RISC-V vector intrinsics])
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS_WITH_CFLAGS,1,
+	      [Defined if underlying compiler supports RISC-V vector intrinsics with extra GCC flags])
+  fi
+fi
+
+AM_CONDITIONAL(ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS,
+	       test "$gcry_cv_cc_riscv_vector_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
 #######################################
 #### Checks for library functions. ####
 #######################################
@@ -3183,6 +3297,10 @@ if test "$found" = "1" ; then
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
+      riscv64-*-*)
+         # Build with the vector permute SIMD128 implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vp-riscv.lo"
+      ;;
       s390x-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
-- 
2.45.2




More information about the Gcrypt-devel mailing list