[PATCH 6/6] Add RISC-V vector cryptography implementation of AES

Thu Aug 7 15:28:55 CEST 2025

* cipher/Makefile.am: Add 'rijndael-riscv-zvkned.c'.
* cipher/rijndael-internal.h (USE_RISCV_V_CRYPTO): New.
* cipher/rijndael-riscv-zvkned.c: New.
* cipher/rijndael.c [USE_RISCV_V_CRYPTO]
(_gcry_aes_riscv_zvkned_setup_acceleration, _gcry_aes_riscv_zvkned_setkey)
(_gcry_aes_riscv_zvkned_prepare_decryption)
(_gcry_aes_riscv_zvkned_encrypt, _gcry_aes_riscv_zvkned_decrypt)
(_gcry_aes_riscv_zvkned_cfb_enc, _gcry_aes_riscv_zvkned_cbc_enc)
(_gcry_aes_riscv_zvkned_ctr_enc, _gcry_aes_riscv_zvkned_ctr32le_enc)
(_gcry_aes_riscv_zvkned_cfb_dec, _gcry_aes_riscv_zvkned_cbc_dec)
(_gcry_aes_riscv_zvkned_ocb_crypt, _gcry_aes_riscv_zvkned_ocb_auth)
(_gcry_aes_riscv_zvkned_ecb_crypt, _gcry_aes_riscv_zvkned_xts_crypt): New.
(do_setkey) [USE_RISCV_V_CRYPTO]: Add setup for RISC-V vector cryptography
extension implementation.
* configure.ac: Add 'rijndael-riscv-zvkned.lo'.
(GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST): Add AES intrinsics.
(gcry_cv_riscv_vaes_vs_intrinsics_work, HAVE_BROKEN_VAES_VS_INTRINSIC): New.
* src/g10lib.h (HWF_RISCV_ZVKNED): Insert before HWF_RISCV_ZVKNHA.
* src/hwf-riscv.c (HWF_RISCV_HWPROBE_EXT_ZVKNED): New.
(hwprobe_features): Add Zvkned.
* src/hwfeatures.c (hwflist): Add "riscv-zvkned".
--

Implementation has been tested against QEMU emulator as there is no
actual HW available with these instructions yet.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am             |    7 +
 cipher/rijndael-internal.h     |    9 +
 cipher/rijndael-riscv-zvkned.c | 1608 ++++++++++++++++++++++++++++++++
 cipher/rijndael.c              |   90 +-
 configure.ac                   |   76 ++
 src/g10lib.h                   |    5 +-
 src/hwf-riscv.c                |    2 +
 src/hwfeatures.c               |    1 +
 8 files changed, 1793 insertions(+), 5 deletions(-)
 create mode 100644 cipher/rijndael-riscv-zvkned.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index ea91b7b8..7abbd5b3 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -122,6 +122,7 @@ EXTRA_libcipher_la_SOURCES = \
 	rijndael-ppc.c rijndael-ppc9le.c                   \
 	rijndael-p10le.c rijndael-gcm-p10le.s              \
 	rijndael-ppc-common.h rijndael-ppc-functions.h     \
+	rijndael-riscv-zvkned.c                            \
 	rijndael-s390x.c                                   \
 	rijndael-vp-aarch64.c rijndael-vp-riscv.c          \
 	rijndael-vp-simd128.h                              \
@@ -389,6 +390,12 @@ riscv_vector_crypto_cflags =
 endif
 endif
 
+rijndael-riscv-zvkned.o: $(srcdir)/rijndael-riscv-zvkned.c Makefile
+	`echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-riscv-zvkned.lo: $(srcdir)/rijndael-riscv-zvkned.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
+
 sha256-riscv-zvknha-zvkb.o: $(srcdir)/sha256-riscv-zvknha-zvkb.c Makefile
 	`echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
 
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 92310fc5..15084a69 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -124,6 +124,15 @@
 # endif
 #endif /* ENABLE_ARM_CRYPTO_SUPPORT */
 
+/* USE_RISCV_V_CRYPTO indicates whether to enable RISC-V vector cryptography
+ * extension code. */
+#undef USE_RISCV_V_CRYPTO
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+# define USE_RISCV_V_CRYPTO 1
+#endif
+
 /* USE_VP_AARCH64 indicates whether to enable vector permute AArch64 SIMD code. */
 #undef USE_VP_AARCH64
 #if defined(__AARCH64EL__) && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS)
diff --git a/cipher/rijndael-riscv-zvkned.c b/cipher/rijndael-riscv-zvkned.c
new file mode 100644
index 00000000..e3ba6769
--- /dev/null
+++ b/cipher/rijndael-riscv-zvkned.c
@@ -0,0 +1,1608 @@
+/* rijndael-riscv-zvkned.c - RISC-V vector crypto implementation of AES
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+
+#include "g10lib.h"
+#include "simd-common-riscv.h"
+#include "rijndael-internal.h"
+#include "cipher-internal.h"
+
+#include <riscv_vector.h>
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ALWAYS_INLINE ASM_FUNC_ATTR
+#define ASM_FUNC_ATTR_NOINLINE NO_INLINE ASM_FUNC_ATTR
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+
+/*
+ * Helper macro and functions
+ */
+
+#define cast_u8m1_u32m1(a) __riscv_vreinterpret_v_u8m1_u32m1(a)
+#define cast_u8m1_u64m1(a) __riscv_vreinterpret_v_u8m1_u64m1(a)
+#define cast_u32m1_u8m1(a) __riscv_vreinterpret_v_u32m1_u8m1(a)
+#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)
+#define cast_u64m1_u8m1(a) __riscv_vreinterpret_v_u64m1_u8m1(a)
+
+#define cast_u8m2_u32m2(a) __riscv_vreinterpret_v_u8m2_u32m2(a)
+#define cast_u32m2_u8m2(a) __riscv_vreinterpret_v_u32m2_u8m2(a)
+
+#define cast_u8m4_u32m4(a) __riscv_vreinterpret_v_u8m4_u32m4(a)
+#define cast_u32m4_u8m4(a) __riscv_vreinterpret_v_u32m4_u8m4(a)
+
+#define cast_u64m1_u32m1(a) __riscv_vreinterpret_v_u64m1_u32m1(a)
+#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)
+
+#define cast_u64m1_i64m1(a) __riscv_vreinterpret_v_u64m1_i64m1(a)
+#define cast_i64m1_u64m1(a) __riscv_vreinterpret_v_i64m1_u64m1(a)
+
+#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+bswap128_u32m1(vuint32m1_t vec, size_t vl_u32)
+{
+  static const byte bswap128_arr[16] =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  size_t vl_bytes = vl_u32 * 4;
+  vuint8m1_t bswap128 = __riscv_vle8_v_u8m1(bswap128_arr, vl_bytes);
+
+  return cast_u8m1_u32m1(
+	    __riscv_vrgather_vv_u8m1(cast_u32m1_u8m1(vec), bswap128, vl_bytes));
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+unaligned_load_u32m1(const void *ptr, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m1_u32m1(__riscv_vle8_v_u8m1(ptr, vl_bytes));
+}
+
+static ASM_FUNC_ATTR_INLINE void
+unaligned_store_u32m1(void *ptr, vuint32m1_t vec, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  __riscv_vse8_v_u8m1(ptr, cast_u32m1_u8m1(vec), vl_bytes);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m4_t
+unaligned_load_u32m4(const void *ptr, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m4_u32m4(__riscv_vle8_v_u8m4(ptr, vl_bytes));
+}
+
+static ASM_FUNC_ATTR_INLINE void
+unaligned_store_u32m4(void *ptr, vuint32m4_t vec, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  __riscv_vse8_v_u8m4(ptr, cast_u32m4_u8m4(vec), vl_bytes);
+}
+
+static vuint32m1_t
+vxor_u8_u32m1(vuint32m1_t a, vuint32m1_t b, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m1_u32m1(__riscv_vxor_vv_u8m1(cast_u32m1_u8m1(a),
+					      cast_u32m1_u8m1(b), vl_bytes));
+}
+
+static vuint32m4_t
+vxor_u8_u32m4(vuint32m4_t a, vuint32m4_t b, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m4_u32m4(__riscv_vxor_vv_u8m4(cast_u32m4_u8m4(a),
+					      cast_u32m4_u8m4(b), vl_bytes));
+}
+
+
+/*
+ * HW support detection
+ */
+
+int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_setup_acceleration(RIJNDAEL_context *ctx)
+{
+  (void)ctx;
+  return (__riscv_vsetvl_e32m1(4) == 4);
+}
+
+
+/*
+ * Key expansion
+ */
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes128_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  size_t vl = 4;
+
+  vuint32m1_t round_key = unaligned_load_u32m1 (key, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[0][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 1, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[1][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 2, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[2][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 3, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[3][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 4, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[4][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 5, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[5][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 6, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[6][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 7, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[7][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 8, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[8][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 9, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[9][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 10, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[10][0], round_key, vl);
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes192_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  size_t vl = 4;
+  u32 *w = &ctx->keyschenc32[0][0];
+  u32 wr;
+  vuint32m1_t rk_0_7;
+  vuint32m1_t rk_4_11;
+
+  rk_0_7 = unaligned_load_u32m1 (&key[0], vl);
+  rk_4_11 = unaligned_load_u32m1 (&key[8], vl);
+  __riscv_vse32_v_u32m1 (&w[0], rk_0_7, vl);
+  __riscv_vse32_v_u32m1 (&w[2], rk_4_11, vl);
+
+#define AES192_KF1_GEN(out, input, round192, vl) \
+  ({ \
+      u32 temp_array[4] = { 0, 0, 0, 0 }; \
+      vuint32m1_t temp_vec; \
+      temp_array[3] = (input); \
+      temp_vec = __riscv_vle32_v_u32m1(temp_array, (vl)); \
+      temp_vec = __riscv_vaeskf1_vi_u32m1(temp_vec, (round192), (vl)); \
+      (out) = __riscv_vmv_x_s_u32m1_u32(temp_vec); \
+  })
+
+#define AES192_EXPAND_BLOCK(w, round192, wr, last) \
+  ({ \
+    (w)[(round192) * 6 + 0] = (w)[(round192) * 6 - 6] ^ (wr); \
+    (w)[(round192) * 6 + 1] = (w)[(round192) * 6 - 5] ^ (w)[(round192) * 6 + 0]; \
+    (w)[(round192) * 6 + 2] = (w)[(round192) * 6 - 4] ^ (w)[(round192) * 6 + 1]; \
+    (w)[(round192) * 6 + 3] = (w)[(round192) * 6 - 3] ^ (w)[(round192) * 6 + 2]; \
+    if (!(last)) \
+      { \
+	(w)[(round192) * 6 + 4] = (w)[(round192) * 6 - 2] ^ (w)[(round192) * 6 + 3]; \
+	(w)[(round192) * 6 + 5] = (w)[(round192) * 6 - 1] ^ (w)[(round192) * 6 + 4]; \
+      } \
+  })
+
+  AES192_KF1_GEN(wr, w[5], 1, vl);
+  AES192_EXPAND_BLOCK(w, 1, wr, 0);
+
+  AES192_KF1_GEN(wr, w[11], 2, vl);
+  AES192_EXPAND_BLOCK(w, 2, wr, 0);
+
+  AES192_KF1_GEN(wr, w[17], 3, vl);
+  AES192_EXPAND_BLOCK(w, 3, wr, 0);
+
+  AES192_KF1_GEN(wr, w[23], 4, vl);
+  AES192_EXPAND_BLOCK(w, 4, wr, 0);
+
+  AES192_KF1_GEN(wr, w[29], 5, vl);
+  AES192_EXPAND_BLOCK(w, 5, wr, 0);
+
+  AES192_KF1_GEN(wr, w[35], 6, vl);
+  AES192_EXPAND_BLOCK(w, 6, wr, 0);
+
+  AES192_KF1_GEN(wr, w[41], 7, vl);
+  AES192_EXPAND_BLOCK(w, 7, wr, 0);
+
+  AES192_KF1_GEN(wr, w[47], 8, vl);
+  AES192_EXPAND_BLOCK(w, 8, wr, 1);
+
+#undef AES192_KF1_GEN
+#undef AES192_EXPAND_BLOCK
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes256_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  size_t vl = 4;
+
+  vuint32m1_t rk_a = unaligned_load_u32m1 (&key[0], vl);
+  vuint32m1_t rk_b = unaligned_load_u32m1 (&key[16], vl);
+
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[0][0], rk_a, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[1][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 2, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[2][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 3, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[3][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 4, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[4][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 5, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[5][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 6, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[6][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 7, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[7][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 8, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[8][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 9, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[9][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 10, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[10][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 11, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[11][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 12, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[12][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 13, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[13][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 14, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[14][0], rk_a, vl);
+
+  clear_vec_regs();
+}
+
+void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  unsigned int rounds = ctx->rounds;
+
+  if (rounds < 12)
+    {
+      aes128_riscv_setkey(ctx, key);
+    }
+  else if (rounds == 12)
+    {
+      aes192_riscv_setkey(ctx, key);
+      _gcry_burn_stack(64);
+    }
+  else
+    {
+      aes256_riscv_setkey(ctx, key);
+    }
+}
+
+static ASM_FUNC_ATTR_INLINE void
+do_prepare_decryption(RIJNDAEL_context *ctx)
+{
+  u32 *ekey = (u32 *)(void *)ctx->keyschenc;
+  u32 *dkey = (u32 *)(void *)ctx->keyschdec;
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  int rr;
+  int r;
+
+  r = 0;
+  rr = rounds;
+  for (r = 0, rr = rounds; r <= rounds; r++, rr--)
+    {
+      __riscv_vse32_v_u32m1(dkey + r * 4,
+			    __riscv_vle32_v_u32m1(ekey + rr * 4, vl),
+			    vl);
+    }
+}
+
+void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_prepare_decryption(RIJNDAEL_context *ctx)
+{
+  do_prepare_decryption(ctx);
+  clear_vec_regs();
+}
+
+
+/*
+ * Encryption / Decryption
+ */
+
+#define ROUND_KEY_VARIABLES \
+  vuint32m1_t rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7, rk8; \
+  vuint32m1_t rk9, rk10, rk11, rk12, rk13, rk_last;
+
+#define PRELOAD_ROUND_KEYS(rk, nrounds, vl) \
+  do { \
+    rk0 = __riscv_vle32_v_u32m1(rk + 0 * 4, vl); \
+    rk1 = __riscv_vle32_v_u32m1(rk + 1 * 4, vl); \
+    rk2 = __riscv_vle32_v_u32m1(rk + 2 * 4, vl); \
+    rk3 = __riscv_vle32_v_u32m1(rk + 3 * 4, vl); \
+    rk4 = __riscv_vle32_v_u32m1(rk + 4 * 4, vl); \
+    rk5 = __riscv_vle32_v_u32m1(rk + 5 * 4, vl); \
+    rk6 = __riscv_vle32_v_u32m1(rk + 6 * 4, vl); \
+    rk7 = __riscv_vle32_v_u32m1(rk + 7 * 4, vl); \
+    rk8 = __riscv_vle32_v_u32m1(rk + 8 * 4, vl); \
+    rk9 = __riscv_vle32_v_u32m1(rk + 9 * 4, vl); \
+    if (UNLIKELY(nrounds >= 12)) \
+      { \
+        rk10 = __riscv_vle32_v_u32m1(rk + 10 * 4, vl); \
+        rk11 = __riscv_vle32_v_u32m1(rk + 11 * 4, vl); \
+        if (LIKELY(nrounds > 12)) \
+          { \
+            rk12 = __riscv_vle32_v_u32m1(rk + 12 * 4, vl); \
+            rk13 = __riscv_vle32_v_u32m1(rk + 13 * 4, vl); \
+          } \
+	else \
+	  { \
+	    rk12 = __riscv_vundefined_u32m1(); \
+	    rk13 = __riscv_vundefined_u32m1(); \
+	  } \
+      } \
+    else \
+      { \
+	rk10 = __riscv_vundefined_u32m1(); \
+	rk11 = __riscv_vundefined_u32m1(); \
+	rk12 = __riscv_vundefined_u32m1(); \
+	rk13 = __riscv_vundefined_u32m1(); \
+      } \
+    rk_last = __riscv_vle32_v_u32m1(rk + nrounds * 4, vl); \
+  } while (0)
+
+#ifdef HAVE_BROKEN_VAES_VS_INTRINSIC
+#define AES_CRYPT(e_d, mx, nrounds, blk, vlen) \
+  asm ( "vsetvli zero,%[vl],e32,"#mx",ta,ma;\n\t" \
+	"vaesz.vs %[block],%[rk0];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk1];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk2];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk3];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk4];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk5];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk6];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk7];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk8];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk9];\n\t" \
+	"blt %[rounds],%[num12],.Lcryptlast%=;\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk10];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk11];\n\t" \
+	"beq %[rounds],%[num12],.Lcryptlast%=;\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk12];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk13];\n\t" \
+	".Lcryptlast%=:\n\t" \
+	"vaes"#e_d"f.vs %[block],%[rk_last];\n\t" \
+	: [block] "+vr" (blk) \
+	: [vl] "r" (vlen), [rounds] "r" (nrounds), [num12] "r" (12), \
+	  [rk0] "vr" (rk0), [rk1] "vr" (rk1), [rk2] "vr" (rk2), \
+	  [rk3] "vr" (rk3), [rk4] "vr" (rk4), [rk5] "vr" (rk5), \
+	  [rk6] "vr" (rk6), [rk7] "vr" (rk7), [rk8] "vr" (rk8), \
+	  [rk9] "vr" (rk9), [rk10] "vr" (rk10), [rk11] "vr" (rk11), \
+	  [rk12] "vr" (rk12), [rk13] "vr" (rk13), \
+	  [rk_last] "vr" (rk_last) \
+	: "vl")
+#else
+#define AES_CRYPT(e_d, mx, rounds, block, vl) \
+  ({ \
+    (block) = __riscv_vaesz_vs_u32m1_u32##mx((block), rk0, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk1, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk2, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk3, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk4, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk5, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk6, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk7, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk8, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk9, (vl)); \
+    if (UNLIKELY((rounds) >= 12)) \
+      { \
+	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk10, (vl)); \
+	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk11, (vl)); \
+	if (LIKELY((rounds) > 12)) \
+	  { \
+	    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk12, (vl)); \
+	    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk13, (vl)); \
+	  } \
+      } \
+    (block) = __riscv_vaes##e_d##f_vs_u32m1_u32##mx((block), rk_last, (vl)); \
+  })
+#endif
+
+unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context *ctx, unsigned char *out,
+				const unsigned char *in)
+{
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t block;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  block = unaligned_load_u32m1(in, vl);
+
+  AES_CRYPT(e, m1, rounds, block, vl);
+
+  unaligned_store_u32m1(out, block, vl);
+
+  clear_vec_regs();
+
+  return 0; /* does not use stack */
+}
+
+unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context *ctx, unsigned char *out,
+				const unsigned char *in)
+{
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t block;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  block = unaligned_load_u32m1(in, vl);
+
+  AES_CRYPT(d, m1, rounds, block, vl);
+
+  unaligned_store_u32m1(out, block, vl);
+
+  clear_vec_regs();
+
+  return 0; /* does not use stack */
+}
+
+static ASM_FUNC_ATTR_INLINE void
+aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
+			    const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = encrypt ? ctx->keyschenc32[0] : ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  ROUND_KEY_VARIABLES;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t blocks;
+
+      blocks = unaligned_load_u32m4(inbuf, vl * 4);
+
+      if (encrypt)
+        AES_CRYPT(e, m4, rounds, blocks, vl * 4);
+      else
+        AES_CRYPT(d, m4, rounds, blocks, vl * 4);
+
+      unaligned_store_u32m4(outbuf, blocks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t block;
+
+      block = unaligned_load_u32m1(inbuf, vl);
+
+      if (encrypt)
+        AES_CRYPT(e, m1, rounds, block, vl);
+      else
+        AES_CRYPT(d, m1, rounds, block, vl);
+
+      unaligned_store_u32m1(outbuf, block, vl);
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  clear_vec_regs();
+}
+
+static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+aes_riscv_zvkned_ecb_enc (void *context, void *outbuf_arg,
+			  const void *inbuf_arg, size_t nblocks)
+{
+  aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 1);
+}
+
+static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+aes_riscv_zvkned_ecb_dec (void *context, void *outbuf_arg,
+			  const void *inbuf_arg, size_t nblocks)
+{
+  aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 0);
+}
+
+void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt)
+{
+  if (encrypt)
+    aes_riscv_zvkned_ecb_enc (context, outbuf_arg, inbuf_arg, nblocks);
+  else
+    aes_riscv_zvkned_ecb_dec (context, outbuf_arg, inbuf_arg, nblocks);
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cfb_enc (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks; nblocks--)
+    {
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+
+      AES_CRYPT(e, m1, rounds, iv, vl);
+
+      data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(iv), data, vl_bytes);
+      __riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
+      iv = cast_u8m1_u32m1(data);
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cbc_enc (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks, int cbc_mac)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  size_t outbuf_add = (!cbc_mac) * BLOCKSIZE;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks; nblocks--)
+    {
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+      iv = cast_u8m1_u32m1(
+	__riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes));
+
+      AES_CRYPT(e, m1, rounds, iv, vl);
+
+      __riscv_vse8_v_u8m1(outbuf, cast_u32m1_u8m1(iv), vl_bytes);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += outbuf_add;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_ctr_enc (void *context, unsigned char *ctr_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  static const byte add_u8_array[4][16] =
+  {
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
+  };
+  static const u64 carry_add[2] = { 1, 1 };
+  static const u64 nocarry_add[2] = { 1, 0 };
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  u64 ctrlow;
+  vuint32m1_t ctr;
+  vuint8m1_t add1;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  add1 = __riscv_vle8_v_u8m1(add_u8_array[0], vl_bytes);
+  ctr = unaligned_load_u32m1(ctr_arg, vl);
+  ctrlow = __riscv_vmv_x_s_u64m1_u64(cast_u32m1_u64m1(bswap128_u32m1(ctr, vl)));
+
+  memory_barrier_with_vec(add1);
+
+  if (nblocks >= 4)
+    {
+      vuint8m1_t add2 = __riscv_vle8_v_u8m1(add_u8_array[1], vl_bytes);
+      vuint8m1_t add3 = __riscv_vle8_v_u8m1(add_u8_array[2], vl_bytes);
+      vuint8m1_t add4 = __riscv_vle8_v_u8m1(add_u8_array[3], vl_bytes);
+
+      memory_barrier_with_vec(add2);
+      memory_barrier_with_vec(add3);
+      memory_barrier_with_vec(add4);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  vuint8m4_t data4blks;
+	  vuint32m4_t ctr4blks;
+
+	  /* detect if 8-bit carry handling is needed */
+	  if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3))
+	    {
+	      static const u64 *adders[5][4] =
+	      {
+		{ nocarry_add, nocarry_add, nocarry_add, carry_add },
+		{ nocarry_add, nocarry_add, carry_add, nocarry_add },
+		{ nocarry_add, carry_add, nocarry_add, nocarry_add },
+		{ carry_add, nocarry_add, nocarry_add, nocarry_add },
+		{ nocarry_add, nocarry_add, nocarry_add, nocarry_add }
+	      };
+	      unsigned int idx = ctrlow <= 3 ? ctrlow : 4;
+	      vuint64m1_t ctr_u64;
+	      vuint32m1_t ctr_u32_1;
+	      vuint32m1_t ctr_u32_2;
+	      vuint32m1_t ctr_u32_3;
+	      vuint32m1_t ctr_u32_4;
+	      vuint64m1_t add_u64;
+
+	      /* Byte swap counter */
+	      ctr_u64 = cast_u32m1_u64m1(bswap128_u32m1(ctr, vl));
+
+	      /* Addition with carry handling */
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][0], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_1 = cast_u64m1_u32m1(ctr_u64);
+
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][1], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_2 = cast_u64m1_u32m1(ctr_u64);
+
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][2], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_3 = cast_u64m1_u32m1(ctr_u64);
+
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][3], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_4 = cast_u64m1_u32m1(ctr_u64);
+
+	      /* Byte swap counters */
+	      ctr_u32_1 = bswap128_u32m1(ctr_u32_1, vl);
+	      ctr_u32_2 = bswap128_u32m1(ctr_u32_2, vl);
+	      ctr_u32_3 = bswap128_u32m1(ctr_u32_3, vl);
+	      ctr_u32_4 = bswap128_u32m1(ctr_u32_4, vl);
+
+	      ctr4blks = __riscv_vundefined_u32m4();
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr_u32_1);
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr_u32_2);
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr_u32_3);
+	      ctr = ctr_u32_4;
+	    }
+	  else
+	    {
+	      /* Fast path addition without carry handling */
+	      vuint8m1_t ctr_u8 = cast_u32m1_u8m1(ctr);
+	      vuint8m1_t ctr1 = __riscv_vadd_vv_u8m1(ctr_u8, add1, vl_bytes);
+	      vuint8m1_t ctr2 = __riscv_vadd_vv_u8m1(ctr_u8, add2, vl_bytes);
+	      vuint8m1_t ctr3 = __riscv_vadd_vv_u8m1(ctr_u8, add3, vl_bytes);
+	      vuint8m4_t ctr0123_u8 = __riscv_vundefined_u8m4();
+
+	      ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(ctr_u8, add4,
+							 vl_bytes));
+
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 0, ctr_u8);
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 1, ctr1);
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 2, ctr2);
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 3, ctr3);
+
+	      ctr4blks = cast_u8m4_u32m4(ctr0123_u8);
+	    }
+
+	  data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);
+
+	  AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);
+
+	  data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
+					   vl_bytes * 4);
+	  __riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t block = ctr;
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+
+      /* detect if 8-bit carry handling is needed */
+      if (UNLIKELY((++ctrlow & 0xff) == 0))
+	{
+	  const u64 *add_arr = UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add;
+	  vuint64m1_t add_val = __riscv_vle64_v_u64m1(add_arr, vl / 2);
+
+	  /* Byte swap counter */
+	  ctr = bswap128_u32m1(ctr, vl);
+
+	  /* Addition with carry handling */
+	  ctr = cast_u64m1_u32m1(__riscv_vadd_vv_u64m1(cast_u32m1_u64m1(ctr),
+						       add_val, vl / 2));
+
+	  /* Byte swap counter */
+	  ctr = bswap128_u32m1(ctr, vl);
+	}
+      else
+	{
+	  /* Fast path addition without carry handling */
+	  ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(cast_u32m1_u8m1(ctr),
+						     add1, vl_bytes));
+	}
+
+      AES_CRYPT(e, m1, rounds, block, vl);
+
+      data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
+      __riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(ctr_arg, ctr, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_ctr32le_enc (void *context, unsigned char *ctr_arg,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks)
+{
+  static const u32 add_u32_array[4][16] =
+  {
+    { 1, },  { 2, }, { 3, }, { 4, }
+  };
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t ctr;
+  vuint32m1_t add1;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  add1 = __riscv_vle32_v_u32m1(add_u32_array[0], vl);
+  ctr = unaligned_load_u32m1(ctr_arg, vl);
+
+  memory_barrier_with_vec(add1);
+
+  if (nblocks >= 4)
+    {
+      vuint32m1_t add2 = __riscv_vle32_v_u32m1(add_u32_array[1], vl);
+      vuint32m1_t add3 = __riscv_vle32_v_u32m1(add_u32_array[2], vl);
+      vuint32m1_t add4 = __riscv_vle32_v_u32m1(add_u32_array[3], vl);
+
+      memory_barrier_with_vec(add2);
+      memory_barrier_with_vec(add3);
+      memory_barrier_with_vec(add4);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  vuint32m1_t ctr1 = __riscv_vadd_vv_u32m1(ctr, add1, vl);
+	  vuint32m1_t ctr2 = __riscv_vadd_vv_u32m1(ctr, add2, vl);
+	  vuint32m1_t ctr3 = __riscv_vadd_vv_u32m1(ctr, add3, vl);
+	  vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+	  vuint8m4_t data4blks;
+
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr1);
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr2);
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr3);
+	  ctr = __riscv_vadd_vv_u32m1(ctr, add4, vl);
+
+	  data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);
+
+	  AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);
+
+	  data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
+					   vl_bytes * 4);
+	  __riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t block = ctr;
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+
+      ctr = __riscv_vadd_vv_u32m1(ctr, add1, vl);
+
+      AES_CRYPT(e, m1, rounds, block, vl);
+
+      data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
+      __riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(ctr_arg, ctr, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cfb_dec (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
+      vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
+      vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
+      vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
+      vuint32m4_t iv4blks = __riscv_vundefined_u32m4();
+
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);
+      iv = iv4;
+
+      AES_CRYPT(e, m4, rounds, iv4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t new_iv = data;
+
+      AES_CRYPT(e, m1, rounds, iv, vl);
+
+      data = vxor_u8_u32m1(iv, data, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+      iv = new_iv;
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cbc_dec (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  if (!ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
+      vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
+      vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
+      vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
+      vuint32m4_t iv4blks = __riscv_vundefined_u32m4();
+
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);
+
+      AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+      iv = iv4;
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t new_iv = data;
+
+      AES_CRYPT(d, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(iv, data, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+      iv = new_iv;
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
+aes_riscv_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  vuint32m1_t ctr;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload Offset and Checksum */
+  iv = unaligned_load_u32m1(c->u_iv.iv, vl);
+  ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);
+
+  if (nblocks >= 4)
+    {
+      vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+      vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
+
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+	  vuint8m1_t l_ntzi;
+	  vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+	  vuint32m4_t offsets = __riscv_vundefined_u32m4();
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
+			  __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      const unsigned char *l;
+      vuint8m1_t l_ntzi;
+      vuint32m1_t data;
+
+      data = unaligned_load_u32m1(inbuf, vl);
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(ctr, data, vl);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      l = ocb_get_l(c, ++n);
+      l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+      iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+
+      data = vxor_u8_u32m1(data, iv, vl);
+
+      AES_CRYPT(e, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(iv, data, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+
+  unaligned_store_u32m1(c->u_iv.iv, iv, vl);
+  unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
+aes_riscv_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  vuint32m1_t ctr;
+  ROUND_KEY_VARIABLES;
+
+  if (!ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload Offset and Checksum */
+  iv = unaligned_load_u32m1(c->u_iv.iv, vl);
+  ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);
+
+  if (nblocks >= 4)
+    {
+      vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+      vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
+
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+	  vuint8m1_t l_ntzi;
+	  vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+	  vuint32m4_t offsets = __riscv_vundefined_u32m4();
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
+			  __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      const unsigned char *l;
+      vuint8m1_t l_ntzi;
+      vuint8m1_t data;
+      vuint32m1_t block;
+
+      l = ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+      data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+      iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+      data = __riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes);
+      block = cast_u8m1_u32m1(data);
+
+      AES_CRYPT(d, m1, rounds, block, vl);
+
+      block = vxor_u8_u32m1(iv, block, vl);
+      unaligned_store_u32m1(outbuf, block, vl);
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(ctr, block, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+
+  unaligned_store_u32m1(c->u_iv.iv, iv, vl);
+  unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt)
+{
+  if (encrypt)
+    return aes_riscv_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+  else
+    return aes_riscv_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
+}
+
+size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				 size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 n = c->u_mode.ocb.aad_nblocks;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  vuint32m1_t ctr;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload Offset and Sum */
+  iv = unaligned_load_u32m1(c->u_mode.ocb.aad_offset, vl);
+  ctr = unaligned_load_u32m1(c->u_mode.ocb.aad_sum, vl);
+
+  if (nblocks >= 4)
+    {
+      vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+      vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
+
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+	  vuint8m1_t l_ntzi;
+	  vuint32m4_t data4blks = unaligned_load_u32m4(abuf, vl * 4);
+	  vuint32m4_t offsets = __riscv_vundefined_u32m4();
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
+
+	  ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
+
+	  abuf += 4 * BLOCKSIZE;
+	}
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
+			  __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      const unsigned char *l;
+      vuint8m1_t l_ntzi;
+      vuint32m1_t data;
+
+      data = unaligned_load_u32m1(abuf, vl);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      l = ocb_get_l(c, ++n);
+      l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+      iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+
+      data = vxor_u8_u32m1(data, iv, vl);
+
+      AES_CRYPT(e, m1, rounds, data, vl);
+
+      ctr = vxor_u8_u32m1(ctr, data, vl);
+
+      abuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.aad_nblocks = n;
+
+  unaligned_store_u32m1(c->u_mode.ocb.aad_offset, iv, vl);
+  unaligned_store_u32m1(c->u_mode.ocb.aad_sum, ctr, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+static const u64 xts_gfmul_const[2] = { 0x87, 0x01 };
+static const u64 xts_swap64_const[2] = { 1, 0 };
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+xts_gfmul_byA (vuint32m1_t vec_in, vuint64m1_t xts_gfmul,
+	       vuint64m1_t xts_swap64, size_t vl)
+{
+  vuint64m1_t in_u64 = cast_u32m1_u64m1(vec_in);
+  vuint64m1_t tmp1;
+
+  tmp1 =
+    __riscv_vrgather_vv_u64m1(cast_u32m1_u64m1(vec_in), xts_swap64, vl / 2);
+  tmp1 = cast_i64m1_u64m1(
+    __riscv_vsra_vx_i64m1(cast_u64m1_i64m1(tmp1), 63, vl / 2));
+  in_u64 = __riscv_vadd_vv_u64m1(in_u64, in_u64, vl / 2);
+  tmp1 = __riscv_vand_vv_u64m1(tmp1, xts_gfmul, vl / 2);
+
+  return cast_u64m1_u32m1(__riscv_vxor_vv_u64m1(in_u64, tmp1, vl / 2));
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes_riscv_xts_enc (void *context, unsigned char *tweak_arg, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t tweak;
+  vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
+  vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload tweak */
+  tweak = unaligned_load_u32m1(tweak_arg, vl);
+
+  memory_barrier_with_vec(xts_gfmul);
+  memory_barrier_with_vec(xts_swap64);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m4_t tweaks = __riscv_vundefined_u32m4();
+
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t tweak0 = tweak;
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      AES_CRYPT(e, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(tweak_arg, tweak, vl);
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes_riscv_xts_dec (void *context, unsigned char *tweak_arg, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t tweak;
+  vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
+  vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
+  ROUND_KEY_VARIABLES;
+
+  if (!ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload tweak */
+  tweak = unaligned_load_u32m1(tweak_arg, vl);
+
+  memory_barrier_with_vec(xts_gfmul);
+  memory_barrier_with_vec(xts_swap64);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m4_t tweaks = __riscv_vundefined_u32m4();
+
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t tweak0 = tweak;
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      AES_CRYPT(d, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(tweak_arg, tweak, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_xts_crypt (void *context, unsigned char *tweak_arg,
+				  void *outbuf_arg, const void *inbuf_arg,
+				  size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    aes_riscv_xts_enc(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
+  else
+    aes_riscv_xts_dec(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
+}
+
+#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 0c48793b..52500e59 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -280,6 +280,63 @@ extern void _gcry_aes_vp_riscv_xts_crypt (void *context, unsigned char *tweak,
 					  size_t nblocks, int encrypt);
 #endif
 
+#ifdef USE_RISCV_V_CRYPTO
+/* RISC-V vector cryptography extension implementation of AES */
+extern int
+_gcry_aes_riscv_zvkned_setup_acceleration (RIJNDAEL_context *ctx);
+
+extern void
+_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context *ctx, const byte *key);
+extern void
+_gcry_aes_riscv_zvkned_prepare_decryption (RIJNDAEL_context *ctx);
+
+extern unsigned int
+_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+				const unsigned char *src);
+extern unsigned int
+_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+				const unsigned char *src);
+extern void
+_gcry_aes_riscv_zvkned_cfb_enc (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_cbc_enc (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks, int cbc_mac);
+extern void
+_gcry_aes_riscv_zvkned_ctr_enc (void *context, unsigned char *ctr,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_ctr32le_enc (void *context, unsigned char *ctr,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_cfb_dec (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_cbc_dec (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern size_t
+_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt);
+extern size_t
+_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				 size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt);
+extern void
+_gcry_aes_riscv_zvkned_xts_crypt (void *context, unsigned char *tweak,
+				  void *outbuf_arg, const void *inbuf_arg,
+				  size_t nblocks, int encrypt);
+#endif
+
 #ifdef USE_PADLOCK
 extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
                                                unsigned char *bx,
@@ -774,9 +831,36 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->xts_crypt = _gcry_aes_vp_aarch64_xts_crypt;
     }
 #endif
+#ifdef USE_RISCV_V_CRYPTO
+    else if ((hwfeatures & HWF_RISCV_IMAFDC)
+	     && (hwfeatures & HWF_RISCV_V)
+	     && (hwfeatures & HWF_RISCV_ZVKNED)
+	     && _gcry_aes_riscv_zvkned_setup_acceleration(ctx))
+    {
+      hw_setkey = _gcry_aes_riscv_zvkned_setkey;
+      ctx->encrypt_fn = _gcry_aes_riscv_zvkned_encrypt;
+      ctx->decrypt_fn = _gcry_aes_riscv_zvkned_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_riscv_zvkned_prepare_decryption;
+
+      /* Setup RISC-V vector cryptography bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_riscv_zvkned_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_riscv_zvkned_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_riscv_zvkned_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_riscv_zvkned_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_riscv_zvkned_ctr_enc;
+      bulk_ops->ctr32le_enc = _gcry_aes_riscv_zvkned_ctr32le_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_riscv_zvkned_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_riscv_zvkned_ocb_auth;
+      bulk_ops->ecb_crypt = _gcry_aes_riscv_zvkned_ecb_crypt;
+      bulk_ops->xts_crypt = _gcry_aes_riscv_zvkned_xts_crypt;
+    }
+#endif
 #ifdef USE_VP_RISCV
-  else if ((hwfeatures & HWF_RISCV_IMAFDC) && (hwfeatures & HWF_RISCV_V) &&
-           _gcry_aes_vp_riscv_setup_acceleration(ctx))
+  else if ((hwfeatures & HWF_RISCV_IMAFDC)
+	   && (hwfeatures & HWF_RISCV_V)
+	   && _gcry_aes_vp_riscv_setup_acceleration(ctx))
     {
       hw_setkey = _gcry_aes_vp_riscv_do_setkey;
       ctx->encrypt_fn = _gcry_aes_vp_riscv_encrypt;
@@ -785,7 +869,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prefetch_dec_fn = NULL;
       ctx->prepare_decryption = _gcry_aes_vp_riscv_prepare_decryption;
 
-      /* Setup vector permute AArch64 bulk encryption routines.  */
+      /* Setup vector permute RISC-V bulk encryption routines.  */
       bulk_ops->cfb_enc = _gcry_aes_vp_riscv_cfb_enc;
       bulk_ops->cfb_dec = _gcry_aes_vp_riscv_cfb_dec;
       bulk_ops->cbc_enc = _gcry_aes_vp_riscv_cbc_enc;
diff --git a/configure.ac b/configure.ac
index d45ea851..45fe5143 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2812,6 +2812,32 @@ m4_define([GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST],
 	__riscv_vse32_v_u32m1(ptr + 4 * vl, w2, vl);
 	__riscv_vse32_v_u32m1(ptr + 5 * vl, w3, vl);
       }
+      void test_aes_key(unsigned int *ptr)
+      {
+	int vl = __riscv_vsetvl_e32m1 (4);
+	vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl);
+	vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl);
+	vuint32m1_t c = __riscv_vaeskf1_vi_u32m1(a, 1, vl);
+	vuint32m1_t d = __riscv_vaeskf2_vi_u32m1(a, b, 2, vl);
+	__riscv_vse32_v_u32m1(ptr + 0 * vl, c, vl);
+	__riscv_vse32_v_u32m1(ptr + 1 * vl, d, vl);
+      }
+      void test_aes_crypt(unsigned int *ptr)
+      {
+	int vl = __riscv_vsetvl_e32m1 (4);
+	vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl);
+	vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl);
+	vuint32m1_t c = __riscv_vaesz_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t d = __riscv_vaesem_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t e = __riscv_vaesef_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t f = __riscv_vaesdm_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t g = __riscv_vaesdf_vs_u32m1_u32m1(a, b, vl);
+	__riscv_vse32_v_u32m1(ptr + 0 * vl, c, vl);
+	__riscv_vse32_v_u32m1(ptr + 1 * vl, d, vl);
+	__riscv_vse32_v_u32m1(ptr + 2 * vl, e, vl);
+	__riscv_vse32_v_u32m1(ptr + 3 * vl, f, vl);
+	__riscv_vse32_v_u32m1(ptr + 4 * vl, g, vl);
+      }
       void test_inline_vec_asm(unsigned int *ptr)
       {
 	int vl = __riscv_vsetvl_e32m1 (4);
@@ -2918,6 +2944,53 @@ EOF
   CFLAGS=$_gcc_cflags_save;
 fi
 
+#
+# Check whether compiler has working RISC-V vector __riscv_vaes*_vs intrinsics
+#
+# Some GCC versions generate a 'vsetvli' instruction with an incorrect 'm1'
+# LMUL instead of the expected 'mX' for the __riscv_vaes*_vs_u32m1_u32mX
+# intrinsics.
+#
+if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" ||
+   test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
+
+  # Setup flags for test.
+  _gcc_cflags_save=$CFLAGS
+  if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
+    CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align"
+  else
+    CFLAGS="$CFLAGS -O2"
+  fi
+
+  AC_CACHE_CHECK([whether compiler has working RISC-V __riscv_vaes*_vs intrinsics],
+    [gcry_cv_riscv_vaes_vs_intrinsics_work],
+    [gcry_cv_riscv_vaes_vs_intrinsics_work=no
+     cat > conftest.c <<EOF
+#include <riscv_vector.h>
+vuint32m4_t test_fn(vuint32m4_t a, vuint32m1_t b, int vl)
+{
+  /* This intrinsic should result in a 'vsetvli' with m4 */
+  return __riscv_vaesem_vs_u32m1_u32m4(a, b, vl);
+}
+EOF
+
+    if $CC $CFLAGS -S conftest.c -o conftest.s >&5 2>&5; then
+      if grep -E 'vsetvli.*,[[[:space:]]]*m4[[[:space:]]]*,' conftest.s >/dev/null 2>&1; then
+	gcry_cv_riscv_vaes_vs_intrinsics_work=yes
+      fi
+    fi
+    rm -f conftest.*
+  ])
+
+  if test "$gcry_cv_riscv_vaes_vs_intrinsics_work" = "no"; then
+    AC_DEFINE([HAVE_BROKEN_VAES_VS_INTRINSIC], [1],
+	      [Define to 1 if __riscv_vaes*_vs intrinsics are broken])
+  fi
+
+  # Restore flags.
+  CFLAGS=$_gcc_cflags_save;
+fi
+
 
 #######################################
 #### Checks for library functions. ####
@@ -3390,6 +3463,9 @@ if test "$found" = "1" ; then
       riscv64-*-*)
          # Build with the vector permute SIMD128 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vp-riscv.lo"
+
+         # Build with the RISC-V vector cryptography implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-riscv-zvkned.lo"
       ;;
       s390x-*-*)
          # Big-Endian.
diff --git a/src/g10lib.h b/src/g10lib.h
index 4fa91ba9..991ec3ea 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -281,8 +281,9 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_RISCV_ZBB           (1 << 2)
 #define HWF_RISCV_ZBC           (1 << 3)
 #define HWF_RISCV_ZVKB          (1 << 4)
-#define HWF_RISCV_ZVKNHA        (1 << 5)
-#define HWF_RISCV_ZVKNHB        (1 << 6)
+#define HWF_RISCV_ZVKNED        (1 << 5)
+#define HWF_RISCV_ZVKNHA        (1 << 6)
+#define HWF_RISCV_ZVKNHB        (1 << 7)
 
 #endif
 
diff --git a/src/hwf-riscv.c b/src/hwf-riscv.c
index 925284a1..c37fd8dc 100644
--- a/src/hwf-riscv.c
+++ b/src/hwf-riscv.c
@@ -191,6 +191,7 @@ detect_riscv_at_hwcap(void)
 #define HWF_RISCV_HWPROBE_EXT_ZBS           (1U << 5)
 #define HWF_RISCV_HWPROBE_EXT_ZBC           (1U << 7)
 #define HWF_RISCV_HWPROBE_EXT_ZVKB          (1U << 19)
+#define HWF_RISCV_HWPROBE_EXT_ZVKNED        (1U << 21)
 #define HWF_RISCV_HWPROBE_EXT_ZVKNHA        (1U << 22)
 #define HWF_RISCV_HWPROBE_EXT_ZVKNHB        (1U << 23)
 #define HWF_RISCV_HWPROBE_EXT_ZICOND        (U64_C(1) << 35)
@@ -215,6 +216,7 @@ static const struct hwprobe_feature_map_s hwprobe_features[] =
     { HWF_RISCV_HWPROBE_EXT_ZBB,     HWF_RISCV_ZBB },
     { HWF_RISCV_HWPROBE_EXT_ZBC,     HWF_RISCV_ZBC },
     { HWF_RISCV_HWPROBE_EXT_ZVKB,    HWF_RISCV_ZVKB },
+    { HWF_RISCV_HWPROBE_EXT_ZVKNED,  HWF_RISCV_ZVKNED },
     { HWF_RISCV_HWPROBE_EXT_ZVKNHA,  HWF_RISCV_ZVKNHA },
     { HWF_RISCV_HWPROBE_EXT_ZVKNHB,  HWF_RISCV_ZVKNHB },
   };
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index df2aaf17..0752d787 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -99,6 +99,7 @@ static struct
     { HWF_RISCV_ZVKB,          "riscv-zvkb" },
     { HWF_RISCV_ZVKNHA,        "riscv-zvknha" },
     { HWF_RISCV_ZVKNHB,        "riscv-zvknhb" },
+    { HWF_RISCV_ZVKNED,        "riscv-zvkned" },
 #endif
   };
 
-- 
2.48.1