From jussi.kivilinna at iki.fi  Mon Jan  6 16:08:52 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  6 Jan 2025 17:08:52 +0200
Subject: [PATCH 5/6] chacha20: add RISC-V vector intrinsics implementation
In-Reply-To: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
References: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
Message-ID: <20250106150853.1779326-5-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'chacha20-riscv-v.c' and
add ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS handling for
'chacha20-riscv-v.o' and 'chacha20-riscv-v.lo'.
* cipher/chacha20-riscv-v.c: New.
* cipher/chacha20.c (USE_RISCV_V): New.
(CHACHA20_context_s): Add 'use_riscv_v'.
[USE_RISCV_V] (_gcry_chacha20_riscv_v_blocks)
(_gcry_chacha20_riscv_v_check_hw): New.
(chacha20_blocks) [USE_RISCV_V]: Add RISC-V vector code path.
(chacha20_do_setkey) [USE_RISCV_V]: Add HW feature detection for
RISC-V vector implementation.
* configure.ac: Add 'chacha20-riscv-v.lo'.
--

Patch adds RISC-V vector extension implementation. Variable length
vector implementation is used for large inputs (4 blocks or more blocks)
and fixed width 128-bit vector implementation is used for shorter input.

Benchmark on SpacemiT K1 (1600 Mhz):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     10.67 ns/B     89.37 MiB/s     17.07 c/B

After (3x faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      3.41 ns/B     279.9 MiB/s      5.45 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am        |  10 +-
 cipher/chacha20-riscv-v.c | 565 ++++++++++++++++++++++++++++++++++++++
 cipher/chacha20.c         |  29 ++
 configure.ac              |   4 +
 4 files changed, 606 insertions(+), 2 deletions(-)
 create mode 100644 cipher/chacha20-riscv-v.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index a0a4d7d8..d871d38d 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -87,8 +87,8 @@ EXTRA_libcipher_la_SOURCES = \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
-	chacha20-ppc.c chacha20-s390x.S \
-	chacha20-p10le-8x.s \
+	chacha20-ppc.c chacha20-s390x.S chacha20-p10le-8x.s \
+	chacha20-riscv-v.c \
 	cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \
 	cipher-gcm-aarch64-simd.c cipher-gcm-armv7-neon.S \
 	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
@@ -359,6 +359,12 @@ else
 riscv_vector_cflags =
 endif
 
+chacha20-riscv-v.o: $(srcdir)/chacha20-riscv-v.c Makefile
+	`echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
+chacha20-riscv-v.lo: $(srcdir)/chacha20-riscv-v.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
 rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
 	`echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
 
diff --git a/cipher/chacha20-riscv-v.c b/cipher/chacha20-riscv-v.c
new file mode 100644
index 00000000..1304a333
--- /dev/null
+++ b/cipher/chacha20-riscv-v.c
@@ -0,0 +1,565 @@
+/* chacha20-riscv-v.c - RISC-V vector implementation of ChaCha20
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(USE_CHACHA20)
+
+#include "simd-common-riscv.h"
+#include <riscv_vector.h>
+#include "bufhelp.h"
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+/**********************************************************************
+  RISC-V vector extension chacha20
+ **********************************************************************/
+
+#define ROTATE16(v)	__riscv_vreinterpret_v_u16m1_u32m1( \
+				__riscv_vrgather_vv_u16m1( \
+					__riscv_vreinterpret_v_u32m1_u16m1(v), \
+					rot16, vl * 2))
+#define ROTATE8(v)	__riscv_vreinterpret_v_u8m1_u32m1( \
+				__riscv_vrgather_vv_u8m1( \
+					__riscv_vreinterpret_v_u32m1_u8m1(v), \
+					rot8, vl * 4))
+#define ROTATE(v, c)	__riscv_vadd_vv_u32m1( \
+				__riscv_vsll_vx_u32m1((v), (c), vl), \
+				__riscv_vsrl_vx_u32m1((v), 32 - (c), vl), vl)
+#define XOR(v, w)	__riscv_vxor_vv_u32m1((v), (w), vl)
+#define PLUS(v, w)	__riscv_vadd_vv_u32m1((v), (w), vl)
+#define WORD_ROL(v, c)	__riscv_vrgather_vv_u32m1((v), (rol##c), vl)
+
+#define QUARTERROUND_4(a0, b0, c0, d0, a1, b1, c1, d1, \
+		       a2, b2, c2, d2, a3, b3, c3, d3) \
+  a0 = PLUS(a0, b0); a1 = PLUS(a1, b1); \
+  a2 = PLUS(a2, b2); a3 = PLUS(a3, b3); \
+    d0 = XOR(d0, a0); d1 = XOR(d1, a1); \
+    d2 = XOR(d2, a2); d3 = XOR(d3, a3); \
+      d0 = ROTATE16(d0); d1 = ROTATE16(d1); \
+      d2 = ROTATE16(d2); d3 = ROTATE16(d3); \
+  c0 = PLUS(c0, d0); c1 = PLUS(c1, d1); \
+  c2 = PLUS(c2, d2); c3 = PLUS(c3, d3); \
+    b0 = XOR(b0, c0); b1 = XOR(b1, c1); \
+    b2 = XOR(b2, c2); b3 = XOR(b3, c3); \
+      b0 = ROTATE(b0, 12); b1 = ROTATE(b1, 12); \
+      b2 = ROTATE(b2, 12); b3 = ROTATE(b3, 12); \
+  a0 = PLUS(a0, b0); a1 = PLUS(a1, b1); \
+  a2 = PLUS(a2, b2); a3 = PLUS(a3, b3); \
+    d0 = XOR(d0, a0); d1 = XOR(d1, a1); \
+    d2 = XOR(d2, a2); d3 = XOR(d3, a3); \
+      d0 = ROTATE8(d0); d1 = ROTATE8(d1); \
+      d2 = ROTATE8(d2); d3 = ROTATE8(d3); \
+  c0 = PLUS(c0, d0); c1 = PLUS(c1, d1); \
+  c2 = PLUS(c2, d2); c3 = PLUS(c3, d3); \
+    b0 = XOR(b0, c0); b1 = XOR(b1, c1); \
+    b2 = XOR(b2, c2); b3 = XOR(b3, c3); \
+      b0 = ROTATE(b0, 7); b1 = ROTATE(b1, 7); \
+      b2 = ROTATE(b2, 7); b3 = ROTATE(b3, 7);
+
+#define QUARTERROUND4_2(x0, x1, x2, x3, y0, y1, y2, y3, rol_x1, rol_x2, rol_x3) \
+  x0 = PLUS(x0, x1); y0 = PLUS(y0, y1); \
+    x3 = XOR(x3, x0); y3 = XOR(y3, y0); \
+      x3 = ROTATE16(x3); y3 = ROTATE16(y3); \
+  x2 = PLUS(x2, x3); y2 = PLUS(y2, y3); \
+    x1 = XOR(x1, x2); y1 = XOR(y1, y2); \
+      x1 = ROTATE(x1, 12); y1 = ROTATE(y1, 12); \
+  x0 = PLUS(x0, x1); y0 = PLUS(y0, y1); \
+    x3 = XOR(x3, x0); y3 = XOR(y3, y0); \
+      x3 = ROTATE8(x3); y3 = ROTATE8(y3); \
+  x2 = PLUS(x2, x3); y2 = PLUS(y2, y3); \
+    x3 = WORD_ROL(x3, rol_x3); y3 = WORD_ROL(y3, rol_x3);\
+      x1 = XOR(x1, x2); y1 = XOR(y1, y2); \
+	x2 = WORD_ROL(x2, rol_x2); y2 = WORD_ROL(y2, rol_x2); \
+	  x1 = ROTATE(x1, 7); y1 = ROTATE(y1, 7); \
+	    x1 = WORD_ROL(x1, rol_x1); y1 = WORD_ROL(y1, rol_x1);
+
+#define QUARTERROUND4(x0, x1, x2, x3, rol_x1, rol_x2, rol_x3) \
+  x0 = PLUS(x0, x1); x3 = XOR(x3, x0); x3 = ROTATE16(x3); \
+  x2 = PLUS(x2, x3); x1 = XOR(x1, x2); x1 = ROTATE(x1, 12); \
+  x0 = PLUS(x0, x1); x3 = XOR(x3, x0); x3 = ROTATE8(x3); \
+  x2 = PLUS(x2, x3); \
+    x3 = WORD_ROL(x3, rol_x3); \
+		     x1 = XOR(x1, x2); \
+    x2 = WORD_ROL(x2, rol_x2); \
+				       x1= ROTATE(x1, 7); \
+    x1 = WORD_ROL(x1, rol_x1);
+
+#define ADD_U64(a, b) __riscv_vreinterpret_v_u64m1_u32m1( \
+			__riscv_vadd_vv_u64m1( \
+			  __riscv_vreinterpret_v_u32m1_u64m1(a), \
+			  __riscv_vreinterpret_v_u32m1_u64m1(b), vl / 2))
+
+#define vxor_v_u32m1_u32m1x8(data, idx, vs, vl) \
+      __riscv_vset_v_u32m1_u32m1x8((data), (idx), \
+	  __riscv_vxor_vv_u32m1( \
+	      __riscv_vget_v_u32m1x8_u32m1((data), (idx)), (vs), (vl)))
+
+static ASM_FUNC_ATTR_INLINE vuint16m1_t
+gen_rot16(size_t vl)
+{
+  return __riscv_vxor_vx_u16m1(__riscv_vid_v_u16m1(vl * 2), 1, vl * 2);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint8m1_t
+gen_rot8(size_t vl)
+{
+  vuint8m1_t rot8, rot8_hi;
+
+  rot8 = __riscv_vid_v_u8m1(vl * 4);
+  rot8_hi = __riscv_vand_vx_u8m1(rot8, ~3, vl * 4);
+  rot8 = __riscv_vadd_vx_u8m1(rot8, 3, vl * 4);
+  rot8 = __riscv_vand_vx_u8m1(rot8, 3, vl * 4);
+  rot8 = __riscv_vadd_vv_u8m1(rot8, rot8_hi, vl * 4);
+
+  return rot8;
+}
+
+static ASM_FUNC_ATTR_INLINE vuint16m2_t
+gen_indexes(size_t vl, size_t stride)
+{
+  vuint16m2_t idx = __riscv_vid_v_u16m2(vl * 4);
+  vuint16m2_t idx_lo = __riscv_vand_vx_u16m2(idx, 3, vl * 4);
+  vuint16m2_t idx_hi = __riscv_vsrl_vx_u16m2(idx, 2, vl * 4);
+  idx_hi = __riscv_vmul_vx_u16m2(idx_hi, stride, vl * 4);
+  return __riscv_vadd_vv_u16m2(idx_hi, idx_lo, vl * 4);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m1x8_t
+unaligned_vlsseg8e32_v_u32m1x8(const void *src, size_t vl)
+{
+  const byte *bsrc = src;
+  vuint16m2_t indexes;
+  vuint8m1_t b0, b1, b2, b3, b4, b5, b6, b7;
+  vuint32m1x8_t data;
+
+  if (LIKELY(((uintptr_t)src & 3) == 0))
+    {
+      /* Fast path for 32-bit aligned loads. */
+      return __riscv_vlsseg8e32_v_u32m1x8(src, 64, vl);
+    }
+
+  indexes = gen_indexes(4 * vl, 64);
+
+  b0 = __riscv_vluxei16_v_u8m1(bsrc + 0 * 4, indexes, vl * 4);
+  b1 = __riscv_vluxei16_v_u8m1(bsrc + 1 * 4, indexes, vl * 4);
+  b2 = __riscv_vluxei16_v_u8m1(bsrc + 2 * 4, indexes, vl * 4);
+  b3 = __riscv_vluxei16_v_u8m1(bsrc + 3 * 4, indexes, vl * 4);
+  b4 = __riscv_vluxei16_v_u8m1(bsrc + 4 * 4, indexes, vl * 4);
+  b5 = __riscv_vluxei16_v_u8m1(bsrc + 5 * 4, indexes, vl * 4);
+  b6 = __riscv_vluxei16_v_u8m1(bsrc + 6 * 4, indexes, vl * 4);
+  b7 = __riscv_vluxei16_v_u8m1(bsrc + 7 * 4, indexes, vl * 4);
+
+  data = __riscv_vundefined_u32m1x8();
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 0, __riscv_vreinterpret_v_u8m1_u32m1(b0));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 1, __riscv_vreinterpret_v_u8m1_u32m1(b1));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 2, __riscv_vreinterpret_v_u8m1_u32m1(b2));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 3, __riscv_vreinterpret_v_u8m1_u32m1(b3));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 4, __riscv_vreinterpret_v_u8m1_u32m1(b4));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 5, __riscv_vreinterpret_v_u8m1_u32m1(b5));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 6, __riscv_vreinterpret_v_u8m1_u32m1(b6));
+  data = __riscv_vset_v_u32m1_u32m1x8(
+	    data, 7, __riscv_vreinterpret_v_u8m1_u32m1(b7));
+
+  return data;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+unaligned_vssseg8e32_v_u32m1x8(void *dst, vuint32m1x8_t data, size_t vl)
+{
+  byte *bdst = dst;
+  vuint16m2_t indexes;
+  vuint8m1_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  if (LIKELY(((uintptr_t)dst & 3) == 0))
+    {
+      /* Fast path for 32-bit aligned stores. */
+      __riscv_vssseg8e32_v_u32m1x8(dst, 64, data, vl);
+      return;
+    }
+
+  indexes = gen_indexes(4 * vl, 64);
+
+  b0 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 0));
+  b1 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 1));
+  b2 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 2));
+  b3 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 3));
+  b4 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 4));
+  b5 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 5));
+  b6 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 6));
+  b7 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 7));
+
+  __riscv_vsuxei16_v_u8m1(bdst + 0 * 4, indexes, b0, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 1 * 4, indexes, b1, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 2 * 4, indexes, b2, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 3 * 4, indexes, b3, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 4 * 4, indexes, b4, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 5 * 4, indexes, b5, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 6 * 4, indexes, b6, vl * 4);
+  __riscv_vsuxei16_v_u8m1(bdst + 7 * 4, indexes, b7, vl * 4);
+}
+
+static ASM_FUNC_ATTR_INLINE unsigned int
+chacha20_rvv_blocks(u32 *input, byte *dst, const byte *src, size_t nblks)
+{
+  unsigned int i;
+
+  if (nblks == 0)
+    return 0;
+
+  /* Try use vector implementation when there is 4 or more blocks. */
+  if (nblks >= 4)
+    {
+      size_t vl = __riscv_vsetvl_e32m1(nblks) < 4
+		    ? __riscv_vsetvl_e32m1(4) : __riscv_vsetvl_e32m1(nblks);
+      vuint32m1_t x0, x1, x2, x3, x4, x5, x6, x7;
+      vuint32m1_t x8, x9, x10, x11, x12, x13, x14, x15;
+      u32 s0, s1, s2, s3, s4, s5, s6, s7;
+      u32 s8, s9, s10, s11, s12, s13, s14, s15;
+      vuint16m1_t rot16 = gen_rot16(vl);
+      vuint8m1_t rot8 = gen_rot8(vl);
+
+      s0 = input[0];
+      s1 = input[1];
+      s2 = input[2];
+      s3 = input[3];
+      s4 = input[4];
+      s5 = input[5];
+      s6 = input[6];
+      s7 = input[7];
+      s8 = input[8];
+      s9 = input[9];
+      s10 = input[10];
+      s11 = input[11];
+      s12 = input[12];
+      s13 = input[13];
+      s14 = input[14];
+      s15 = input[15];
+
+      while (nblks >= 4)
+	{
+	  vuint32m1_t ctr;
+	  vbool32_t carry;
+	  vuint32m1x8_t data;
+
+	  if (vl < 4)
+	    break;
+
+	  x0 = __riscv_vmv_v_x_u32m1(s0, vl);
+	  x1 = __riscv_vmv_v_x_u32m1(s1, vl);
+	  x2 = __riscv_vmv_v_x_u32m1(s2, vl);
+	  x3 = __riscv_vmv_v_x_u32m1(s3, vl);
+	  x4 = __riscv_vmv_v_x_u32m1(s4, vl);
+	  x5 = __riscv_vmv_v_x_u32m1(s5, vl);
+	  x6 = __riscv_vmv_v_x_u32m1(s6, vl);
+	  x7 = __riscv_vmv_v_x_u32m1(s7, vl);
+	  x8 = __riscv_vmv_v_x_u32m1(s8, vl);
+	  x9 = __riscv_vmv_v_x_u32m1(s9, vl);
+	  x10 = __riscv_vmv_v_x_u32m1(s10, vl);
+	  x11 = __riscv_vmv_v_x_u32m1(s11, vl);
+	  x13 = __riscv_vmv_v_x_u32m1(s13, vl);
+	  x14 = __riscv_vmv_v_x_u32m1(s14, vl);
+	  x15 = __riscv_vmv_v_x_u32m1(s15, vl);
+
+	  ctr = __riscv_vid_v_u32m1(vl);
+	  carry = __riscv_vmadc_vx_u32m1_b32(ctr, s12, vl);
+	  ctr = __riscv_vadd_vx_u32m1(ctr, s12, vl);
+	  x12 = ctr;
+	  x13 = __riscv_vadc_vxm_u32m1(x13, 0, carry, vl);
+
+	  for (i = 20; i > 0; i -= 2)
+	    {
+	      QUARTERROUND_4(x0, x4,  x8, x12,
+			     x1, x5,  x9, x13,
+			     x2, x6, x10, x14,
+			     x3, x7, x11, x15);
+	      QUARTERROUND_4(x0, x5, x10, x15,
+			     x1, x6, x11, x12,
+		             x2, x7,  x8, x13,
+		             x3, x4,  x9, x14);
+	    }
+
+	  x0 = __riscv_vadd_vx_u32m1(x0, s0, vl);
+	  x1 = __riscv_vadd_vx_u32m1(x1, s1, vl);
+	  x2 = __riscv_vadd_vx_u32m1(x2, s2, vl);
+	  x3 = __riscv_vadd_vx_u32m1(x3, s3, vl);
+	  x4 = __riscv_vadd_vx_u32m1(x4, s4, vl);
+	  x5 = __riscv_vadd_vx_u32m1(x5, s5, vl);
+	  x6 = __riscv_vadd_vx_u32m1(x6, s6, vl);
+	  x7 = __riscv_vadd_vx_u32m1(x7, s7, vl);
+	  x8 = __riscv_vadd_vx_u32m1(x8, s8, vl);
+	  x9 = __riscv_vadd_vx_u32m1(x9, s9, vl);
+	  x10 = __riscv_vadd_vx_u32m1(x10, s10, vl);
+	  x11 = __riscv_vadd_vx_u32m1(x11, s11, vl);
+	  x12 = __riscv_vadd_vv_u32m1(x12, ctr, vl);
+	  x13 = __riscv_vadc_vxm_u32m1(x13, s13, carry, vl);
+	  x14 = __riscv_vadd_vx_u32m1(x14, s14, vl);
+	  x15 = __riscv_vadd_vx_u32m1(x15, s15, vl);
+
+	  s12 += vl;
+	  s13 += s12 < vl;
+
+	  data = unaligned_vlsseg8e32_v_u32m1x8((const void *)src, vl);
+
+	  data = vxor_v_u32m1_u32m1x8(data, 0, x0, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 1, x1, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 2, x2, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 3, x3, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 4, x4, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 5, x5, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 6, x6, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 7, x7, vl);
+
+	  unaligned_vssseg8e32_v_u32m1x8((void *)dst, data, vl);
+
+	  data = unaligned_vlsseg8e32_v_u32m1x8((const void *)(src + 32), vl);
+
+	  data = vxor_v_u32m1_u32m1x8(data, 0, x8, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 1, x9, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 2, x10, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 3, x11, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 4, x12, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 5, x13, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 6, x14, vl);
+	  data = vxor_v_u32m1_u32m1x8(data, 7, x15, vl);
+
+	  unaligned_vssseg8e32_v_u32m1x8((void *)(dst + 32), data, vl);
+
+	  src += vl * 64;
+	  dst += vl * 64;
+	  nblks -= vl;
+	  vl = __riscv_vsetvl_e32m1(nblks) < 4
+		    ? __riscv_vsetvl_e32m1(4) : __riscv_vsetvl_e32m1(nblks);
+	}
+
+      input[12] = s12;
+      input[13] = s13;
+    }
+
+  /* Use SIMD implementation for remaining blocks. */
+  if (nblks > 0)
+    {
+      static const u32 rol_const[3][4] =
+	{
+	  { 1, 2, 3, 0 },
+	  { 2, 3, 0, 1 },
+	  { 3, 0, 1, 2 }
+	};
+      static const u32 one_u64_const[4] = { 1, 0, 0, 0 };
+      size_t vl = 4;
+      vuint32m1_t rol1, rol2, rol3;
+      vuint32m1_t one_u64;
+      vuint32m1_t v0, v1, v2, v3;
+      vuint32m1_t v4, v5, v6, v7;
+      vuint32m1_t state0, state1, state2, state3;
+      vuint8m1_t i0, i1, i2, i3;
+      vuint8m1_t i4, i5, i6, i7;
+      vuint16m1_t rot16 = gen_rot16(vl);
+      vuint8m1_t rot8 = gen_rot8(vl);
+
+      rol1 = __riscv_vle32_v_u32m1(rol_const[0], vl);
+      rol2 = __riscv_vle32_v_u32m1(rol_const[1], vl);
+      rol3 = __riscv_vle32_v_u32m1(rol_const[2], vl);
+      one_u64 = __riscv_vle32_v_u32m1(one_u64_const, vl);
+
+      state0 = __riscv_vle32_v_u32m1(&input[0], vl);
+      state1 = __riscv_vle32_v_u32m1(&input[4], vl);
+      state2 = __riscv_vle32_v_u32m1(&input[8], vl);
+      state3 = __riscv_vle32_v_u32m1(&input[12], vl);
+
+      input[12] += nblks;
+      input[13] += input[12] < nblks;
+
+      /* SIMD 2x block implementation */
+      while (nblks >= 2)
+	{
+	  v0 = state0;
+	  v1 = state1;
+	  v2 = state2;
+	  v3 = state3;
+
+	  v4 = state0;
+	  v5 = state1;
+	  v6 = state2;
+	  v7 = state3;
+	  v7 = ADD_U64(v7, one_u64);
+
+	  i0 = __riscv_vle8_v_u8m1(src + 0 * 16, vl * 4);
+	  i1 = __riscv_vle8_v_u8m1(src + 1 * 16, vl * 4);
+	  i2 = __riscv_vle8_v_u8m1(src + 2 * 16, vl * 4);
+	  i3 = __riscv_vle8_v_u8m1(src + 3 * 16, vl * 4);
+
+	  for (i = 20; i > 0; i -= 2)
+	    {
+	      QUARTERROUND4_2(v0, v1, v2, v3, v4, v5, v6, v7, 1, 2, 3);
+	      QUARTERROUND4_2(v0, v1, v2, v3, v4, v5, v6, v7, 3, 2, 1);
+	    }
+
+	  v0 = __riscv_vadd_vv_u32m1(v0, state0, vl);
+	  v1 = __riscv_vadd_vv_u32m1(v1, state1, vl);
+	  v2 = __riscv_vadd_vv_u32m1(v2, state2, vl);
+	  v3 = __riscv_vadd_vv_u32m1(v3, state3, vl);
+	  state3 = ADD_U64(state3, one_u64);
+
+	  v0 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i0),
+				     v0, vl);
+	  v1 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i1),
+				     v1, vl);
+	  v2 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i2),
+				     v2, vl);
+	  v3 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i3),
+				     v3, vl);
+
+	  v4 = __riscv_vadd_vv_u32m1(v4, state0, vl);
+	  v5 = __riscv_vadd_vv_u32m1(v5, state1, vl);
+	  v6 = __riscv_vadd_vv_u32m1(v6, state2, vl);
+	  v7 = __riscv_vadd_vv_u32m1(v7, state3, vl);
+	  state3 = ADD_U64(state3, one_u64);
+
+	  i4 = __riscv_vle8_v_u8m1(src + 4 * 16, vl * 4);
+	  i5 = __riscv_vle8_v_u8m1(src + 5 * 16, vl * 4);
+	  i6 = __riscv_vle8_v_u8m1(src + 6 * 16, vl * 4);
+	  i7 = __riscv_vle8_v_u8m1(src + 7 * 16, vl * 4);
+
+	  __riscv_vse8_v_u8m1(dst + 0 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v0), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 1 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v1), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 2 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v2), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 3 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v3), vl * 4);
+
+	  v4 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i4),
+				     v4, vl);
+	  v5 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i5),
+				     v5, vl);
+	  v6 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i6),
+				     v6, vl);
+	  v7 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i7),
+				     v7, vl);
+
+	  __riscv_vse8_v_u8m1(dst + 4 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v4), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 5 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v5), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 6 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v6), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 7 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v7), vl * 4);
+
+	  src += 2 * 64;
+	  dst += 2 * 64;
+
+	  nblks -= 2;
+	}
+
+      /* 1x block implementation */
+      while (nblks)
+	{
+	  v0 = state0;
+	  v1 = state1;
+	  v2 = state2;
+	  v3 = state3;
+
+	  i0 = __riscv_vle8_v_u8m1(src + 0 * 16, vl * 4);
+	  i1 = __riscv_vle8_v_u8m1(src + 1 * 16, vl * 4);
+	  i2 = __riscv_vle8_v_u8m1(src + 2 * 16, vl * 4);
+	  i3 = __riscv_vle8_v_u8m1(src + 3 * 16, vl * 4);
+
+	  for (i = 20; i > 0; i -= 2)
+	    {
+	      QUARTERROUND4(v0, v1, v2, v3, 1, 2, 3);
+	      QUARTERROUND4(v0, v1, v2, v3, 3, 2, 1);
+	    }
+
+	  v0 = __riscv_vadd_vv_u32m1(v0, state0, vl);
+	  v1 = __riscv_vadd_vv_u32m1(v1, state1, vl);
+	  v2 = __riscv_vadd_vv_u32m1(v2, state2, vl);
+	  v3 = __riscv_vadd_vv_u32m1(v3, state3, vl);
+
+	  state3 = ADD_U64(state3, one_u64);
+
+	  v0 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i0),
+				     v0, vl);
+	  v1 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i1),
+				     v1, vl);
+	  v2 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i2),
+				     v2, vl);
+	  v3 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i3),
+				     v3, vl);
+	  __riscv_vse8_v_u8m1(dst + 0 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v0), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 1 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v1), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 2 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v2), vl * 4);
+	  __riscv_vse8_v_u8m1(dst + 3 * 16,
+			      __riscv_vreinterpret_v_u32m1_u8m1(v3), vl * 4);
+	  src += 64;
+	  dst += 64;
+
+	  nblks--;
+	}
+    }
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_chacha20_riscv_v_blocks(u32 *state, byte *dst, const byte *src,
+			      size_t nblks)
+{
+  return chacha20_rvv_blocks(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_chacha20_riscv_v_check_hw(void)
+{
+  return (__riscv_vsetvl_e8m1(16) == 16);
+}
+
+#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index ca8176f4..8b547db3 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -113,6 +113,12 @@
 # endif /* USE_S390X_VX */
 #endif
 
+/* USE_RISCV_V indicates whether to enable RISC-V vector extension code. */
+#undef USE_RISCV_V
+#if defined (__riscv) && defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS)
+# define USE_RISCV_V 1
+#endif
+
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
@@ -137,6 +143,7 @@ typedef struct CHACHA20_context_s
   unsigned int use_p9:1;
   unsigned int use_p10:1;
   unsigned int use_s390x:1;
+  unsigned int use_riscv_v:1;
 } CHACHA20_context_t;
 
 
@@ -259,6 +266,16 @@ unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
 
 #endif /* USE_AARCH64_SIMD */
 
+#ifdef USE_RISCV_V
+
+unsigned int _gcry_chacha20_riscv_v_blocks(u32 *state, byte *dst,
+					   const byte *src,
+					   size_t nblks);
+
+unsigned int _gcry_chacha20_riscv_v_check_hw(void);
+
+#endif /* USE_RISCV_V */
+
 
 static const char *selftest (void);
 
@@ -396,6 +413,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
     }
 #endif
 
+#ifdef USE_RISCV_V
+  if (ctx->use_riscv_v)
+    {
+      return _gcry_chacha20_riscv_v_blocks(ctx->input, dst, src, nblks);
+    }
+#endif
+
   return do_chacha20_blocks (ctx->input, dst, src, nblks);
 }
 
@@ -538,6 +562,11 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_S390X_VX
   ctx->use_s390x = (features & HWF_S390X_VX) != 0;
 #endif
+#ifdef USE_RISCV_V
+  ctx->use_riscv_v = (features & HWF_RISCV_IMAFDC)
+		     && (features & HWF_RISCV_V)
+		     && _gcry_chacha20_riscv_v_check_hw();
+#endif
 
   (void)features;
 
diff --git a/configure.ac b/configure.ac
index fbe82695..4e9f1754 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3510,6 +3510,10 @@ if test "$found" = "1" ; then
          # Build with the s390x/zSeries vector implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-s390x.lo"
       ;;
+      riscv64-*-*)
+         # Build with the RISC-V vector implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-riscv-v.lo"
+      ;;
    esac
 fi
 
-- 
2.45.2


From jussi.kivilinna at iki.fi  Mon Jan  6 16:08:48 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  6 Jan 2025 17:08:48 +0200
Subject: [PATCH 1/6] hwf: add detection of RISC-V (64-bit) hardware features
Message-ID: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>

* configure.ac
(gcry_cv_gcc_inline_asm_riscv, gcry_cv_gcc_inline_asm_riscv_v)
(HAVE_GCC_INLINE_ASM_RISCV_V, HAVE_CPU_ARCH_RISCV): Add RISC-V
detection support.
* mpi/config.links: Add setup for RISC-V links.
* src/Makefile.am: Add 'hwf-riscv.c'.
* src/g10lib.h (HWF_RISCV_IMAFDC, HWF_RISCV_V, HWF_RISCV_B)
(HWF_RISCV_ZBC): New.
* src/hwf_common.h (_gcry_hwf_detect_riscv): New.
* src/hwf-riscv.c: New.
* src/hwfeatures.c: Add "riscv-imafdc", "riscv-v", "riscv-b" and
"riscv-zbc".
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 configure.ac     |  90 +++++++++++
 mpi/config.links |   7 +
 src/Makefile.am  |   4 +-
 src/g10lib.h     |   7 +
 src/hwf-common.h |   1 +
 src/hwf-riscv.c  | 386 +++++++++++++++++++++++++++++++++++++++++++++++
 src/hwfeatures.c |   9 ++
 7 files changed, 503 insertions(+), 1 deletion(-)
 create mode 100644 src/hwf-riscv.c

diff --git a/configure.ac b/configure.ac
index d708f89a..f20d654d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2619,6 +2619,92 @@ if test "$gcry_cv_gcc_inline_asm_s390x_vx" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC inline assembler supports RISC-V instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports RISC-V instructions],
+      [gcry_cv_gcc_inline_asm_riscv],
+      [if test "$mpi_cpu_arch" != "riscv64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	  gcry_cv_gcc_inline_asm_riscv="n/a"
+	else
+	  gcry_cv_gcc_inline_asm_riscv=no
+	  AC_LINK_IFELSE([AC_LANG_PROGRAM(
+	  [[unsigned int testfunc(unsigned int x)
+	    {
+	      unsigned int y;
+	      asm volatile ("add %0, %1, %2" :
+			    "=r" (y) : "r" (1), "r" (x) : "a5");
+	      asm volatile (".option push;\n\t"
+			    ".option arch, +zba;\n\t"
+			    "sh3add %0, %1, %1;\n\t"
+			    ".option pop;\n\t"
+			    : "=r" (y)
+			    : "r" (y));
+	      asm volatile (".option push;\n\t"
+			    ".option arch, +zbb;\n\t"
+			    "cpop %0, %1;\n\t"
+			    ".option pop;\n\t"
+			    : "=r" (y)
+			    : "r" (y));
+	      asm volatile (".option push;\n\t"
+			    ".option arch, +zbs;\n\t"
+			    "bexti %0, %1, 1;\n\t"
+			    ".option pop;\n\t"
+			    : "=r" (y)
+			    : "r" (y));
+	      asm volatile (".option push;\n\t"
+			    ".option arch, +zbc;\n\t"
+			    "clmulr %0, %1, %2;\n\t"
+			    ".option pop;\n\t"
+			    : "=r" (y)
+			    : "r" (y), "r" (y));
+	      return y;
+	    }
+	    ]] , [ testfunc(0); ])],
+	  [gcry_cv_gcc_inline_asm_riscv=yes])
+	fi])
+if test "$gcry_cv_gcc_inline_asm_riscv" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_RISCV,1,
+     [Defined if inline assembler supports RISC-V instructions])
+fi
+
+
+#
+# Check whether GCC inline assembler supports RISC-V vector instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports RISC-V vector instructions],
+      [gcry_cv_gcc_inline_asm_riscv_v],
+      [if test "$mpi_cpu_arch" != "riscv64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	  gcry_cv_gcc_inline_asm_riscv_v="n/a"
+	else
+	  gcry_cv_gcc_inline_asm_riscv_v=no
+	  if test "$gcry_cv_gcc_inline_asm_riscv" = "yes" ; then
+	    AC_LINK_IFELSE([AC_LANG_PROGRAM(
+	    [[unsigned int testfunc(void)
+	      {
+		unsigned int vlmax;
+		asm volatile (".option push;\n\t"
+			      ".option arch, +v;\n\t"
+			      "vsetvli %0, %1, e8, m1, ta, ma;\n\t"
+			      "vxor.vv v1, v1, v1;\n\t"
+			      ".option pop;\n\t"
+			      : "=r" (vlmax)
+			      : "r" (~0)
+			      : "vl", "vtype", "v1");
+		return vlmax;
+	      }
+	      ]], [ testfunc(); ])],
+	    [gcry_cv_gcc_inline_asm_riscv_v=yes])
+	  fi
+	fi])
+if test "$gcry_cv_gcc_inline_asm_riscv_v" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_RISCV_V,1,
+     [Defined if inline assembler supports RISC-V vector instructions])
+fi
+
+
 #######################################
 #### Checks for library functions. ####
 #######################################
@@ -3798,6 +3884,10 @@ case "$mpi_cpu_arch" in
         AC_DEFINE(HAVE_CPU_ARCH_S390X, 1, [Defined for s390x/zSeries platforms])
         GCRYPT_HWF_MODULES="libgcrypt_la-hwf-s390x.lo"
         ;;
+     riscv64)
+        AC_DEFINE(HAVE_CPU_ARCH_RISCV, 1, [Defined for RISC-V platforms])
+        GCRYPT_HWF_MODULES="libgcrypt_la-hwf-riscv.lo"
+        ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
 
diff --git a/mpi/config.links b/mpi/config.links
index 94b42e53..eefe8680 100644
--- a/mpi/config.links
+++ b/mpi/config.links
@@ -333,6 +333,13 @@ case "${host}" in
 	path="powerpc32"
         mpi_cpu_arch="ppc"
 	;;
+
+    riscv64-*-*)
+       echo '/* No working assembler modules available */' >>./mpi/asm-syntax.h
+       path=""
+       mpi_cpu_arch="riscv64"
+       ;;
+
     *)
 	echo '/* Platform not known */' >>./mpi/asm-syntax.h
 	path=""
diff --git a/src/Makefile.am b/src/Makefile.am
index f6191bc8..6177171f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -66,7 +66,9 @@ libgcrypt_la_SOURCES = \
 	context.c context.h const-time.h const-time.c \
 	ec-context.h
 
-EXTRA_libgcrypt_la_SOURCES = hwf-x86.c hwf-arm.c hwf-ppc.c hwf-s390x.c
+EXTRA_libgcrypt_la_SOURCES = \
+	hwf-x86.c hwf-arm.c hwf-ppc.c hwf-s390x.c hwf-riscv.c
+
 gcrypt_hwf_modules = @GCRYPT_HWF_MODULES@
 
 
diff --git a/src/g10lib.h b/src/g10lib.h
index fcf291b8..0a3ac127 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -274,6 +274,13 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_S390X_MSA_9         (1 << 3)
 #define HWF_S390X_VX            (1 << 4)
 
+#elif defined(HAVE_CPU_ARCH_RISCV)
+
+#define HWF_RISCV_IMAFDC        (1 << 0)
+#define HWF_RISCV_V             (1 << 1)
+#define HWF_RISCV_B             (1 << 2)
+#define HWF_RISCV_ZBC           (1 << 3)
+
 #endif
 
 gpg_err_code_t _gcry_disable_hw_feature (const char *name);
diff --git a/src/hwf-common.h b/src/hwf-common.h
index ebd045c5..749ff040 100644
--- a/src/hwf-common.h
+++ b/src/hwf-common.h
@@ -24,5 +24,6 @@ unsigned int _gcry_hwf_detect_x86 (void);
 unsigned int _gcry_hwf_detect_arm (void);
 unsigned int _gcry_hwf_detect_ppc (void);
 unsigned int _gcry_hwf_detect_s390x (void);
+unsigned int _gcry_hwf_detect_riscv (void);
 
 #endif /*HWF_COMMON_H*/
diff --git a/src/hwf-riscv.c b/src/hwf-riscv.c
new file mode 100644
index 00000000..39333154
--- /dev/null
+++ b/src/hwf-riscv.c
@@ -0,0 +1,386 @@
+/* hwf-riscv.c - Detect hardware features - RISC-V part
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#if defined(HAVE_SYS_AUXV_H) && (defined(HAVE_GETAUXVAL) || \
+    defined(HAVE_ELF_AUX_INFO))
+#include <sys/auxv.h>
+#endif
+#if defined(__linux__) && defined(HAVE_SYSCALL)
+# include <sys/syscall.h>
+#endif
+
+#include "g10lib.h"
+#include "hwf-common.h"
+
+#if !defined (__riscv)
+# error Module build for wrong CPU.
+#endif
+
+
+#if defined(HAVE_SYS_AUXV_H) && defined(HAVE_ELF_AUX_INFO) && \
+    !defined(HAVE_GETAUXVAL) && defined(AT_HWCAP)
+#define HAVE_GETAUXVAL
+static unsigned long getauxval(unsigned long type)
+{
+  unsigned long auxval = 0;
+  int err;
+
+  /* FreeBSD provides 'elf_aux_info' function that does the same as
+   * 'getauxval' on Linux. */
+
+  err = elf_aux_info (type, &auxval, sizeof(auxval));
+  if (err)
+    {
+      errno = err;
+      auxval = 0;
+    }
+
+  return auxval;
+}
+#endif
+
+
+#undef HAS_SYS_AT_HWCAP
+#if defined(__linux__) || \
+    (defined(HAVE_SYS_AUXV_H) && defined(HAVE_GETAUXVAL))
+#define HAS_SYS_AT_HWCAP 1
+
+struct hwcap_feature_map_s {
+  unsigned int hwcap_flag;
+  unsigned int hwf_flag;
+};
+
+/* Note: These macros have same values on Linux and FreeBSD. */
+#ifndef AT_HWCAP
+# define AT_HWCAP        16
+#endif
+#ifndef AT_HWCAP2
+# define AT_HWCAP2       26
+#endif
+
+#define HWCAP_ISA(l)     (1U << (unsigned int)(l - 'a'))
+#define HWCAP_ISA_IMAFDC (HWCAP_ISA('i') | HWCAP_ISA('m') | \
+			  HWCAP_ISA('a') | HWCAP_ISA('f') | \
+			  HWCAP_ISA('d') | HWCAP_ISA('c'))
+
+static const struct hwcap_feature_map_s hwcap_features[] =
+  {
+    { HWCAP_ISA_IMAFDC,  HWF_RISCV_IMAFDC },
+    { HWCAP_ISA('v'),    HWF_RISCV_V },
+    { HWCAP_ISA('b'),    HWF_RISCV_B },
+  };
+
+static int
+get_hwcap(unsigned int *hwcap)
+{
+  struct { unsigned long a_type; unsigned long a_val; } auxv;
+  FILE *f;
+  int err = -1;
+  static int hwcap_initialized = 0;
+  static unsigned int stored_hwcap = 0;
+
+  if (hwcap_initialized)
+    {
+      *hwcap = stored_hwcap;
+      return 0;
+    }
+
+#if defined(HAVE_SYS_AUXV_H) && defined(HAVE_GETAUXVAL)
+  errno = 0;
+  auxv.a_val = getauxval (AT_HWCAP);
+  if (errno == 0)
+    {
+      stored_hwcap |= auxv.a_val;
+      hwcap_initialized = 1;
+    }
+
+  if (hwcap_initialized && stored_hwcap)
+    {
+      *hwcap = stored_hwcap;
+      return 0;
+    }
+#endif
+
+  f = fopen("/proc/self/auxv", "r");
+  if (!f)
+    {
+      *hwcap = stored_hwcap;
+      return -1;
+    }
+
+  while (fread(&auxv, sizeof(auxv), 1, f) > 0)
+    {
+      if (auxv.a_type == AT_HWCAP)
+        {
+          stored_hwcap |= auxv.a_val;
+          hwcap_initialized = 1;
+        }
+    }
+
+  if (hwcap_initialized)
+    err = 0;
+
+  fclose(f);
+  *hwcap = stored_hwcap;
+  return err;
+}
+
+static unsigned int
+detect_riscv_at_hwcap(void)
+{
+  unsigned int hwcap;
+  unsigned int features = 0;
+  unsigned int i;
+
+  if (get_hwcap(&hwcap) < 0)
+    return features;
+
+  for (i = 0; i < DIM(hwcap_features); i++)
+    {
+      unsigned int hwcap_flag = hwcap_features[i].hwcap_flag;
+      if ((hwcap & hwcap_flag) == hwcap_flag)
+        features |= hwcap_features[i].hwf_flag;
+    }
+
+  return features;
+}
+
+#endif /* HAS_SYS_AT_HWCAP */
+
+
+#undef HAS_SYS_HWPROBE
+#if defined(__linux__) && defined(HAVE_SYSCALL)
+#define HAS_SYS_HWPROBE 1
+
+#ifndef __NR_riscv_hwprobe
+#define __NR_riscv_hwprobe 258
+#endif
+
+#define HWF_RISCV_HWPROBE_KEY_BASE_BEHAVIOR 3
+#define HWF_RISCV_HWPROBE_BASE_BEHAVIOR_IMA (1U << 0)
+
+#define HWF_RISCV_HWPROBE_KEY_IMA_EXT_0     4
+#define HWF_RISCV_HWPROBE_IMA_FD            (1U << 0)
+#define HWF_RISCV_HWPROBE_IMA_C             (1U << 1)
+#define HWF_RISCV_HWPROBE_IMA_V             (1U << 2)
+#define HWF_RISCV_HWPROBE_EXT_ZBA           (1U << 3)
+#define HWF_RISCV_HWPROBE_EXT_ZBB           (1U << 4)
+#define HWF_RISCV_HWPROBE_EXT_ZBS           (1U << 5)
+#define HWF_RISCV_HWPROBE_EXT_ZBC           (1U << 7)
+#define HWF_RISCV_HWPROBE_EXT_ZICOND        (U64_C(1) << 35)
+
+#define HWF_RISCV_HWPROBE_IMA_FDC (HWF_RISCV_HWPROBE_IMA_FD \
+				   | HWF_RISCV_HWPROBE_IMA_C)
+
+#define HWF_RISCV_HWPROBE_IMA_B   (HWF_RISCV_HWPROBE_EXT_ZBA \
+				   | HWF_RISCV_HWPROBE_EXT_ZBB \
+				   | HWF_RISCV_HWPROBE_EXT_ZBS)
+
+struct hwf_riscv_hwprobe_s {
+  u64 key;
+  u64 value;
+};
+
+struct hwprobe_feature_map_s {
+  unsigned int ima_ext_0_flag;
+  unsigned int hwf_flag;
+};
+
+static const struct hwprobe_feature_map_s hwprobe_features[] =
+  {
+    { HWF_RISCV_HWPROBE_IMA_FDC,     HWF_RISCV_IMAFDC },
+    { HWF_RISCV_HWPROBE_IMA_V,       HWF_RISCV_V },
+    { HWF_RISCV_HWPROBE_IMA_B,       HWF_RISCV_B },
+    { HWF_RISCV_HWPROBE_EXT_ZBC,     HWF_RISCV_ZBC },
+  };
+
+static int
+hwf_riscv_hwprobe(struct hwf_riscv_hwprobe_s *pairs, size_t pair_count,
+	      size_t cpu_count, unsigned long *cpus, unsigned int flags)
+{
+  return syscall(__NR_riscv_hwprobe, pairs, pair_count, cpu_count, cpus, flags);
+}
+
+static unsigned int
+detect_riscv_hwprobe(void)
+{
+  const int base_behavior_idx = 0;
+  const int ima_ext_0_idx = base_behavior_idx + 1;
+  struct hwf_riscv_hwprobe_s reqs[ima_ext_0_idx + 1];
+  unsigned int features = 0;
+  unsigned int i;
+  int ret;
+
+  memset(reqs, 0, sizeof(reqs));
+  reqs[base_behavior_idx].key = HWF_RISCV_HWPROBE_KEY_BASE_BEHAVIOR;
+  reqs[ima_ext_0_idx].key = HWF_RISCV_HWPROBE_KEY_IMA_EXT_0;
+
+  ret = hwf_riscv_hwprobe(reqs, DIM(reqs), 0, NULL, 0);
+  if (ret < 0)
+    return 0;
+
+  for (i = 0; i < DIM(hwprobe_features); i++)
+    {
+      unsigned int ima_ext_0_flag = hwprobe_features[i].ima_ext_0_flag;
+      if ((reqs[base_behavior_idx].value & HWF_RISCV_HWPROBE_BASE_BEHAVIOR_IMA)
+	  && (reqs[ima_ext_0_idx].value & ima_ext_0_flag) == ima_ext_0_flag)
+        features |= hwprobe_features[i].hwf_flag;
+    }
+
+  return features;
+}
+
+#endif /* HAS_SYS_HWPROBE */
+
+
+static unsigned int
+detect_riscv_hwf_by_toolchain (void)
+{
+  unsigned int features = 0;
+
+  /* Detect CPU features required by toolchain. */
+
+#if defined(__riscv_i) && __riscv_i >= 1000000 && \
+    defined(__riscv_m) && __riscv_m >= 1000000 && \
+    defined(__riscv_a) && __riscv_a >= 1000000 && \
+    defined(__riscv_f) && __riscv_f >= 1000000 && \
+    defined(__riscv_d) && __riscv_d >= 1000000 && \
+    defined(__riscv_c) && __riscv_c >= 1000000
+  features |= HWF_RISCV_IMAFDC;
+#endif
+
+#if defined(__riscv_zba) && __riscv_zba >= 1000000 && \
+    defined(__riscv_zbb) && __riscv_zbb >= 1000000 && \
+    defined(__riscv_zbs) && __riscv_zbs >= 1000000 && \
+    defined(HAVE_GCC_INLINE_ASM_RISCV)
+  {
+    unsigned int tmp = 0;
+
+    /* Early test for Zba/Zbb/Zbs instructions to detect faulty toolchain
+     * configuration. */
+    asm volatile (".option push;\n\t"
+		  ".option arch, +zba;\n\t"
+		  "sh3add %0, %1, %1;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (tmp)
+		  : "r" (123));
+    asm volatile (".option push;\n\t"
+		  ".option arch, +zbb;\n\t"
+		  "cpop %0, %1;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (tmp)
+		  : "r" (321));
+    asm volatile (".option push;\n\t"
+		  ".option arch, +zbs;\n\t"
+		  "bexti %0, %1, 1;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (tmp)
+		  : "r" (234));
+
+    features |= HWF_RISCV_B;
+  }
+#endif
+
+#if defined(__riscv_zbc) && __riscv_zbc >= 1000000 && \
+    defined(HAVE_GCC_INLINE_ASM_RISCV)
+  {
+    unsigned int tmp = 0;
+
+    /* Early test for Zbc instructions to detect faulty toolchain
+     * configuration. */
+    asm volatile (".option push;\n\t"
+		  ".option arch, +zbc;\n\t"
+		  "clmulr %0, %1, %2;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (tmp)
+		  : "r" (123), "r" (321));
+
+    features |= HWF_RISCV_ZBC;
+  }
+#endif
+
+#ifdef HAVE_GCC_INLINE_ASM_RISCV_V
+#if defined(__riscv_v) && __riscv_v >= 12000
+  {
+    unsigned int vlmax = 0;
+
+    /* Early test for RVV instructions to detect faulty toolchain
+     * configuration. */
+    asm volatile (".option push;\n\t"
+		  ".option arch, +v;\n\t"
+		  "vsetvli %0, %1, e8, m1, ta, ma;\n\t"
+		  "vxor.vv v1, v1, v1;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (vlmax)
+		  : "r" (~0)
+		  : "vl", "vtype", "v1");
+
+    features |= HWF_RISCV_V;
+  }
+#endif
+#endif
+
+  return features;
+}
+
+unsigned int
+_gcry_hwf_detect_riscv (void)
+{
+  unsigned int features = 0;
+
+#if defined (HAS_SYS_AT_HWCAP)
+  features |= detect_riscv_at_hwcap ();
+#endif
+
+#if defined (HAS_SYS_HWPROBE)
+  features |= detect_riscv_hwprobe ();
+#endif
+
+  features |= detect_riscv_hwf_by_toolchain ();
+
+  /* Require VLEN >= 128-bit for "riscv-v" HWF. */
+  if (features & HWF_RISCV_V)
+    {
+      unsigned int vlmax = 0;
+
+#if defined(HAVE_GCC_INLINE_ASM_RISCV_V)
+      asm volatile (".option push;\n\t"
+		    ".option arch, +v;\n\t"
+		    "vsetvli %0, %1, e8, m1, ta, ma;\n\t"
+		    ".option pop;\n\t"
+		    : "=r" (vlmax)
+		    : "r" (~0)
+		    : "vl", "vtype");
+#endif
+
+      if (vlmax < 16)
+	{
+	  features &= ~HWF_RISCV_V;
+	}
+    }
+
+  return features;
+}
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index b11cadef..96ddfd30 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -91,6 +91,11 @@ static struct
     { HWF_S390X_MSA_8,         "s390x-msa-8" },
     { HWF_S390X_MSA_9,         "s390x-msa-9" },
     { HWF_S390X_VX,            "s390x-vx" },
+#elif defined(HAVE_CPU_ARCH_RISCV)
+    { HWF_RISCV_IMAFDC,        "riscv-imafdc" },
+    { HWF_RISCV_V,             "riscv-v" },
+    { HWF_RISCV_B,             "riscv-b" },
+    { HWF_RISCV_ZBC,           "riscv-zbc" },
 #endif
   };
 
@@ -245,6 +250,10 @@ _gcry_detect_hw_features (void)
   {
     hw_features = _gcry_hwf_detect_s390x ();
   }
+#elif defined (HAVE_CPU_ARCH_RISCV)
+  {
+    hw_features = _gcry_hwf_detect_riscv ();
+  }
 #endif
   hw_features &= ~disabled_hw_features;
 }
-- 
2.45.2


From jussi.kivilinna at iki.fi  Mon Jan  6 16:08:51 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  6 Jan 2025 17:08:51 +0200
Subject: [PATCH 4/6] Add GHASH RISC-V/Zbc implementation
In-Reply-To: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
References: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
Message-ID: <20250106150853.1779326-4-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'cipher-gcm-riscv-b-zbc.c'.
* cipher/cipher-gcm-riscv-b-zbc.c: New.
* cipher/cipher-gcm.c [GCM_USE_RISCV_ZBC] (_gcry_ghash_setup_riscv_zbc)
(_gcry_ghash_riscv_zbc): New.
(setupM) [GCM_USE_RISCV_ZBC]: Check for HWF_RISCV_IMAFDC, HWF_RISCV_B
HWF_RISCV_ZBC to enable RISC-V/Zbc implementation.
* cipher/cipher-internal.h (GCM_USE_RISCV_ZBC): New.
* configure.ac: Add 'cipher-gcm-riscv-b-zbc.lo'.
--

Patch adds RISC-V Zbc extension accelerated GHASH implementation.

Benchmark on SpacemiT K1 (1600 Mhz):

Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 GMAC_AES           |     14.32 ns/B     66.60 MiB/s     22.91 c/B

After (19x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 GMAC_AES           |     0.744 ns/B      1281 MiB/s      1.19 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am              |   1 +
 cipher/cipher-gcm-riscv-b-zbc.c | 276 ++++++++++++++++++++++++++++++++
 cipher/cipher-gcm.c             |  16 ++
 cipher/cipher-internal.h        |   7 +
 configure.ac                    |   3 +
 5 files changed, 303 insertions(+)
 create mode 100644 cipher/cipher-gcm-riscv-b-zbc.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 88b2d17c..a0a4d7d8 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -92,6 +92,7 @@ EXTRA_libcipher_la_SOURCES = \
 	cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \
 	cipher-gcm-aarch64-simd.c cipher-gcm-armv7-neon.S \
 	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
+	cipher-gcm-riscv-b-zbc.c \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
diff --git a/cipher/cipher-gcm-riscv-b-zbc.c b/cipher/cipher-gcm-riscv-b-zbc.c
new file mode 100644
index 00000000..705b7462
--- /dev/null
+++ b/cipher/cipher-gcm-riscv-b-zbc.c
@@ -0,0 +1,276 @@
+/* cipher-gcm-irscv-b-zbc.c - RISC-V Zbc accelerated GHASH
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+#if defined (GCM_USE_RISCV_ZBC)
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+typedef struct { u64 val[2]; } u64x2;
+typedef struct { u64x2 val[2]; } u64x2x2;
+
+static ASM_FUNC_ATTR_INLINE u64x2
+load_aligned_u64x2(const void *ptr)
+{
+  u64x2 vec;
+
+  asm ("ld %0, 0(%1)"
+       : "=r" (vec.val[0])
+       : "r" (ptr)
+       : "memory");
+  asm ("ld %0, 8(%1)"
+       : "=r" (vec.val[1])
+       : "r" (ptr)
+       : "memory");
+
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+load_unaligned_u64x2(const void *ptr)
+{
+  if (((uintptr_t)ptr & 7) == 0)
+    {
+      /* aligned load */
+      return load_aligned_u64x2(ptr);
+    }
+  else
+    {
+      /* unaligned load */
+      const bufhelp_u64_t *ptr_u64 = ptr;
+      u64x2 vec;
+      vec.val[0] = ptr_u64[0].a;
+      vec.val[1] = ptr_u64[1].a;
+      return vec;
+    }
+}
+
+static ASM_FUNC_ATTR_INLINE void
+store_aligned_u64x2(void *ptr, u64x2 vec)
+{
+  asm ("sd %0, 0(%1)"
+       :
+       : "r" (vec.val[0]), "r" (ptr)
+       : "memory");
+  asm ("sd %0, 8(%1)"
+       :
+       : "r" (vec.val[1]), "r" (ptr)
+       : "memory");
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+byteswap_u64(u64 x)
+{
+  asm (".option push;\n\t"
+       ".option arch, +zbb;\n\t"
+       "rev8 %0, %1;\n\t"
+       ".option pop;\n\t"
+       : "=r" (x)
+       : "r" (x));
+  return x;
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+byteswap_u64x2(u64x2 vec)
+{
+  u64 tmp = byteswap_u64(vec.val[0]);
+  vec.val[0] = byteswap_u64(vec.val[1]);
+  vec.val[1] = tmp;
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+veor_u64x2(u64x2 va, u64x2 vb)
+{
+  va.val[0] ^= vb.val[0];
+  va.val[1] ^= vb.val[1];
+  return va;
+}
+
+/* 64x64 => 128 carry-less multiplication */
+static ASM_FUNC_ATTR_INLINE u64x2
+clmul_u64x2(u64 a, u64 b)
+{
+  u64x2 vec;
+  asm (".option push;\n\t"
+       ".option arch, +zbc;\n\t"
+       "clmul %0, %1, %2;\n\t"
+       ".option pop;\n\t"
+       : "=r" (vec.val[0])
+       : "r" (a), "r" (b));
+  asm (".option push;\n\t"
+       ".option arch, +zbc;\n\t"
+       "clmulh %0, %1, %2;\n\t"
+       ".option pop;\n\t"
+       : "=r" (vec.val[1])
+       : "r" (a), "r" (b));
+  return vec;
+}
+
+/* GHASH functions.
+ *
+ * See "Gouv?a, C. P. L. & L?pez, J. Implementing GCM on ARMv8. Topics in
+ * Cryptology ? CT-RSA 2015" for details.
+ */
+static ASM_FUNC_ATTR_INLINE u64x2x2
+pmul_128x128(u64x2 a, u64x2 b)
+{
+  u64 a_l = a.val[0];
+  u64 a_h = a.val[1];
+  u64 b_l = b.val[0];
+  u64 b_h = b.val[1];
+  u64 t1_h = b_l ^ b_h;
+  u64 t1_l = a_l ^ a_h;
+  u64x2 r0 = clmul_u64x2(a_l, b_l);
+  u64x2 r1 = clmul_u64x2(a_h, b_h);
+  u64x2 t2 = clmul_u64x2(t1_h, t1_l);
+  u64 t2_l, t2_h;
+  u64 r0_l, r0_h;
+  u64 r1_l, r1_h;
+
+  t2 = veor_u64x2(t2, r0);
+  t2 = veor_u64x2(t2, r1);
+
+  r0_l = r0.val[0];
+  r0_h = r0.val[1];
+  r1_l = r1.val[0];
+  r1_h = r1.val[1];
+  t2_l = t2.val[0];
+  t2_h = t2.val[1];
+
+  r0_h = r0_h ^ t2_l;
+  r1_l = r1_l ^ t2_h;
+
+  r0 = (const u64x2){ .val = { r0_l, r0_h } };
+  r1 = (const u64x2){ .val = { r1_l, r1_h } };
+
+  return (const u64x2x2){ .val = { r0, r1 } };
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+reduction(u64x2x2 r0r1)
+{
+  static const u64 rconst = { U64_C(0xc200000000000000) };
+  u64x2 r0 = r0r1.val[0];
+  u64x2 r1 = r0r1.val[1];
+  u64x2 t = clmul_u64x2(r0.val[0], rconst);
+  r0.val[1] ^= t.val[0];
+  r1.val[0] ^= t.val[1];
+  t = clmul_u64x2(r0.val[1], rconst);
+  r1 = veor_u64x2(r1, t);
+  return veor_u64x2(r0, r1);
+}
+
+ASM_FUNC_ATTR_NOINLINE unsigned int
+_gcry_ghash_riscv_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
+		      size_t nblocks)
+{
+  u64x2 rhash;
+  u64x2 rh1;
+  u64x2 rbuf;
+  u64x2x2 rr0rr1;
+
+  if (nblocks == 0)
+    return 0;
+
+  rhash = load_aligned_u64x2(result);
+  rh1 = load_aligned_u64x2(c->u_mode.gcm.u_ghash_key.key);
+
+  rhash = byteswap_u64x2(rhash);
+
+  rbuf = load_unaligned_u64x2(buf);
+  buf += 16;
+  nblocks--;
+
+  rbuf = byteswap_u64x2(rbuf);
+
+  rhash = veor_u64x2(rhash, rbuf);
+
+  while (nblocks)
+    {
+      rbuf = load_unaligned_u64x2(buf);
+      buf += 16;
+      nblocks--;
+
+      rr0rr1 = pmul_128x128(rhash, rh1);
+
+      rbuf = byteswap_u64x2(rbuf);
+
+      rhash = reduction(rr0rr1);
+
+      rhash = veor_u64x2(rhash, rbuf);
+    }
+
+  rr0rr1 = pmul_128x128(rhash, rh1);
+  rhash = reduction(rr0rr1);
+
+  rhash = byteswap_u64x2(rhash);
+
+  store_aligned_u64x2(result, rhash);
+
+
+  return 0;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh_1(void *r_out, u64x2 i)
+{
+  static const u64 rconst = { U64_C(0xc200000000000000) };
+  u64 ia = i.val[0];
+  u64 ib = i.val[1];
+  u64 oa, ob, ma;
+  u64x2 oa_ob;
+
+  ma = (u64)-(ib >> 63);
+  oa = ib >> 63;
+  ob = ia >> 63;
+  ma = ma & rconst;
+  ib = ib << 1;
+  ia = ia << 1;
+  ob = ob | ib;
+  oa = oa | ia;
+  ob = ob ^ ma;
+  oa_ob = (const u64x2){ .val = { oa, ob } };
+  store_aligned_u64x2(r_out, oa_ob);
+}
+
+ASM_FUNC_ATTR_NOINLINE void
+_gcry_ghash_setup_riscv_zbc(gcry_cipher_hd_t c)
+{
+  u64x2 rhash = load_aligned_u64x2(c->u_mode.gcm.u_ghash_key.key);
+
+  rhash = byteswap_u64x2(rhash);
+
+  gcm_lsh_1(c->u_mode.gcm.u_ghash_key.key, rhash);
+}
+
+#endif /* GCM_USE_RISCV_ZBC */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 9fbdb02e..37743c30 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -102,6 +102,13 @@ ghash_armv7_neon (gcry_cipher_hd_t c, byte *result, const byte *buf,
 }
 #endif /* GCM_USE_ARM_NEON */
 
+#ifdef GCM_USE_RISCV_ZBC
+extern void _gcry_ghash_setup_riscv_zbc(gcry_cipher_hd_t c);
+
+extern unsigned int _gcry_ghash_riscv_zbc(gcry_cipher_hd_t c, byte *result,
+					  const byte *buf, size_t nblocks);
+#endif /* GCM_USE_RISCV_ZBC */
+
 #ifdef GCM_USE_AARCH64
 extern void _gcry_ghash_setup_aarch64_simd(gcry_cipher_hd_t c);
 
@@ -621,6 +628,15 @@ setupM (gcry_cipher_hd_t c)
       _gcry_ghash_setup_aarch64_simd (c);
     }
 #endif
+#ifdef GCM_USE_RISCV_ZBC
+  else if ((features & HWF_RISCV_IMAFDC)
+	   && (features & HWF_RISCV_B)
+	   && (features & HWF_RISCV_ZBC))
+    {
+      c->u_mode.gcm.ghash_fn = _gcry_ghash_riscv_zbc;
+      _gcry_ghash_setup_riscv_zbc (c);
+    }
+#endif
 #ifdef GCM_USE_PPC_VPMSUM
   else if (features & HWF_PPC_VCRYPTO)
     {
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 19b3eada..9f50ebc2 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -136,6 +136,13 @@
 #endif
 #endif /* GCM_USE_PPC_VPMSUM */
 
+/* GCM_USE_RISCV_ZBC indicates whether to compile GCM with RISC-V Zbc code. */
+#undef GCM_USE_RISCV_ZBC
+#if defined (__riscv) && (__riscv_xlen == 64) && \
+    defined(HAVE_GCC_INLINE_ASM_RISCV)
+# define GCM_USE_RISCV_ZBC 1
+#endif
+
 typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
                                     const byte *buf, size_t nblocks);
 
diff --git a/configure.ac b/configure.ac
index 55d15fa3..fbe82695 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3868,6 +3868,9 @@ case "${host}" in
   powerpc64le-*-* | powerpc64-*-* | powerpc-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-ppc.lo"
   ;;
+  riscv64-*-*)
+    GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-riscv-b-zbc.lo"
+  ;;
 esac
 
 # Arch specific MAC implementations
-- 
2.45.2


From jussi.kivilinna at iki.fi  Mon Jan  6 16:08:49 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  6 Jan 2025 17:08:49 +0200
Subject: [PATCH 2/6] bithelp: add count trailing zero bits variant for RISC-V
In-Reply-To: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
References: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
Message-ID: <20250106150853.1779326-2-jussi.kivilinna@iki.fi>

* cipher/bithelp.h (_gcry_ctz_no_zero): New.
(_gcry_ctz): Use '_gcry_ctz_no_zero'.
* cipher/cipher-internal.h (ocb_get_l): Use '_gcry_ctz_no_zero'.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/bithelp.h         | 33 ++++++++++++++++++++++++++++-----
 cipher/cipher-internal.h |  2 +-
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/cipher/bithelp.h b/cipher/bithelp.h
index 7793ce7c..a4faf345 100644
--- a/cipher/bithelp.h
+++ b/cipher/bithelp.h
@@ -19,6 +19,7 @@
 #ifndef GCRYPT_BITHELP_H
 #define GCRYPT_BITHELP_H
 
+#include "config.h"
 #include "types.h"
 
 
@@ -77,13 +78,25 @@ _gcry_bswap64(u64 x)
 
 
 /* Count trailing zero bits in an unsigend int.  We return an int
-   because that is what gcc's builtin does.  Returns the number of
-   bits in X if X is 0. */
+   because that is what gcc's builtin does.  X must not be zero. */
 static inline int
-_gcry_ctz (unsigned int x)
+_gcry_ctz_no_zero (unsigned int x)
 {
-#if defined (HAVE_BUILTIN_CTZ)
-  return x ? __builtin_ctz (x) : 8 * sizeof (x);
+#if defined(__riscv) && \
+    (defined(__riscv_f) && __riscv_f >= 2002000) && \
+    (!defined(__riscv_zbb) || __riscv_zbb < 2002000) && \
+    defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS)
+  /* Use float cast approach when building for RISC-V without Zbb extension.
+   * Without Zbb, GCC gives us slower generic version for __builtin_ctz().
+   *
+   * See:
+   * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
+   */
+  float f = (float)(x & -x);
+  typedef u32 __attribute__((may_alias)) may_alias_u32;
+  return ((*(const may_alias_u32 *)&f) >> 23) - 0x7f;
+#elif defined (HAVE_BUILTIN_CTZ)
+  return __builtin_ctz (x);
 #else
   /* See
    * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightModLookup
@@ -100,6 +113,16 @@ _gcry_ctz (unsigned int x)
 }
 
 
+/* Count trailing zero bits in an unsigend int.  We return an int
+   because that is what gcc's builtin does.  Returns the number of
+   bits in X if X is 0. */
+static inline int
+_gcry_ctz (unsigned int x)
+{
+  return x ? _gcry_ctz_no_zero (x) : 8 * sizeof (x);
+}
+
+
 /* Count trailing zero bits in an u64.  We return an int because that
    is what gcc's builtin does.  Returns the number of bits in X if X
    is 0.  */
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index ddf8fbb5..19b3eada 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -775,7 +775,7 @@ ocb_get_l (gcry_cipher_hd_t c, u64 n)
         : [low] "r" ((unsigned long)n)
         : "cc");
 #else
-  ntz = _gcry_ctz (n);
+  ntz = _gcry_ctz_no_zero (n);
 #endif
 
   return c->u_mode.ocb.L[ntz];
-- 
2.45.2


From jussi.kivilinna at iki.fi  Mon Jan  6 16:08:53 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  6 Jan 2025 17:08:53 +0200
Subject: [PATCH 6/6] Add SHA3 acceleration for RISC-V "B" extension
In-Reply-To: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
References: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
Message-ID: <20250106150853.1779326-6-jussi.kivilinna@iki.fi>

* cipher/keccak.c (USE_RISCV_B): New.
[USE_RISCV_B]: (ANDN64, ROL64, keccak_riscv_b_64_ops): New.
(keccak_init) [USE_RISCV_B]: Use 'keccak_riscv_b_64_ops' if
HWF_RISCV_IMAFDC and HWF_RISCV_B available.
--

Patch adds RISC-V "B" extension acceleration for SHA3.

Benchmark on SpacemiT K1 (1600 Mhz):

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA3-256       |     22.98 ns/B     41.51 MiB/s     36.76 c/B

After (2x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA3-256       |     11.15 ns/B     85.57 MiB/s     17.83 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/keccak.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/cipher/keccak.c b/cipher/keccak.c
index 44cc9f71..4ada6b4c 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -91,6 +91,15 @@
 #endif /* USE_S390X_CRYPTO */
 
 
+/* GCM_USE_RISCV_B indicates whether to compile GCM with RISC-V "B" extension
+ * code. */
+#undef USE_RISCV_B
+#if defined (__riscv) && (__riscv_xlen == 64) && \
+    defined(HAVE_GCC_INLINE_ASM_RISCV)
+# define USE_RISCV_B 1
+#endif
+
+
 /* x86-64 vector register assembly implementations use SystemV ABI, ABI
  * conversion needed on Win64 through function attribute. */
 #undef ASM_FUNC_ABI
@@ -359,7 +368,6 @@ static inline void absorb_lanes64_1(u64 *dst, const byte *in)
   dst[0] ^= buf_get_le64(in + 8 * 0);
 }
 
-
 # define ANDN64(x, y) (~(x) & (y))
 # define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
 		      ((x) >> ((64 - (unsigned int)(n)) & 63)))
@@ -450,6 +458,48 @@ static const keccak_ops_t keccak_bmi2_64_ops =
 #endif /* USE_64BIT_BMI2 */
 
 
+/* Construct 64-bit RISC-V "B" extension implementation. */
+#ifdef USE_RISCV_B
+
+# define ANDN64(x, y) ({ \
+			u64 tmp; \
+			asm (".option push;\n\t" \
+			    ".option arch, +zbb;\n\t" \
+			    "andn %0, %1, %2;\n\t" \
+			    ".option pop;\n\t" \
+			    : "=r" (tmp) \
+			    : "r" (y), "r" (x)); \
+			tmp; })
+
+# define ROL64(x, n) ({ \
+			u64 tmp; \
+			asm (".option push;\n\t" \
+			    ".option arch, +zbb;\n\t" \
+			    "rori %0, %1, %2;\n\t" \
+			    ".option pop;\n\t" \
+			    : "=r" (tmp) \
+			    : "r" (x), "I" ((64 - n) & 63)); \
+			tmp; })
+
+# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_riscv_b
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_riscv_b
+# include "keccak_permute_64.h"
+
+# undef ANDN64
+# undef ROL64
+# undef KECCAK_F1600_PERMUTE_FUNC_NAME
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
+
+static const keccak_ops_t keccak_riscv_b_64_ops =
+{
+  .permute = keccak_f1600_state_permute64_riscv_b,
+  .absorb = keccak_absorb_lanes64_riscv_b,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_RISCV_B */
+
+
 /* 64-bit Intel AVX512 implementation. */
 #ifdef USE_64BIT_AVX512
 
@@ -1002,6 +1052,10 @@ keccak_init (int algo, void *context, unsigned int flags)
   else if (features & HWF_INTEL_FAST_SHLD)
     ctx->ops = &keccak_shld_64_ops;
 #endif
+#ifdef USE_RISCV_B
+  else if ((features & HWF_RISCV_IMAFDC) && (features & HWF_RISCV_B))
+    ctx->ops = &keccak_riscv_b_64_ops;
+#endif
 
   /* Set input block size, in Keccak terms this is called 'rate'. */
 
-- 
2.45.2


From jussi.kivilinna at iki.fi  Mon Jan  6 16:08:50 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  6 Jan 2025 17:08:50 +0200
Subject: [PATCH 3/6] Add RISC-V vector permute AES
In-Reply-To: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
References: <20250106150853.1779326-1-jussi.kivilinna@iki.fi>
Message-ID: <20250106150853.1779326-3-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'rinjdael-vp-riscv.c' and
CFLAG handling for 'rijndael-vp-riscv.o' and 'rijndael-vp-riscv.lo'.
(ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS): New.
* cipher/rijndael-internal.h (USE_VP_RISCV): New.
* cipher/rijndael-vp-simd128.h [__ARM_NEON]: Move ARM NEON macros to ...
* cipher/rijndael-vp-aarch64.c: ... here.
* cipher/rijndael-vp-riscv.c: New.
* cipher/rijndael-vp-simd128.h: Use '__m128i_const' type for constant
vector values and use *_amemld() macros to load these values to vector
registers.
[__x86_64__] (vpaddd128, vpaddb128): Remove.
[__x86_64__] (psrl_byte_128, movdqa128_memld, pand128_amemld)
(paddq128_amemld, paddd128_amemld, pshufb128_amemld): New.
[HAVE_SIMD256] (aes_encrypt_core_4blks_simd256)
(aes_decrypt_core_4blks_simd256): New.
(FUNC_CTR_ENC, FUNC_CTR32LE_ENC, FUNC_CFB_DEC, FUNC_CBC_DEC)
(aes_simd128_ocb_enc, aes_simd128_ocb_dec, FUNC_OCB_AUTH)
(aes_simd128_ecb_enc, aes_simd128_ecb_dec, aes_simd128_xts_enc)
(aes_simd128_xts_dec) [HAVE_SIMD256]: Add 4 block parallel code paths
for HW with 256-bit wide vectors.
* cipher/rijndael.c [USE_VP_RISCV]
(_gcry_aes_vp_riscv_setup_acceleration, _gcry_aes_vp_riscv_do_setkey)
(_gcry_aes_vp_riscv_prepare_decryption, _gcry_aes_vp_riscv_encrypt)
(_gcry_aes_vp_riscv_decrypt, _gcry_aes_vp_riscv_cfb_enc)
(_gcry_aes_vp_riscv_cbc_enc, _gcry_aes_vp_riscv_ctr_enc)
(_gcry_aes_vp_riscv_ctr32le_enc, _gcry_aes_vp_riscv_cfb_dec)
(_gcry_aes_vp_riscv_cbc_dec, _gcry_aes_vp_riscv_ocb_crypt)
(_gcry_aes_vp_riscv_ocb_auth, _gcry_aes_vp_riscv_ecb_crypt)
(_gcry_aes_vp_riscv_xts_crypt): New.
(do_setkey) [USE_VP_RISCV]: Setup vector permute AES for RISC-V with
HWF_RISCV_IMAFDC and HWF_RISCV_V.
* cipher/simd-common-riscv.h: New.
* configure.ac: Add 'rijndael-vp-riscv.lo'.
(gcry_cv_cc_riscv_vector_intrinsics)
(gcry_cv_cc_riscv_vector_intrinsics_cflags): New.
--

Patch adds AES vector permutation implementation for RISC-V with
fixed vector lengths of 128-bit and 256-bit.

Benchmark on SpacemiT K1 (1600 Mhz):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     35.30 ns/B     27.02 MiB/s     56.48 c/B
        ECB dec |     35.51 ns/B     26.86 MiB/s     56.81 c/B
        CBC enc |     35.40 ns/B     26.94 MiB/s     56.63 c/B
        CBC dec |     36.30 ns/B     26.27 MiB/s     58.08 c/B
        CFB enc |     36.25 ns/B     26.31 MiB/s     58.00 c/B
        CFB dec |     36.25 ns/B     26.31 MiB/s     58.00 c/B
        OFB enc |     38.28 ns/B     24.91 MiB/s     61.25 c/B
        OFB dec |     38.28 ns/B     24.91 MiB/s     61.26 c/B
        CTR enc |     39.81 ns/B     23.96 MiB/s     63.69 c/B
        CTR dec |     39.81 ns/B     23.96 MiB/s     63.69 c/B
        XTS enc |     36.38 ns/B     26.22 MiB/s     58.20 c/B
        XTS dec |     36.26 ns/B     26.30 MiB/s     58.01 c/B
        OCB enc |     40.94 ns/B     23.29 MiB/s     65.50 c/B
        OCB dec |     40.71 ns/B     23.43 MiB/s     65.13 c/B
       OCB auth |     37.34 ns/B     25.54 MiB/s     59.75 c/B

After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  speed vs old
        ECB enc |     16.76 ns/B     56.90 MiB/s     26.82 c/B     2.11x
        ECB dec |     19.94 ns/B     47.84 MiB/s     31.90 c/B     1.78x
        CBC enc |     31.72 ns/B     30.06 MiB/s     50.75 c/B     1.12x
        CBC dec |     20.24 ns/B     47.12 MiB/s     32.38 c/B     1.79x
        CFB enc |     31.80 ns/B     29.99 MiB/s     50.88 c/B     1.14x
        CFB dec |     16.87 ns/B     56.55 MiB/s     26.98 c/B     2.15x
        OFB enc |     38.68 ns/B     24.66 MiB/s     61.88 c/B     0.99x
        OFB dec |     38.65 ns/B     24.67 MiB/s     61.85 c/B     0.99x
        CTR enc |     16.86 ns/B     56.57 MiB/s     26.97 c/B     2.36x
        XTS enc |     17.49 ns/B     54.51 MiB/s     27.99 c/B     2.08x
        XTS dec |     20.80 ns/B     45.86 MiB/s     33.27 c/B     1.74x
        GCM enc |     31.16 ns/B     30.61 MiB/s     49.85 c/B     1.73x
        OCB enc |     17.25 ns/B     55.28 MiB/s     27.60 c/B     2.37x
        OCB dec |     20.64 ns/B     46.21 MiB/s     33.02 c/B     1.97x
       OCB auth |     17.11 ns/B     55.73 MiB/s     27.38 c/B     2.18x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am           |   16 +-
 cipher/rijndael-internal.h   |    8 +-
 cipher/rijndael-vp-aarch64.c |   60 +-
 cipher/rijndael-vp-riscv.c   |  285 ++++++++++
 cipher/rijndael-vp-simd128.h | 1044 +++++++++++++++++++++++++++-------
 cipher/rijndael.c            |   80 +++
 cipher/simd-common-riscv.h   |   48 ++
 configure.ac                 |  118 ++++
 8 files changed, 1463 insertions(+), 196 deletions(-)
 create mode 100644 cipher/rijndael-vp-riscv.c
 create mode 100644 cipher/simd-common-riscv.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 90415d83..88b2d17c 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -119,7 +119,8 @@ EXTRA_libcipher_la_SOURCES = \
 	rijndael-p10le.c rijndael-gcm-p10le.s              \
 	rijndael-ppc-common.h rijndael-ppc-functions.h     \
 	rijndael-s390x.c                                   \
-	rijndael-vp-aarch64.c rijndael-vp-simd128.h        \
+	rijndael-vp-aarch64.c rijndael-vp-riscv.c          \
+	rijndael-vp-simd128.h                              \
 	rmd160.c \
 	rsa.c \
 	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
@@ -349,3 +350,16 @@ serpent-avx512-x86.o: $(srcdir)/serpent-avx512-x86.c Makefile
 
 serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
 	`echo $(LTCOMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
+
+if ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS
+# Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+riscv_vector_cflags = -O2 -march=rv64imafdcv -mstrict-align
+else
+riscv_vector_cflags =
+endif
+
+rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
+	`echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-vp-riscv.lo: $(srcdir)/rijndael-vp-riscv.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 69ef86af..92310fc5 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -124,12 +124,18 @@
 # endif
 #endif /* ENABLE_ARM_CRYPTO_SUPPORT */
 
-/* USE_ARM_CE indicates whether to enable vector permute AArch64 SIMD code. */
+/* USE_VP_AARCH64 indicates whether to enable vector permute AArch64 SIMD code. */
 #undef USE_VP_AARCH64
 #if defined(__AARCH64EL__) && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS)
 # define USE_VP_AARCH64 1
 #endif
 
+/* USE_VP_RISCV indicates whether to enable vector permute RISC-V code. */
+#undef USE_VP_RISCV
+#if defined (__riscv) && defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS)
+# define USE_VP_RISCV 1
+#endif
+
 /* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
  * accelerated code.  USE_PPC_CRYPTO_WITH_PPC9LE indicates whether to
  * enable POWER9 optimized variant.  */
diff --git a/cipher/rijndael-vp-aarch64.c b/cipher/rijndael-vp-aarch64.c
index 0532c421..9c8b852b 100644
--- a/cipher/rijndael-vp-aarch64.c
+++ b/cipher/rijndael-vp-aarch64.c
@@ -1,5 +1,5 @@
-/* SSSE3 vector permutation AES for Libgcrypt
- * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+/* AArch64 SIMD vector permutation AES for Libgcrypt
+ * Copyright (C) 2014-2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -50,6 +50,62 @@
 #ifdef USE_VP_AARCH64
 
 
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (ARM)
+ **********************************************************************/
+
+#include "simd-common-aarch64.h"
+#include <arm_neon.h>
+
+#define __m128i uint64x2_t
+
+#define pand128(a, o)           (o = vandq_u64(o, a))
+#define pandn128(a, o)          (o = vbicq_u64(a, o))
+#define pxor128(a, o)           (o = veorq_u64(o, a))
+#define paddq128(a, o)          (o = vaddq_u64(o, a))
+#define paddd128(a, o)          (o = (__m128i)vaddq_u32((uint32x4_t)o, (uint32x4_t)a))
+#define paddb128(a, o)          (o = (__m128i)vaddq_u8((uint8x16_t)o, (uint8x16_t)a))
+
+#define psrld128(s, o)          (o = (__m128i)vshrq_n_u32((uint32x4_t)o, s))
+#define psraq128(s, o)          (o = (__m128i)vshrq_n_s64((int64x2_t)o, s))
+#define psrldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
+				   o = (__m128i)vextq_u8((uint8x16_t)o, \
+				                         (uint8x16_t)__tmp, (s) & 15);})
+#define pslldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
+                                   o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
+                                                         (uint8x16_t)o, (16 - (s)) & 15);})
+#define psrl_byte_128(s, o)     (o = (__m128i)vshrq_n_u8((uint8x16_t)o, s))
+
+#define pshufb128(m8, o)        (o = (__m128i)vqtbl1q_u8((uint8x16_t)o, (uint8x16_t)m8))
+#define pshufd128(m32, a, o)    ({ static const __m128i __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+				   __m128i __tmp2; \
+				   movdqa128(a, o); \
+				   movdqa128_memld(&__tmp1, __tmp2); \
+				   pshufb128(__tmp2, o); })
+#define pshufd128_0x93(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 12))
+#define pshufd128_0xFF(a, o)    (o = (__m128i)vdupq_laneq_u32((uint32x4_t)a, 3))
+#define pshufd128_0xFE(a, o)    pshufd128(0xFE, a, o)
+#define pshufd128_0x4E(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
+
+#define palignr128(s, a, o)     (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)o, s))
+
+#define movdqa128(a, o)         (o = a)
+
+#define movdqa128_memld(a, o)   (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+
+#define pand128_amemld(m, o)    pand128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define pxor128_amemld(m, o)    pxor128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define paddq128_amemld(m, o)   paddq128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define paddd128_amemld(m, o)   paddd128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define pshufb128_amemld(m, o)  pshufb128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+
+/* Following operations may have unaligned memory input */
+#define movdqu128_memld(a, o)   (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+
+/* Following operations may have unaligned memory output */
+#define movdqu128_memst(a, o)   vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
+
+
 #ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
 # define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
 #else
diff --git a/cipher/rijndael-vp-riscv.c b/cipher/rijndael-vp-riscv.c
new file mode 100644
index 00000000..b8c6ed13
--- /dev/null
+++ b/cipher/rijndael-vp-riscv.c
@@ -0,0 +1,285 @@
+/* RISC-V vector permutation AES for Libgcrypt
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The code is based on the public domain library libvpaes version 0.5
+ * available at http://crypto.stanford.edu/vpaes/ and which carries
+ * this notice:
+ *
+ *     libvpaes: constant-time SSSE3 AES encryption and decryption.
+ *     version 0.5
+ *
+ *     By Mike Hamburg, Stanford University, 2009.  Public domain.
+ *     I wrote essentially all of this code.  I did not write the test
+ *     vectors; they are the NIST known answer tests.  I hereby release all
+ *     the code and documentation here that I wrote into the public domain.
+ *
+ *     This is an implementation of AES following my paper,
+ *       "Accelerating AES with Vector Permute Instructions"
+ *       CHES 2009; http://shiftleft.org/papers/vector_aes/
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h"  /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_VP_RISCV
+
+
+/**********************************************************************
+  AT&T x86 asm to intrinsics conversion macros (RISC-V)
+ **********************************************************************/
+
+#include <riscv_vector.h>
+#include "simd-common-riscv.h"
+
+/*
+ * SIMD128
+ */
+
+typedef vuint8m1_t __m128i;
+
+#define cast_m128i_to_s8(a)     (__riscv_vreinterpret_v_u8m1_i8m1(a))
+#define cast_m128i_to_u32(a)    (__riscv_vreinterpret_v_u8m1_u32m1(a))
+#define cast_m128i_to_u64(a)    (__riscv_vreinterpret_v_u8m1_u64m1(a))
+#define cast_m128i_to_s64(a)    (__riscv_vreinterpret_v_u64m1_i64m1(cast_m128i_to_u64(a)))
+
+#define cast_s8_to_m128i(a)     (__riscv_vreinterpret_v_i8m1_u8m1(a))
+#define cast_u32_to_m128i(a)    (__riscv_vreinterpret_v_u32m1_u8m1(a))
+#define cast_u64_to_m128i(a)    (__riscv_vreinterpret_v_u64m1_u8m1(a))
+#define cast_s64_to_m128i(a)    (cast_u64_to_m128i(__riscv_vreinterpret_v_i64m1_u64m1(a)))
+
+#define pand128(a, o)           (o = __riscv_vand_vv_u8m1((o), (a), 16))
+#define pandn128(a, o)          (o = __riscv_vand_vv_u8m1(__riscv_vnot_v_u8m1((o), 16), (a), 16))
+#define pxor128(a, o)           (o = __riscv_vxor_vv_u8m1((o), (a), 16))
+#define paddb128(a, o)          (o = __riscv_vadd_vv_u8m1((o), (a), 16))
+#define paddd128(a, o)          (o = cast_u32_to_m128i(__riscv_vadd_vv_u32m1( \
+							cast_m128i_to_u32(o), \
+							cast_m128i_to_u32(a), 4)))
+#define paddq128(a, o)          (o = cast_u64_to_m128i(__riscv_vadd_vv_u64m1( \
+							cast_m128i_to_u64(o), \
+							cast_m128i_to_u64(a), 2)))
+
+#define psrld128(s, o)          (o = cast_u32_to_m128i(__riscv_vsrl_vx_u32m1(cast_m128i_to_u32(o), (s), 4))
+#define psraq128(s, o)          (o = cast_s64_to_m128i(__riscv_vsra_vx_i64m1(cast_m128i_to_s64(o), (s), 2)))
+#define psrldq128(s, o)         (o = __riscv_vslidedown_vx_u8m1((o), (s), 16))
+#define pslldq128(s, o)         ({ vuint8m1_t __tmp = __riscv_vmv_v_x_u8m1(0, 16); \
+				   o = __riscv_vslideup_vx_u8m1(__tmp, (o), (s), 16); })
+#define psrl_byte_128(s, o)     (o = __riscv_vsrl_vx_u8m1((o), (s), 16))
+
+#define pshufb128(m8, o)        (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+#define pshufd128(m32, a, o)    ({ static const __m128i_const __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+				   __m128i __tmp2; \
+				   movdqa128(a, o); \
+				   movdqa128_memld(&__tmp1, __tmp2); \
+				   pshufb128(__tmp2, o); })
+
+#define pshufd128_0x93(a, o)    pshufd128(0x93, a, o)
+#define pshufd128_0xFF(a, o)    (o = cast_u32_to_m128i(__riscv_vrgather_vx_u32m1(cast_m128i_to_u32(a), 3, 4)))
+#define pshufd128_0xFE(a, o)    pshufd128(0xFE, a, o)
+#define pshufd128_0x4E(a, o)    pshufd128(0x4E, a, o)
+
+#define palignr128(s, a, o)     (o = __riscv_vslideup_vx_u8m1(__riscv_vslidedown_vx_u8m1((a), (s), 16), (o), 16 - (s), 16))
+
+#define movdqa128(a, o)         (o = (a))
+
+#define movdqa128_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 16))
+
+#define pand128_amemld(m, o)    pand128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define pxor128_amemld(m, o)    pxor128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define paddq128_amemld(m, o)   paddq128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define paddd128_amemld(m, o)   paddd128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define pshufb128_amemld(m, o)  pshufb128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+
+/* Following operations may have unaligned memory input */
+#define movdqu128_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 16))
+
+/* Following operations may have unaligned memory output */
+#define movdqu128_memst(a, o)   (__riscv_vse8_v_u8m1((void *)(o), (a), 16))
+
+/*
+ * SIMD256
+ */
+
+#define PSHUFD256_MASK_TO_PSHUFB256_MASK(m32) { \
+	M128I_BYTE(((((m32) >> 0) & 0x03) * 4) + 0, \
+		   ((((m32) >> 0) & 0x03) * 4) + 1, \
+		   ((((m32) >> 0) & 0x03) * 4) + 2, \
+		   ((((m32) >> 0) & 0x03) * 4) + 3, \
+		   ((((m32) >> 2) & 0x03) * 4) + 0, \
+		   ((((m32) >> 2) & 0x03) * 4) + 1, \
+		   ((((m32) >> 2) & 0x03) * 4) + 2, \
+		   ((((m32) >> 2) & 0x03) * 4) + 3, \
+		   ((((m32) >> 4) & 0x03) * 4) + 0, \
+		   ((((m32) >> 4) & 0x03) * 4) + 1, \
+		   ((((m32) >> 4) & 0x03) * 4) + 2, \
+		   ((((m32) >> 4) & 0x03) * 4) + 3, \
+		   ((((m32) >> 6) & 0x03) * 4) + 0, \
+		   ((((m32) >> 6) & 0x03) * 4) + 1, \
+		   ((((m32) >> 6) & 0x03) * 4) + 2, \
+		   ((((m32) >> 6) & 0x03) * 4) + 3), \
+	M128I_BYTE(((((m32) >> 0) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 0) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 0) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 0) & 0x03) * 4) + 3 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 0 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 2) & 0x03) * 4) + 3 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 0 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 4) & 0x03) * 4) + 3 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 0 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 1 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 2 + 16, \
+		   ((((m32) >> 6) & 0x03) * 4) + 3 + 16) }
+
+typedef vuint8m1_t __m256i;
+
+#define HAVE_SIMD256 1
+
+#define check_simd256_support() (__riscv_vsetvl_e8m1(32) == 32)
+
+#define cast_m256i_to_s8(a)     cast_m128i_to_s8(a)
+#define cast_m256i_to_u32(a)    cast_m128i_to_u32(a)
+#define cast_m256i_to_u64(a)    cast_m128i_to_u64(a)
+#define cast_m256i_to_s64(a)    cast_m128i_to_s64(a)
+
+#define cast_s8_to_m256i(a)     (__riscv_vreinterpret_v_i8m1_u8m1(a))
+#define cast_u32_to_m256i(a)    (__riscv_vreinterpret_v_u32m1_u8m1(a))
+#define cast_u64_to_m256i(a)    (__riscv_vreinterpret_v_u64m1_u8m1(a))
+#define cast_s64_to_m256i(a)    (cast_u64_to_m128i(__riscv_vreinterpret_v_i64m1_u64m1(a)))
+
+#define pand256(a, o)           (o = __riscv_vand_vv_u8m1((o), (a), 32))
+#define pandn256(a, o)          (o = __riscv_vand_vv_u8m1(__riscv_vnot_v_u8m1((o), 32), (a), 32))
+#define pxor256(a, o)           (o = __riscv_vxor_vv_u8m1((o), (a), 32))
+#define paddb256(a, o)          (o = __riscv_vadd_vv_u8m1((o), (a), 32))
+#define paddd256(a, o)          (o = cast_u32_to_m256i(__riscv_vadd_vv_u32m1( \
+							cast_m256i_to_u32(o), \
+							cast_m256i_to_u32(a), 8)))
+#define paddq256(a, o)          (o = cast_u64_to_m256i(__riscv_vadd_vv_u64m1( \
+							cast_m256i_to_u64(o), \
+							cast_m256i_to_u64(a), 4)))
+
+#define psrld256(s, o)          (o = cast_u32_to_m256i(__riscv_vsrl_vx_u32m1(cast_m256i_to_u32(o), (s), 8))
+#define psraq256(s, o)          (o = cast_s64_to_m256i(__riscv_vsra_vx_i64m1(cast_m256i_to_s64(o), (s), 4)))
+#define psrl_byte_256(s, o)     (o = __riscv_vsrl_vx_u8m1((o), (s), 32))
+
+/* Note: these are not PSHUFB equavalent as full 256-bit vector is used as
+ * 32 byte table. 256-bit PSHUFB on x86 handles 128-bit lanes separately as
+ * 128-bit 16 byte tables. */
+
+/* tab32 variant: indexes have values 0..31. Used when 'm8' is constant and
+ * variable data is in 'o'. */
+#define pshufb256_tab32(m8, o)  (o = __riscv_vrgather_vv_u8m1((o), (m8), 32))
+
+/* tab16 variant: indexes have values 0..16 and only low 128-bit of 'o' is
+ * used. Used when 'o' is constant and variable data is in 'm8'. */
+#define pshufb256_tab16(m8, o)  (o = __riscv_vrgather_vv_u8m1((o), (m8), 32))
+
+/* Load 16 byte mask for 'pshufb256_tab32' usage as if 256-bit PSHUFB was to be
+ * used as on x86 (two separate 128-bit lanes). */
+#define load_tab32_mask(m, o)   ({ __m128i __tmp_lo128; \
+				   __m128i __tmp_hi128; \
+				   movdqu128_memld(m, __tmp_lo128); \
+				   __tmp_hi128 = __riscv_vadd_vx_u8m1(__tmp_lo128, 16, 16); \
+				   o = __riscv_vslideup_vx_u8m1(__tmp_lo128, __tmp_hi128, 16, 32); })
+
+#define broadcast128_256(a, o)  (o = __riscv_vslideup_vx_u8m1((a), (a), 16, 32))
+
+/* Load 16 byte table for 'pshufb256_tab16' usage. On x86 this would splat
+ * 128-bit table from memory to both 128-bit lanes of 256-bit register.
+ * On RISC-V this just loads memory to lower 128-bits. */
+#define load_tab16_table(m, o)  movdqu128_memld(m, o)
+
+#define pshufd256(m32, a, o)    ({ static const __m128i_const __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+				   __m256i __tmp2; \
+				   movdqa256(a, o); \
+				   load_tab32_mask(&__tmp1, __tmp2); \
+				   pshufb256_tab32(__tmp2, o); })
+
+#define pshufd256_0x93(a, o)    pshufd256(0x93, a, o)
+
+#define insert256_hi128(x, o)   (o = __riscv_vslideup_vx_u8m1((o), (x), 16, 32))
+#define extract256_hi128(y, o)  (o = __riscv_vslidedown_vx_u8m1((y), 16, 32))
+
+#define movdqa256(a, o)         (o = (a))
+
+#define movdqa128_256(a, o)     (o = (a))
+#define movdqa256_128(a, o)     (o = (a))
+
+#define movdqa256_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 32))
+
+#define pand256_amemld(m, o)    pand128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define pxor256_amemld(m, o)    pxor128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define paddq256_amemld(m, o)   paddq128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define paddd256_amemld(m, o)   paddd128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define pshufb256_amemld(m, o)  pshufb128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define broadcast128_256_amemld(m, o) \
+				broadcast128_256(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+
+/* Following operations may have unaligned memory input */
+#define movdqu256_memld(a, o)   (o = __riscv_vle8_v_u8m1((const void *)(a), 32))
+
+/* Following operations may have unaligned memory output */
+#define movdqu256_memst(a, o)   (__riscv_vse8_v_u8m1((void *)(o), (a), 32))
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENCRYPT _gcry_aes_vp_riscv_encrypt
+#define FUNC_DECRYPT _gcry_aes_vp_riscv_decrypt
+#define FUNC_CFB_ENC _gcry_aes_vp_riscv_cfb_enc
+#define FUNC_CFB_DEC _gcry_aes_vp_riscv_cfb_dec
+#define FUNC_CBC_ENC _gcry_aes_vp_riscv_cbc_enc
+#define FUNC_CBC_DEC _gcry_aes_vp_riscv_cbc_dec
+#define FUNC_CTR_ENC _gcry_aes_vp_riscv_ctr_enc
+#define FUNC_CTR32LE_ENC _gcry_aes_vp_riscv_ctr32le_enc
+#define FUNC_OCB_CRYPT _gcry_aes_vp_riscv_ocb_crypt
+#define FUNC_OCB_AUTH _gcry_aes_vp_riscv_ocb_auth
+#define FUNC_ECB_CRYPT _gcry_aes_vp_riscv_ecb_crypt
+#define FUNC_XTS_CRYPT _gcry_aes_vp_riscv_xts_crypt
+#define FUNC_SETKEY _gcry_aes_vp_riscv_do_setkey
+#define FUNC_PREPARE_DEC _gcry_aes_vp_riscv_prepare_decryption
+
+#include "rijndael-vp-simd128.h"
+
+int
+_gcry_aes_vp_riscv_setup_acceleration(RIJNDAEL_context *ctx)
+{
+  (void)ctx;
+  return (__riscv_vsetvl_e8m1(16) == 16);
+}
+
+#endif /* USE_VP_RISCV */
diff --git a/cipher/rijndael-vp-simd128.h b/cipher/rijndael-vp-simd128.h
index f6fc8d5e..af8ee291 100644
--- a/cipher/rijndael-vp-simd128.h
+++ b/cipher/rijndael-vp-simd128.h
@@ -1,5 +1,5 @@
 /* SIMD128 intrinsics implementation vector permutation AES for Libgcrypt
- * Copyright (C) 2024 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2024-2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -92,55 +92,7 @@
 
 #define M128I_U64(a0, a1) { a0, a1 }
 
-#ifdef __ARM_NEON
-
-/**********************************************************************
-  AT&T x86 asm to intrinsics conversion macros (ARM)
- **********************************************************************/
-
-#include "simd-common-aarch64.h"
-#include <arm_neon.h>
-
-#define __m128i uint64x2_t
-
-#define pand128(a, o)           (o = vandq_u64(o, a))
-#define pandn128(a, o)          (o = vbicq_u64(a, o))
-#define pxor128(a, o)           (o = veorq_u64(o, a))
-#define paddq128(a, o)          (o = vaddq_u64(o, a))
-#define paddd128(a, o)          (o = (__m128i)vaddq_u32((uint32x4_t)o, (uint32x4_t)a))
-#define paddb128(a, o)          (o = (__m128i)vaddq_u8((uint8x16_t)o, (uint8x16_t)a))
-
-#define psrld128(s, o)          (o = (__m128i)vshrq_n_u32((uint32x4_t)o, s))
-#define psraq128(s, o)          (o = (__m128i)vshrq_n_s64((int64x2_t)o, s))
-#define psrldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
-				   o = (__m128i)vextq_u8((uint8x16_t)o, \
-				                         (uint8x16_t)__tmp, (s) & 15);})
-#define pslldq128(s, o)         ({ uint64x2_t __tmp = { 0, 0 }; \
-                                   o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
-                                                         (uint8x16_t)o, (16 - (s)) & 15);})
-
-#define pshufb128(m8, o)        (o = (__m128i)vqtbl1q_u8((uint8x16_t)o, (uint8x16_t)m8))
-#define pshufd128(m32, a, o)    ({ static const __m128i __tmp = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
-				   movdqa128(a, o); \
-				   pshufb128(__tmp, o); })
-#define pshufd128_0x93(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 12))
-#define pshufd128_0xFF(a, o)    (o = (__m128i)vdupq_laneq_u32((uint32x4_t)a, 3))
-#define pshufd128_0xFE(a, o)    pshufd128(0xFE, a, o)
-#define pshufd128_0x4E(a, o)    (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
-
-#define palignr128(s, a, o)     (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)o, s))
-
-#define movdqa128(a, o)         (o = a)
-
-#define pxor128_amemld(m, o)    pxor128(*(const __m128i *)(m), o)
-
-/* Following operations may have unaligned memory input */
-#define movdqu128_memld(a, o)   (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
-
-/* Following operations may have unaligned memory output */
-#define movdqu128_memst(a, o)   vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
-
-#endif /* __ARM_NEON */
+typedef u64 __m128i_const[2]  __attribute__ ((aligned (16)));
 
 #if defined(__x86_64__) || defined(__i386__)
 
@@ -154,13 +106,12 @@
 #define pandn128(a, o)          (o = _mm_andnot_si128(o, a))
 #define pxor128(a, o)           (o = _mm_xor_si128(o, a))
 #define paddq128(a, o)          (o = _mm_add_epi64(o, a))
-#define vpaddd128(a, o)         (o = _mm_add_epi32(o, a))
-#define vpaddb128(a, o)         (o = _mm_add_epi8(o, a))
 
 #define psrld128(s, o)          (o = _mm_srli_epi32(o, s))
 #define psraq128(s, o)          (o = _mm_srai_epi64(o, s))
 #define psrldq128(s, o)         (o = _mm_srli_si128(o, s))
 #define pslldq128(s, o)         (o = _mm_slli_si128(o, s))
+#define psrl_byte_128(s, o)     psrld128(o, s)
 
 #define pshufb128(m8, o)        (o = _mm_shuffle_epi8(o, m8))
 #define pshufd128(m32, a, o)    (o = _mm_shuffle_epi32(a, m32))
@@ -173,7 +124,13 @@
 
 #define movdqa128(a, o)         (o = a)
 
-#define pxor128_amemld(m, o)    pxor128(*(const __m128i *)(m), o)
+#define movdqa128_memld(a, o)   (o = (__m128i)_mm_load_si128((const void *)(a)))
+
+#define pand128_amemld(m, o)    pand128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define pxor128_amemld(m, o)    pxor128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define paddq128_amemld(m, o)   paddq128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define paddd128_amemld(m, o)   paddd128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define pshufb128_amemld(m, o)  pshufb128((__m128i)_mm_load_si128((const void *)(m)), o)
 
 /* Following operations may have unaligned memory input */
 #define movdqu128_memld(a, o)   (o = _mm_loadu_si128((const __m128i *)(a)))
@@ -225,73 +182,73 @@
   constant vectors
  **********************************************************************/
 
-static const __m128i k_s0F =
+static const __m128i_const k_s0F =
 	M128I_U64(
 		0x0F0F0F0F0F0F0F0F,
 		0x0F0F0F0F0F0F0F0F
 	);
 
-static const __m128i k_iptlo =
+static const __m128i_const k_iptlo =
 	M128I_U64(
 		0xC2B2E8985A2A7000,
 		0xCABAE09052227808
 	);
 
-static const __m128i k_ipthi =
+static const __m128i_const k_ipthi =
 	M128I_U64(
 		0x4C01307D317C4D00,
 		0xCD80B1FCB0FDCC81
 	);
 
-static const __m128i k_inv =
+static const __m128i_const k_inv =
 	M128I_U64(
 		0x0E05060F0D080180,
 		0x040703090A0B0C02
 	);
 
-static const __m128i k_inva =
+static const __m128i_const k_inva =
 	M128I_U64(
 		0x01040A060F0B0780,
 		0x030D0E0C02050809
 	);
 
-static const __m128i k_sb1u =
+static const __m128i_const k_sb1u =
 	M128I_U64(
 		0xB19BE18FCB503E00,
 		0xA5DF7A6E142AF544
 	);
 
-static const __m128i k_sb1t =
+static const __m128i_const k_sb1t =
 	M128I_U64(
 		0x3618D415FAE22300,
 		0x3BF7CCC10D2ED9EF
 	);
 
-static const __m128i k_sb2u =
+static const __m128i_const k_sb2u =
 	M128I_U64(
 		0xE27A93C60B712400,
 		0x5EB7E955BC982FCD
 	);
 
-static const __m128i k_sb2t =
+static const __m128i_const k_sb2t =
 	M128I_U64(
 		0x69EB88400AE12900,
 		0xC2A163C8AB82234A
 	);
 
-static const __m128i k_sbou =
+static const __m128i_const k_sbou =
 	M128I_U64(
 		0xD0D26D176FBDC700,
 		0x15AABF7AC502A878
 	);
 
-static const __m128i k_sbot =
+static const __m128i_const k_sbot =
 	M128I_U64(
 		0xCFE474A55FBB6A00,
 		0x8E1E90D1412B35FA
 	);
 
-static const __m128i k_mc_forward[4] =
+static const __m128i_const k_mc_forward[4] =
 {
 	M128I_U64(
 		0x0407060500030201,
@@ -311,7 +268,7 @@ static const __m128i k_mc_forward[4] =
 	)
 };
 
-static const __m128i k_mc_backward[4] =
+static const __m128i_const k_mc_backward[4] =
 {
 	M128I_U64(
 		0x0605040702010003,
@@ -331,7 +288,7 @@ static const __m128i k_mc_backward[4] =
 	)
 };
 
-static const __m128i k_sr[4] =
+static const __m128i_const k_sr[4] =
 {
 	M128I_U64(
 		0x0706050403020100,
@@ -351,19 +308,19 @@ static const __m128i k_sr[4] =
 	)
 };
 
-static const __m128i k_rcon =
+static const __m128i_const k_rcon =
 	M128I_U64(
 		0x1F8391B9AF9DEEB6,
 		0x702A98084D7C7D81
 	);
 
-static const __m128i k_s63 =
+static const __m128i_const k_s63 =
 	M128I_U64(
 		0x5B5B5B5B5B5B5B5B,
 		0x5B5B5B5B5B5B5B5B
 	);
 
-static const __m128i k_opt[2] =
+static const __m128i_const k_opt[2] =
 {
 	M128I_U64(
 		0xFF9F4929D6B66000,
@@ -375,7 +332,7 @@ static const __m128i k_opt[2] =
 	)
 };
 
-static const __m128i k_deskew[2] =
+static const __m128i_const k_deskew[2] =
 {
 	M128I_U64(
 		0x07E4A34047A4E300,
@@ -387,7 +344,7 @@ static const __m128i k_deskew[2] =
 	)
 };
 
-static const __m128i k_dks_1[2] =
+static const __m128i_const k_dks_1[2] =
 {
 	M128I_U64(
 		0xB6116FC87ED9A700,
@@ -399,7 +356,7 @@ static const __m128i k_dks_1[2] =
 	)
 };
 
-static const __m128i k_dks_2[2] =
+static const __m128i_const k_dks_2[2] =
 {
 	M128I_U64(
 		0x27438FEBCCA86400,
@@ -411,7 +368,7 @@ static const __m128i k_dks_2[2] =
 	)
 };
 
-static const __m128i k_dks_3[2] =
+static const __m128i_const k_dks_3[2] =
 {
 	M128I_U64(
 		0x03C4C50201C6C700,
@@ -423,7 +380,7 @@ static const __m128i k_dks_3[2] =
 	)
 };
 
-static const __m128i k_dks_4[2] =
+static const __m128i_const k_dks_4[2] =
 {
 	M128I_U64(
 		0xE3C390B053732000,
@@ -435,7 +392,7 @@ static const __m128i k_dks_4[2] =
 	)
 };
 
-static const __m128i k_dipt[2] =
+static const __m128i_const k_dipt[2] =
 {
 	M128I_U64(
 		0x0F505B040B545F00,
@@ -447,7 +404,7 @@ static const __m128i k_dipt[2] =
 	)
 };
 
-static const __m128i k_dsb9[2] =
+static const __m128i_const k_dsb9[2] =
 {
 	M128I_U64(
 		0x851C03539A86D600,
@@ -459,7 +416,7 @@ static const __m128i k_dsb9[2] =
 	)
 };
 
-static const __m128i k_dsbd[2] =
+static const __m128i_const k_dsbd[2] =
 {
 	M128I_U64(
 		0x7D57CCDFE6B1A200,
@@ -471,7 +428,7 @@ static const __m128i k_dsbd[2] =
 	)
 };
 
-static const __m128i k_dsbb[2] =
+static const __m128i_const k_dsbb[2] =
 {
 	M128I_U64(
 		0xD022649296B44200,
@@ -483,7 +440,7 @@ static const __m128i k_dsbb[2] =
 	)
 };
 
-static const __m128i k_dsbe[2] =
+static const __m128i_const k_dsbe[2] =
 {
 	M128I_U64(
 		0x46F2929626D4D000,
@@ -495,7 +452,7 @@ static const __m128i k_dsbe[2] =
 	)
 };
 
-static const __m128i k_dsbo[2] =
+static const __m128i_const k_dsbo[2] =
 {
 	M128I_U64(
 		0x1387EA537EF94000,
@@ -551,8 +508,8 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
   if (!low_round_only)
     {
       /* extract rcon from xmm8 */
-      static const __m128i zero = { 0 };
-      xmm1 = zero;
+      static const __m128i_const zero = { 0 };
+      movdqa128_memld(&zero, xmm1);
       palignr128(15, xmm8, xmm1);
       palignr128(15, xmm8, xmm8);
       pxor128(xmm1, xmm7);
@@ -569,12 +526,12 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
   movdqa128(xmm7, xmm1);
   pslldq128(8, xmm7);
   pxor128(xmm1, xmm7);
-  pxor128(k_s63, xmm7);
+  pxor128_amemld(&k_s63, xmm7);
 
   /* subbytes */
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);            /* 1 = i */
+  psrl_byte_128(4, xmm1);       /* 1 = i */
   pand128(xmm9, xmm0);          /* 0 = k */
   movdqa128(xmm11, xmm2);       /* 2 : a/k */
   pshufb128(xmm0, xmm2);        /* 2 = a/k */
@@ -591,9 +548,9 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
   movdqa128(xmm10, xmm3);       /* 3 : 1/jak */
   pshufb128(xmm4, xmm3);        /* 3 = 1/jak */
   pxor128(xmm1, xmm3);          /* 3 = jo */
-  movdqa128(k_sb1u, xmm4);      /* 4 : sbou */
+  movdqa128_memld(&k_sb1u, xmm4);      /* 4 : sbou */
   pshufb128(xmm2, xmm4);        /* 4 = sbou */
-  movdqa128(k_sb1t, xmm0);      /* 0 : sbot */
+  movdqa128_memld(&k_sb1t, xmm0);      /* 0 : sbot */
   pshufb128(xmm3, xmm0);        /* 0 = sb1t */
   pxor128(xmm4, xmm0);          /* 0 = sbox output */
 
@@ -608,7 +565,8 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
 
 static ASM_FUNC_ATTR_INLINE __m128i
 aes_schedule_transform(__m128i xmm0, const __m128i xmm9,
-		       const __m128i tablelo, const __m128i tablehi)
+		       const __m128i_const *tablelo,
+		       const __m128i_const *tablehi)
 {
   /* aes_schedule_transform
    *
@@ -622,11 +580,11 @@ aes_schedule_transform(__m128i xmm0, const __m128i xmm9,
 
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);
+  psrl_byte_128(4, xmm1);
   pand128(xmm9, xmm0);
-  movdqa128(tablelo, xmm2);
+  movdqa128_memld(tablelo, xmm2);
   pshufb128(xmm0, xmm2);
-  movdqa128(tablehi, xmm0);
+  movdqa128_memld(tablehi, xmm0);
   pshufb128(xmm1, xmm0);
   pxor128(xmm2, xmm0);
 
@@ -662,12 +620,12 @@ aes_schedule_mangle(__m128i xmm0, struct vp_aes_config_s *pconfig, int decrypt,
   unsigned int rotoffs = *protoffs;
 
   movdqa128(xmm0, xmm4);
-  movdqa128(k_mc_forward[0], xmm5);
+  movdqa128_memld(&k_mc_forward[0], xmm5);
 
   if (!decrypt)
     {
       keysched += 16;
-      pxor128(k_s63, xmm4);
+      pxor128_amemld(&k_s63, xmm4);
       pshufb128(xmm5, xmm4);
       movdqa128(xmm4, xmm3);
       pshufb128(xmm5, xmm4);
@@ -678,29 +636,29 @@ aes_schedule_mangle(__m128i xmm0, struct vp_aes_config_s *pconfig, int decrypt,
   else
     {
       /* first table: *9 */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_1[0], k_dks_1[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_1[0], &k_dks_1[1]);
       movdqa128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       /* next table:  *B */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_2[0], k_dks_2[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_2[0], &k_dks_2[1]);
       pxor128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       /* next table:  *D */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_3[0], k_dks_3[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_3[0], &k_dks_3[1]);
       pxor128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       /* next table:  *E */
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_4[0], k_dks_4[1]);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_4[0], &k_dks_4[1]);
       pxor128(xmm0, xmm3);
       pshufb128(xmm5, xmm3);
 
       keysched -= 16;
     }
 
-  pshufb128(k_sr[rotoffs], xmm3);
+  pshufb128_amemld(&k_sr[rotoffs], xmm3);
   rotoffs -= 16 / 16;
   rotoffs &= 48 / 16;
   movdqu128_memst(xmm3, keysched);
@@ -725,16 +683,16 @@ aes_schedule_mangle_last(__m128i xmm0, struct vp_aes_config_s config,
 
   if (!decrypt)
     {
-      pshufb128(k_sr[rotoffs], xmm0); /* output permute */
+      pshufb128_amemld(&k_sr[rotoffs], xmm0); /* output permute */
       config.keysched += 16;
-      pxor128(k_s63, xmm0);
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_opt[0], k_opt[1]);
+      pxor128_amemld(&k_s63, xmm0);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_opt[0], &k_opt[1]);
     }
   else
     {
       config.keysched -= 16;
-      pxor128(k_s63, xmm0);
-      xmm0 = aes_schedule_transform(xmm0, xmm9, k_deskew[0], k_deskew[1]);
+      pxor128_amemld(&k_s63, xmm0);
+      xmm0 = aes_schedule_transform(xmm0, xmm9, &k_deskew[0], &k_deskew[1]);
     }
 
   movdqu128_memst(xmm0, config.keysched); /* save last key */
@@ -825,7 +783,7 @@ aes_schedule_192(const byte *key, struct vp_aes_config_s config, int decrypt,
   int r = 4;
 
   movdqu128_memld(key + 8, xmm0); /* load key part 2 (very unaligned) */
-  xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi); /* input transform */
+  xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi); /* input transform */
   movdqa128(xmm0, xmm6);
   psrldq128(8, xmm6);
   pslldq128(8, xmm6); /* clobber low side with zeros */
@@ -867,7 +825,7 @@ aes_schedule_256(const byte *key, struct vp_aes_config_s config, int decrypt,
   int r = 7;
 
   movdqu128_memld(key + 16, xmm0); /* load key part 2 (unaligned) */
-  xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi); /* input transform */
+  xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi); /* input transform */
 
   while (1)
     {
@@ -900,16 +858,16 @@ aes_schedule_core(const byte *key, struct vp_aes_config_s config,
   unsigned int keybits = (config.nround - 10) * 32 + 128;
   __m128i xmm0, xmm3, xmm7, xmm8, xmm9, xmm10, xmm11;
 
-  movdqa128(k_s0F, xmm9);
-  movdqa128(k_inv, xmm10);
-  movdqa128(k_inva, xmm11);
-  movdqa128(k_rcon, xmm8);
+  movdqa128_memld(&k_s0F, xmm9);
+  movdqa128_memld(&k_inv, xmm10);
+  movdqa128_memld(&k_inva, xmm11);
+  movdqa128_memld(&k_rcon, xmm8);
 
   movdqu128_memld(key, xmm0);
 
   /* input transform */
   movdqa128(xmm0, xmm3);
-  xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi);
+  xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi);
   movdqa128(xmm0, xmm7);
 
   if (!decrypt)
@@ -920,7 +878,7 @@ aes_schedule_core(const byte *key, struct vp_aes_config_s config,
   else
     {
       /* decrypting, output zeroth round key after shiftrows */
-      pshufb128(k_sr[rotoffs], xmm3);
+      pshufb128_amemld(&k_sr[rotoffs], xmm3);
       movdqu128_memst(xmm3, config.keysched);
       rotoffs ^= 48 / 16;
     }
@@ -998,23 +956,23 @@ FUNC_PREPARE_DEC (RIJNDAEL_context *ctx)
 }
 
 #define enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15) \
-	movdqa128(k_s0F, xmm9); \
-	movdqa128(k_inv, xmm10); \
-	movdqa128(k_inva, xmm11); \
-	movdqa128(k_sb1u, xmm13); \
-	movdqa128(k_sb1t, xmm12); \
-	movdqa128(k_sb2u, xmm15); \
-	movdqa128(k_sb2t, xmm14);
+	movdqa128_memld(&k_s0F, xmm9); \
+	movdqa128_memld(&k_inv, xmm10); \
+	movdqa128_memld(&k_inva, xmm11); \
+	movdqa128_memld(&k_sb1u, xmm13); \
+	movdqa128_memld(&k_sb1t, xmm12); \
+	movdqa128_memld(&k_sb2u, xmm15); \
+	movdqa128_memld(&k_sb2t, xmm14);
 
 #define dec_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm8) \
-	movdqa128(k_s0F, xmm9); \
-	movdqa128(k_inv, xmm10); \
-	movdqa128(k_inva, xmm11); \
-	movdqa128(k_dsb9[0], xmm13); \
-	movdqa128(k_dsb9[1], xmm12); \
-	movdqa128(k_dsbd[0], xmm15); \
-	movdqa128(k_dsbb[0], xmm14); \
-	movdqa128(k_dsbe[0], xmm8);
+	movdqa128_memld(&k_s0F, xmm9); \
+	movdqa128_memld(&k_inv, xmm10); \
+	movdqa128_memld(&k_inva, xmm11); \
+	movdqa128_memld(&k_dsb9[0], xmm13); \
+	movdqa128_memld(&k_dsb9[1], xmm12); \
+	movdqa128_memld(&k_dsbd[0], xmm15); \
+	movdqa128_memld(&k_dsbb[0], xmm14); \
+	movdqa128_memld(&k_dsbe[0], xmm8);
 
 static ASM_FUNC_ATTR_INLINE __m128i
 aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
@@ -1025,13 +983,13 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
   const byte *end_keys = config.sched_keys + 16 * config.nround;
   unsigned int mc_pos = 1;
 
-  movdqa128(k_iptlo, xmm2);
+  movdqa128_memld(&k_iptlo, xmm2);
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);
+  psrl_byte_128(4, xmm1);
   pand128(xmm9, xmm0);
   pshufb128(xmm0, xmm2);
-  movdqa128(k_ipthi, xmm0);
+  movdqa128_memld(&k_ipthi, xmm0);
 
   pshufb128(xmm1, xmm0);
   pxor128_amemld(config.sched_keys, xmm2);
@@ -1044,7 +1002,7 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       /* top of round */
       movdqa128(xmm9, xmm1);                  /* 1 : i */
       pandn128(xmm0, xmm1);                   /* 1 = i<<4 */
-      psrld128(4, xmm1);                      /* 1 = i */
+      psrl_byte_128(4, xmm1);                 /* 1 = i */
       pand128(xmm9, xmm0);                    /* 0 = k */
       movdqa128(xmm11, xmm2);                 /* 2 : a/k */
       pshufb128(xmm0, xmm2);                  /* 2 = a/k */
@@ -1074,14 +1032,14 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       pxor128(xmm4, xmm0);                    /* 0 = A */
       movdqa128(xmm15, xmm4);                 /* 4 : sb2u */
       pshufb128(xmm2, xmm4);                  /* 4 = sb2u */
-      movdqa128(k_mc_forward[mc_pos], xmm1);
+      movdqa128_memld(&k_mc_forward[mc_pos], xmm1);
       movdqa128(xmm14, xmm2);                 /* 2 : sb2t */
       pshufb128(xmm3, xmm2);                  /* 2 = sb2t */
       pxor128(xmm4, xmm2);                    /* 2 = 2A */
       movdqa128(xmm0, xmm3);                  /* 3 = A */
       pshufb128(xmm1, xmm0);                  /* 0 = B */
       pxor128(xmm2, xmm0);                    /* 0 = 2A+B */
-      pshufb128(k_mc_backward[mc_pos], xmm3); /* 3 = D */
+      pshufb128_amemld(&k_mc_backward[mc_pos], xmm3); /* 3 = D */
       pxor128(xmm0, xmm3);                    /* 3 = 2A+B+D */
       pshufb128(xmm1, xmm0);                  /* 0 = 2B+C */
       pxor128(xmm3, xmm0);                    /* 0 = 2A+3B+C+D */
@@ -1091,13 +1049,13 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
     }
 
   /* middle of last round */
-  movdqa128(k_sbou, xmm4);          /* 3 : sbou */
+  movdqa128_memld(&k_sbou, xmm4);   /* 3 : sbou */
   pshufb128(xmm2, xmm4);            /* 4 = sbou */
   pxor128_amemld(config.sched_keys, xmm4); /* 4 = sb1u + k */
-  movdqa128(k_sbot, xmm0);          /* 0 : sbot */
+  movdqa128_memld(&k_sbot, xmm0);   /* 0 : sbot */
   pshufb128(xmm3, xmm0);            /* 0 = sb1t */
   pxor128(xmm4, xmm0);              /* 0 = A */
-  pshufb128(k_sr[mc_pos], xmm0);
+  pshufb128_amemld(&k_sr[mc_pos], xmm0);
 
   return xmm0;
 }
@@ -1112,20 +1070,20 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
   __m128i xmm0_a, xmm0_b;
   __m128i xmm1_a, xmm2_a, xmm3_a, xmm4_a;
   __m128i xmm1_b, xmm2_b, xmm3_b, xmm4_b;
-  __m128i xmm5;
+  __m128i xmm5, xmm6;
   const byte *end_keys = config.sched_keys + 16 * config.nround;
   unsigned int mc_pos = 1;
 
   xmm0_a = *pxmm0_a;
   xmm0_b = *pxmm0_b;
 
-  movdqa128(k_iptlo, xmm2_a);	movdqa128(k_iptlo, xmm2_b);
+  movdqa128_memld(&k_iptlo, xmm2_a); movdqa128(xmm2_a, xmm2_b);
   movdqa128(xmm9, xmm1_a);	movdqa128(xmm9, xmm1_b);
   pandn128(xmm0_a, xmm1_a);	pandn128(xmm0_b, xmm1_b);
-  psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+  psrl_byte_128(4, xmm1_a);	psrl_byte_128(4, xmm1_b);
   pand128(xmm9, xmm0_a);	pand128(xmm9, xmm0_b);
   pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
-  movdqa128(k_ipthi, xmm0_a);	movdqa128(k_ipthi, xmm0_b);
+  movdqa128_memld(&k_ipthi, xmm0_a); movdqa128(xmm0_a, xmm0_b);
 
   pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
   movdqu128_memld(config.sched_keys, xmm5);
@@ -1139,7 +1097,7 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       /* top of round */
       movdqa128(xmm9, xmm1_a);		movdqa128(xmm9, xmm1_b);
       pandn128(xmm0_a, xmm1_a);		pandn128(xmm0_b, xmm1_b);
-      psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+      psrl_byte_128(4, xmm1_a);		psrl_byte_128(4, xmm1_b);
       pand128(xmm9, xmm0_a);		pand128(xmm9, xmm0_b);
       movdqa128(xmm11, xmm2_a);		movdqa128(xmm11, xmm2_b);
       pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
@@ -1170,18 +1128,17 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
       movdqa128(xmm15, xmm4_a);		movdqa128(xmm15, xmm4_b);
       pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
-      movdqa128(k_mc_forward[mc_pos], xmm1_a);
-					movdqa128(k_mc_forward[mc_pos], xmm1_b);
+      movdqa128_memld(&k_mc_forward[mc_pos], xmm6);
       movdqa128(xmm14, xmm2_a);		movdqa128(xmm14, xmm2_b);
       pshufb128(xmm3_a, xmm2_a);	pshufb128(xmm3_b, xmm2_b);
       pxor128(xmm4_a, xmm2_a);		pxor128(xmm4_b, xmm2_b);
       movdqa128(xmm0_a, xmm3_a);	movdqa128(xmm0_b, xmm3_b);
-      pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
+      pshufb128(xmm6, xmm0_a);		pshufb128(xmm6, xmm0_b);
       pxor128(xmm2_a, xmm0_a);		pxor128(xmm2_b, xmm0_b);
-      pshufb128(k_mc_backward[mc_pos], xmm3_a);
-					pshufb128(k_mc_backward[mc_pos], xmm3_b);
+      movdqa128_memld(&k_mc_backward[mc_pos], xmm5);
+      pshufb128(xmm5, xmm3_a);		pshufb128(xmm5, xmm3_b);
       pxor128(xmm0_a, xmm3_a);		pxor128(xmm0_b, xmm3_b);
-      pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
+      pshufb128(xmm6, xmm0_a);		pshufb128(xmm6, xmm0_b);
       pxor128(xmm3_a, xmm0_a);		pxor128(xmm3_b, xmm0_b);
 
       config.sched_keys += 16;
@@ -1189,20 +1146,133 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
     }
 
   /* middle of last round */
-  movdqa128(k_sbou, xmm4_a);	movdqa128(k_sbou, xmm4_b);
+  movdqa128_memld(&k_sbou, xmm4_a); movdqa128_memld(&k_sbou, xmm4_b);
   pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
   movdqu128_memld(config.sched_keys, xmm5);
   pxor128(xmm5, xmm4_a);	pxor128(xmm5, xmm4_b);
-  movdqa128(k_sbot, xmm0_a);	movdqa128(k_sbot, xmm0_b);
+  movdqa128_memld(&k_sbot, xmm0_a); movdqa128_memld(&k_sbot, xmm0_b);
   pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
   pxor128(xmm4_a, xmm0_a);	pxor128(xmm4_b, xmm0_b);
-  pshufb128(k_sr[mc_pos], xmm0_a);
-				pshufb128(k_sr[mc_pos], xmm0_b);
+  movdqa128_memld(&k_sr[mc_pos], xmm5);
+  pshufb128(xmm5, xmm0_a);	pshufb128(xmm5, xmm0_b);
 
   *pxmm0_a = xmm0_a;
   *pxmm0_b = xmm0_b;
 }
 
+#ifdef HAVE_SIMD256
+
+static ASM_FUNC_ATTR_INLINE void
+aes_encrypt_core_4blks_simd256(__m256i *pymm0_a, __m256i *pymm0_b,
+			       struct vp_aes_config_s config,
+			       __m128i xmm9, __m128i xmm10, __m128i xmm11,
+			       __m128i xmm12, __m128i xmm13, __m128i xmm14,
+			       __m128i xmm15)
+{
+  __m256i ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+  __m256i ymm0_a, ymm0_b;
+  __m256i ymm1_a, ymm2_a, ymm3_a, ymm4_a;
+  __m256i ymm1_b, ymm2_b, ymm3_b, ymm4_b;
+  __m256i ymm5, ymm6;
+  const byte *end_keys = config.sched_keys + 16 * config.nround;
+  unsigned int mc_pos = 1;
+
+  broadcast128_256(xmm9, ymm9);
+  movdqa128_256(xmm10, ymm10);
+  movdqa128_256(xmm11, ymm11);
+  movdqa128_256(xmm12, ymm12);
+  movdqa128_256(xmm13, ymm13);
+  movdqa128_256(xmm14, ymm14);
+  movdqa128_256(xmm15, ymm15);
+
+  ymm0_a = *pymm0_a;
+  ymm0_b = *pymm0_b;
+
+  load_tab16_table(&k_iptlo, ymm2_a); 	movdqa256(ymm2_a, ymm2_b);
+  movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+  pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+  psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+  pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+  pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+  load_tab16_table(&k_ipthi, ymm0_a); 	movdqa256(ymm0_a, ymm0_b);
+
+  pshufb256_tab16(ymm1_a, ymm0_a);	pshufb256_tab16(ymm1_b, ymm0_b);
+  broadcast128_256_amemld(config.sched_keys, ymm5);
+  pxor256(ymm5, ymm2_a);		pxor256(ymm5, ymm2_b);
+  pxor256(ymm2_a, ymm0_a);		pxor256(ymm2_b, ymm0_b);
+
+  config.sched_keys += 16;
+
+  while (1)
+    {
+      /* top of round */
+      movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+      pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+      psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+      pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+      movdqa256(ymm11, ymm2_a);		movdqa256(ymm11, ymm2_b);
+      pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+      pxor256(ymm1_a, ymm0_a);		pxor256(ymm1_b, ymm0_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm1_a, ymm3_a);	pshufb256_tab16(ymm1_b, ymm3_b);
+      pxor256(ymm2_a, ymm3_a);		pxor256(ymm2_b, ymm3_b);
+      movdqa256(ymm10, ymm4_a);		movdqa256(ymm10, ymm4_b);
+      pshufb256_tab16(ymm0_a,  ymm4_a);	pshufb256_tab16(ymm0_b,  ymm4_b);
+      pxor256(ymm2_a, ymm4_a);		pxor256(ymm2_b, ymm4_b);
+      movdqa256(ymm10, ymm2_a);		movdqa256(ymm10, ymm2_b);
+      pshufb256_tab16(ymm3_a, ymm2_a);	pshufb256_tab16(ymm3_b, ymm2_b);
+      pxor256(ymm0_a, ymm2_a);		pxor256(ymm0_b, ymm2_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm4_a, ymm3_a);	pshufb256_tab16(ymm4_b, ymm3_b);
+      pxor256(ymm1_a, ymm3_a);		pxor256(ymm1_b, ymm3_b);
+
+      if (config.sched_keys == end_keys)
+	break;
+
+      /* middle of middle round */
+      movdqa256(ymm13, ymm4_a);		movdqa256(ymm13, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      broadcast128_256_amemld(config.sched_keys, ymm5);
+      pxor256(ymm5, ymm4_a);		pxor256(ymm5, ymm4_b);
+      movdqa256(ymm12, ymm0_a);		movdqa256(ymm12, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+      movdqa256(ymm15, ymm4_a);		movdqa256(ymm15, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      load_tab32_mask(&k_mc_forward[mc_pos], ymm6);
+      movdqa256(ymm14, ymm2_a);		movdqa256(ymm14, ymm2_b);
+      pshufb256_tab16(ymm3_a, ymm2_a);	pshufb256_tab16(ymm3_b, ymm2_b);
+      pxor256(ymm4_a, ymm2_a);		pxor256(ymm4_b, ymm2_b);
+      movdqa256(ymm0_a, ymm3_a);	movdqa256(ymm0_b, ymm3_b);
+      pshufb256_tab32(ymm6, ymm0_a);	pshufb256_tab32(ymm6, ymm0_b);
+      pxor256(ymm2_a, ymm0_a);		pxor256(ymm2_b, ymm0_b);
+      load_tab32_mask(&k_mc_backward[mc_pos], ymm5);
+      pshufb256_tab32(ymm5, ymm3_a);	pshufb256_tab32(ymm5, ymm3_b);
+      pxor256(ymm0_a, ymm3_a);		pxor256(ymm0_b, ymm3_b);
+      pshufb256_tab32(ymm6, ymm0_a);	pshufb256_tab32(ymm6, ymm0_b);
+      pxor256(ymm3_a, ymm0_a);		pxor256(ymm3_b, ymm0_b);
+
+      config.sched_keys += 16;
+      mc_pos = (mc_pos + 1) % 4; /* next mc mod 4 */
+    }
+
+  /* middle of last round */
+  movdqa256_memld(&k_sbou, ymm4_a); 	movdqa256_memld(&k_sbou, ymm4_b);
+  pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+  broadcast128_256_amemld(config.sched_keys, ymm5);
+  pxor256(ymm5, ymm4_a);		pxor256(ymm5, ymm4_b);
+  movdqa256_memld(&k_sbot, ymm0_a); 	movdqa256_memld(&k_sbot, ymm0_b);
+  pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+  pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+  load_tab32_mask(&k_sr[mc_pos], ymm5);
+  pshufb256_tab32(ymm5, ymm0_a);	pshufb256_tab32(ymm5, ymm0_b);
+
+  *pymm0_a = ymm0_a;
+  *pymm0_b = ymm0_b;
+}
+
+#endif /* HAVE_SIMD256 */
+
 static ASM_FUNC_ATTR_INLINE __m128i
 aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
 		 __m128i xmm9, __m128i xmm10, __m128i xmm11, __m128i xmm12,
@@ -1212,17 +1282,17 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
   const byte *end_keys = config.sched_keys + 16 * config.nround;
   unsigned int mc_pos = config.nround % 4;
 
-  movdqa128(k_dipt[0], xmm2);
+  movdqa128_memld(&k_dipt[0], xmm2);
   movdqa128(xmm9, xmm1);
   pandn128(xmm0, xmm1);
-  psrld128(4, xmm1);
+  psrl_byte_128(4, xmm1);
   pand128(xmm9, xmm0);
   pshufb128(xmm0, xmm2);
-  movdqa128(k_dipt[1], xmm0);
+  movdqa128_memld(&k_dipt[1], xmm0);
   pshufb128(xmm1, xmm0);
   pxor128_amemld(config.sched_keys, xmm2);
   pxor128(xmm2, xmm0);
-  movdqa128(k_mc_forward[3], xmm5);
+  movdqa128_memld(&k_mc_forward[3], xmm5);
 
   config.sched_keys += 16;
 
@@ -1231,7 +1301,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       /* top of round */
       movdqa128(xmm9, xmm1);                  /* 1 : i */
       pandn128(xmm0, xmm1);                   /* 1 = i<<4 */
-      psrld128(4, xmm1);                      /* 1 = i */
+      psrl_byte_128(4, xmm1);                 /* 1 = i */
       pand128(xmm9, xmm0);                    /* 0 = k */
       movdqa128(xmm11, xmm2);                 /* 2 : a/k */
       pshufb128(xmm0, xmm2);                  /* 2 = a/k */
@@ -1258,7 +1328,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       pxor128_amemld(config.sched_keys, xmm4);
       movdqa128(xmm12, xmm0);                 /* 0 : sb9t */
       pshufb128(xmm3, xmm0);                  /* 0 = sb9t */
-      movdqa128(k_dsbd[1], xmm1);             /* 1 : sbdt */
+      movdqa128_memld(&k_dsbd[1], xmm1);      /* 1 : sbdt */
       pxor128(xmm4, xmm0);                    /* 0 = ch */
 
       pshufb128(xmm5, xmm0);                  /* MC ch */
@@ -1272,7 +1342,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       movdqa128(xmm14, xmm4);                 /* 4 : sbbu */
       pshufb128(xmm2, xmm4);                  /* 4 = sbbu */
       pxor128(xmm1, xmm4);                    /* 4 = ch */
-      movdqa128(k_dsbb[1], xmm0);             /* 0 : sbbt */
+      movdqa128_memld(&k_dsbb[1], xmm0);      /* 0 : sbbt */
       pshufb128(xmm3, xmm0);                  /* 0 = sbbt */
       pxor128(xmm4, xmm0);                    /* 0 = ch */
 
@@ -1281,7 +1351,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
       pshufb128(xmm2, xmm4);                  /* 4 = sbeu */
       pshufd128_0x93(xmm5, xmm5);
       pxor128(xmm0, xmm4);                    /* 4 = ch */
-      movdqa128(k_dsbe[1], xmm0);             /* 0 : sbet */
+      movdqa128_memld(&k_dsbe[1], xmm0);      /* 0 : sbet */
       pshufb128(xmm3, xmm0);                  /* 0 = sbet */
       pxor128(xmm4, xmm0);                    /* 0 = ch */
 
@@ -1289,13 +1359,13 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
     }
 
   /* middle of last round */
-  movdqa128(k_dsbo[0], xmm4);       /* 3 : sbou */
+  movdqa128_memld(&k_dsbo[0], xmm4);/* 3 : sbou */
   pshufb128(xmm2, xmm4);            /* 4 = sbou */
   pxor128_amemld(config.sched_keys, xmm4); /* 4 = sb1u + k */
-  movdqa128(k_dsbo[1], xmm0);       /* 0 : sbot */
+  movdqa128_memld(&k_dsbo[1], xmm0);/* 0 : sbot */
   pshufb128(xmm3, xmm0);            /* 0 = sb1t */
   pxor128(xmm4, xmm0);              /* 0 = A */
-  pshufb128(k_sr[mc_pos], xmm0);
+  pshufb128_amemld(&k_sr[mc_pos], xmm0);
 
   return xmm0;
 }
@@ -1317,18 +1387,18 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
   xmm0_a = *pxmm0_a;
   xmm0_b = *pxmm0_b;
 
-  movdqa128(k_dipt[0], xmm2_a);	movdqa128(k_dipt[0], xmm2_b);
+  movdqa128_memld(&k_dipt[0], xmm2_a); movdqa128(xmm2_a, xmm2_b);
   movdqa128(xmm9, xmm1_a);	movdqa128(xmm9, xmm1_b);
   pandn128(xmm0_a, xmm1_a);	pandn128(xmm0_b, xmm1_b);
-  psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+  psrl_byte_128(4, xmm1_a);	psrl_byte_128(4, xmm1_b);
   pand128(xmm9, xmm0_a);	pand128(xmm9, xmm0_b);
   pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
-  movdqa128(k_dipt[1], xmm0_a);	movdqa128(k_dipt[1], xmm0_b);
+  movdqa128_memld(&k_dipt[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
   pshufb128(xmm1_a, xmm0_a);	pshufb128(xmm1_b, xmm0_b);
   movdqu128_memld(config.sched_keys, xmm6);
   pxor128(xmm6, xmm2_a);	pxor128(xmm6, xmm2_b);
   pxor128(xmm2_a, xmm0_a);	pxor128(xmm2_b, xmm0_b);
-  movdqa128(k_mc_forward[3], xmm5);
+  movdqa128_memld(&k_mc_forward[3], xmm5);
 
   config.sched_keys += 16;
 
@@ -1337,7 +1407,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       /* top of round */
       movdqa128(xmm9, xmm1_a);		movdqa128(xmm9, xmm1_b);
       pandn128(xmm0_a, xmm1_a);		pandn128(xmm0_b, xmm1_b);
-      psrld128(4, xmm1_a);		psrld128(4, xmm1_b);
+      psrl_byte_128(4, xmm1_a);		psrl_byte_128(4, xmm1_b);
       pand128(xmm9, xmm0_a);		pand128(xmm9, xmm0_b);
       movdqa128(xmm11, xmm2_a);		movdqa128(xmm11, xmm2_b);
       pshufb128(xmm0_a, xmm2_a);	pshufb128(xmm0_b, xmm2_b);
@@ -1365,7 +1435,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       pxor128(xmm6, xmm4_a);		pxor128(xmm6, xmm4_b);
       movdqa128(xmm12, xmm0_a);		movdqa128(xmm12, xmm0_b);
       pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
-      movdqa128(k_dsbd[1], xmm1_a);	movdqa128(k_dsbd[1], xmm1_b);
+      movdqa128_memld(&k_dsbd[1], xmm1_a); movdqa128(xmm1_a, xmm1_b);
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
 
       pshufb128(xmm5, xmm0_a);		pshufb128(xmm5, xmm0_b);
@@ -1379,7 +1449,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       movdqa128(xmm14, xmm4_a);		movdqa128(xmm14, xmm4_b);
       pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
       pxor128(xmm1_a, xmm4_a);		pxor128(xmm1_b, xmm4_b);
-      movdqa128(k_dsbb[1], xmm0_a);	movdqa128(k_dsbb[1], xmm0_b);
+      movdqa128_memld(&k_dsbb[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
       pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
 
@@ -1388,7 +1458,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
       pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
       pshufd128_0x93(xmm5, xmm5);
       pxor128(xmm0_a, xmm4_a);		pxor128(xmm0_b, xmm4_b);
-      movdqa128(k_dsbe[1], xmm0_a);	movdqa128(k_dsbe[1], xmm0_b);
+      movdqa128_memld(&k_dsbe[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
       pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
       pxor128(xmm4_a, xmm0_a);		pxor128(xmm4_b, xmm0_b);
 
@@ -1396,20 +1466,144 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
     }
 
   /* middle of last round */
-  movdqa128(k_dsbo[0], xmm4_a);	movdqa128(k_dsbo[0], xmm4_b);
+  movdqa128_memld(&k_dsbo[0], xmm4_a); movdqa128(xmm4_a, xmm4_b);
   pshufb128(xmm2_a, xmm4_a);	pshufb128(xmm2_b, xmm4_b);
   movdqu128_memld(config.sched_keys, xmm6);
   pxor128(xmm6, xmm4_a);	pxor128(xmm6, xmm4_b);
-  movdqa128(k_dsbo[1], xmm0_a);	movdqa128(k_dsbo[1], xmm0_b);
+  movdqa128_memld(&k_dsbo[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
   pshufb128(xmm3_a, xmm0_a);	pshufb128(xmm3_b, xmm0_b);
   pxor128(xmm4_a, xmm0_a);	pxor128(xmm4_b, xmm0_b);
-  pshufb128(k_sr[mc_pos], xmm0_a);
-				pshufb128(k_sr[mc_pos], xmm0_b);
+  movdqa128_memld(&k_sr[mc_pos], xmm5);
+  pshufb128(xmm5, xmm0_a);	pshufb128(xmm5, xmm0_b);
 
   *pxmm0_a = xmm0_a;
   *pxmm0_b = xmm0_b;
 }
 
+#ifdef HAVE_SIMD256
+
+static ASM_FUNC_ATTR_INLINE void
+aes_decrypt_core_4blks_simd256(__m256i *pymm0_a, __m256i *pymm0_b,
+			       struct vp_aes_config_s config,
+			       __m128i xmm9, __m128i xmm10, __m128i xmm11,
+			       __m128i xmm12, __m128i xmm13, __m128i xmm14,
+			       __m128i xmm15, __m128i xmm8)
+{
+  __m256i ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm8;
+  __m256i ymm0_a, ymm0_b;
+  __m256i ymm1_a, ymm2_a, ymm3_a, ymm4_a;
+  __m256i ymm1_b, ymm2_b, ymm3_b, ymm4_b;
+  __m256i ymm5, ymm6;
+  const byte *end_keys = config.sched_keys + 16 * config.nround;
+  unsigned int mc_pos = config.nround % 4;
+
+  broadcast128_256(xmm9, ymm9);
+  movdqa128_256(xmm10, ymm10);
+  movdqa128_256(xmm11, ymm11);
+  movdqa128_256(xmm12, ymm12);
+  movdqa128_256(xmm13, ymm13);
+  movdqa128_256(xmm14, ymm14);
+  movdqa128_256(xmm15, ymm15);
+  movdqa128_256(xmm8, ymm8);
+
+  ymm0_a = *pymm0_a;
+  ymm0_b = *pymm0_b;
+
+  load_tab16_table(&k_dipt[0], ymm2_a); movdqa256(ymm2_a, ymm2_b);
+  movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+  pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+  psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+  pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+  pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+  load_tab16_table(&k_dipt[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+  pshufb256_tab16(ymm1_a, ymm0_a);	pshufb256_tab16(ymm1_b, ymm0_b);
+  broadcast128_256_amemld(config.sched_keys, ymm6);
+  pxor256(ymm6, ymm2_a);		pxor256(ymm6, ymm2_b);
+  pxor256(ymm2_a, ymm0_a);		pxor256(ymm2_b, ymm0_b);
+  load_tab32_mask(&k_mc_forward[3], ymm5);
+
+  config.sched_keys += 16;
+
+  while (1)
+    {
+      /* top of round */
+      movdqa256(ymm9, ymm1_a);		movdqa256(ymm9, ymm1_b);
+      pandn256(ymm0_a, ymm1_a);		pandn256(ymm0_b, ymm1_b);
+      psrl_byte_256(4, ymm1_a);		psrl_byte_256(4, ymm1_b);
+      pand256(ymm9, ymm0_a);		pand256(ymm9, ymm0_b);
+      movdqa256(ymm11, ymm2_a);		movdqa256(ymm11, ymm2_b);
+      pshufb256_tab16(ymm0_a, ymm2_a);	pshufb256_tab16(ymm0_b, ymm2_b);
+      pxor256(ymm1_a, ymm0_a);		pxor256(ymm1_b, ymm0_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm1_a, ymm3_a);	pshufb256_tab16(ymm1_b, ymm3_b);
+      pxor256(ymm2_a, ymm3_a);		pxor256(ymm2_b, ymm3_b);
+      movdqa256(ymm10, ymm4_a);		movdqa256(ymm10, ymm4_b);
+      pshufb256_tab16(ymm0_a, ymm4_a);	pshufb256_tab16(ymm0_b, ymm4_b);
+      pxor256(ymm2_a, ymm4_a);		pxor256(ymm2_b, ymm4_b);
+      movdqa256(ymm10, ymm2_a);		movdqa256(ymm10, ymm2_b);
+      pshufb256_tab16(ymm3_a, ymm2_a);	pshufb256_tab16(ymm3_b, ymm2_b);
+      pxor256(ymm0_a, ymm2_a);		pxor256(ymm0_b, ymm2_b);
+      movdqa256(ymm10, ymm3_a);		movdqa256(ymm10, ymm3_b);
+      pshufb256_tab16(ymm4_a, ymm3_a);	pshufb256_tab16(ymm4_b, ymm3_b);
+      pxor256(ymm1_a, ymm3_a);		pxor256(ymm1_b, ymm3_b);
+
+      if (config.sched_keys == end_keys)
+	break;
+
+      /* Inverse mix columns */
+      movdqa256(ymm13, ymm4_a);		movdqa256(ymm13, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      broadcast128_256_amemld(config.sched_keys, ymm6);
+      pxor256(ymm6, ymm4_a);		pxor256(ymm6, ymm4_b);
+      movdqa256(ymm12, ymm0_a);		movdqa256(ymm12, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      load_tab16_table(&k_dsbd[1], ymm1_a); movdqa256(ymm1_a, ymm1_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+
+      pshufb256_tab32(ymm5, ymm0_a);	pshufb256_tab32(ymm5, ymm0_b);
+      movdqa256(ymm15, ymm4_a);		movdqa256(ymm15, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      pxor256(ymm0_a, ymm4_a);		pxor256(ymm0_b, ymm4_b);
+      pshufb256_tab16(ymm3_a, ymm1_a);	pshufb256_tab16(ymm3_b, ymm1_b);
+      pxor256(ymm4_a, ymm1_a);		pxor256(ymm4_b, ymm1_b);
+
+      pshufb256_tab32(ymm5, ymm1_a);	pshufb256_tab32(ymm5, ymm1_b);
+      movdqa256(ymm14, ymm4_a);		movdqa256(ymm14, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      pxor256(ymm1_a, ymm4_a);		pxor256(ymm1_b, ymm4_b);
+      load_tab16_table(&k_dsbb[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+
+      pshufb256_tab32(ymm5, ymm0_a);	pshufb256_tab32(ymm5, ymm0_b);
+      movdqa256(ymm8, ymm4_a);		movdqa256(ymm8, ymm4_b);
+      pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+      pshufd256_0x93(ymm5, ymm5);
+      pxor256(ymm0_a, ymm4_a);		pxor256(ymm0_b, ymm4_b);
+      load_tab16_table(&k_dsbe[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+      pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+      pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+
+      config.sched_keys += 16;
+    }
+
+  /* middle of last round */
+  load_tab16_table(&k_dsbo[0], ymm4_a); movdqa256(ymm4_a, ymm4_b);
+  pshufb256_tab16(ymm2_a, ymm4_a);	pshufb256_tab16(ymm2_b, ymm4_b);
+  broadcast128_256_amemld(config.sched_keys, ymm6);
+  pxor256(ymm6, ymm4_a);		pxor256(ymm6, ymm4_b);
+  load_tab16_table(&k_dsbo[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+  pshufb256_tab16(ymm3_a, ymm0_a);	pshufb256_tab16(ymm3_b, ymm0_b);
+  pxor256(ymm4_a, ymm0_a);		pxor256(ymm4_b, ymm0_b);
+  load_tab32_mask(&k_sr[mc_pos], ymm5);
+  pshufb256_tab16(ymm5, ymm0_a);	pshufb256_tab16(ymm5, ymm0_b);
+
+  *pymm0_a = ymm0_a;
+  *pymm0_b = ymm0_b;
+}
+
+#endif /* HAVE_SIMD256 */
+
 ASM_FUNC_ATTR_NOINLINE unsigned int
 FUNC_ENCRYPT (const RIJNDAEL_context *ctx, unsigned char *dst,
               const unsigned char *src)
@@ -1534,12 +1728,12 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 {
   __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7, xmm8;
   __m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static const __m128i be_mask =
+  static const __m128i_const be_mask =
     M128I_BYTE(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  static const __m128i bigendian_add =
+  static const __m128i_const bigendian_add =
     M128I_BYTE(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
-  static const __m128i carry_add = M128I_U64(1, 1);
-  static const __m128i nocarry_add = M128I_U64(1, 0);
+  static const __m128i_const carry_add = M128I_U64(1, 1);
+  static const __m128i_const nocarry_add = M128I_U64(1, 0);
   u64 ctrlow = buf_get_be64(ctr + 8);
   struct vp_aes_config_s config;
 
@@ -1548,9 +1742,77 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 
   enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
 
-  movdqa128(bigendian_add, xmm8); /* Preload byte add */
+  movdqa128_memld(&bigendian_add, xmm8); /* Preload byte add */
   movdqu128_memld(ctr, xmm7); /* Preload CTR */
-  movdqa128(be_mask, xmm6); /* Preload mask */
+  movdqa128_memld(&be_mask, xmm6); /* Preload mask */
+
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqa128_256(xmm7, ymm0);
+
+	  /* detect if 8-bit carry handling is needed */
+	  if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3))
+	    {
+	      static const __m128i_const *adders[5][4] =
+	      {
+		{ &nocarry_add, &nocarry_add, &nocarry_add, &carry_add },
+		{ &nocarry_add, &nocarry_add, &carry_add, &nocarry_add },
+		{ &nocarry_add, &carry_add, &nocarry_add, &nocarry_add },
+		{ &carry_add, &nocarry_add, &nocarry_add, &nocarry_add },
+		{ &nocarry_add, &nocarry_add, &nocarry_add, &nocarry_add }
+	      };
+	      unsigned int idx = ctrlow <= 3 ? ctrlow : 4;
+
+	      pshufb128(xmm6, xmm7);
+
+	      paddq128_amemld(adders[idx][0], xmm7);
+	      movdqa128(xmm7, xmm2);
+	      pshufb128(xmm6, xmm2);
+	      insert256_hi128(xmm2, ymm0);
+	      paddq128_amemld(adders[idx][1], xmm7);
+	      movdqa128(xmm7, xmm2);
+	      pshufb128(xmm6, xmm2);
+	      movdqa128_256(xmm2, ymm1);
+	      paddq128_amemld(adders[idx][2], xmm7);
+	      movdqa128(xmm7, xmm2);
+	      pshufb128(xmm6, xmm2);
+	      insert256_hi128(xmm2, ymm1);
+	      paddq128_amemld(adders[idx][3], xmm7);
+
+	      pshufb128(xmm6, xmm7);
+	    }
+	  else
+	    {
+	      paddb128(xmm8, xmm7);
+	      insert256_hi128(xmm7, ymm0);
+	      paddb128(xmm8, xmm7);
+	      movdqa128_256(xmm7, ymm1);
+	      paddb128(xmm8, xmm7);
+	      insert256_hi128(xmm7, ymm1);
+	      paddb128(xmm8, xmm7);
+	    }
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
+					xmm15);
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
 
   for (; nblocks >= 2; nblocks -= 2)
     {
@@ -1564,24 +1826,24 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 	  /* detect if 64-bit carry handling is needed */
 	  if (UNLIKELY(ctrlow == 1))
 	    {
-	      paddq128(carry_add, xmm7);
+	      paddq128_amemld(&carry_add, xmm7);
 	      movdqa128(xmm7, xmm1);
 	      pshufb128(xmm6, xmm1);
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	    }
 	  else if (UNLIKELY(ctrlow == 0))
 	    {
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	      movdqa128(xmm7, xmm1);
 	      pshufb128(xmm6, xmm1);
-	      paddq128(carry_add, xmm7);
+	      paddq128_amemld(&carry_add, xmm7);
 	    }
 	  else
 	    {
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	      movdqa128(xmm7, xmm1);
 	      pshufb128(xmm6, xmm1);
-	      paddq128(nocarry_add, xmm7);
+	      paddq128_amemld(&nocarry_add, xmm7);
 	    }
 
 	  pshufb128(xmm6, xmm7);
@@ -1617,7 +1879,7 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 	  pshufb128(xmm6, xmm7);
 
 	  /* detect if 64-bit carry handling is needed */
-	  paddq128(UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add, xmm7);
+	  paddq128_amemld(UNLIKELY(ctrlow == 0) ? &carry_add : &nocarry_add, xmm7);
 
 	  pshufb128(xmm6, xmm7);
 	}
@@ -1649,8 +1911,8 @@ FUNC_CTR32LE_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 {
   __m128i xmm0, xmm1, xmm2, xmm3, xmm7, xmm8;
   __m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static const __m128i add_one = M128I_U64(1, 0);
-  static const __m128i add_two = M128I_U64(2, 0);
+  static const __m128i_const add_one = M128I_U64(1, 0);
+  static const __m128i_const add_two = M128I_U64(2, 0);
   struct vp_aes_config_s config;
 
   config.nround = ctx->rounds;
@@ -1658,15 +1920,53 @@ FUNC_CTR32LE_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
 
   enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
 
-  movdqa128(add_one, xmm8); /* Preload byte add */
+  movdqa128_memld(&add_one, xmm8); /* Preload byte add */
   movdqu128_memld(ctr, xmm7); /* Preload CTR */
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqa128(xmm7, xmm0);
+	  movdqa128(xmm7, xmm1);
+	  paddd128(xmm8, xmm1);
+	  paddd128_amemld(&add_two, xmm7);
+	  movdqa128_256(xmm0, ymm0);
+	  insert256_hi128(xmm1, ymm0);
+
+	  movdqa128(xmm7, xmm1);
+	  movdqa128(xmm7, xmm2);
+	  paddd128(xmm8, xmm2);
+	  paddd128_amemld(&add_two, xmm7);
+	  movdqa128_256(xmm1, ymm1);
+	  insert256_hi128(xmm2, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqa128(xmm7, xmm0);
       movdqa128(xmm7, xmm1);
       paddd128(xmm8, xmm1);
-      paddd128(add_two, xmm7);
+      paddd128_amemld(&add_two, xmm7);
 
       aes_encrypt_core_2blks(&xmm0, &xmm1, config,
 			      xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
@@ -1719,6 +2019,36 @@ FUNC_CFB_DEC (RIJNDAEL_context *ctx, unsigned char *iv,
 
   movdqu128_memld(iv, xmm0);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm6, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqa128_256(xmm0, ymm6);
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+	  movdqa256_128(ymm2, xmm2);
+	  insert256_hi128(xmm2, ymm6);
+	  movdqu256_memld(inbuf + 1 * BLOCKSIZE, ymm1);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+	  extract256_hi128(ymm3, xmm0);
+
+	  aes_encrypt_core_4blks_simd256(&ymm6, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm2, ymm6);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm6, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqa128(xmm0, xmm1);
@@ -1779,6 +2109,36 @@ FUNC_CBC_DEC (RIJNDAEL_context *ctx, unsigned char *iv,
 
   movdqu128_memld(iv, xmm7);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+	  movdqa256_128(ymm0, xmm0);
+	  movdqa128_256(xmm7, ymm2);
+	  insert256_hi128(xmm0, ymm2);
+	  movdqu256_memld(inbuf + 1 * BLOCKSIZE, ymm3);
+	  extract256_hi128(ymm1, xmm7);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf, xmm0);
@@ -1843,6 +2203,68 @@ aes_simd128_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   movdqu128_memld(c->u_iv.iv, xmm7);
   movdqu128_memld(c->u_ctr.ctr, xmm6);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support() && nblocks >= 4)
+    {
+      __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+      movdqa128_256(xmm6, ymm6);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm8);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm8);
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  pxor256(ymm0, ymm6);
+	  pxor256(ymm1, ymm6);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      extract256_hi128(ymm6, xmm0);
+      movdqa256_128(ymm6, xmm6);
+      pxor128(xmm0, xmm6);
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       const unsigned char *l;
@@ -1942,6 +2364,69 @@ aes_simd128_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
   movdqu128_memld(c->u_iv.iv, xmm7);
   movdqu128_memld(c->u_ctr.ctr, xmm6);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support() && nblocks >= 4)
+    {
+      __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+      movdqa128_256(xmm6, ymm6);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm8);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm8);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  pxor256(ymm0, ymm6);
+	  pxor256(ymm1, ymm6);
+
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      extract256_hi128(ymm6, xmm0);
+      movdqa256_128(ymm6, xmm6);
+      pxor128(xmm0, xmm6);
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       const unsigned char *l;
@@ -2044,6 +2529,61 @@ FUNC_OCB_AUTH(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
   movdqu128_memld(c->u_mode.ocb.aad_offset, xmm7);
   movdqu128_memld(c->u_mode.ocb.aad_sum, xmm6);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support() && nblocks >= 4)
+    {
+      __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+      movdqa128_256(xmm6, ymm6);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+
+	  movdqu256_memld(abuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(abuf + 2 * BLOCKSIZE, ymm1);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm3);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  movdqa128_256(xmm7, ymm8);
+
+	  l = ocb_get_l(c, ++n);
+	  movdqu128_memld(l, xmm2);
+	  pxor128(xmm2, xmm7);
+	  insert256_hi128(xmm7, ymm8);
+
+	  pxor256(ymm3, ymm0);
+	  pxor256(ymm8, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm0, ymm6);
+	  pxor256(ymm1, ymm6);
+
+	  abuf += 4 * BLOCKSIZE;
+	}
+
+      extract256_hi128(ymm6, xmm0);
+      movdqa256_128(ymm6, xmm6);
+      pxor128(xmm0, xmm6);
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       const unsigned char *l;
@@ -2117,6 +2657,29 @@ aes_simd128_ecb_enc (void *context, void *outbuf_arg, const void *inbuf_arg,
 
   enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf + 0 * BLOCKSIZE, xmm0);
@@ -2171,6 +2734,29 @@ aes_simd128_ecb_dec (void *context, void *outbuf_arg, const void *inbuf_arg,
 
   dec_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm8);
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf + 0 * BLOCKSIZE, xmm0);
@@ -2216,13 +2802,13 @@ FUNC_ECB_CRYPT (void *context, void *outbuf_arg, const void *inbuf_arg,
 
 static ASM_FUNC_ATTR_INLINE __m128i xts_gfmul_byA (__m128i xmm5)
 {
-  static const __m128i xts_gfmul_const = M128I_U64(0x87, 0x01);
+  static const __m128i_const xts_gfmul_const = M128I_U64(0x87, 0x01);
   __m128i xmm1;
 
   pshufd128_0x4E(xmm5, xmm1);
   psraq128(63, xmm1);
   paddq128(xmm5, xmm5);
-  pand128(xts_gfmul_const, xmm1);
+  pand128_amemld(&xts_gfmul_const, xmm1);
   pxor128(xmm1, xmm5);
 
   return xmm5;
@@ -2246,6 +2832,43 @@ aes_simd128_xts_enc (void *context, unsigned char *tweak, void *outbuf_arg,
 
   movdqu128_memld(tweak, xmm7); /* Preload tweak */
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  movdqa128_256(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  movdqa128_256(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+
+	  aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf, xmm0);
@@ -2315,6 +2938,43 @@ aes_simd128_xts_dec (void *context, unsigned char *tweak, void *outbuf_arg,
 
   movdqu128_memld(tweak, xmm7); /* Preload tweak */
 
+#ifdef HAVE_SIMD256
+  if (check_simd256_support())
+    {
+      __m256i ymm0, ymm1, ymm2, ymm3;
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+	  movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+	  movdqa128_256(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm2);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  movdqa128_256(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+	  insert256_hi128(xmm7, ymm3);
+	  xmm7 = xts_gfmul_byA(xmm7);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+
+	  aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+					 xmm9, xmm10, xmm11, xmm12, xmm13,
+					 xmm14, xmm15, xmm8);
+
+	  pxor256(ymm2, ymm0);
+	  pxor256(ymm3, ymm1);
+	  movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+	  movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+	  outbuf += 4 * BLOCKSIZE;
+	  inbuf  += 4 * BLOCKSIZE;
+	}
+    }
+#endif /* HAVE_SIMD256 */
+
   for (; nblocks >= 2; nblocks -= 2)
     {
       movdqu128_memld(inbuf, xmm0);
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 12c27319..0c48793b 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -224,6 +224,62 @@ extern void _gcry_aes_vp_aarch64_xts_crypt (void *context, unsigned char *tweak,
 					    size_t nblocks, int encrypt);
 #endif
 
+#ifdef USE_VP_RISCV
+/* RISC-V vector permutation implementation of AES */
+extern int _gcry_aes_vp_riscv_setup_acceleration(RIJNDAEL_context *ctx);
+
+extern void _gcry_aes_vp_riscv_do_setkey(RIJNDAEL_context *ctx,
+					 const byte *key);
+extern void _gcry_aes_vp_riscv_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_vp_riscv_encrypt (const RIJNDAEL_context *ctx,
+						unsigned char *dst,
+						const unsigned char *src);
+extern unsigned int _gcry_aes_vp_riscv_decrypt (const RIJNDAEL_context *ctx,
+						unsigned char *dst,
+						const unsigned char *src);
+extern void _gcry_aes_vp_riscv_cfb_enc (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern void _gcry_aes_vp_riscv_cbc_enc (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks,
+					int cbc_mac);
+extern void _gcry_aes_vp_riscv_ctr_enc (void *context, unsigned char *ctr,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern void _gcry_aes_vp_riscv_ctr32le_enc (void *context, unsigned char *ctr,
+					    void *outbuf_arg,
+					    const void *inbuf_arg,
+					    size_t nblocks);
+extern void _gcry_aes_vp_riscv_cfb_dec (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern void _gcry_aes_vp_riscv_cbc_dec (void *context, unsigned char *iv,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks);
+extern size_t _gcry_aes_vp_riscv_ocb_crypt (gcry_cipher_hd_t c,
+					    void *outbuf_arg,
+					    const void *inbuf_arg,
+					    size_t nblocks,
+					    int encrypt);
+extern size_t _gcry_aes_vp_riscv_ocb_auth (gcry_cipher_hd_t c,
+					   const void *abuf_arg,
+					   size_t nblocks);
+extern void _gcry_aes_vp_riscv_ecb_crypt (void *context, void *outbuf_arg,
+					  const void *inbuf_arg,
+					  size_t nblocks, int encrypt);
+extern void _gcry_aes_vp_riscv_xts_crypt (void *context, unsigned char *tweak,
+					  void *outbuf_arg,
+					  const void *inbuf_arg,
+					  size_t nblocks, int encrypt);
+#endif
+
 #ifdef USE_PADLOCK
 extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
                                                unsigned char *bx,
@@ -718,6 +774,30 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->xts_crypt = _gcry_aes_vp_aarch64_xts_crypt;
     }
 #endif
+#ifdef USE_VP_RISCV
+  else if ((hwfeatures & HWF_RISCV_IMAFDC) && (hwfeatures & HWF_RISCV_V) &&
+           _gcry_aes_vp_riscv_setup_acceleration(ctx))
+    {
+      hw_setkey = _gcry_aes_vp_riscv_do_setkey;
+      ctx->encrypt_fn = _gcry_aes_vp_riscv_encrypt;
+      ctx->decrypt_fn = _gcry_aes_vp_riscv_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_vp_riscv_prepare_decryption;
+
+      /* Setup vector permute AArch64 bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_vp_riscv_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_vp_riscv_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_vp_riscv_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_vp_riscv_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_vp_riscv_ctr_enc;
+      bulk_ops->ctr32le_enc = _gcry_aes_vp_riscv_ctr32le_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_vp_riscv_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_vp_riscv_ocb_auth;
+      bulk_ops->ecb_crypt = _gcry_aes_vp_riscv_ecb_crypt;
+      bulk_ops->xts_crypt = _gcry_aes_vp_riscv_xts_crypt;
+    }
+#endif
 #ifdef USE_PPC_CRYPTO_WITH_PPC9LE
   else if ((hwfeatures & HWF_PPC_VCRYPTO) && (hwfeatures & HWF_PPC_ARCH_3_00))
     {
diff --git a/cipher/simd-common-riscv.h b/cipher/simd-common-riscv.h
new file mode 100644
index 00000000..8381000f
--- /dev/null
+++ b/cipher/simd-common-riscv.h
@@ -0,0 +1,48 @@
+/* simd-common-riscv.h  -  Common macros for RISC-V vector code
+ *
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_SIMD_COMMON_RISCV_H
+#define GCRY_SIMD_COMMON_RISCV_H
+
+#include <config.h>
+
+#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+
+#define clear_vec_regs() __asm__ volatile("vsetvli zero, %0, e8, m1, ta, ma;\n" \
+					  "vmv.v.i v0, 0;\n" \
+					  "vmv.v.i v1, 0;\n" \
+					  "vmv2r.v v2, v0;\n" \
+					  "vmv4r.v v4, v0;\n" \
+					  "vmv8r.v v8, v0;\n" \
+					  "vmv8r.v v16, v0;\n" \
+					  "vmv8r.v v24, v0;\n" \
+					  : \
+					  : "r" (~0) \
+					  : "memory", "vl", "vtype", \
+					    "v0", "v1", "v2", "v3", \
+					    "v4", "v5", "v6", "v7", \
+					    "v8", "v9", "v10", "v11", \
+					    "v12", "v13", "v14", "v15", \
+					    "v16", "v17", "v18", "v19", \
+					    "v20", "v21", "v22", "v23", \
+					    "v24", "v25", "v26", "v27", \
+					    "v28", "v29", "v30", "v31")
+
+#endif /* GCRY_SIMD_COMMON_RISCV_H */
diff --git a/configure.ac b/configure.ac
index f20d654d..55d15fa3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2705,6 +2705,120 @@ if test "$gcry_cv_gcc_inline_asm_riscv_v" = "yes" ; then
 fi
 
 
+#
+# Check whether compiler supports RISC-V vector intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics],
+      [gcry_cv_cc_riscv_vector_intrinsics],
+      [if test "$mpi_cpu_arch" != "riscv64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	gcry_cv_cc_riscv_vector_intrinsics="n/a"
+      else
+	gcry_cv_cc_riscv_vector_intrinsics=no
+	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+	[[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+	  #error __riscv_v_intrinsic not defined or too old version
+	  #endif
+	  #include <riscv_vector.h>
+	  typedef vuint8m1_t __m128i;
+	  #define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+	  #define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+	  #define paddq128(a, o) (o = cast_u64_to_m128i( \
+					__riscv_vadd_vv_u64m1( \
+					  cast_m128i_to_u64(o), \
+					  cast_m128i_to_u64(a), 2)))
+	  #define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+	  #define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+	  #define clear_vec_reg_v0() \
+	    __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
+			     "vmv.v.x v0, zero;\n" \
+			     ::: "memory", "vtype", "vl", "v0")
+	  static inline __attribute__((always_inline)) __m128i
+	  fn2(__m128i a)
+	  {
+	    paddq128(a, a);
+	    return a;
+	  }
+	  __m128i fn(__m128i in)
+	  {
+	    __m128i x;
+	    memory_barrier_with_vec(in);
+	    x = fn2(in);
+	    memory_barrier_with_vec(x);
+	    pshufb128(in, x);
+	    memory_barrier_with_vec(in);
+	    clear_vec_reg_v0();
+	    return in;
+	  }
+	  ]])],
+	[gcry_cv_cc_riscv_vector_intrinsics=yes])
+      fi])
+if test "$gcry_cv_cc_riscv_vector_intrinsics" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
+	    [Defined if underlying compiler supports RISC-V vector intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=rv64imafdcv -mstrict-align"
+
+if test "$gcry_cv_cc_riscv_vector_intrinsics" = "no" &&
+   test "$mpi_cpu_arch" = "riscv64" &&
+   test "$try_asm_modules" = "yes" ; then
+  AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics with extra GCC flags],
+    [gcry_cv_cc_riscv_vector_intrinsics_cflags],
+    [gcry_cv_cc_riscv_vector_intrinsics_cflags=no
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+      [[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+	#error __riscv_v_intrinsic not defined or too old version
+	#endif
+	#include <riscv_vector.h>
+	typedef vuint8m1_t __m128i;
+	#define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+	#define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+	#define paddq128(a, o) (o = cast_u64_to_m128i( \
+					__riscv_vadd_vv_u64m1( \
+					  cast_m128i_to_u64(o), \
+					  cast_m128i_to_u64(a), 2)))
+	#define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+	#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+	#define clear_vec_reg_v0() \
+	  __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
+			    "vmv.v.x v0, zero;\n" \
+			    ::: "memory", "vl", "v0")
+	static inline __attribute__((always_inline)) __m128i
+	fn2(__m128i a)
+	{
+	  paddq128(a, a);
+	  return a;
+	}
+	__m128i fn(__m128i in)
+	{
+	  __m128i x;
+	  memory_barrier_with_vec(in);
+	  x = fn2(in);
+	  memory_barrier_with_vec(x);
+	  pshufb128(in, x);
+	  memory_barrier_with_vec(in);
+	  clear_vec_reg_v0();
+	  return in;
+	}
+	]])],
+      [gcry_cv_cc_riscv_vector_intrinsics_cflags=yes])])
+  if test "$gcry_cv_cc_riscv_vector_intrinsics_cflags" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
+	      [Defined if underlying compiler supports RISC-V vector intrinsics])
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS_WITH_CFLAGS,1,
+	      [Defined if underlying compiler supports RISC-V vector intrinsics with extra GCC flags])
+  fi
+fi
+
+AM_CONDITIONAL(ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS,
+	       test "$gcry_cv_cc_riscv_vector_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
 #######################################
 #### Checks for library functions. ####
 #######################################
@@ -3183,6 +3297,10 @@ if test "$found" = "1" ; then
          # Build with the crypto extension implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
       ;;
+      riscv64-*-*)
+         # Build with the vector permute SIMD128 implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vp-riscv.lo"
+      ;;
       s390x-*-*)
          # Big-Endian.
          # Build with the crypto extension implementation
-- 
2.45.2


From lucas.mulling at suse.com  Wed Jan  8 13:31:57 2025
From: lucas.mulling at suse.com (Lucas Mulling)
Date: Wed, 8 Jan 2025 09:31:57 -0300
Subject: FIPS SLI revamp: __thread check is sensitive to compiler flags
Message-ID: <CAK8Nc5Pc3V9D9M0sGOg2S45tLb637NMYNKYTsK6Z+NwdFUzF=w@mail.gmail.com>

Hi,

We were backporting FIPS SLI changes to 1.10.3, in our target environment
`-Wall` and -`Werror=unused-variable` are set, making the check for
`__thread` fail. FYI if anyone else has this problem, fix attached.

Thanks,
Lucas M?lling
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250108/afd359b1/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: fix-__thread-check.patch
Type: text/x-patch
Size: 668 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250108/afd359b1/attachment.bin>

From gniibe at fsij.org  Thu Jan  9 02:32:20 2025
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 09 Jan 2025 10:32:20 +0900
Subject: FIPS SLI revamp: __thread check is sensitive to compiler flags
In-Reply-To: <CAK8Nc5Pc3V9D9M0sGOg2S45tLb637NMYNKYTsK6Z+NwdFUzF=w@mail.gmail.com>
References: <CAK8Nc5Pc3V9D9M0sGOg2S45tLb637NMYNKYTsK6Z+NwdFUzF=w@mail.gmail.com>
Message-ID: <87r05csrff.fsf@akagi.fsij.org>

Lucas Mulling wrote:
> We were backporting FIPS SLI changes to 1.10.3, in our target environment
> `-Wall` and -`Werror=unused-variable` are set, making the check for
> `__thread` fail. FYI if anyone else has this problem, fix attached.

Thank you for your feedback.  Fixed in master by the commit:

	42e8858566e32080aaf818b168f34c698a9ef084

-- 


From josch at mister-muffin.de  Tue Jan 14 16:09:36 2025
From: josch at mister-muffin.de (Johannes Schauer Marin Rodrigues)
Date: Tue, 14 Jan 2025 16:09:36 +0100
Subject: Python binding: import gpg -> ImportError: cannot import name 'gpgme'
 from partially initialized module 'gpg'
Message-ID: <173686737613.15909.5406636995937562954@localhost>

Hi,

steps to reproduce:

python3 -m venv .venv
. .venv/bin/activate
python3 -m pip install --upgrade pip
python3 -m pip install gpg
python3 -c "import gpg"

This will output:

Successfully built gpg
Installing collected packages: gpg
Successfully installed gpg-1.10.0
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/.venv/lib/python3.10/site-packages/gpg/__init__.py", line 101, in <module>
    from . import core
  File "/.venv/lib/python3.10/site-packages/gpg/core.py", line 34, in <module>
    from . import gpgme
ImportError: cannot import name 'gpgme' from partially initialized module 'gpg' (most likely due to a circular import) (/.venv/lib/python3.10/site-packages/gpg/__init__.py)

This is on Ubuntu Jammy. You can reproduce this in a clean chroot environment
with just python3-venv build-essential and libgpgme-dev installed on top of the
default package set.

I hope this is the right list for this issue and I hope maybe somebody can shed
some light on this situation. :)

Thanks!

cheers, josch
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: signature
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250114/85815eaa/attachment.sig>

From gniibe at fsij.org  Thu Jan 16 07:33:14 2025
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 16 Jan 2025 15:33:14 +0900
Subject: [PATCH] Remove WindowsCE support.
Message-ID: <87r053nu8l.fsf@akagi.fsij.org>

Hello,

For GnuPG and other libraries for GnuPG, WindowsCE support has been
removed.  This is a patch for libgcrypt.  Tested (make check) on amd64
and Windows.

-- 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-Remove-WindowsCE-support.patch
Type: text/x-diff
Size: 17788 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250116/a378be02/attachment-0001.patch>

From lucas.mulling at suse.com  Fri Jan 24 14:19:31 2025
From: lucas.mulling at suse.com (Lucas Mulling)
Date: Fri, 24 Jan 2025 10:19:31 -0300
Subject: [PATCH] cipher: Check and mark non-compliant cipher modes in the SLI
Message-ID: <CAK8Nc5NYMET_3Xr9s8wmu++c0-+LPTLTdvvwCRCuyqDCSczZLg@mail.gmail.com>

Hello,

Attached patch.

Best,
Lucas M?lling
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250124/bc149d16/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-cipher-Check-and-mark-non-compliant-cipher-modes-in-the-SLI.patch
Type: text/x-patch
Size: 7822 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250124/bc149d16/attachment.bin>

From gniibe at fsij.org  Mon Jan 27 01:09:33 2025
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Mon, 27 Jan 2025 09:09:33 +0900
Subject: [PATCH] cipher: Check and mark non-compliant cipher modes in
 the SLI
In-Reply-To: <CAK8Nc5NYMET_3Xr9s8wmu++c0-+LPTLTdvvwCRCuyqDCSczZLg@mail.gmail.com>
References: <CAK8Nc5NYMET_3Xr9s8wmu++c0-+LPTLTdvvwCRCuyqDCSczZLg@mail.gmail.com>
Message-ID: <87r04pqfr6.fsf@akagi.fsij.org>

Hello,

Lucas Mulling wrote:
> Attached patch.

Thank you.  Applied and pushed to master under the ticket 7338.
-- 


From wk at gnupg.org  Tue Jan 28 17:40:17 2025
From: wk at gnupg.org (Werner Koch)
Date: Tue, 28 Jan 2025 17:40:17 +0100
Subject: [PATCH] cipher: Check and mark non-compliant cipher modes in
 the SLI
In-Reply-To: <CAK8Nc5NYMET_3Xr9s8wmu++c0-+LPTLTdvvwCRCuyqDCSczZLg@mail.gmail.com>
 (Lucas Mulling via Gcrypt-devel's message of "Fri, 24 Jan 2025
 10:19:31 -0300")
References: <CAK8Nc5NYMET_3Xr9s8wmu++c0-+LPTLTdvvwCRCuyqDCSczZLg@mail.gmail.com>
Message-ID: <87lduunb7y.fsf@jacob.g10code.de>

Hi!

On Fri, 24 Jan 2025 10:19, Lucas Mulling said:

> +int
> +_gcry_cipher_is_mode_fips_compliant(int mode)

Given that this function returns an error code it should also be
declared as to do this.  However, the name of the function indicates
that this returns a boolean status and one would expect true for FIPS
comliance.  But the logic is invers.  This is fine but the function
should then for example be named _gcry_cipher_mode_fips_compliance.

MODE is not an int but enum gcry_cipher_modes and thus it is better to
use that.  Also put all modes into the switch so that the compiler can
check its completeness and we do not miss to check whether new modes may
be FIPS compliant.

> @@ -1988,6 +1988,7 @@ char *gcry_get_config (int mode, const char *what);
>  #define GCRY_FIPS_FLAG_REJECT_PK            (1 << 5)
>  #define GCRY_FIPS_FLAG_REJECT_PK_MD         (1 << 6)
>  #define GCRY_FIPS_FLAG_REJECT_PK_GOST_SM2   (1 << 7)
> +#define GCRY_FIPS_FLAG_REJECT_CIPHER_MODE   (1 << 8)

Do we already have a documentation for these new constants?  In any case
it should be put into the NEWS file.


Shalom-Salam,

   Werner


-- 
The pioneers of a warless world are the youth that
refuse military service.             - A. Einstein
-------------- next part --------------
A non-text attachment was scrubbed...
Name: openpgp-digital-signature.asc
Type: application/pgp-signature
Size: 247 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250128/997c99f3/attachment.sig>

From lucas.mulling at suse.com  Tue Jan 28 20:12:15 2025
From: lucas.mulling at suse.com (Lucas Mulling)
Date: Tue, 28 Jan 2025 16:12:15 -0300
Subject: [PATCH] cipher: Check and mark non-compliant cipher modes in the
 SLI
In-Reply-To: <87lduunb7y.fsf@jacob.g10code.de>
References: <CAK8Nc5NYMET_3Xr9s8wmu++c0-+LPTLTdvvwCRCuyqDCSczZLg@mail.gmail.com>
 <87lduunb7y.fsf@jacob.g10code.de>
Message-ID: <CAK8Nc5N_Zz2cuCNrnspZO4JbZqooF=44TpGO1DwJ7fNgJKxdiA@mail.gmail.com>

Hi,

Fixed in the attached patch (included a small typo fix from the other
patch, sorry about that).

> MODE is not an int but enum gcry_cipher_modes and thus it is better to
> use that.  Also put all modes into the switch so that the compiler can
> check its completeness and we do not miss to check whether new modes may
> be FIPS compliant.

Not sure if _gcry_cipher_open_internal should also use gcry_cipher_modes.
Let me know
if this is something you want changed since it checks mode as an int as
well.

Best,
Lucas M?lling


On Tue, Jan 28, 2025 at 1:39?PM Werner Koch <wk at gnupg.org> wrote:

> Hi!
>
> On Fri, 24 Jan 2025 10:19, Lucas Mulling said:
>
> > +int
> > +_gcry_cipher_is_mode_fips_compliant(int mode)
>
> Given that this function returns an error code it should also be
> declared as to do this.  However, the name of the function indicates
> that this returns a boolean status and one would expect true for FIPS
> comliance.  But the logic is invers.  This is fine but the function
> should then for example be named _gcry_cipher_mode_fips_compliance.
>
> MODE is not an int but enum gcry_cipher_modes and thus it is better to
> use that.  Also put all modes into the switch so that the compiler can
> check its completeness and we do not miss to check whether new modes may
> be FIPS compliant.
>
> > @@ -1988,6 +1988,7 @@ char *gcry_get_config (int mode, const char *what);
> >  #define GCRY_FIPS_FLAG_REJECT_PK            (1 << 5)
> >  #define GCRY_FIPS_FLAG_REJECT_PK_MD         (1 << 6)
> >  #define GCRY_FIPS_FLAG_REJECT_PK_GOST_SM2   (1 << 7)
> > +#define GCRY_FIPS_FLAG_REJECT_CIPHER_MODE   (1 << 8)
>
> Do we already have a documentation for these new constants?  In any case
> it should be put into the NEWS file.
>
>
>
> Shalom-Salam,
>
>    Werner
>
>
> --
> The pioneers of a warless world are the youth that
> refuse military service.             - A. Einstein
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250128/f26b2132/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-chiper-Rename-_gcry_cipher_is_mode_fips_compliant.patch
Type: text/x-patch
Size: 1992 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250128/f26b2132/attachment-0001.bin>

From gniibe at fsij.org  Fri Jan 31 07:25:39 2025
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Fri, 31 Jan 2025 15:25:39 +0900
Subject: [PATCH] MPI helper of multiplication, Least Leak Intended
Message-ID: <877c6b8pp8.fsf@akagi.fsij.org>

Hello,

This month, I created the ticket for improvement of modular
exponentiation implementation with a branch named gniibe/t7490:

	https://dev.gnupg.org/T7490

It somehow works for me now.

I'd like to merge the branch manually in steps.

Here is a first patch for MPI multiplication.  The purpose is to have
constant-time property.

But, for these kinds of internal intermediate routines, I felt that
naming "constant-time" sounds too much.  Honestly speaking, it's
"Least Leak Intended", and I couldn't declare it constant-time.

So, I use the suffix of _lli.  It's basecase multiplication, no Karatsuba.

diff --git a/mpi/mpi-internal.h b/mpi/mpi-internal.h
index 935bf3e1..f04f1dbd 100644
--- a/mpi/mpi-internal.h
+++ b/mpi/mpi-internal.h
@@ -230,6 +230,8 @@ void _gcry_mpih_mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp,
 						   mpi_size_t size);
 mpi_limb_t _gcry_mpih_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
 					 mpi_ptr_t vp, mpi_size_t vsize);
+mpi_limb_t _gcry_mpih_mul_lli(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
+                              mpi_ptr_t vp, mpi_size_t vsize);
 void _gcry_mpih_sqr_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size );
 void _gcry_mpih_sqr_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size,
 						mpi_ptr_t tspace);
diff --git a/mpi/mpih-mul.c b/mpi/mpih-mul.c
index 6c51533f..1e6bfcb2 100644
--- a/mpi/mpih-mul.c
+++ b/mpi/mpih-mul.c
@@ -527,3 +527,32 @@ _gcry_mpih_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
     _gcry_mpih_release_karatsuba_ctx( &ctx );
     return *prod_endp;
 }
+
+
+/* Do same calculation as _gcry_mpih_mul does, but Least Leak Intended.  */
+mpi_limb_t
+_gcry_mpih_mul_lli( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
+                    mpi_ptr_t vp, mpi_size_t vsize )
+{
+    mpi_limb_t cy;
+    mpi_size_t i;
+    mpi_limb_t v_limb;
+
+    if( !vsize )
+        return 0;
+
+    v_limb = vp[0];
+    cy = _gcry_mpih_mul_1( prodp, up, usize, v_limb );
+
+    prodp[usize] = cy;
+    prodp++;
+
+    for( i = 1; i < vsize; i++ ) {
+        v_limb = vp[i];
+        cy = _gcry_mpih_addmul_1(prodp, up, usize, v_limb);
+        prodp[usize] = cy;
+        prodp++;
+    }
+
+    return cy;
+}
--