From pikolasikolatest2 at gmail.com  Mon Aug  4 12:18:10 2025
From: pikolasikolatest2 at gmail.com (walid falcon)
Date: Mon, 4 Aug 2025 11:18:10 +0100
Subject: =?UTF-8?B?2LTZh9in2K/YqSDYp9mE2KPYqNmI2Kkg2YHZiiDYp9mE2YXYutix2Kg=?=
Message-ID: <CAEDhREqxrWPpxHn0G0c_eh17_9q1tnuv_mq4uwJGWRMs1wsuJg@mail.gmail.com>

?? ?? ????? ?????? ??? ????? ?????? ???????


 ????? ????? ????? ?? ???? ?????? ???????
https://www.targir.com/2025/04/blog-post_14.html


??????? ????? ???????? ????? ?????? ?????????? ????????? ???????? ???


?? ?? ????? ?????? ?? ???????


????? ?????? ????? ??????? ???? ????? ??? ??? ?????? ?????? ??????? ???????
??? ????? ?????? ?????????? ?? ??? ??? ???? ??? ???? ???? ?? ????? ????.
????? ??? ??????? ????? ?? ????????? ???????? ???? ???? ??? ????? ??????
???????.


??????? ???? ?????? ??????? ????? ??????
1. ??????? ??? ???????? ?? ?????? ???????
????? ???? ??? ???? ???? ?????? ?? ?? ??????? ???? ??????? ?? ???? ??????
???????? ????? ???? ??? ??? ???? ?????? ??? ????? ??????.


2. ????? ????? ?? ???? ??? ????
?? ????? ??? ???? ????? ?????? ??? ???????? ????? ??? ??????? ??????
??????? ???????.


??????? ?????? ??? ????? ??????
????: ??? ???? ???? ???????
????? ??? ???? ?? ??? ??? ????? ????? ??? ??? ??????? ?????????? ???? ???
???? ??? ??? ????? ?? ????? ???? ?? ??? ????? ??? ????? ?????? ??????.


??????: ??????? ????????
???? ?? ????? ??????? ???????
??? ???????? ????? ?????? (?? ????)
????? ???? ???????
????? ??????
????? ?? ??????? ???? ????? ???? ??????
????? ????? ?????? pdf
???? ??? ??????? ??????????? ????? ????? ????? ?????? PDF ???? ???????? ???
?? ??????? ?????? ?? ??? ??? ??? ??? ?????? ???? ??????? ?????? ??????
??????? ?????? ?????.


??????? ??????? ??? ????? ??????
?? ???? ????? ??? ????? ?????? ???????????
??? ??????? ??? ?????? ?????? ?????? ???? ???????.


?? ?? ??? ?????? ??? ????? ???????
?? ????? ????? ??? ??? ???????? ???? ???? ?? ?????? ??? ??? ??? ????? ?????
??? ????? ?????.


??? ??? ????? ?????? ???? ???????? ????????? ??????
?? ????? ??? ??? ?????? ??? ????? ?????? ???? ???????? ????????? ??????
???? ???? ???? ??????? ???:


1. ??????? ?????? ?????? ?????
?????? ?? ????? ?? ??? ??????? ????????? ????? ????? ???? ??? ??????? ?????
??????? ??? ?? ????? ??:


??????? ???????? ???????? ????? ????? ?? ????? ???????
????? ???? ??????
??????? ?????? ??????????
2. ??????? ??????? ????? ?????
?? ????? ??? ???? ????? ???? ????? ??? ??????? ?? ????? ??????? ??? ?????
?? ????? ?????? ?? ????? ?????? ?????????? ??? ?? ???? ????? ?????? ??????
?? ??? ??????? ????????? ??????.


????? ??????
?????? ?? ??? ??????? ?? ????? ????? ?????? ?????? ???? ??? ????? ?? ?????
?????? ?? ???? ?? ??? ??????? ????????.


???? ?????
??????? ?????? ???????


??????? ????????
??????? ?????? :


?       ????? ??????? ?


?       ?????? ???????


??????? ?????? :


?       ????? ????? ???? ?


?       ?????? ??????? .


??????? ??????? ??????? ??????? ????????
??????? ???????? ??????? ???? ????? ???? ?????


??????? ??????? ?????? ??????
??????? ???????? ??????? ???? ????? ???? ?????


?? ???? ???? ?????? ??? ??????
????? ??????


???? ???????
????? ?????? ???????? ????????


???? ????????? (??????? ???????)
????? ?????? ???????? ????????_?????


????? ????? ????? ?? ???? ?????? ???????


????? ????? ?????? ???????
????? ?????? ??: [??? ???????]
??? ?????: [????? ???????]
??? ??????: [????? ???????]


???? ????? ???????
??? ?????? ?????:


????? ??????: [????? ?????? ????]


??? ????? ??????? ???????: [??? CIN]


????? ????? ????????: [????? ????? ????? ????]


??????? ??????: [????? ???????]


??????: [?????? ???????]


?????? ????? ???? ?????? ???? ??????? ??????????? ??? ???:


?? ????? (?) ?????? (?): [??? ?????/?]
??????? (?) ??????: [????? ???????]
??: [???? ???????]


??(??) ????/????? ?????????(?)? ??? ????(?) ????? ????? ????? ????? ????
???? ??????:
[??? ????]? ???????? ??????: [????? ????? ????]? ???????? ???? ?????
??????? ???????: [??? CIN ????].


?????? ???? ????????? ????????? ???????? ???? ??? ?????/?? ????? ?? ???????
??????? ????? ??? ????? ???? ??? ????? ??? ??????? ?? ??????? ?? ??????
???????.


???????
????? ??????: _____________________
???????: ___________________________
???????: ____ / ____ / ________

??????? :
????? ??? ??????? ???????? ??????? ??: ???? ??????? ???????? ????? ???????
????? ???????? ??????.
?????? ??? ????? ??? ??????? ?????????? (??? ???? ??????) ???? ??? ???
???????? ??? ????? ????? ?? ??? ??? ????.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250804/84294899/attachment-0001.html>

From jussi.kivilinna at iki.fi  Thu Aug  7 14:45:47 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu,  7 Aug 2025 15:45:47 +0300
Subject: [PATCH] crc-intel-pclmul: add AVX2 and AVX512 code paths
Message-ID: <20250807124548.644871-1-jussi.kivilinna@iki.fi>

* cipher/crc-intel-pclmul.c (crc32_consts_s, crc32_consts)
(crc24rfc2440_consts): Add k_ymm and k_zmm.
(crc32_reflected_bulk, crc32_bulk): Add VPCLMUL+AVX2 and VAES_VPCLMUL+AVX512
code paths; Add 'hwfeatures' parameter.
(_gcry_crc32_intel_pclmul, _gcry_crc24rfc2440_intel_pclmul): Add 'hwfeatures'
parameter.
* cipher/crc.c (CRC_CONTEXT) [USE_INTEL_PCLMUL]: Add 'hwfeatures'.
(_gcry_crc32_intel_pclmul, _gcry_crc24rfc2440_intel_pclmul): Add 'hwfeatures'
parameter.
(crc32_init, crc32rfc1510_init, crc24rfc2440_init) [USE_INTEL_PCLMUL]: Store
HW features to context.
--

Benchmark on Zen4:

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 CRC32          |     0.046 ns/B     20861 MiB/s     0.248 c/B      5421?1
 CRC32RFC1510   |     0.046 ns/B     20809 MiB/s     0.250 c/B      5463?14
 CRC24RFC2440   |     0.046 ns/B     20934 MiB/s     0.251 c/B      5504?2

After AVX2:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 CRC32          |     0.023 ns/B     42277 MiB/s     0.123 c/B      5440?6
 CRC32RFC1510   |     0.022 ns/B     42949 MiB/s     0.121 c/B      5454?16
 CRC24RFC2440   |     0.023 ns/B     41955 MiB/s     0.124 c/B      5439?13

After AVX512:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 CRC32          |     0.011 ns/B     85877 MiB/s     0.061 c/B      5500
 CRC32RFC1510   |     0.011 ns/B     83898 MiB/s     0.063 c/B      5500
 CRC24RFC2440   |     0.012 ns/B     80590 MiB/s     0.065 c/B      5500

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/crc-intel-pclmul.c | 500 ++++++++++++++++++++++++++++++++++----
 cipher/crc.c              |  13 +-
 2 files changed, 459 insertions(+), 54 deletions(-)

diff --git a/cipher/crc-intel-pclmul.c b/cipher/crc-intel-pclmul.c
index 825dee2a..8209fc34 100644
--- a/cipher/crc-intel-pclmul.c
+++ b/cipher/crc-intel-pclmul.c
@@ -68,6 +68,10 @@ struct crc32_consts_s
   u64 k[6];
   /* my_p: { floor(x^64 / P(x)), P(x) } */
   u64 my_p[2];
+  /* k_ymm: { x^(32*33), x^(32*31) } mod P(x) */
+  u64 k_ymm[2];
+  /* k_zmm: { x^(32*65), x^(32*63) } mod P(x) */
+  u64 k_zmm[2];
 };
 
 
@@ -81,6 +85,12 @@ static const struct crc32_consts_s crc32_consts ALIGNED_16 =
   },
   { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
     U64_C(0x1f7011641), U64_C(0x1db710641)
+  },
+  { /* k_ymm[2] */
+    U64_C(0x1e88ef372), U64_C(0x14a7fe880)  /* y = { 33, 31 } */,
+  },
+  { /* k_zmm[2] */
+    U64_C(0x11542778a), U64_C(0x1322d1430)  /* y = { 65, 63 } */
   }
 };
 
@@ -94,6 +104,12 @@ static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 =
   },
   { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
     U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+  },
+  { /* k_ymm[2] */
+    U64_C(0xaee5d500) << 32, U64_C(0x1a43ea00) << 32  /* y = { 33, 31 } */
+  },
+  { /* k_zmm[2] */
+    U64_C(0x21342700) << 32, U64_C(0x5d2b6300) << 32  /* y = { 65, 63 } */
   }
 };
 
@@ -144,31 +160,216 @@ static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
 /* PCLMUL functions for reflected CRC32. */
 static ASM_FUNC_ATTR_INLINE void
 crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
-		      const struct crc32_consts_s *consts)
+		      const struct crc32_consts_s *consts, u32 hwfeatures)
 {
   if (inlen >= 8 * 16)
     {
-      asm volatile ("movd %[crc], %%xmm4\n\t"
-		    "movdqu %[inbuf_0], %%xmm0\n\t"
-		    "movdqu %[inbuf_1], %%xmm1\n\t"
-		    "movdqu %[inbuf_2], %%xmm2\n\t"
-		    "movdqu %[inbuf_3], %%xmm3\n\t"
-		    "pxor %%xmm4, %%xmm0\n\t"
-		    :
-		    : [inbuf_0] "m" (inbuf[0 * 16]),
-		      [inbuf_1] "m" (inbuf[1 * 16]),
-		      [inbuf_2] "m" (inbuf[2 * 16]),
-		      [inbuf_3] "m" (inbuf[3 * 16]),
-		      [crc] "m" (*pcrc)
-		    );
+      if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+	  && (hwfeatures & HWF_INTEL_AVX2)
+	  && inlen >= 8 * 32)
+	{
+	  if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+	      && (hwfeatures & HWF_INTEL_AVX512)
+	      && inlen >= 8 * 64)
+	    {
+	      asm volatile("vmovd %[crc], %%xmm4\n\t"
+			   "vpopcntb %%xmm4, %%xmm0\n\t" /* spec stop for old AVX512 CPUs */
+			   "vmovdqu64 %[inbuf_0], %%zmm0\n\t"
+			   "vmovdqu64 %[inbuf_1], %%zmm1\n\t"
+			   "vmovdqu64 %[inbuf_2], %%zmm2\n\t"
+			   "vmovdqu64 %[inbuf_3], %%zmm3\n\t"
+			   "vpxorq %%zmm4, %%zmm0, %%zmm0\n\t"
+			   :
+			   : [crc] "m" (*pcrc),
+			     [inbuf_0] "m" (inbuf[0 * 64]),
+			     [inbuf_1] "m" (inbuf[1 * 64]),
+			     [inbuf_2] "m" (inbuf[2 * 64]),
+			     [inbuf_3] "m" (inbuf[3 * 64]),
+			     [k_zmm] "m" (consts->k_zmm[0])
+			   );
+
+	      inbuf += 4 * 64;
+	      inlen -= 4 * 64;
+
+	      asm volatile("vbroadcasti32x4 %[k_zmm], %%zmm4\n\t"
+			   :
+			   : [k_zmm] "m" (consts->k_zmm[0])
+			   );
+
+	      /* Fold by 16. */
+	      while (inlen >= 4 * 64)
+		{
+		  asm volatile ("vmovdqu64 %[inbuf_0], %%zmm5\n\t"
+				"vmovdqa64 %%zmm0, %%zmm6\n\t"
+				"vpclmulqdq $0x00, %%zmm4, %%zmm0, %%zmm0\n\t"
+				"vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm0\n\t"
+
+				"vmovdqu64 %[inbuf_1], %%zmm5\n\t"
+				"vmovdqa64 %%zmm1, %%zmm6\n\t"
+				"vpclmulqdq $0x00, %%zmm4, %%zmm1, %%zmm1\n\t"
+				"vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm1\n\t"
+
+				"vmovdqu64 %[inbuf_2], %%zmm5\n\t"
+				"vmovdqa64 %%zmm2, %%zmm6\n\t"
+				"vpclmulqdq $0x00, %%zmm4, %%zmm2, %%zmm2\n\t"
+				"vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm2\n\t"
+
+				"vmovdqu64 %[inbuf_3], %%zmm5\n\t"
+				"vmovdqa64 %%zmm3, %%zmm6\n\t"
+				"vpclmulqdq $0x00, %%zmm4, %%zmm3, %%zmm3\n\t"
+				"vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm3\n\t"
+				:
+				: [inbuf_0] "m" (inbuf[0 * 64]),
+				  [inbuf_1] "m" (inbuf[1 * 64]),
+				  [inbuf_2] "m" (inbuf[2 * 64]),
+				  [inbuf_3] "m" (inbuf[3 * 64])
+				);
+
+		  inbuf += 4 * 64;
+		  inlen -= 4 * 64;
+		}
+
+	      /* Fold 16 to 8. */
+	      asm volatile("vbroadcasti32x4 %[k_ymm], %%zmm4\n\t"
+			   /* Fold zmm2 into zmm0. */
+			   "vmovdqa64 %%zmm0, %%zmm5\n\t"
+			   "vpclmulqdq $0x00, %%zmm4, %%zmm5, %%zmm5\n\t"
+			   "vpclmulqdq $0x11, %%zmm4, %%zmm0, %%zmm0\n\t"
+			   "vpternlogq $0x96, %%zmm2, %%zmm5, %%zmm0\n\t"
+			   /* Fold zmm3 into zmm1. */
+			   "vmovdqa64 %%zmm1, %%zmm5\n\t"
+			   "vpclmulqdq $0x00, %%zmm4, %%zmm5, %%zmm5\n\t"
+			   "vpclmulqdq $0x11, %%zmm4, %%zmm1, %%zmm1\n\t"
+			   "vpternlogq $0x96, %%zmm3, %%zmm5, %%zmm1\n\t"
+			   :
+			   : [k_ymm] "m" (consts->k_ymm[0]));
+
+	      asm volatile("vextracti64x4 $1, %%zmm1, %%ymm3\n\t"
+			   "vmovdqa %%ymm1, %%ymm2\n\t"
+			   "vextracti64x4 $1, %%zmm0, %%ymm1\n\t"
+			   :
+			   : );
+	    }
+	  else
+	    {
+	      asm volatile ("vmovd %[crc], %%xmm4\n\t"
+			    "vmovdqu %[inbuf_0], %%ymm0\n\t"
+			    "vmovdqu %[inbuf_1], %%ymm1\n\t"
+			    "vmovdqu %[inbuf_2], %%ymm2\n\t"
+			    "vmovdqu %[inbuf_3], %%ymm3\n\t"
+			    "vpxor %%ymm4, %%ymm0, %%ymm0\n\t"
+			    :
+			    : [inbuf_0] "m" (inbuf[0 * 32]),
+			      [inbuf_1] "m" (inbuf[1 * 32]),
+			      [inbuf_2] "m" (inbuf[2 * 32]),
+			      [inbuf_3] "m" (inbuf[3 * 32]),
+			      [crc] "m" (*pcrc)
+			    );
 
-      inbuf += 4 * 16;
-      inlen -= 4 * 16;
+	      inbuf += 4 * 32;
+	      inlen -= 4 * 32;
 
-      asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
-		    :
-		    : [k1k2] "m" (consts->k[1 - 1])
-		    );
+	      asm volatile ("vbroadcasti128 %[k_ymm], %%ymm4\n\t"
+			    :
+			    : [k_ymm] "m" (consts->k_ymm[0])
+			    );
+	    }
+
+	  /* Fold by 8. */
+	  while (inlen >= 4 * 32)
+	    {
+	      asm volatile ("vmovdqu %[inbuf_0], %%ymm5\n\t"
+			    "vmovdqa %%ymm0, %%ymm6\n\t"
+			    "vpclmulqdq $0x00, %%ymm4, %%ymm0, %%ymm0\n\t"
+			    "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+			    "vpxor %%ymm6, %%ymm0, %%ymm0\n\t"
+
+			    "vmovdqu %[inbuf_1], %%ymm5\n\t"
+			    "vmovdqa %%ymm1, %%ymm6\n\t"
+			    "vpclmulqdq $0x00, %%ymm4, %%ymm1, %%ymm1\n\t"
+			    "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+			    "vpxor %%ymm6, %%ymm1, %%ymm1\n\t"
+
+			    "vmovdqu %[inbuf_2], %%ymm5\n\t"
+			    "vmovdqa %%ymm2, %%ymm6\n\t"
+			    "vpclmulqdq $0x00, %%ymm4, %%ymm2, %%ymm2\n\t"
+			    "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
+			    "vpxor %%ymm6, %%ymm2, %%ymm2\n\t"
+
+			    "vmovdqu %[inbuf_3], %%ymm5\n\t"
+			    "vmovdqa %%ymm3, %%ymm6\n\t"
+			    "vpclmulqdq $0x00, %%ymm4, %%ymm3, %%ymm3\n\t"
+			    "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"
+			    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t"
+			    :
+			    : [inbuf_0] "m" (inbuf[0 * 32]),
+			      [inbuf_1] "m" (inbuf[1 * 32]),
+			      [inbuf_2] "m" (inbuf[2 * 32]),
+			      [inbuf_3] "m" (inbuf[3 * 32])
+			    );
+
+	      inbuf += 4 * 32;
+	      inlen -= 4 * 32;
+	    }
+
+	  /* Fold 8 to 4. */
+	  asm volatile("vbroadcasti128 %[k1k2], %%ymm4\n\t"
+
+		       /* Fold ymm2 into ymm0. */
+		       "vmovdqa %%ymm0, %%ymm5\n\t"
+		       "vpclmulqdq $0x00, %%ymm4, %%ymm5, %%ymm5\n\t"
+		       "vpclmulqdq $0x11, %%ymm4, %%ymm0, %%ymm0\n\t"
+		       "vpxor %%ymm2, %%ymm5, %%ymm5\n\t"
+		       "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+
+		       /* Fold ymm3 into ymm1. */
+		       "vmovdqa %%ymm1, %%ymm5\n\t"
+		       "vpclmulqdq $0x00, %%ymm4, %%ymm5, %%ymm5\n\t"
+		       "vpclmulqdq $0x11, %%ymm4, %%ymm1, %%ymm1\n\t"
+		       "vpxor %%ymm3, %%ymm5, %%ymm5\n\t"
+		       "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+
+		       "vextracti128 $1, %%ymm1, %%xmm3\n\t"
+		       "vmovdqa %%xmm1, %%xmm2\n\t"
+		       "vextracti128 $1, %%ymm0, %%xmm1\n\t"
+
+		       "vzeroupper\n\t"
+		       :
+		       : [k1k2] "m" (consts->k[1 - 1])
+		       );
+      }
+      else
+	{
+	  asm volatile ("movd %[crc], %%xmm4\n\t"
+			"movdqu %[inbuf_0], %%xmm0\n\t"
+			"movdqu %[inbuf_1], %%xmm1\n\t"
+			"movdqu %[inbuf_2], %%xmm2\n\t"
+			"movdqu %[inbuf_3], %%xmm3\n\t"
+			"pxor %%xmm4, %%xmm0\n\t"
+			:
+			: [inbuf_0] "m" (inbuf[0 * 16]),
+			  [inbuf_1] "m" (inbuf[1 * 16]),
+			  [inbuf_2] "m" (inbuf[2 * 16]),
+			  [inbuf_3] "m" (inbuf[3 * 16]),
+			  [crc] "m" (*pcrc)
+			);
+
+	  inbuf += 4 * 16;
+	  inlen -= 4 * 16;
+
+	  asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+			:
+			: [k1k2] "m" (consts->k[1 - 1])
+			);
+	}
 
       /* Fold by 4. */
       while (inlen >= 4 * 16)
@@ -219,7 +420,6 @@ crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
 		    );
 
       /* Fold 4 to 1. */
-
       asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
 		    "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
 		    "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
@@ -489,7 +689,7 @@ crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
 /* PCLMUL functions for non-reflected CRC32. */
 static ASM_FUNC_ATTR_INLINE void
 crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
-	    const struct crc32_consts_s *consts)
+	    const struct crc32_consts_s *consts, u32 hwfeatures)
 {
   asm volatile ("movdqa %[bswap], %%xmm7\n\t"
 		:
@@ -498,31 +698,230 @@ crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
 
   if (inlen >= 8 * 16)
     {
-      asm volatile ("movd %[crc], %%xmm4\n\t"
-		    "movdqu %[inbuf_0], %%xmm0\n\t"
-		    "movdqu %[inbuf_1], %%xmm1\n\t"
-		    "movdqu %[inbuf_2], %%xmm2\n\t"
-		    "pxor %%xmm4, %%xmm0\n\t"
-		    "movdqu %[inbuf_3], %%xmm3\n\t"
-		    "pshufb %%xmm7, %%xmm0\n\t"
-		    "pshufb %%xmm7, %%xmm1\n\t"
-		    "pshufb %%xmm7, %%xmm2\n\t"
-		    "pshufb %%xmm7, %%xmm3\n\t"
-		    :
-		    : [inbuf_0] "m" (inbuf[0 * 16]),
-		      [inbuf_1] "m" (inbuf[1 * 16]),
-		      [inbuf_2] "m" (inbuf[2 * 16]),
-		      [inbuf_3] "m" (inbuf[3 * 16]),
-		      [crc] "m" (*pcrc)
-		    );
+      if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+	  && (hwfeatures & HWF_INTEL_AVX2)
+	  && inlen >= 8 * 32)
+	{
+	  if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+	      && (hwfeatures & HWF_INTEL_AVX512)
+	      && inlen >= 8 * 64)
+	    {
+	      asm volatile("vpopcntb %%xmm7, %%xmm0\n\t" /* spec stop for old AVX512 CPUs */
+			   "vshufi32x4 $0x00, %%zmm7, %%zmm7, %%zmm7\n\t"
+			   "vmovd %[crc], %%xmm4\n\t"
+			   "vmovdqu64 %[inbuf_0], %%zmm0\n\t"
+			   "vmovdqu64 %[inbuf_1], %%zmm1\n\t"
+			   "vmovdqu64 %[inbuf_2], %%zmm2\n\t"
+			   "vmovdqu64 %[inbuf_3], %%zmm3\n\t"
+			   "vpxorq %%zmm4, %%zmm0, %%zmm0\n\t"
+			   "vpshufb %%zmm7, %%zmm0, %%zmm0\n\t"
+			   "vpshufb %%zmm7, %%zmm1, %%zmm1\n\t"
+			   "vpshufb %%zmm7, %%zmm2, %%zmm2\n\t"
+			   "vpshufb %%zmm7, %%zmm3, %%zmm3\n\t"
+			   :
+			   : [crc] "m" (*pcrc),
+			     [inbuf_0] "m" (inbuf[0 * 64]),
+			     [inbuf_1] "m" (inbuf[1 * 64]),
+			     [inbuf_2] "m" (inbuf[2 * 64]),
+			     [inbuf_3] "m" (inbuf[3 * 64])
+			   );
+
+	      inbuf += 4 * 64;
+	      inlen -= 4 * 64;
+
+	      asm volatile ("vbroadcasti32x4 %[k_zmm], %%zmm4\n\t"
+			    :
+			    : [k_zmm] "m" (consts->k_zmm[0])
+			    );
 
-      inbuf += 4 * 16;
-      inlen -= 4 * 16;
+	      /* Fold by 16. */
+	      while (inlen >= 4 * 64)
+		{
+		  asm volatile ("vmovdqu64 %[inbuf_0], %%zmm5\n\t"
+				"vmovdqa64 %%zmm0, %%zmm6\n\t"
+				"vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+				"vpclmulqdq $0x01, %%zmm4, %%zmm0, %%zmm0\n\t"
+				"vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm0\n\t"
+
+				"vmovdqu64 %[inbuf_1], %%zmm5\n\t"
+				"vmovdqa64 %%zmm1, %%zmm6\n\t"
+				"vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+				"vpclmulqdq $0x01, %%zmm4, %%zmm1, %%zmm1\n\t"
+				"vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm1\n\t"
+
+				"vmovdqu64 %[inbuf_2], %%zmm5\n\t"
+				"vmovdqa64 %%zmm2, %%zmm6\n\t"
+				"vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+				"vpclmulqdq $0x01, %%zmm4, %%zmm2, %%zmm2\n\t"
+				"vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm2\n\t"
+
+				"vmovdqu64 %[inbuf_3], %%zmm5\n\t"
+				"vmovdqa64 %%zmm3, %%zmm6\n\t"
+				"vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+				"vpclmulqdq $0x01, %%zmm4, %%zmm3, %%zmm3\n\t"
+				"vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+				"vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm3\n\t"
+				:
+				: [inbuf_0] "m" (inbuf[0 * 64]),
+				  [inbuf_1] "m" (inbuf[1 * 64]),
+				  [inbuf_2] "m" (inbuf[2 * 64]),
+				  [inbuf_3] "m" (inbuf[3 * 64])
+				);
+
+		  inbuf += 4 * 64;
+		  inlen -= 4 * 64;
+		}
+
+	      asm volatile("vbroadcasti32x4 %[k_ymm], %%zmm4\n\t"
+			   /* Fold zmm2 into zmm0. */
+			   "vmovdqa64 %%zmm0, %%zmm5\n\t"
+			   "vpclmulqdq $0x01, %%zmm4, %%zmm5, %%zmm5\n\t"
+			   "vpclmulqdq $0x10, %%zmm4, %%zmm0, %%zmm0\n\t"
+			   "vpternlogq $0x96, %%zmm2, %%zmm5, %%zmm0\n\t"
+			   /* Fold zmm3 into zmm1. */
+			   "vmovdqa64 %%zmm1, %%zmm5\n\t"
+			   "vpclmulqdq $0x01, %%zmm4, %%zmm5, %%zmm5\n\t"
+			   "vpclmulqdq $0x10, %%zmm4, %%zmm1, %%zmm1\n\t"
+			   "vpternlogq $0x96, %%zmm3, %%zmm5, %%zmm1\n\t"
+			   :
+			   : [k_ymm] "m" (consts->k_ymm[0])
+			   );
+
+	      asm volatile("vextracti64x4 $1, %%zmm1, %%ymm3\n\t"
+			   "vmovdqa %%ymm1, %%ymm2\n\t"
+			   "vextracti64x4 $1, %%zmm0, %%ymm1\n\t"
+			   :
+			   :
+			   );
+	    }
+	  else
+	    {
+	      asm volatile("vinserti128 $1, %%xmm7, %%ymm7, %%ymm7\n\t"
+			  "vmovd %[crc], %%xmm4\n\t"
+			  "vmovdqu %[inbuf_0], %%ymm0\n\t"
+			  "vmovdqu %[inbuf_1], %%ymm1\n\t"
+			  "vmovdqu %[inbuf_2], %%ymm2\n\t"
+			  "vmovdqu %[inbuf_3], %%ymm3\n\t"
+			  "vpxor %%ymm4, %%ymm0, %%ymm0\n\t"
+			  "vpshufb %%ymm7, %%ymm0, %%ymm0\n\t"
+			  "vpshufb %%ymm7, %%ymm1, %%ymm1\n\t"
+			  "vpshufb %%ymm7, %%ymm2, %%ymm2\n\t"
+			  "vpshufb %%ymm7, %%ymm3, %%ymm3\n\t"
+			  :
+			  : [crc] "m" (*pcrc),
+			    [inbuf_0] "m" (inbuf[0 * 32]),
+			    [inbuf_1] "m" (inbuf[1 * 32]),
+			    [inbuf_2] "m" (inbuf[2 * 32]),
+			    [inbuf_3] "m" (inbuf[3 * 32])
+			  );
+
+	      inbuf += 4 * 32;
+	      inlen -= 4 * 32;
+
+	      asm volatile ("vbroadcasti128 %[k_ymm], %%ymm4\n\t"
+			  : : [k_ymm] "m" (consts->k_ymm[0]));
+	    }
 
-      asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
-		    :
-		    : [k1k2] "m" (consts->k[1 - 1])
-		    );
+	  /* Fold by 8. */
+	  while (inlen >= 4 * 32)
+	    {
+	      asm volatile ("vmovdqu %[inbuf_0], %%ymm5\n\t"
+			    "vmovdqa %%ymm0, %%ymm6\n\t"
+			    "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+			    "vpclmulqdq $0x01, %%ymm4, %%ymm0, %%ymm0\n\t"
+			    "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+			    "vpxor %%ymm6, %%ymm0, %%ymm0\n\t"
+
+			    "vmovdqu %[inbuf_1], %%ymm5\n\t"
+			    "vmovdqa %%ymm1, %%ymm6\n\t"
+			    "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+			    "vpclmulqdq $0x01, %%ymm4, %%ymm1, %%ymm1\n\t"
+			    "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+			    "vpxor %%ymm6, %%ymm1, %%ymm1\n\t"
+
+			    "vmovdqu %[inbuf_2], %%ymm5\n\t"
+			    "vmovdqa %%ymm2, %%ymm6\n\t"
+			    "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+			    "vpclmulqdq $0x01, %%ymm4, %%ymm2, %%ymm2\n\t"
+			    "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
+			    "vpxor %%ymm6, %%ymm2, %%ymm2\n\t"
+
+			    "vmovdqu %[inbuf_3], %%ymm5\n\t"
+			    "vmovdqa %%ymm3, %%ymm6\n\t"
+			    "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+			    "vpclmulqdq $0x01, %%ymm4, %%ymm3, %%ymm3\n\t"
+			    "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+			    "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"
+			    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t"
+			    :
+			    : [inbuf_0] "m" (inbuf[0 * 32]),
+			      [inbuf_1] "m" (inbuf[1 * 32]),
+			      [inbuf_2] "m" (inbuf[2 * 32]),
+			      [inbuf_3] "m" (inbuf[3 * 32])
+			    );
+
+	      inbuf += 4 * 32;
+	      inlen -= 4 * 32;
+	    }
+
+	  asm volatile("vbroadcasti128 %[k1k2], %%ymm4\n\t"
+
+		       /* Fold ymm2 into ymm0. */
+		       "vmovdqa %%ymm0, %%ymm5\n\t"
+		       "vpclmulqdq $0x01, %%ymm4, %%ymm5, %%ymm5\n\t"
+		       "vpclmulqdq $0x10, %%ymm4, %%ymm0, %%ymm0\n\t"
+		       "vpxor %%ymm2, %%ymm5, %%ymm5\n\t"
+		       "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+
+		       /* Fold ymm3 into ymm1. */
+		       "vmovdqa %%ymm1, %%ymm5\n\t"
+		       "vpclmulqdq $0x01, %%ymm4, %%ymm5, %%ymm5\n\t"
+		       "vpclmulqdq $0x10, %%ymm4, %%ymm1, %%ymm1\n\t"
+		       "vpxor %%ymm3, %%ymm5, %%ymm5\n\t"
+		       "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+
+		       "vextracti128 $1, %%ymm1, %%xmm3\n\t"
+		       "vmovdqa %%xmm1, %%xmm2\n\t"
+		       "vextracti128 $1, %%ymm0, %%xmm1\n\t"
+		       "vzeroupper\n\t"
+		       :
+		       : [k1k2] "m" (consts->k[1 - 1])
+		       );
+	}
+      else
+	{
+	  asm volatile ("movd %[crc], %%xmm4\n\t"
+			"movdqu %[inbuf_0], %%xmm0\n\t"
+			"movdqu %[inbuf_1], %%xmm1\n\t"
+			"movdqu %[inbuf_2], %%xmm2\n\t"
+			"pxor %%xmm4, %%xmm0\n\t"
+			"movdqu %[inbuf_3], %%xmm3\n\t"
+			"pshufb %%xmm7, %%xmm0\n\t"
+			"pshufb %%xmm7, %%xmm1\n\t"
+			"pshufb %%xmm7, %%xmm2\n\t"
+			"pshufb %%xmm7, %%xmm3\n\t"
+			:
+			: [inbuf_0] "m" (inbuf[0 * 16]),
+			  [inbuf_1] "m" (inbuf[1 * 16]),
+			  [inbuf_2] "m" (inbuf[2 * 16]),
+			  [inbuf_3] "m" (inbuf[3 * 16]),
+			  [crc] "m" (*pcrc)
+			);
+
+	  inbuf += 4 * 16;
+	  inlen -= 4 * 16;
+
+	  asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+			:
+			: [k1k2] "m" (consts->k[1 - 1])
+			);
+	}
 
       /* Fold by 4. */
       while (inlen >= 4 * 16)
@@ -577,7 +976,6 @@ crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
 		    );
 
       /* Fold 4 to 1. */
-
       asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
 		    "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
 		    "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
@@ -865,7 +1263,8 @@ crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
 }
 
 void ASM_FUNC_ATTR
-_gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+_gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen,
+			  u32 hwfeatures)
 {
   const struct crc32_consts_s *consts = &crc32_consts;
 #if defined(__x86_64__) && defined(__WIN64__)
@@ -883,7 +1282,7 @@ _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
     return;
 
   if (inlen >= 16)
-    crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
+    crc32_reflected_bulk(pcrc, inbuf, inlen, consts, hwfeatures);
   else
     crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
 
@@ -898,7 +1297,8 @@ _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
 }
 
 void ASM_FUNC_ATTR
-_gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+_gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen,
+				 u32 hwfeatures)
 {
   const struct crc32_consts_s *consts = &crc24rfc2440_consts;
 #if defined(__x86_64__) && defined(__WIN64__)
@@ -918,7 +1318,7 @@ _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
   /* Note: *pcrc in input endian. */
 
   if (inlen >= 16)
-    crc32_bulk(pcrc, inbuf, inlen, consts);
+    crc32_bulk(pcrc, inbuf, inlen, consts, hwfeatures);
   else
     crc32_less_than_16(pcrc, inbuf, inlen, consts);
 
diff --git a/cipher/crc.c b/cipher/crc.c
index cdff0648..21ab8523 100644
--- a/cipher/crc.c
+++ b/cipher/crc.c
@@ -70,6 +70,7 @@ typedef struct
   u32 CRC;
 #ifdef USE_INTEL_PCLMUL
   unsigned int use_pclmul:1;           /* Intel PCLMUL shall be used.  */
+  u32 hwfeatures;
 #endif
 #ifdef USE_ARM_PMULL
   unsigned int use_pmull:1;            /* ARMv8 PMULL shall be used. */
@@ -84,9 +85,10 @@ CRC_CONTEXT;
 
 #ifdef USE_INTEL_PCLMUL
 /*-- crc-intel-pclmul.c --*/
-void _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen,
+			       u32 hwfeatures);
 void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf,
-				      size_t inlen);
+				      size_t inlen, u32 hwfeatures);
 #endif
 
 #ifdef USE_ARM_PMULL
@@ -407,6 +409,7 @@ crc32_init (void *context, unsigned int flags)
 
 #ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+  ctx->hwfeatures = hwf;
 #endif
 #ifdef USE_ARM_PMULL
   ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
@@ -431,7 +434,7 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen)
 #ifdef USE_INTEL_PCLMUL
   if (ctx->use_pclmul)
     {
-      _gcry_crc32_intel_pclmul(&ctx->CRC, inbuf, inlen);
+      _gcry_crc32_intel_pclmul(&ctx->CRC, inbuf, inlen, ctx->hwfeatures);
       return;
     }
 #endif
@@ -506,6 +509,7 @@ crc32rfc1510_init (void *context, unsigned int flags)
 
 #ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+  ctx->hwfeatures = hwf;
 #endif
 #ifdef USE_ARM_PMULL
   ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
@@ -843,6 +847,7 @@ crc24rfc2440_init (void *context, unsigned int flags)
 
 #ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+  ctx->hwfeatures = hwf;
 #endif
 #ifdef USE_ARM_PMULL
   ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
@@ -867,7 +872,7 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen)
 #ifdef USE_INTEL_PCLMUL
   if (ctx->use_pclmul)
     {
-      _gcry_crc24rfc2440_intel_pclmul(&ctx->CRC, inbuf, inlen);
+      _gcry_crc24rfc2440_intel_pclmul(&ctx->CRC, inbuf, inlen, ctx->hwfeatures);
       return;
     }
 #endif
-- 
2.48.1


From jussi.kivilinna at iki.fi  Thu Aug  7 15:28:50 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu,  7 Aug 2025 16:28:50 +0300
Subject: [PATCH 1/6] configure.ac: deduplicate intrinsics test code
Message-ID: <20250807132855.878167-1-jussi.kivilinna@iki.fi>

* configure.ac (gcry_cv_cc_aarch64_neon_intrinsics)
(gcry_cv_cc_aarch64_neon_intrinsics_cflags): Move test source code
to new macro GCRY_AARCH64_NEON_INTRINSICS_TEST.
(gcry_cv_cc_ppc_altivec, gcry_cv_cc_ppc_altivec_cflags):
Move test source code to new macro GCRY_POWERPC_VECTOR_INTRINSICS_TEST.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 configure.ac | 164 ++++++++++++++++++++++-----------------------------
 1 file changed, 70 insertions(+), 94 deletions(-)

diff --git a/configure.ac b/configure.ac
index c8d9b4a3..3ce405e9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2235,6 +2235,37 @@ fi
 #
 # Check whether compiler supports AArch64/NEON/crypto intrinsics
 #
+m4_define([GCRY_AARCH64_NEON_INTRINSICS_TEST],
+  [AC_LANG_SOURCE(
+    [[#include <arm_neon.h>
+      #define __m128i uint64x2_t
+      #define vpsrldq128(s, a, o) \
+	({ uint64x2_t __tmp = { 0, 0 }; \
+	    o = (__m128i)vextq_u8((uint8x16_t)a, \
+				  (uint8x16_t)__tmp, (s) & 15); })
+      #define vaesenclast128(a, b, o) \
+	(o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+      #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+      static inline __attribute__((always_inline)) __m128i
+      fn2(__m128i a)
+      {
+	vpsrldq128(2, a, a);
+	return a;
+      }
+      __m128i fn(__m128i in)
+      {
+	__m128i x;
+	memory_barrier_with_vec(in);
+	x = fn2(in);
+	memory_barrier_with_vec(x);
+	vaesenclast128(in, x, in);
+	memory_barrier_with_vec(in);
+	return in;
+      }
+    ]]
+  )]
+)
+
 AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics],
       [gcry_cv_cc_aarch64_neon_intrinsics],
       [if test "$mpi_cpu_arch" != "aarch64" ||
@@ -2242,34 +2273,9 @@ AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics],
 	gcry_cv_cc_aarch64_neon_intrinsics="n/a"
       else
 	gcry_cv_cc_aarch64_neon_intrinsics=no
-	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-	[[#include <arm_neon.h>
-	  #define __m128i uint64x2_t
-	  #define vpsrldq128(s, a, o) \
-	    ({ uint64x2_t __tmp = { 0, 0 }; \
-		o = (__m128i)vextq_u8((uint8x16_t)a, \
-				      (uint8x16_t)__tmp, (s) & 15); })
-	  #define vaesenclast128(a, b, o) \
-	    (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
-	  #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
-	  static inline __attribute__((always_inline)) __m128i
-	  fn2(__m128i a)
-	  {
-	    vpsrldq128(2, a, a);
-	    return a;
-	  }
-	  __m128i fn(__m128i in)
-	  {
-	    __m128i x;
-	    memory_barrier_with_vec(in);
-	    x = fn2(in);
-	    memory_barrier_with_vec(x);
-	    vaesenclast128(in, x, in);
-	    memory_barrier_with_vec(in);
-	    return in;
-	  }
-	  ]])],
-	[gcry_cv_cc_aarch64_neon_intrinsics=yes])
+	AC_COMPILE_IFELSE(
+	  [GCRY_AARCH64_NEON_INTRINSICS_TEST],
+	  [gcry_cv_cc_aarch64_neon_intrinsics=yes])
       fi])
 if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
@@ -2284,35 +2290,12 @@ if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" &&
    test "$try_asm_modules" = "yes" ; then
   AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_aarch64_neon_intrinsics_cflags],
-    [gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
-    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-      [[#include <arm_neon.h>
-	#define __m128i uint64x2_t
-	#define vpsrldq128(s, a, o) \
-	  ({ uint64x2_t __tmp = { 0, 0 }; \
-	      o = (__m128i)vextq_u8((uint8x16_t)a, \
-				    (uint8x16_t)__tmp, (s) & 15); })
-	#define vaesenclast128(a, b, o) \
-	  (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
-	#define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
-	static inline __attribute__((always_inline)) __m128i
-	fn2(__m128i a)
-	{
-	  vpsrldq128(2, a, a);
-	  return a;
-	}
-	__m128i fn(__m128i in)
-	{
-	  __m128i x;
-	  memory_barrier_with_vec(in);
-	  x = fn2(in);
-	  memory_barrier_with_vec(x);
-	  vaesenclast128(in, x, in);
-	  memory_barrier_with_vec(in);
-	  return in;
-	}
-	]])],
-      [gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])])
+    [
+      gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
+      AC_COMPILE_IFELSE(
+	[GCRY_AARCH64_NEON_INTRINSICS_TEST],
+	[gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])
+    ])
   if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
 	      [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
@@ -2331,6 +2314,27 @@ CFLAGS=$_gcc_cflags_save;
 #
 # Check whether compiler supports PowerPC AltiVec/VSX intrinsics
 #
+m4_define([GCRY_POWERPC_VECTOR_INTRINSICS_TEST],
+  [AC_LANG_SOURCE(
+    [[#include <altivec.h>
+      typedef vector unsigned char block;
+      typedef vector unsigned int vecu32;
+      static inline __attribute__((always_inline)) vecu32
+      vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
+      {
+	return vec_sld (a, b, (4 * idx) & 15);
+      }
+      block fn(block in)
+      {
+	block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
+	vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
+	y = vec_sld_u32 (y, y, 3);
+	return vec_cipher_be (t, in) ^ (block)y;
+      }
+    ]]
+  )]
+)
+
 AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
       [gcry_cv_cc_ppc_altivec],
       [if test "$mpi_cpu_arch" != "ppc" ||
@@ -2338,24 +2342,9 @@ AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics]
 	gcry_cv_cc_ppc_altivec="n/a"
       else
 	gcry_cv_cc_ppc_altivec=no
-	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-	[[#include <altivec.h>
-	  typedef vector unsigned char block;
-	  typedef vector unsigned int vecu32;
-	  static inline __attribute__((always_inline)) vecu32
-	  vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
-	  {
-	    return vec_sld (a, b, (4 * idx) & 15);
-	  }
-	  block fn(block in)
-	  {
-	    block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
-	    vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
-	    y = vec_sld_u32 (y, y, 3);
-	    return vec_cipher_be (t, in) ^ (block)y;
-	  }
-	  ]])],
-	[gcry_cv_cc_ppc_altivec=yes])
+	AC_COMPILE_IFELSE(
+	  [GCRY_POWERPC_VECTOR_INTRINSICS_TEST],
+	  [gcry_cv_cc_ppc_altivec=yes])
       fi])
 if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
@@ -2370,25 +2359,12 @@ if test "$gcry_cv_cc_ppc_altivec" = "no" &&
    test "$try_asm_modules" = "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
-    [gcry_cv_cc_ppc_altivec_cflags=no
-    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-      [[#include <altivec.h>
-	typedef vector unsigned char block;
-	typedef vector unsigned int vecu32;
-	static inline __attribute__((always_inline)) vecu32
-	vec_sld_u32(vecu32 a, vecu32 b, unsigned int idx)
-	{
-	  return vec_sld (a, b, (4 * idx) & 15);
-	}
-	block fn(block in)
-	{
-	  block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
-	  vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
-	  y = vec_sld_u32 (y, y, 3);
-	  return vec_cipher_be (t, in) ^ (block)y;
-	}
-	]])],
-      [gcry_cv_cc_ppc_altivec_cflags=yes])])
+    [
+      gcry_cv_cc_ppc_altivec_cflags=no
+      AC_COMPILE_IFELSE(
+	[GCRY_POWERPC_VECTOR_INTRINSICS_TEST],
+	[gcry_cv_cc_ppc_altivec_cflags=yes])
+    ])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
 	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
-- 
2.48.1


From jussi.kivilinna at iki.fi  Thu Aug  7 15:28:53 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu,  7 Aug 2025 16:28:53 +0300
Subject: [PATCH 4/6] Add RISC-V vector cryptography implementations of SHA256
 and SHA512
In-Reply-To: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
References: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
Message-ID: <20250807132855.878167-4-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'sha256-riscv-zvknha-zvkb.c' and
'sha512-riscv-zvknhb-zvkb.c'.
* cipher/sha256-riscv-zvknha-zvkb.c: New.
* cipher/sha256.c (USE_RISCV_V_CRYPTO): New.
[USE_RISCV_V_CRYPTO] (_gcry_sha256_riscv_v_check_hw)
(_gcry_sha256_transform_riscv_zvknha_zvkb)
(do_sha256_transform_riscv_zvknha): New.
(sha256_common_init) [sha256_common_init]: Enable new implementation
if supported by HW.
* cipher/sha512-riscv-zvknhb-zvkb.c: New.
* cipher/sha512.c (USE_RISCV_V_CRYPTO): New.
[USE_RISCV_V_CRYPTO] (_gcry_sha512_riscv_v_check_hw)
(_gcry_sha512_transform_riscv_zvknhb_zvkb)
(do_sha512_transform_riscv_zvknhb): New.
(sha512_common_init) [sha512_common_init]: Enable new implementation
if supported by HW.
* configure.ac: Add 'sha256-riscv-zvknha-zvkb.lo' and
'sha512-riscv-zvknhb-zvkb.lo'.
(GCRY_RISCV_VECTOR_INTRINSICS_TEST): New.
(gcry_cv_cc_riscv_vector_intrinsics)
(gcry_cv_cc_riscv_vector_intrinsics_cflags): Move test code to new
macro GCRY_RISCV_VECTOR_INTRINSICS_TEST.
(GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST)
(gcry_cv_cc_riscv_vector_crypto_intrinsics)
(gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags)
(gcry_cv_riscv_vsha2cl_intrinsics_work): New.
* src/g10lib.h [HAVE_CPU_ARCH_RISCV] (HWF_RISCV_ZVKB, HWF_RISCV_ZVKNHA)
(HWF_RISCV_ZVKNHB): New.
* src/hwf-riscv.c (HWF_RISCV_HWPROBE_EXT_ZVKB)
(HWF_RISCV_HWPROBE_EXT_ZVKNHA, HWF_RISCV_HWPROBE_EXT_ZVKNHB): New.
* src/hwfeatures.c (hwflist) [HAVE_CPU_ARCH_RISCV]: Add "riscv-zvkb",
"riscv-zvknha" and "riscv-zvknhb".
--

Implementations have been tested against QEMU emulator as there is no
actual HW available with these instructions yet.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am                |  23 ++-
 cipher/sha256-riscv-zvknha-zvkb.c | 197 +++++++++++++++++++++
 cipher/sha256.c                   |  34 ++++
 cipher/sha512-riscv-zvknhb-zvkb.c | 190 +++++++++++++++++++++
 cipher/sha512.c                   |  37 ++++
 configure.ac                      | 274 ++++++++++++++++++++++--------
 src/g10lib.h                      |   3 +
 src/hwf-riscv.c                   |   6 +
 src/hwfeatures.c                  |   3 +
 9 files changed, 690 insertions(+), 77 deletions(-)
 create mode 100644 cipher/sha256-riscv-zvknha-zvkb.c
 create mode 100644 cipher/sha512-riscv-zvknhb-zvkb.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index dfffefb5..3375ea38 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -143,11 +143,11 @@ EXTRA_libcipher_la_SOURCES = \
 	sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
 	sha256-avx2-bmi2-amd64.S \
 	sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
-	sha256-intel-shaext.c sha256-ppc.c \
+	sha256-intel-shaext.c sha256-ppc.c sha256-riscv-zvknha-zvkb.c \
 	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
 	sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
 	sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
-	sha512-ppc.c sha512-ssse3-i386.c \
+	sha512-ppc.c sha512-riscv-zvknhb-zvkb.c sha512-ssse3-i386.c \
 	sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
 	keccak.c keccak_permute_32.h keccak_permute_64.h \
 	keccak-armv7-neon.S keccak-amd64-avx512.S \
@@ -373,3 +373,22 @@ rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
 
 rijndael-vp-riscv.lo: $(srcdir)/rijndael-vp-riscv.c Makefile
 	`echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
+if ENABLE_RISCV_VECTOR_CRYPTO_INTRINSICS_EXTRA_CFLAGS
+# Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+riscv_vector_crypto_cflags = -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align
+else
+riscv_vector_crypto_cflags =
+endif
+
+sha256-riscv-zvknha-zvkb.o: $(srcdir)/sha256-riscv-zvknha-zvkb.c Makefile
+	`echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha256-riscv-zvknha-zvkb.lo: $(srcdir)/sha256-riscv-zvknha-zvkb.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha512-riscv-zvknhb-zvkb.o: $(srcdir)/sha512-riscv-zvknhb-zvkb.c Makefile
+	`echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
+
+sha512-riscv-zvknhb-zvkb.lo: $(srcdir)/sha512-riscv-zvknhb-zvkb.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/sha256-riscv-zvknha-zvkb.c b/cipher/sha256-riscv-zvknha-zvkb.c
new file mode 100644
index 00000000..6375f9aa
--- /dev/null
+++ b/cipher/sha256-riscv-zvknha-zvkb.c
@@ -0,0 +1,197 @@
+/* sha256-riscv-zvknha-zvkb.c - RISC-V vector crypto implementation of SHA-256
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+
+#include "g10lib.h"
+#include "simd-common-riscv.h"
+#include <riscv_vector.h>
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+working_vsha2cl_vv_u32m1(vuint32m1_t hgcd, vuint32m1_t feba,
+			 vuint32m1_t kw, size_t vl)
+{
+#ifdef HAVE_BROKEN_VSHA2CL_INTRINSIC
+  asm (
+    "vsetvli zero,%3,e32,m1,ta,ma;\n\t"
+    "vsha2cl.vv %0,%1,%2;\n\t"
+    : "+vr" (hgcd)
+    : "vr" (feba), "vr" (kw), "r" (vl)
+    : "vl", "vtype"
+  );
+  return hgcd;
+#else
+  return __riscv_vsha2cl_vv_u32m1(hgcd, feba, kw, vl);
+#endif
+}
+
+
+/* Quad-round with message expansion (rounds 0-47) */
+#define QUAD_ROUND_W_SCHED(w0, w1, w2, w3) \
+    v_k = __riscv_vle32_v_u32m1(k, vl); \
+    k += 4; \
+    v_kw = __riscv_vadd_vv_u32m1(v_k, w0, vl); \
+    v_hgcd_work = working_vsha2cl_vv_u32m1(v_hgcd_work, v_feba_work, v_kw, vl); \
+    v_feba_work = __riscv_vsha2ch_vv_u32m1(v_feba_work, v_hgcd_work, v_kw, vl); \
+    v_w_merged = __riscv_vmerge_vvm_u32m1(w2, w1, merge_mask, vl); \
+    w0 = __riscv_vsha2ms_vv_u32m1(w0, v_w_merged, w3, vl);
+
+/* Quad-round without message expansion (rounds 48-63) */
+#define QUAD_ROUND_NO_SCHED(w0) \
+    v_k = __riscv_vle32_v_u32m1(k, vl); \
+    k += 4; \
+    v_kw = __riscv_vadd_vv_u32m1(v_k, w0, vl); \
+    v_hgcd_work = working_vsha2cl_vv_u32m1(v_hgcd_work, v_feba_work, v_kw, vl); \
+    v_feba_work = __riscv_vsha2ch_vv_u32m1(v_feba_work, v_hgcd_work, v_kw, vl);
+
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+load_and_swap (const byte * p, size_t vl, size_t vl_bytes)
+{
+  vuint8m1_t temp_bytes = __riscv_vle8_v_u8m1(p, vl_bytes);
+  return __riscv_vrev8_v_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(temp_bytes),
+			       vl);
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+sha256_transform_zvknha_zvkb (u32 state[8], const uint8_t * data,
+			      size_t nblocks)
+{
+  static const u32 k_const[64] =
+  {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+  };
+  static const u32 feba_hgcd_indices[4] = { 20, 16, 4, 0 };
+  static const int feba_offset = 0;
+  static const int hgcd_offset = 8 / sizeof(u32);
+  size_t vl;
+  size_t vl_bytes;
+  vuint32m1_t idx;
+  vuint32m1_t v_feba_work, v_feba;
+  vuint32m1_t v_hgcd_work, v_hgcd;
+  vuint32m1_t w0, w1, w2, w3;
+  vuint32m1_t v_k, v_kw, v_w_merged;
+  vbool32_t merge_mask;
+  vuint32m1_t v_feba_hgcd_idx;
+
+  vl = 4;
+  vl_bytes = vl * 4;
+  idx = __riscv_vid_v_u32m1(vl);
+  merge_mask = __riscv_vmseq_vx_u32m1_b32(idx, 0, vl);
+
+  v_feba_hgcd_idx = __riscv_vle32_v_u32m1(feba_hgcd_indices, vl);
+
+  v_feba = __riscv_vluxei32_v_u32m1(state + feba_offset, v_feba_hgcd_idx, vl);
+  v_hgcd = __riscv_vluxei32_v_u32m1(state + hgcd_offset, v_feba_hgcd_idx, vl);
+
+  while (nblocks > 0)
+    {
+      const u32 *k = k_const;
+
+      v_feba_work = v_feba;
+      v_hgcd_work = v_hgcd;
+
+      w0 = load_and_swap(data + 0, vl, vl_bytes);
+      w1 = load_and_swap(data + 16, vl, vl_bytes);
+      w2 = load_and_swap(data + 32, vl, vl_bytes);
+      w3 = load_and_swap(data + 48, vl, vl_bytes);
+
+      QUAD_ROUND_W_SCHED(w0, w1, w2, w3);
+      QUAD_ROUND_W_SCHED(w1, w2, w3, w0);
+      QUAD_ROUND_W_SCHED(w2, w3, w0, w1);
+      QUAD_ROUND_W_SCHED(w3, w0, w1, w2);
+      QUAD_ROUND_W_SCHED(w0, w1, w2, w3);
+      QUAD_ROUND_W_SCHED(w1, w2, w3, w0);
+      QUAD_ROUND_W_SCHED(w2, w3, w0, w1);
+      QUAD_ROUND_W_SCHED(w3, w0, w1, w2);
+      QUAD_ROUND_W_SCHED(w0, w1, w2, w3);
+      QUAD_ROUND_W_SCHED(w1, w2, w3, w0);
+      QUAD_ROUND_W_SCHED(w2, w3, w0, w1);
+      QUAD_ROUND_W_SCHED(w3, w0, w1, w2);
+      QUAD_ROUND_NO_SCHED(w0);
+      QUAD_ROUND_NO_SCHED(w1);
+      QUAD_ROUND_NO_SCHED(w2);
+      QUAD_ROUND_NO_SCHED(w3);
+
+      v_feba = __riscv_vadd_vv_u32m1(v_feba, v_feba_work, vl);
+      v_hgcd = __riscv_vadd_vv_u32m1(v_hgcd, v_hgcd_work, vl);
+
+      data += 64;
+      nblocks--;
+    }
+
+  __riscv_vsuxei32_v_u32m1(state + feba_offset, v_feba_hgcd_idx, v_feba, vl);
+  __riscv_vsuxei32_v_u32m1(state + hgcd_offset, v_feba_hgcd_idx, v_hgcd, vl);
+
+  clear_vec_regs();
+}
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_sha256_transform_riscv_zvknha_zvkb(u32 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks)
+{
+  sha256_transform_zvknha_zvkb(state, input_data, num_blks);
+  return 0;
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_sha256_riscv_v_check_hw(void)
+{
+  return (__riscv_vsetvl_e32m1(4) == 4);
+}
+
+#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
diff --git a/cipher/sha256.c b/cipher/sha256.c
index 24cab566..27d4b1d4 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -110,6 +110,15 @@
 # endif
 #endif
 
+/* USE_RISCV_V_CRYPTO indicates whether to enable RISC-V vector cryptography
+ * extension code. */
+#undef USE_RISCV_V_CRYPTO
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+# define USE_RISCV_V_CRYPTO 1
+#endif
+
 /* USE_S390X_CRYPTO indicates whether to enable zSeries code. */
 #undef USE_S390X_CRYPTO
 #if defined(HAVE_GCC_INLINE_ASM_S390X)
@@ -242,6 +251,23 @@ do_sha256_transform_ppc9(void *ctx, const unsigned char *data, size_t nblks)
 }
 #endif
 
+#ifdef USE_RISCV_V_CRYPTO
+unsigned int _gcry_sha256_riscv_v_check_hw(void);
+
+unsigned int
+_gcry_sha256_transform_riscv_zvknha_zvkb(u32 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks);
+
+static unsigned int
+do_sha256_transform_riscv_zvknha(void *ctx, const unsigned char *data,
+				 size_t nblks)
+{
+  SHA256_CONTEXT *hd = ctx;
+  return _gcry_sha256_transform_riscv_zvknha_zvkb (hd->h, data, nblks);
+}
+#endif
+
 #ifdef USE_S390X_CRYPTO
 #include "asm-inline-s390x.h"
 
@@ -324,6 +350,14 @@ sha256_common_init (SHA256_CONTEXT *hd)
   if ((features & HWF_PPC_VCRYPTO) != 0 && (features & HWF_PPC_ARCH_3_00) != 0)
     hd->bctx.bwrite = do_sha256_transform_ppc9;
 #endif
+#ifdef USE_RISCV_V_CRYPTO
+  if ((features & HWF_RISCV_IMAFDC)
+      && (features & HWF_RISCV_V)
+      && (features & HWF_RISCV_ZVKB)
+      && ((features & HWF_RISCV_ZVKNHA) || (features & HWF_RISCV_ZVKNHB))
+      && _gcry_sha256_riscv_v_check_hw())
+    hd->bctx.bwrite = do_sha256_transform_riscv_zvknha;
+#endif
 #ifdef USE_S390X_CRYPTO
   hd->use_s390x_crypto = 0;
   if ((features & HWF_S390X_MSA) != 0)
diff --git a/cipher/sha512-riscv-zvknhb-zvkb.c b/cipher/sha512-riscv-zvknhb-zvkb.c
new file mode 100644
index 00000000..5f5d483a
--- /dev/null
+++ b/cipher/sha512-riscv-zvknhb-zvkb.c
@@ -0,0 +1,190 @@
+/* sha512-riscv-zvknhb-zvkb.c - RISC-V vector crypto implementation of SHA-512
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS) && \
+    defined(USE_SHA512)
+
+#include "g10lib.h"
+#include "simd-common-riscv.h"
+#include <riscv_vector.h>
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+static ASM_FUNC_ATTR_INLINE vuint64m2_t
+working_vsha2cl_vv_u64m2(vuint64m2_t hgcd, vuint64m2_t feba,
+			 vuint64m2_t kw, size_t vl)
+{
+#ifdef HAVE_BROKEN_VSHA2CL_INTRINSIC
+  asm (
+    "vsetvli zero,%3,e64,m2,ta,ma;\n\t"
+    "vsha2cl.vv %0,%1,%2;\n\t"
+    : "+vr" (hgcd)
+    : "vr" (feba), "vr" (kw), "r" (vl)
+    : "vl", "vtype"
+  );
+  return hgcd;
+#else
+  return __riscv_vsha2cl_vv_u64m2(hgcd, feba, kw, vl);
+#endif
+}
+
+
+/* Quad-round with message expansion (rounds 0-63) */
+#define QUAD_ROUND_W_SCHED(w0, w1, w2, w3) \
+  k_tmp = k; \
+  asm ("" : "+r" (k_tmp) :: "memory"); \
+  v_k = __riscv_vle64_v_u64m2(k_tmp, vl); \
+  k += 4; \
+  v_kw = __riscv_vadd_vv_u64m2(v_k, w0, vl); \
+  v_hgcd_work = working_vsha2cl_vv_u64m2(v_hgcd_work, v_feba_work, v_kw, vl); \
+  v_feba_work = __riscv_vsha2ch_vv_u64m2(v_feba_work, v_hgcd_work, v_kw, vl); \
+  v_w_merged = __riscv_vmerge_vvm_u64m2(w2, w1, merge_mask, vl); \
+  w0 = __riscv_vsha2ms_vv_u64m2(w0, v_w_merged, w3, vl);
+
+/* Quad-round without message expansion (rounds 64-79) */
+#define QUAD_ROUND_NO_SCHED(w0) \
+  k_tmp = k; \
+  asm ("" : "+r" (k_tmp) :: "memory"); \
+  v_k = __riscv_vle64_v_u64m2(k_tmp, vl); \
+  k += 4; \
+  v_kw = __riscv_vadd_vv_u64m2(v_k, w0, vl); \
+  v_hgcd_work = working_vsha2cl_vv_u64m2(v_hgcd_work, v_feba_work, v_kw, vl); \
+  v_feba_work = __riscv_vsha2ch_vv_u64m2(v_feba_work, v_hgcd_work, v_kw, vl);
+
+
+static ASM_FUNC_ATTR_INLINE vuint64m2_t
+load_and_swap(const byte *p, size_t vl, size_t vl_bytes)
+{
+  vuint8m2_t temp_bytes = __riscv_vle8_v_u8m2(p, vl_bytes);
+  return __riscv_vrev8_v_u64m2(__riscv_vreinterpret_v_u8m2_u64m2(temp_bytes),
+                               vl);
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+sha512_transform_zvknhb_zvkb (u64 state[8], const byte *data,
+			      size_t nblocks, const u64 k_const[80])
+{
+  static const u64 feba_hgcd_indices[4] = { 40, 32, 8, 0 };
+  static const int feba_offset = 0;
+  static const int hgcd_offset = 16 / sizeof(u64);
+  size_t vl;
+  size_t vl_bytes;
+  vuint64m2_t idx;
+  vuint64m2_t v_feba_work, v_feba;
+  vuint64m2_t v_hgcd_work, v_hgcd;
+  vuint64m2_t w0, w1, w2, w3;
+  vuint64m2_t v_k, v_kw, v_w_merged;
+  vbool32_t merge_mask;
+  vuint64m2_t v_feba_hgcd_idx;
+
+  vl = 4;
+  vl_bytes = vl * 8;
+  idx = __riscv_vid_v_u64m2(vl);
+  merge_mask = __riscv_vmseq_vx_u64m2_b32(idx, 0, vl);
+
+  v_feba_hgcd_idx = __riscv_vle64_v_u64m2(feba_hgcd_indices, vl);
+
+  v_feba = __riscv_vluxei64_v_u64m2(state + feba_offset, v_feba_hgcd_idx, vl);
+  v_hgcd = __riscv_vluxei64_v_u64m2(state + hgcd_offset, v_feba_hgcd_idx, vl);
+
+  while (nblocks > 0)
+    {
+      const u64 *k = k_const;
+      const u64 *k_tmp;
+
+      v_feba_work = v_feba;
+      v_hgcd_work = v_hgcd;
+
+      w0 = load_and_swap(data + 0, vl, vl_bytes);
+      w1 = load_and_swap(data + 32, vl, vl_bytes);
+      w2 = load_and_swap(data + 64, vl, vl_bytes);
+      w3 = load_and_swap(data + 96, vl, vl_bytes);
+
+      QUAD_ROUND_W_SCHED(w0, w1, w2, w3);
+      QUAD_ROUND_W_SCHED(w1, w2, w3, w0);
+      QUAD_ROUND_W_SCHED(w2, w3, w0, w1);
+      QUAD_ROUND_W_SCHED(w3, w0, w1, w2);
+      QUAD_ROUND_W_SCHED(w0, w1, w2, w3);
+      QUAD_ROUND_W_SCHED(w1, w2, w3, w0);
+      QUAD_ROUND_W_SCHED(w2, w3, w0, w1);
+      QUAD_ROUND_W_SCHED(w3, w0, w1, w2);
+      QUAD_ROUND_W_SCHED(w0, w1, w2, w3);
+      QUAD_ROUND_W_SCHED(w1, w2, w3, w0);
+      QUAD_ROUND_W_SCHED(w2, w3, w0, w1);
+      QUAD_ROUND_W_SCHED(w3, w0, w1, w2);
+      QUAD_ROUND_W_SCHED(w0, w1, w2, w3);
+      QUAD_ROUND_W_SCHED(w1, w2, w3, w0);
+      QUAD_ROUND_W_SCHED(w2, w3, w0, w1);
+      QUAD_ROUND_W_SCHED(w3, w0, w1, w2);
+
+      QUAD_ROUND_NO_SCHED(w0);
+      QUAD_ROUND_NO_SCHED(w1);
+      QUAD_ROUND_NO_SCHED(w2);
+      QUAD_ROUND_NO_SCHED(w3);
+
+      v_feba = __riscv_vadd_vv_u64m2(v_feba, v_feba_work, vl);
+      v_hgcd = __riscv_vadd_vv_u64m2(v_hgcd, v_hgcd_work, vl);
+
+      data += 128;
+      nblocks--;
+    }
+
+  __riscv_vsuxei64_v_u64m2(state + feba_offset, v_feba_hgcd_idx, v_feba, vl);
+  __riscv_vsuxei64_v_u64m2(state + hgcd_offset, v_feba_hgcd_idx, v_hgcd, vl);
+
+  clear_vec_regs();
+}
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_sha512_transform_riscv_zvknhb_zvkb(u64 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks,
+					 const u64 k[80])
+{
+  sha512_transform_zvknhb_zvkb(state, input_data, num_blks, k);
+  return 0;
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_sha512_riscv_v_check_hw(void)
+{
+  return (__riscv_vsetvl_e64m2(4) == 4);
+}
+
+#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
diff --git a/cipher/sha512.c b/cipher/sha512.c
index bf3f3ff2..a0c0bf1c 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -154,6 +154,16 @@
 #endif
 
 
+/* USE_RISCV_V_CRYPTO indicates whether to enable RISC-V vector cryptography
+ * extension code. */
+#undef USE_RISCV_V_CRYPTO
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+# define USE_RISCV_V_CRYPTO 1
+#endif
+
+
 /* USE_S390X_CRYPTO indicates whether to enable zSeries code. */
 #undef USE_S390X_CRYPTO
 #if defined(HAVE_GCC_INLINE_ASM_S390X)
@@ -392,6 +402,25 @@ do_sha512_transform_ppc9(void *ctx, const unsigned char *data, size_t nblks)
 #endif
 
 
+#ifdef USE_RISCV_V_CRYPTO
+unsigned int _gcry_sha512_riscv_v_check_hw(void);
+
+unsigned int
+_gcry_sha512_transform_riscv_zvknhb_zvkb(u64 state[8],
+					 const unsigned char *input_data,
+					 size_t num_blks,
+					 const u64 k[80]);
+
+static unsigned int
+do_sha512_transform_riscv_zvknhb(void *ctx, const unsigned char *data,
+				 size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_riscv_zvknhb_zvkb (hd->state.h, data, nblks, k);
+}
+#endif
+
+
 #ifdef USE_S390X_CRYPTO
 #include "asm-inline-s390x.h"
 
@@ -479,6 +508,14 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
   if ((features & HWF_INTEL_SSSE3) != 0)
     ctx->bctx.bwrite = do_sha512_transform_i386_ssse3;
 #endif
+#ifdef USE_RISCV_V_CRYPTO
+  if ((features & HWF_RISCV_IMAFDC)
+      && (features & HWF_RISCV_V)
+      && (features & HWF_RISCV_ZVKB)
+      && (features & HWF_RISCV_ZVKNHB)
+      && _gcry_sha512_riscv_v_check_hw())
+    ctx->bctx.bwrite = do_sha512_transform_riscv_zvknhb;
+#endif
 #ifdef USE_S390X_CRYPTO
   ctx->use_s390x_crypto = 0;
   if ((features & HWF_S390X_MSA) != 0)
diff --git a/configure.ac b/configure.ac
index 3ce405e9..63bdfbe7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2669,6 +2669,46 @@ fi
 #
 # Check whether compiler supports RISC-V vector intrinsics
 #
+m4_define([GCRY_RISCV_VECTOR_INTRINSICS_TEST],
+  [AC_LANG_SOURCE(
+    [[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+      #error __riscv_v_intrinsic not defined or too old version
+      #endif
+      #include <riscv_vector.h>
+      typedef vuint8m1_t __m128i;
+      #define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+      #define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+      #define paddq128(a, o) (o = cast_u64_to_m128i( \
+				    __riscv_vadd_vv_u64m1( \
+				      cast_m128i_to_u64(o), \
+				      cast_m128i_to_u64(a), 2)))
+      #define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+      #define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+      #define clear_vec_reg_v0() \
+	__asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
+			  "vmv.v.x v0, zero;\n" \
+			  ::: "memory", "vtype", "vl", "v0")
+      static inline __attribute__((always_inline)) __m128i
+      fn2(__m128i a)
+      {
+	paddq128(a, a);
+	return a;
+      }
+      __m128i fn(__m128i in)
+      {
+	__m128i x;
+	memory_barrier_with_vec(in);
+	x = fn2(in);
+	memory_barrier_with_vec(x);
+	pshufb128(in, x);
+	memory_barrier_with_vec(in);
+	clear_vec_reg_v0();
+	return in;
+      }
+    ]]
+  )]
+)
+
 AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics],
       [gcry_cv_cc_riscv_vector_intrinsics],
       [if test "$mpi_cpu_arch" != "riscv64" ||
@@ -2676,43 +2716,9 @@ AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics],
 	gcry_cv_cc_riscv_vector_intrinsics="n/a"
       else
 	gcry_cv_cc_riscv_vector_intrinsics=no
-	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-	[[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
-	  #error __riscv_v_intrinsic not defined or too old version
-	  #endif
-	  #include <riscv_vector.h>
-	  typedef vuint8m1_t __m128i;
-	  #define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
-	  #define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
-	  #define paddq128(a, o) (o = cast_u64_to_m128i( \
-					__riscv_vadd_vv_u64m1( \
-					  cast_m128i_to_u64(o), \
-					  cast_m128i_to_u64(a), 2)))
-	  #define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
-	  #define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
-	  #define clear_vec_reg_v0() \
-	    __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
-			     "vmv.v.x v0, zero;\n" \
-			     ::: "memory", "vtype", "vl", "v0")
-	  static inline __attribute__((always_inline)) __m128i
-	  fn2(__m128i a)
-	  {
-	    paddq128(a, a);
-	    return a;
-	  }
-	  __m128i fn(__m128i in)
-	  {
-	    __m128i x;
-	    memory_barrier_with_vec(in);
-	    x = fn2(in);
-	    memory_barrier_with_vec(x);
-	    pshufb128(in, x);
-	    memory_barrier_with_vec(in);
-	    clear_vec_reg_v0();
-	    return in;
-	  }
-	  ]])],
-	[gcry_cv_cc_riscv_vector_intrinsics=yes])
+	AC_COMPILE_IFELSE(
+	  [GCRY_RISCV_VECTOR_INTRINSICS_TEST],
+	  [gcry_cv_cc_riscv_vector_intrinsics=yes])
       fi])
 if test "$gcry_cv_cc_riscv_vector_intrinsics" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
@@ -2720,6 +2726,7 @@ if test "$gcry_cv_cc_riscv_vector_intrinsics" = "yes" ; then
 fi
 
 _gcc_cflags_save=$CFLAGS
+# Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
 CFLAGS="$CFLAGS -O2 -march=rv64imafdcv -mstrict-align"
 
 if test "$gcry_cv_cc_riscv_vector_intrinsics" = "no" &&
@@ -2727,44 +2734,12 @@ if test "$gcry_cv_cc_riscv_vector_intrinsics" = "no" &&
    test "$try_asm_modules" = "yes" ; then
   AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics with extra GCC flags],
     [gcry_cv_cc_riscv_vector_intrinsics_cflags],
-    [gcry_cv_cc_riscv_vector_intrinsics_cflags=no
-    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
-      [[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
-	#error __riscv_v_intrinsic not defined or too old version
-	#endif
-	#include <riscv_vector.h>
-	typedef vuint8m1_t __m128i;
-	#define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
-	#define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
-	#define paddq128(a, o) (o = cast_u64_to_m128i( \
-					__riscv_vadd_vv_u64m1( \
-					  cast_m128i_to_u64(o), \
-					  cast_m128i_to_u64(a), 2)))
-	#define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
-	#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
-	#define clear_vec_reg_v0() \
-	  __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
-			    "vmv.v.x v0, zero;\n" \
-			    ::: "memory", "vl", "v0")
-	static inline __attribute__((always_inline)) __m128i
-	fn2(__m128i a)
-	{
-	  paddq128(a, a);
-	  return a;
-	}
-	__m128i fn(__m128i in)
-	{
-	  __m128i x;
-	  memory_barrier_with_vec(in);
-	  x = fn2(in);
-	  memory_barrier_with_vec(x);
-	  pshufb128(in, x);
-	  memory_barrier_with_vec(in);
-	  clear_vec_reg_v0();
-	  return in;
-	}
-	]])],
-      [gcry_cv_cc_riscv_vector_intrinsics_cflags=yes])])
+    [
+      gcry_cv_cc_riscv_vector_intrinsics_cflags=no
+      AC_COMPILE_IFELSE(
+	[GCRY_RISCV_VECTOR_INTRINSICS_TEST],
+	[gcry_cv_cc_riscv_vector_intrinsics_cflags=yes])
+    ])
   if test "$gcry_cv_cc_riscv_vector_intrinsics_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
 	      [Defined if underlying compiler supports RISC-V vector intrinsics])
@@ -2780,6 +2755,145 @@ AM_CONDITIONAL(ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS,
 CFLAGS=$_gcc_cflags_save;
 
 
+#
+# Check whether compiler supports RISC-V vector cryptography intrinsics
+#
+m4_define([GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST],
+  [AC_LANG_SOURCE(
+    [[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+      #error __riscv_v_intrinsic not defined or too old version
+      #endif
+      #include <riscv_vector.h>
+      void test_sha2(unsigned int *ptr)
+      {
+	int vl = __riscv_vsetvl_e32m1 (4);
+	vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl);
+	vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl);
+	vuint32m1_t w0 = __riscv_vle32_v_u32m1(ptr + 2 * vl, vl);
+	vuint32m1_t w1 = __riscv_vle32_v_u32m1(ptr + 3 * vl, vl);
+	vuint32m1_t w2 = __riscv_vle32_v_u32m1(ptr + 4 * vl, vl);
+	vuint32m1_t w3 = __riscv_vle32_v_u32m1(ptr + 5 * vl, vl);
+	vuint32m1_t m;
+	vuint32m1_t idx = __riscv_vid_v_u32m1 (vl);
+	vbool32_t merge_mask = __riscv_vmseq_vx_u32m1_b32 (idx, 0, vl);
+	a = __riscv_vsha2cl_vv_u32m1(a, b, w0, vl);
+	b = __riscv_vsha2ch_vv_u32m1(a, b, w0, vl);
+	m = __riscv_vmerge_vvm_u32m1(w2, w1, merge_mask, vl);
+	w0 = __riscv_vsha2ms_vv_u32m1(w0, m, w3, vl);
+	__riscv_vse32_v_u32m1(ptr + 0 * vl, a, vl);
+	__riscv_vse32_v_u32m1(ptr + 1 * vl, b, vl);
+	__riscv_vse32_v_u32m1(ptr + 2 * vl, w0, vl);
+	__riscv_vse32_v_u32m1(ptr + 3 * vl, w1, vl);
+	__riscv_vse32_v_u32m1(ptr + 4 * vl, w2, vl);
+	__riscv_vse32_v_u32m1(ptr + 5 * vl, w3, vl);
+      }
+      void test_inline_vec_asm(unsigned int *ptr)
+      {
+	int vl = __riscv_vsetvl_e32m1 (4);
+	vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl);
+	vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl);
+	asm (
+	  "vsetvli zero,%1,e32,m1,ta,ma;\n\t"
+	  "vsha2ms.vv %0,%2,%2;\n\t"
+	  : "+vr" (a)
+	  : "r" (vl), "vr" (b)
+	  : "vl", "vtype"
+	);
+	__riscv_vse32_v_u32m1(ptr + 0 * vl, a, vl);
+      }
+    ]]
+  )]
+)
+
+AC_CACHE_CHECK([whether compiler supports RISC-V vector cryptography intrinsics],
+      [gcry_cv_cc_riscv_vector_crypto_intrinsics],
+      [if test "$mpi_cpu_arch" != "riscv64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	gcry_cv_cc_riscv_vector_crypto_intrinsics="n/a"
+      else
+	gcry_cv_cc_riscv_vector_crypto_intrinsics=no
+	AC_COMPILE_IFELSE(
+	  [GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST],
+	  [gcry_cv_cc_riscv_vector_crypto_intrinsics=yes])
+      fi])
+if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS,1,
+	    [Defined if underlying compiler supports RISC-V vector cryptography intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+# Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align"
+
+if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "no" &&
+   test "$mpi_cpu_arch" = "riscv64" &&
+   test "$try_asm_modules" = "yes" ; then
+  AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics with extra GCC flags],
+    [gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags],
+    [gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags=no
+    AC_COMPILE_IFELSE(
+      [GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST],
+      [gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags=yes])])
+  if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS,1,
+	      [Defined if underlying compiler supports RISC-V vector cryptography intrinsics])
+    AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS_WITH_CFLAGS,1,
+	      [Defined if underlying compiler supports RISC-V vector cryptography intrinsics with extra GCC flags])
+  fi
+fi
+
+AM_CONDITIONAL(ENABLE_RISCV_VECTOR_CRYPTO_INTRINSICS_EXTRA_CFLAGS,
+	       test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
+#
+# Check whether compiler has working RISC-V vector __riscv_vsha2cl intrinsics
+#
+# LLVM has broken __riscv_vsha2cl_* intrinsics where they emit 'vsha2ch.vv'
+# instructions instead of expected 'vsha2cl.vv':
+#  https://github.com/llvm/llvm-project/issues/151814
+#
+if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" ||
+   test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
+
+  # Setup flags for test if needed.
+  _gcc_cflags_save=$CFLAGS
+  if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
+    CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align"
+  fi
+
+  AC_CACHE_CHECK([whether compiler has working RISC-V __riscv_vsha2cl intrinsics],
+    [gcry_cv_riscv_vsha2cl_intrinsics_work],
+    [gcry_cv_riscv_vsha2cl_intrinsics_work=no
+     cat > conftest.c <<EOF
+#include <riscv_vector.h>
+vuint32m1_t test_fn(vuint32m1_t a, vuint32m1_t b, vuint32m1_t c, int vl)
+{
+  return __riscv_vsha2cl_vv_u32m1(a, b, c, vl);
+}
+EOF
+
+    if $CC $CFLAGS -S conftest.c -o conftest.s >&5 2>&5; then
+      if grep 'vsha2cl' conftest.s >/dev/null 2>&1; then
+        gcry_cv_riscv_vsha2cl_intrinsics_work=yes
+      fi
+    fi
+    rm -f conftest.*
+  ])
+
+  if test "$gcry_cv_riscv_vsha2cl_intrinsics_work" = "no"; then
+    AC_DEFINE([HAVE_BROKEN_VSHA2CL_INTRINSIC], [1],
+              [Define to 1 if __riscv_vsha2cl intrinsics are broken])
+  fi
+
+  # Restore flags.
+  CFLAGS=$_gcc_cflags_save;
+fi
+
+
 #######################################
 #### Checks for library functions. ####
 #######################################
@@ -3656,6 +3770,11 @@ if test "$found" = "1" ; then
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-ppc.lo"
+      ;;
+      riscv64-*-*)
+         # Build with the RISC-V vector cryptography implementation
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha256-riscv-zvknha-zvkb.lo"
+      ;;
    esac
 
    case "$mpi_cpu_arch" in
@@ -3709,6 +3828,11 @@ if test "$found" = "1" ; then
          # Big-Endian.
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
+      ;;
+      riscv64-*-*)
+         # Build with the RISC-V vector cryptography implementation
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-riscv-zvknhb-zvkb.lo"
+      ;;
    esac
 fi
 
diff --git a/src/g10lib.h b/src/g10lib.h
index 84ec4713..4fa91ba9 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -280,6 +280,9 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_RISCV_V             (1 << 1)
 #define HWF_RISCV_ZBB           (1 << 2)
 #define HWF_RISCV_ZBC           (1 << 3)
+#define HWF_RISCV_ZVKB          (1 << 4)
+#define HWF_RISCV_ZVKNHA        (1 << 5)
+#define HWF_RISCV_ZVKNHB        (1 << 6)
 
 #endif
 
diff --git a/src/hwf-riscv.c b/src/hwf-riscv.c
index 13ca4879..925284a1 100644
--- a/src/hwf-riscv.c
+++ b/src/hwf-riscv.c
@@ -190,6 +190,9 @@ detect_riscv_at_hwcap(void)
 #define HWF_RISCV_HWPROBE_EXT_ZBB           (1U << 4)
 #define HWF_RISCV_HWPROBE_EXT_ZBS           (1U << 5)
 #define HWF_RISCV_HWPROBE_EXT_ZBC           (1U << 7)
+#define HWF_RISCV_HWPROBE_EXT_ZVKB          (1U << 19)
+#define HWF_RISCV_HWPROBE_EXT_ZVKNHA        (1U << 22)
+#define HWF_RISCV_HWPROBE_EXT_ZVKNHB        (1U << 23)
 #define HWF_RISCV_HWPROBE_EXT_ZICOND        (U64_C(1) << 35)
 
 #define HWF_RISCV_HWPROBE_IMA_FDC (HWF_RISCV_HWPROBE_IMA_FD \
@@ -211,6 +214,9 @@ static const struct hwprobe_feature_map_s hwprobe_features[] =
     { HWF_RISCV_HWPROBE_IMA_V,       HWF_RISCV_V },
     { HWF_RISCV_HWPROBE_EXT_ZBB,     HWF_RISCV_ZBB },
     { HWF_RISCV_HWPROBE_EXT_ZBC,     HWF_RISCV_ZBC },
+    { HWF_RISCV_HWPROBE_EXT_ZVKB,    HWF_RISCV_ZVKB },
+    { HWF_RISCV_HWPROBE_EXT_ZVKNHA,  HWF_RISCV_ZVKNHA },
+    { HWF_RISCV_HWPROBE_EXT_ZVKNHB,  HWF_RISCV_ZVKNHB },
   };
 
 static int
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index afcaa00d..df2aaf17 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -96,6 +96,9 @@ static struct
     { HWF_RISCV_V,             "riscv-v" },
     { HWF_RISCV_ZBB,           "riscv-zbb" },
     { HWF_RISCV_ZBC,           "riscv-zbc" },
+    { HWF_RISCV_ZVKB,          "riscv-zvkb" },
+    { HWF_RISCV_ZVKNHA,        "riscv-zvknha" },
+    { HWF_RISCV_ZVKNHB,        "riscv-zvknhb" },
 #endif
   };
 
-- 
2.48.1


From jussi.kivilinna at iki.fi  Thu Aug  7 15:28:51 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu,  7 Aug 2025 16:28:51 +0300
Subject: [PATCH 2/6] rijndael-vp: clean-ups
In-Reply-To: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
References: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
Message-ID: <20250807132855.878167-2-jussi.kivilinna@iki.fi>

* cipher/rijndael-vp-riscv.c: Remove "./" from "cipher-internal.h" include.
* cipher/rijndael-vp-simd128.h (aes_simd128_xts_enc, aes_simd128_xts_dec): Make
these functions static.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-vp-riscv.c   | 2 +-
 cipher/rijndael-vp-simd128.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cipher/rijndael-vp-riscv.c b/cipher/rijndael-vp-riscv.c
index b8c6ed13..24ae3a7c 100644
--- a/cipher/rijndael-vp-riscv.c
+++ b/cipher/rijndael-vp-riscv.c
@@ -44,7 +44,7 @@
 #include "cipher.h"
 #include "bufhelp.h"
 #include "rijndael-internal.h"
-#include "./cipher-internal.h"
+#include "cipher-internal.h"
 
 
 #ifdef USE_VP_RISCV
diff --git a/cipher/rijndael-vp-simd128.h b/cipher/rijndael-vp-simd128.h
index af8ee291..805faa6f 100644
--- a/cipher/rijndael-vp-simd128.h
+++ b/cipher/rijndael-vp-simd128.h
@@ -2814,7 +2814,7 @@ static ASM_FUNC_ATTR_INLINE __m128i xts_gfmul_byA (__m128i xmm5)
   return xmm5;
 }
 
-ASM_FUNC_ATTR_NOINLINE void
+static ASM_FUNC_ATTR_NOINLINE void
 aes_simd128_xts_enc (void *context, unsigned char *tweak, void *outbuf_arg,
 		     const void *inbuf_arg, size_t nblocks)
 {
@@ -2914,7 +2914,7 @@ aes_simd128_xts_enc (void *context, unsigned char *tweak, void *outbuf_arg,
   clear_vec_regs();
 }
 
-ASM_FUNC_ATTR_NOINLINE void
+static ASM_FUNC_ATTR_NOINLINE void
 aes_simd128_xts_dec (void *context, unsigned char *tweak, void *outbuf_arg,
 		     const void *inbuf_arg, size_t nblocks)
 {
-- 
2.48.1


From jussi.kivilinna at iki.fi  Thu Aug  7 15:28:54 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu,  7 Aug 2025 16:28:54 +0300
Subject: [PATCH 5/6] riscv: always use -mstrict-align for vector intrinsic
 implementations
In-Reply-To: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
References: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
Message-ID: <20250807132855.878167-5-jussi.kivilinna@iki.fi>

* cipher/Makefile.am (riscv_vector_cflags, riscv_vector_crypto_cflags): Use
"-mstrict-align" if SUPPORT_CC_RISCV_MSTRICT_ALIGN is set.
* configure.ac (SUPPORT_CC_RISCV_MSTRICT_ALIGN): Add check for "-mstrict-align"
compiler flag support.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am | 12 ++++++++++--
 configure.ac       | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3375ea38..ea91b7b8 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -355,12 +355,16 @@ serpent-avx512-x86.o: $(srcdir)/serpent-avx512-x86.c Makefile
 serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
 	`echo $(LTCOMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
 
-if ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS
 # Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+if ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS
 riscv_vector_cflags = -O2 -march=rv64imafdcv -mstrict-align
 else
+if SUPPORT_CC_RISCV_MSTRICT_ALIGN
+riscv_vector_cflags = -O2 -mstrict-align
+else
 riscv_vector_cflags =
 endif
+endif
 
 chacha20-riscv-v.o: $(srcdir)/chacha20-riscv-v.c Makefile
 	`echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
@@ -374,12 +378,16 @@ rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
 rijndael-vp-riscv.lo: $(srcdir)/rijndael-vp-riscv.c Makefile
 	`echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
 
-if ENABLE_RISCV_VECTOR_CRYPTO_INTRINSICS_EXTRA_CFLAGS
 # Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+if ENABLE_RISCV_VECTOR_CRYPTO_INTRINSICS_EXTRA_CFLAGS
 riscv_vector_crypto_cflags = -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align
 else
+if SUPPORT_CC_RISCV_MSTRICT_ALIGN
+riscv_vector_crypto_cflags = -O2 -mstrict-align
+else
 riscv_vector_crypto_cflags =
 endif
+endif
 
 sha256-riscv-zvknha-zvkb.o: $(srcdir)/sha256-riscv-zvknha-zvkb.c Makefile
 	`echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/configure.ac b/configure.ac
index 63bdfbe7..d45ea851 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2666,6 +2666,31 @@ if test "$gcry_cv_gcc_inline_asm_riscv_v" = "yes" ; then
 fi
 
 
+#
+# Check whether compiler supports RISC-V -mstrict-align flag
+#
+_gcc_cflags_save=$CFLAGS
+# Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+CFLAGS="$CFLAGS -mstrict-align"
+
+AC_CACHE_CHECK([whether compiler supports RISC-V -mstrict-align flag],
+      [gcry_cv_cc_riscv_mstrict_align],
+      [if test "$mpi_cpu_arch" != "riscv64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	gcry_cv_cc_riscv_mstrict_align="n/a"
+      else
+	gcry_cv_cc_riscv_mstrict_align=no
+	AC_COMPILE_IFELSE(
+	  [AC_LANG_SOURCE([[void testfn(void) { }]])],
+	  [gcry_cv_cc_riscv_mstrict_align=yes])
+      fi])
+AM_CONDITIONAL(SUPPORT_CC_RISCV_MSTRICT_ALIGN,
+	       test "$gcry_cv_cc_riscv_mstrict_align" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
 #
 # Check whether compiler supports RISC-V vector intrinsics
 #
-- 
2.48.1


From jussi.kivilinna at iki.fi  Thu Aug  7 15:28:52 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu,  7 Aug 2025 16:28:52 +0300
Subject: [PATCH 3/6] bithelp: fix __riscv_zbb check for _gcry_ctz_no_zero
In-Reply-To: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
References: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
Message-ID: <20250807132855.878167-3-jussi.kivilinna@iki.fi>

* cipher/bithelp.h (_gcry_ctz_no_zero): Fix __riscv_zbb version check.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/bithelp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cipher/bithelp.h b/cipher/bithelp.h
index a4faf345..50731650 100644
--- a/cipher/bithelp.h
+++ b/cipher/bithelp.h
@@ -84,7 +84,7 @@ _gcry_ctz_no_zero (unsigned int x)
 {
 #if defined(__riscv) && \
     (defined(__riscv_f) && __riscv_f >= 2002000) && \
-    (!defined(__riscv_zbb) || __riscv_zbb < 2002000) && \
+    (!defined(__riscv_zbb) || __riscv_zbb < 1000000) && \
     defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS)
   /* Use float cast approach when building for RISC-V without Zbb extension.
    * Without Zbb, GCC gives us slower generic version for __builtin_ctz().
-- 
2.48.1


From jussi.kivilinna at iki.fi  Thu Aug  7 15:28:55 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu,  7 Aug 2025 16:28:55 +0300
Subject: [PATCH 6/6] Add RISC-V vector cryptography implementation of AES
In-Reply-To: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
References: <20250807132855.878167-1-jussi.kivilinna@iki.fi>
Message-ID: <20250807132855.878167-6-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'rijndael-riscv-zvkned.c'.
* cipher/rijndael-internal.h (USE_RISCV_V_CRYPTO): New.
* cipher/rijndael-riscv-zvkned.c: New.
* cipher/rijndael.c [USE_RISCV_V_CRYPTO]
(_gcry_aes_riscv_zvkned_setup_acceleration, _gcry_aes_riscv_zvkned_setkey)
(_gcry_aes_riscv_zvkned_prepare_decryption)
(_gcry_aes_riscv_zvkned_encrypt, _gcry_aes_riscv_zvkned_decrypt)
(_gcry_aes_riscv_zvkned_cfb_enc, _gcry_aes_riscv_zvkned_cbc_enc)
(_gcry_aes_riscv_zvkned_ctr_enc, _gcry_aes_riscv_zvkned_ctr32le_enc)
(_gcry_aes_riscv_zvkned_cfb_dec, _gcry_aes_riscv_zvkned_cbc_dec)
(_gcry_aes_riscv_zvkned_ocb_crypt, _gcry_aes_riscv_zvkned_ocb_auth)
(_gcry_aes_riscv_zvkned_ecb_crypt, _gcry_aes_riscv_zvkned_xts_crypt): New.
(do_setkey) [USE_RISCV_V_CRYPTO]: Add setup for RISC-V vector cryptography
extension implementation.
* configure.ac: Add 'rijndael-riscv-zvkned.lo'.
(GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST): Add AES intrinsics.
(gcry_cv_riscv_vaes_vs_intrinsics_work, HAVE_BROKEN_VAES_VS_INTRINSIC): New.
* src/g10lib.h (HWF_RISCV_ZVKNED): Insert before HWF_RISCV_ZVKNHA.
* src/hwf-riscv.c (HWF_RISCV_HWPROBE_EXT_ZVKNED): New.
(hwprobe_features): Add Zvkned.
* src/hwfeatures.c (hwflist): Add "riscv-zvkned".
--

Implementation has been tested against QEMU emulator as there is no
actual HW available with these instructions yet.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am             |    7 +
 cipher/rijndael-internal.h     |    9 +
 cipher/rijndael-riscv-zvkned.c | 1608 ++++++++++++++++++++++++++++++++
 cipher/rijndael.c              |   90 +-
 configure.ac                   |   76 ++
 src/g10lib.h                   |    5 +-
 src/hwf-riscv.c                |    2 +
 src/hwfeatures.c               |    1 +
 8 files changed, 1793 insertions(+), 5 deletions(-)
 create mode 100644 cipher/rijndael-riscv-zvkned.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index ea91b7b8..7abbd5b3 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -122,6 +122,7 @@ EXTRA_libcipher_la_SOURCES = \
 	rijndael-ppc.c rijndael-ppc9le.c                   \
 	rijndael-p10le.c rijndael-gcm-p10le.s              \
 	rijndael-ppc-common.h rijndael-ppc-functions.h     \
+	rijndael-riscv-zvkned.c                            \
 	rijndael-s390x.c                                   \
 	rijndael-vp-aarch64.c rijndael-vp-riscv.c          \
 	rijndael-vp-simd128.h                              \
@@ -389,6 +390,12 @@ riscv_vector_crypto_cflags =
 endif
 endif
 
+rijndael-riscv-zvkned.o: $(srcdir)/rijndael-riscv-zvkned.c Makefile
+	`echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-riscv-zvkned.lo: $(srcdir)/rijndael-riscv-zvkned.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
+
 sha256-riscv-zvknha-zvkb.o: $(srcdir)/sha256-riscv-zvknha-zvkb.c Makefile
 	`echo $(COMPILE) $(riscv_vector_crypto_cflags) -c $< | $(instrumentation_munging) `
 
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 92310fc5..15084a69 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -124,6 +124,15 @@
 # endif
 #endif /* ENABLE_ARM_CRYPTO_SUPPORT */
 
+/* USE_RISCV_V_CRYPTO indicates whether to enable RISC-V vector cryptography
+ * extension code. */
+#undef USE_RISCV_V_CRYPTO
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+# define USE_RISCV_V_CRYPTO 1
+#endif
+
 /* USE_VP_AARCH64 indicates whether to enable vector permute AArch64 SIMD code. */
 #undef USE_VP_AARCH64
 #if defined(__AARCH64EL__) && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS)
diff --git a/cipher/rijndael-riscv-zvkned.c b/cipher/rijndael-riscv-zvkned.c
new file mode 100644
index 00000000..e3ba6769
--- /dev/null
+++ b/cipher/rijndael-riscv-zvkned.c
@@ -0,0 +1,1608 @@
+/* rijndael-riscv-zvkned.c - RISC-V vector crypto implementation of AES
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+
+#include "g10lib.h"
+#include "simd-common-riscv.h"
+#include "rijndael-internal.h"
+#include "cipher-internal.h"
+
+#include <riscv_vector.h>
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ALWAYS_INLINE ASM_FUNC_ATTR
+#define ASM_FUNC_ATTR_NOINLINE NO_INLINE ASM_FUNC_ATTR
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+
+/*
+ * Helper macro and functions
+ */
+
+#define cast_u8m1_u32m1(a) __riscv_vreinterpret_v_u8m1_u32m1(a)
+#define cast_u8m1_u64m1(a) __riscv_vreinterpret_v_u8m1_u64m1(a)
+#define cast_u32m1_u8m1(a) __riscv_vreinterpret_v_u32m1_u8m1(a)
+#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)
+#define cast_u64m1_u8m1(a) __riscv_vreinterpret_v_u64m1_u8m1(a)
+
+#define cast_u8m2_u32m2(a) __riscv_vreinterpret_v_u8m2_u32m2(a)
+#define cast_u32m2_u8m2(a) __riscv_vreinterpret_v_u32m2_u8m2(a)
+
+#define cast_u8m4_u32m4(a) __riscv_vreinterpret_v_u8m4_u32m4(a)
+#define cast_u32m4_u8m4(a) __riscv_vreinterpret_v_u32m4_u8m4(a)
+
+#define cast_u64m1_u32m1(a) __riscv_vreinterpret_v_u64m1_u32m1(a)
+#define cast_u32m1_u64m1(a) __riscv_vreinterpret_v_u32m1_u64m1(a)
+
+#define cast_u64m1_i64m1(a) __riscv_vreinterpret_v_u64m1_i64m1(a)
+#define cast_i64m1_u64m1(a) __riscv_vreinterpret_v_i64m1_u64m1(a)
+
+#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+bswap128_u32m1(vuint32m1_t vec, size_t vl_u32)
+{
+  static const byte bswap128_arr[16] =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  size_t vl_bytes = vl_u32 * 4;
+  vuint8m1_t bswap128 = __riscv_vle8_v_u8m1(bswap128_arr, vl_bytes);
+
+  return cast_u8m1_u32m1(
+	    __riscv_vrgather_vv_u8m1(cast_u32m1_u8m1(vec), bswap128, vl_bytes));
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+unaligned_load_u32m1(const void *ptr, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m1_u32m1(__riscv_vle8_v_u8m1(ptr, vl_bytes));
+}
+
+static ASM_FUNC_ATTR_INLINE void
+unaligned_store_u32m1(void *ptr, vuint32m1_t vec, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  __riscv_vse8_v_u8m1(ptr, cast_u32m1_u8m1(vec), vl_bytes);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m4_t
+unaligned_load_u32m4(const void *ptr, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m4_u32m4(__riscv_vle8_v_u8m4(ptr, vl_bytes));
+}
+
+static ASM_FUNC_ATTR_INLINE void
+unaligned_store_u32m4(void *ptr, vuint32m4_t vec, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  __riscv_vse8_v_u8m4(ptr, cast_u32m4_u8m4(vec), vl_bytes);
+}
+
+static vuint32m1_t
+vxor_u8_u32m1(vuint32m1_t a, vuint32m1_t b, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m1_u32m1(__riscv_vxor_vv_u8m1(cast_u32m1_u8m1(a),
+					      cast_u32m1_u8m1(b), vl_bytes));
+}
+
+static vuint32m4_t
+vxor_u8_u32m4(vuint32m4_t a, vuint32m4_t b, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m4_u32m4(__riscv_vxor_vv_u8m4(cast_u32m4_u8m4(a),
+					      cast_u32m4_u8m4(b), vl_bytes));
+}
+
+
+/*
+ * HW support detection
+ */
+
+int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_setup_acceleration(RIJNDAEL_context *ctx)
+{
+  (void)ctx;
+  return (__riscv_vsetvl_e32m1(4) == 4);
+}
+
+
+/*
+ * Key expansion
+ */
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes128_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  size_t vl = 4;
+
+  vuint32m1_t round_key = unaligned_load_u32m1 (key, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[0][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 1, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[1][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 2, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[2][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 3, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[3][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 4, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[4][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 5, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[5][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 6, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[6][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 7, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[7][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 8, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[8][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 9, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[9][0], round_key, vl);
+
+  round_key = __riscv_vaeskf1_vi_u32m1 (round_key, 10, vl);
+  __riscv_vse32_v_u32m1 (&ctx->keyschenc32[10][0], round_key, vl);
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes192_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  size_t vl = 4;
+  u32 *w = &ctx->keyschenc32[0][0];
+  u32 wr;
+  vuint32m1_t rk_0_7;
+  vuint32m1_t rk_4_11;
+
+  rk_0_7 = unaligned_load_u32m1 (&key[0], vl);
+  rk_4_11 = unaligned_load_u32m1 (&key[8], vl);
+  __riscv_vse32_v_u32m1 (&w[0], rk_0_7, vl);
+  __riscv_vse32_v_u32m1 (&w[2], rk_4_11, vl);
+
+#define AES192_KF1_GEN(out, input, round192, vl) \
+  ({ \
+      u32 temp_array[4] = { 0, 0, 0, 0 }; \
+      vuint32m1_t temp_vec; \
+      temp_array[3] = (input); \
+      temp_vec = __riscv_vle32_v_u32m1(temp_array, (vl)); \
+      temp_vec = __riscv_vaeskf1_vi_u32m1(temp_vec, (round192), (vl)); \
+      (out) = __riscv_vmv_x_s_u32m1_u32(temp_vec); \
+  })
+
+#define AES192_EXPAND_BLOCK(w, round192, wr, last) \
+  ({ \
+    (w)[(round192) * 6 + 0] = (w)[(round192) * 6 - 6] ^ (wr); \
+    (w)[(round192) * 6 + 1] = (w)[(round192) * 6 - 5] ^ (w)[(round192) * 6 + 0]; \
+    (w)[(round192) * 6 + 2] = (w)[(round192) * 6 - 4] ^ (w)[(round192) * 6 + 1]; \
+    (w)[(round192) * 6 + 3] = (w)[(round192) * 6 - 3] ^ (w)[(round192) * 6 + 2]; \
+    if (!(last)) \
+      { \
+	(w)[(round192) * 6 + 4] = (w)[(round192) * 6 - 2] ^ (w)[(round192) * 6 + 3]; \
+	(w)[(round192) * 6 + 5] = (w)[(round192) * 6 - 1] ^ (w)[(round192) * 6 + 4]; \
+      } \
+  })
+
+  AES192_KF1_GEN(wr, w[5], 1, vl);
+  AES192_EXPAND_BLOCK(w, 1, wr, 0);
+
+  AES192_KF1_GEN(wr, w[11], 2, vl);
+  AES192_EXPAND_BLOCK(w, 2, wr, 0);
+
+  AES192_KF1_GEN(wr, w[17], 3, vl);
+  AES192_EXPAND_BLOCK(w, 3, wr, 0);
+
+  AES192_KF1_GEN(wr, w[23], 4, vl);
+  AES192_EXPAND_BLOCK(w, 4, wr, 0);
+
+  AES192_KF1_GEN(wr, w[29], 5, vl);
+  AES192_EXPAND_BLOCK(w, 5, wr, 0);
+
+  AES192_KF1_GEN(wr, w[35], 6, vl);
+  AES192_EXPAND_BLOCK(w, 6, wr, 0);
+
+  AES192_KF1_GEN(wr, w[41], 7, vl);
+  AES192_EXPAND_BLOCK(w, 7, wr, 0);
+
+  AES192_KF1_GEN(wr, w[47], 8, vl);
+  AES192_EXPAND_BLOCK(w, 8, wr, 1);
+
+#undef AES192_KF1_GEN
+#undef AES192_EXPAND_BLOCK
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes256_riscv_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  size_t vl = 4;
+
+  vuint32m1_t rk_a = unaligned_load_u32m1 (&key[0], vl);
+  vuint32m1_t rk_b = unaligned_load_u32m1 (&key[16], vl);
+
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[0][0], rk_a, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[1][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 2, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[2][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 3, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[3][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 4, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[4][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 5, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[5][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 6, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[6][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 7, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[7][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 8, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[8][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 9, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[9][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 10, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[10][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 11, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[11][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 12, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[12][0], rk_a, vl);
+
+  rk_b = __riscv_vaeskf2_vi_u32m1(rk_b, rk_a, 13, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[13][0], rk_b, vl);
+
+  rk_a = __riscv_vaeskf2_vi_u32m1(rk_a, rk_b, 14, vl);
+  __riscv_vse32_v_u32m1(&ctx->keyschenc32[14][0], rk_a, vl);
+
+  clear_vec_regs();
+}
+
+void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  unsigned int rounds = ctx->rounds;
+
+  if (rounds < 12)
+    {
+      aes128_riscv_setkey(ctx, key);
+    }
+  else if (rounds == 12)
+    {
+      aes192_riscv_setkey(ctx, key);
+      _gcry_burn_stack(64);
+    }
+  else
+    {
+      aes256_riscv_setkey(ctx, key);
+    }
+}
+
+static ASM_FUNC_ATTR_INLINE void
+do_prepare_decryption(RIJNDAEL_context *ctx)
+{
+  u32 *ekey = (u32 *)(void *)ctx->keyschenc;
+  u32 *dkey = (u32 *)(void *)ctx->keyschdec;
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  int rr;
+  int r;
+
+  r = 0;
+  rr = rounds;
+  for (r = 0, rr = rounds; r <= rounds; r++, rr--)
+    {
+      __riscv_vse32_v_u32m1(dkey + r * 4,
+			    __riscv_vle32_v_u32m1(ekey + rr * 4, vl),
+			    vl);
+    }
+}
+
+void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_prepare_decryption(RIJNDAEL_context *ctx)
+{
+  do_prepare_decryption(ctx);
+  clear_vec_regs();
+}
+
+
+/*
+ * Encryption / Decryption
+ */
+
+#define ROUND_KEY_VARIABLES \
+  vuint32m1_t rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7, rk8; \
+  vuint32m1_t rk9, rk10, rk11, rk12, rk13, rk_last;
+
+#define PRELOAD_ROUND_KEYS(rk, nrounds, vl) \
+  do { \
+    rk0 = __riscv_vle32_v_u32m1(rk + 0 * 4, vl); \
+    rk1 = __riscv_vle32_v_u32m1(rk + 1 * 4, vl); \
+    rk2 = __riscv_vle32_v_u32m1(rk + 2 * 4, vl); \
+    rk3 = __riscv_vle32_v_u32m1(rk + 3 * 4, vl); \
+    rk4 = __riscv_vle32_v_u32m1(rk + 4 * 4, vl); \
+    rk5 = __riscv_vle32_v_u32m1(rk + 5 * 4, vl); \
+    rk6 = __riscv_vle32_v_u32m1(rk + 6 * 4, vl); \
+    rk7 = __riscv_vle32_v_u32m1(rk + 7 * 4, vl); \
+    rk8 = __riscv_vle32_v_u32m1(rk + 8 * 4, vl); \
+    rk9 = __riscv_vle32_v_u32m1(rk + 9 * 4, vl); \
+    if (UNLIKELY(nrounds >= 12)) \
+      { \
+        rk10 = __riscv_vle32_v_u32m1(rk + 10 * 4, vl); \
+        rk11 = __riscv_vle32_v_u32m1(rk + 11 * 4, vl); \
+        if (LIKELY(nrounds > 12)) \
+          { \
+            rk12 = __riscv_vle32_v_u32m1(rk + 12 * 4, vl); \
+            rk13 = __riscv_vle32_v_u32m1(rk + 13 * 4, vl); \
+          } \
+	else \
+	  { \
+	    rk12 = __riscv_vundefined_u32m1(); \
+	    rk13 = __riscv_vundefined_u32m1(); \
+	  } \
+      } \
+    else \
+      { \
+	rk10 = __riscv_vundefined_u32m1(); \
+	rk11 = __riscv_vundefined_u32m1(); \
+	rk12 = __riscv_vundefined_u32m1(); \
+	rk13 = __riscv_vundefined_u32m1(); \
+      } \
+    rk_last = __riscv_vle32_v_u32m1(rk + nrounds * 4, vl); \
+  } while (0)
+
+#ifdef HAVE_BROKEN_VAES_VS_INTRINSIC
+#define AES_CRYPT(e_d, mx, nrounds, blk, vlen) \
+  asm ( "vsetvli zero,%[vl],e32,"#mx",ta,ma;\n\t" \
+	"vaesz.vs %[block],%[rk0];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk1];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk2];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk3];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk4];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk5];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk6];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk7];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk8];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk9];\n\t" \
+	"blt %[rounds],%[num12],.Lcryptlast%=;\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk10];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk11];\n\t" \
+	"beq %[rounds],%[num12],.Lcryptlast%=;\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk12];\n\t" \
+	"vaes"#e_d"m.vs %[block],%[rk13];\n\t" \
+	".Lcryptlast%=:\n\t" \
+	"vaes"#e_d"f.vs %[block],%[rk_last];\n\t" \
+	: [block] "+vr" (blk) \
+	: [vl] "r" (vlen), [rounds] "r" (nrounds), [num12] "r" (12), \
+	  [rk0] "vr" (rk0), [rk1] "vr" (rk1), [rk2] "vr" (rk2), \
+	  [rk3] "vr" (rk3), [rk4] "vr" (rk4), [rk5] "vr" (rk5), \
+	  [rk6] "vr" (rk6), [rk7] "vr" (rk7), [rk8] "vr" (rk8), \
+	  [rk9] "vr" (rk9), [rk10] "vr" (rk10), [rk11] "vr" (rk11), \
+	  [rk12] "vr" (rk12), [rk13] "vr" (rk13), \
+	  [rk_last] "vr" (rk_last) \
+	: "vl")
+#else
+#define AES_CRYPT(e_d, mx, rounds, block, vl) \
+  ({ \
+    (block) = __riscv_vaesz_vs_u32m1_u32##mx((block), rk0, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk1, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk2, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk3, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk4, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk5, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk6, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk7, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk8, (vl)); \
+    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk9, (vl)); \
+    if (UNLIKELY((rounds) >= 12)) \
+      { \
+	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk10, (vl)); \
+	(block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk11, (vl)); \
+	if (LIKELY((rounds) > 12)) \
+	  { \
+	    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk12, (vl)); \
+	    (block) = __riscv_vaes##e_d##m_vs_u32m1_u32##mx((block), rk13, (vl)); \
+	  } \
+      } \
+    (block) = __riscv_vaes##e_d##f_vs_u32m1_u32##mx((block), rk_last, (vl)); \
+  })
+#endif
+
+unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context *ctx, unsigned char *out,
+				const unsigned char *in)
+{
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t block;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  block = unaligned_load_u32m1(in, vl);
+
+  AES_CRYPT(e, m1, rounds, block, vl);
+
+  unaligned_store_u32m1(out, block, vl);
+
+  clear_vec_regs();
+
+  return 0; /* does not use stack */
+}
+
+unsigned int ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context *ctx, unsigned char *out,
+				const unsigned char *in)
+{
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t block;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  block = unaligned_load_u32m1(in, vl);
+
+  AES_CRYPT(d, m1, rounds, block, vl);
+
+  unaligned_store_u32m1(out, block, vl);
+
+  clear_vec_regs();
+
+  return 0; /* does not use stack */
+}
+
+static ASM_FUNC_ATTR_INLINE void
+aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
+			    const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = encrypt ? ctx->keyschenc32[0] : ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  ROUND_KEY_VARIABLES;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t blocks;
+
+      blocks = unaligned_load_u32m4(inbuf, vl * 4);
+
+      if (encrypt)
+        AES_CRYPT(e, m4, rounds, blocks, vl * 4);
+      else
+        AES_CRYPT(d, m4, rounds, blocks, vl * 4);
+
+      unaligned_store_u32m4(outbuf, blocks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t block;
+
+      block = unaligned_load_u32m1(inbuf, vl);
+
+      if (encrypt)
+        AES_CRYPT(e, m1, rounds, block, vl);
+      else
+        AES_CRYPT(d, m1, rounds, block, vl);
+
+      unaligned_store_u32m1(outbuf, block, vl);
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  clear_vec_regs();
+}
+
+static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+aes_riscv_zvkned_ecb_enc (void *context, void *outbuf_arg,
+			  const void *inbuf_arg, size_t nblocks)
+{
+  aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 1);
+}
+
+static void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+aes_riscv_zvkned_ecb_dec (void *context, void *outbuf_arg,
+			  const void *inbuf_arg, size_t nblocks)
+{
+  aes_riscv_zvkned_ecb_crypt (context, outbuf_arg, inbuf_arg, nblocks, 0);
+}
+
+void ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt)
+{
+  if (encrypt)
+    aes_riscv_zvkned_ecb_enc (context, outbuf_arg, inbuf_arg, nblocks);
+  else
+    aes_riscv_zvkned_ecb_dec (context, outbuf_arg, inbuf_arg, nblocks);
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cfb_enc (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks; nblocks--)
+    {
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+
+      AES_CRYPT(e, m1, rounds, iv, vl);
+
+      data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(iv), data, vl_bytes);
+      __riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
+      iv = cast_u8m1_u32m1(data);
+
+      outbuf += BLOCKSIZE;
+      inbuf  += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cbc_enc (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks, int cbc_mac)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  size_t outbuf_add = (!cbc_mac) * BLOCKSIZE;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks; nblocks--)
+    {
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+      iv = cast_u8m1_u32m1(
+	__riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes));
+
+      AES_CRYPT(e, m1, rounds, iv, vl);
+
+      __riscv_vse8_v_u8m1(outbuf, cast_u32m1_u8m1(iv), vl_bytes);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += outbuf_add;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_ctr_enc (void *context, unsigned char *ctr_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  static const byte add_u8_array[4][16] =
+  {
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 },
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 }
+  };
+  static const u64 carry_add[2] = { 1, 1 };
+  static const u64 nocarry_add[2] = { 1, 0 };
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  u64 ctrlow;
+  vuint32m1_t ctr;
+  vuint8m1_t add1;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  add1 = __riscv_vle8_v_u8m1(add_u8_array[0], vl_bytes);
+  ctr = unaligned_load_u32m1(ctr_arg, vl);
+  ctrlow = __riscv_vmv_x_s_u64m1_u64(cast_u32m1_u64m1(bswap128_u32m1(ctr, vl)));
+
+  memory_barrier_with_vec(add1);
+
+  if (nblocks >= 4)
+    {
+      vuint8m1_t add2 = __riscv_vle8_v_u8m1(add_u8_array[1], vl_bytes);
+      vuint8m1_t add3 = __riscv_vle8_v_u8m1(add_u8_array[2], vl_bytes);
+      vuint8m1_t add4 = __riscv_vle8_v_u8m1(add_u8_array[3], vl_bytes);
+
+      memory_barrier_with_vec(add2);
+      memory_barrier_with_vec(add3);
+      memory_barrier_with_vec(add4);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  vuint8m4_t data4blks;
+	  vuint32m4_t ctr4blks;
+
+	  /* detect if 8-bit carry handling is needed */
+	  if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3))
+	    {
+	      static const u64 *adders[5][4] =
+	      {
+		{ nocarry_add, nocarry_add, nocarry_add, carry_add },
+		{ nocarry_add, nocarry_add, carry_add, nocarry_add },
+		{ nocarry_add, carry_add, nocarry_add, nocarry_add },
+		{ carry_add, nocarry_add, nocarry_add, nocarry_add },
+		{ nocarry_add, nocarry_add, nocarry_add, nocarry_add }
+	      };
+	      unsigned int idx = ctrlow <= 3 ? ctrlow : 4;
+	      vuint64m1_t ctr_u64;
+	      vuint32m1_t ctr_u32_1;
+	      vuint32m1_t ctr_u32_2;
+	      vuint32m1_t ctr_u32_3;
+	      vuint32m1_t ctr_u32_4;
+	      vuint64m1_t add_u64;
+
+	      /* Byte swap counter */
+	      ctr_u64 = cast_u32m1_u64m1(bswap128_u32m1(ctr, vl));
+
+	      /* Addition with carry handling */
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][0], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_1 = cast_u64m1_u32m1(ctr_u64);
+
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][1], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_2 = cast_u64m1_u32m1(ctr_u64);
+
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][2], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_3 = cast_u64m1_u32m1(ctr_u64);
+
+	      add_u64 = __riscv_vle64_v_u64m1(adders[idx][3], vl / 2);
+	      ctr_u64 = __riscv_vadd_vv_u64m1(ctr_u64, add_u64, vl / 2);
+	      ctr_u32_4 = cast_u64m1_u32m1(ctr_u64);
+
+	      /* Byte swap counters */
+	      ctr_u32_1 = bswap128_u32m1(ctr_u32_1, vl);
+	      ctr_u32_2 = bswap128_u32m1(ctr_u32_2, vl);
+	      ctr_u32_3 = bswap128_u32m1(ctr_u32_3, vl);
+	      ctr_u32_4 = bswap128_u32m1(ctr_u32_4, vl);
+
+	      ctr4blks = __riscv_vundefined_u32m4();
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr_u32_1);
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr_u32_2);
+	      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr_u32_3);
+	      ctr = ctr_u32_4;
+	    }
+	  else
+	    {
+	      /* Fast path addition without carry handling */
+	      vuint8m1_t ctr_u8 = cast_u32m1_u8m1(ctr);
+	      vuint8m1_t ctr1 = __riscv_vadd_vv_u8m1(ctr_u8, add1, vl_bytes);
+	      vuint8m1_t ctr2 = __riscv_vadd_vv_u8m1(ctr_u8, add2, vl_bytes);
+	      vuint8m1_t ctr3 = __riscv_vadd_vv_u8m1(ctr_u8, add3, vl_bytes);
+	      vuint8m4_t ctr0123_u8 = __riscv_vundefined_u8m4();
+
+	      ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(ctr_u8, add4,
+							 vl_bytes));
+
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 0, ctr_u8);
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 1, ctr1);
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 2, ctr2);
+	      ctr0123_u8 = __riscv_vset_v_u8m1_u8m4(ctr0123_u8, 3, ctr3);
+
+	      ctr4blks = cast_u8m4_u32m4(ctr0123_u8);
+	    }
+
+	  data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);
+
+	  AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);
+
+	  data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
+					   vl_bytes * 4);
+	  __riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t block = ctr;
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+
+      /* detect if 8-bit carry handling is needed */
+      if (UNLIKELY((++ctrlow & 0xff) == 0))
+	{
+	  const u64 *add_arr = UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add;
+	  vuint64m1_t add_val = __riscv_vle64_v_u64m1(add_arr, vl / 2);
+
+	  /* Byte swap counter */
+	  ctr = bswap128_u32m1(ctr, vl);
+
+	  /* Addition with carry handling */
+	  ctr = cast_u64m1_u32m1(__riscv_vadd_vv_u64m1(cast_u32m1_u64m1(ctr),
+						       add_val, vl / 2));
+
+	  /* Byte swap counter */
+	  ctr = bswap128_u32m1(ctr, vl);
+	}
+      else
+	{
+	  /* Fast path addition without carry handling */
+	  ctr = cast_u8m1_u32m1(__riscv_vadd_vv_u8m1(cast_u32m1_u8m1(ctr),
+						     add1, vl_bytes));
+	}
+
+      AES_CRYPT(e, m1, rounds, block, vl);
+
+      data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
+      __riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(ctr_arg, ctr, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_ctr32le_enc (void *context, unsigned char *ctr_arg,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks)
+{
+  static const u32 add_u32_array[4][16] =
+  {
+    { 1, },  { 2, }, { 3, }, { 4, }
+  };
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t ctr;
+  vuint32m1_t add1;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  add1 = __riscv_vle32_v_u32m1(add_u32_array[0], vl);
+  ctr = unaligned_load_u32m1(ctr_arg, vl);
+
+  memory_barrier_with_vec(add1);
+
+  if (nblocks >= 4)
+    {
+      vuint32m1_t add2 = __riscv_vle32_v_u32m1(add_u32_array[1], vl);
+      vuint32m1_t add3 = __riscv_vle32_v_u32m1(add_u32_array[2], vl);
+      vuint32m1_t add4 = __riscv_vle32_v_u32m1(add_u32_array[3], vl);
+
+      memory_barrier_with_vec(add2);
+      memory_barrier_with_vec(add3);
+      memory_barrier_with_vec(add4);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  vuint32m1_t ctr1 = __riscv_vadd_vv_u32m1(ctr, add1, vl);
+	  vuint32m1_t ctr2 = __riscv_vadd_vv_u32m1(ctr, add2, vl);
+	  vuint32m1_t ctr3 = __riscv_vadd_vv_u32m1(ctr, add3, vl);
+	  vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+	  vuint8m4_t data4blks;
+
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, ctr1);
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, ctr2);
+	  ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, ctr3);
+	  ctr = __riscv_vadd_vv_u32m1(ctr, add4, vl);
+
+	  data4blks = __riscv_vle8_v_u8m4(inbuf, vl_bytes * 4);
+
+	  AES_CRYPT(e, m4, rounds, ctr4blks, vl * 4);
+
+	  data4blks = __riscv_vxor_vv_u8m4(cast_u32m4_u8m4(ctr4blks), data4blks,
+					   vl_bytes * 4);
+	  __riscv_vse8_v_u8m4(outbuf, data4blks, vl_bytes * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t block = ctr;
+      vuint8m1_t data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+
+      ctr = __riscv_vadd_vv_u32m1(ctr, add1, vl);
+
+      AES_CRYPT(e, m1, rounds, block, vl);
+
+      data = __riscv_vxor_vv_u8m1(cast_u32m1_u8m1(block), data, vl_bytes);
+      __riscv_vse8_v_u8m1(outbuf, data, vl_bytes);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(ctr_arg, ctr, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cfb_dec (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
+      vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
+      vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
+      vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
+      vuint32m4_t iv4blks = __riscv_vundefined_u32m4();
+
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);
+      iv = iv4;
+
+      AES_CRYPT(e, m4, rounds, iv4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t new_iv = data;
+
+      AES_CRYPT(e, m1, rounds, iv, vl);
+
+      data = vxor_u8_u32m1(iv, data, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+      iv = new_iv;
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_cbc_dec (void *context, unsigned char *iv_arg,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t iv;
+  ROUND_KEY_VARIABLES;
+
+  if (!ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  iv = unaligned_load_u32m1(iv_arg, vl);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m1_t iv1 = __riscv_vget_v_u32m4_u32m1(data4blks, 0);
+      vuint32m1_t iv2 = __riscv_vget_v_u32m4_u32m1(data4blks, 1);
+      vuint32m1_t iv3 = __riscv_vget_v_u32m4_u32m1(data4blks, 2);
+      vuint32m1_t iv4 = __riscv_vget_v_u32m4_u32m1(data4blks, 3);
+      vuint32m4_t iv4blks = __riscv_vundefined_u32m4();
+
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 0, iv);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 1, iv1);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 2, iv2);
+      iv4blks = __riscv_vset_v_u32m1_u32m4(iv4blks, 3, iv3);
+
+      AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(iv4blks, data4blks, vl * 4);
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+      iv = iv4;
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t new_iv = data;
+
+      AES_CRYPT(d, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(iv, data, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+      iv = new_iv;
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(iv_arg, iv, vl);
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
+aes_riscv_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  vuint32m1_t ctr;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload Offset and Checksum */
+  iv = unaligned_load_u32m1(c->u_iv.iv, vl);
+  ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);
+
+  if (nblocks >= 4)
+    {
+      vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+      vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
+
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+	  vuint8m1_t l_ntzi;
+	  vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+	  vuint32m4_t offsets = __riscv_vundefined_u32m4();
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
+			  __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      const unsigned char *l;
+      vuint8m1_t l_ntzi;
+      vuint32m1_t data;
+
+      data = unaligned_load_u32m1(inbuf, vl);
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(ctr, data, vl);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      l = ocb_get_l(c, ++n);
+      l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+      iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+
+      data = vxor_u8_u32m1(data, iv, vl);
+
+      AES_CRYPT(e, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(iv, data, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+
+  unaligned_store_u32m1(c->u_iv.iv, iv, vl);
+  unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 size_t
+aes_riscv_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  vuint32m1_t ctr;
+  ROUND_KEY_VARIABLES;
+
+  if (!ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload Offset and Checksum */
+  iv = unaligned_load_u32m1(c->u_iv.iv, vl);
+  ctr = unaligned_load_u32m1(c->u_ctr.ctr, vl);
+
+  if (nblocks >= 4)
+    {
+      vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+      vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
+
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+	  vuint8m1_t l_ntzi;
+	  vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+	  vuint32m4_t offsets = __riscv_vundefined_u32m4();
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
+
+	  inbuf += 4 * BLOCKSIZE;
+	  outbuf += 4 * BLOCKSIZE;
+	}
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
+			  __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      const unsigned char *l;
+      vuint8m1_t l_ntzi;
+      vuint8m1_t data;
+      vuint32m1_t block;
+
+      l = ocb_get_l(c, ++n);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+      data = __riscv_vle8_v_u8m1(inbuf, vl_bytes);
+      iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+      data = __riscv_vxor_vv_u8m1(data, cast_u32m1_u8m1(iv), vl_bytes);
+      block = cast_u8m1_u32m1(data);
+
+      AES_CRYPT(d, m1, rounds, block, vl);
+
+      block = vxor_u8_u32m1(iv, block, vl);
+      unaligned_store_u32m1(outbuf, block, vl);
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(ctr, block, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+
+  unaligned_store_u32m1(c->u_iv.iv, iv, vl);
+  unaligned_store_u32m1(c->u_ctr.ctr, ctr, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt)
+{
+  if (encrypt)
+    return aes_riscv_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+  else
+    return aes_riscv_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
+}
+
+size_t ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2
+_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				 size_t nblocks)
+{
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 n = c->u_mode.ocb.aad_nblocks;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  size_t vl_bytes = vl * 4;
+  vuint32m1_t iv;
+  vuint32m1_t ctr;
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload Offset and Sum */
+  iv = unaligned_load_u32m1(c->u_mode.ocb.aad_offset, vl);
+  ctr = unaligned_load_u32m1(c->u_mode.ocb.aad_sum, vl);
+
+  if (nblocks >= 4)
+    {
+      vuint32m4_t ctr4blks = __riscv_vundefined_u32m4();
+      vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, vl);
+
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 0, ctr);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 1, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 2, zero);
+      ctr4blks = __riscv_vset_v_u32m1_u32m4(ctr4blks, 3, zero);
+
+      for (; nblocks >= 4; nblocks -= 4)
+	{
+	  const unsigned char *l;
+	  vuint8m1_t l_ntzi;
+	  vuint32m4_t data4blks = unaligned_load_u32m4(abuf, vl * 4);
+	  vuint32m4_t offsets = __riscv_vundefined_u32m4();
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 0, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 1, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 2, iv);
+
+	  l = ocb_get_l(c, ++n);
+	  l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+	  iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+	  offsets = __riscv_vset_v_u32m1_u32m4(offsets, 3, iv);
+
+	  data4blks = vxor_u8_u32m4(offsets, data4blks, vl * 4);
+
+	  AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
+
+	  ctr4blks = vxor_u8_u32m4(ctr4blks, data4blks, vl * 4);
+
+	  abuf += 4 * BLOCKSIZE;
+	}
+
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      ctr = vxor_u8_u32m1(__riscv_vget_v_u32m4_u32m1(ctr4blks, 0),
+			  __riscv_vget_v_u32m4_u32m1(ctr4blks, 1), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 2), vl);
+      ctr = vxor_u8_u32m1(ctr, __riscv_vget_v_u32m4_u32m1(ctr4blks, 3), vl);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      const unsigned char *l;
+      vuint8m1_t l_ntzi;
+      vuint32m1_t data;
+
+      data = unaligned_load_u32m1(abuf, vl);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      l = ocb_get_l(c, ++n);
+      l_ntzi = __riscv_vle8_v_u8m1(l, vl_bytes);
+      iv = vxor_u8_u32m1(iv, cast_u8m1_u32m1(l_ntzi), vl);
+
+      data = vxor_u8_u32m1(data, iv, vl);
+
+      AES_CRYPT(e, m1, rounds, data, vl);
+
+      ctr = vxor_u8_u32m1(ctr, data, vl);
+
+      abuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.aad_nblocks = n;
+
+  unaligned_store_u32m1(c->u_mode.ocb.aad_offset, iv, vl);
+  unaligned_store_u32m1(c->u_mode.ocb.aad_sum, ctr, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+static const u64 xts_gfmul_const[2] = { 0x87, 0x01 };
+static const u64 xts_swap64_const[2] = { 1, 0 };
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+xts_gfmul_byA (vuint32m1_t vec_in, vuint64m1_t xts_gfmul,
+	       vuint64m1_t xts_swap64, size_t vl)
+{
+  vuint64m1_t in_u64 = cast_u32m1_u64m1(vec_in);
+  vuint64m1_t tmp1;
+
+  tmp1 =
+    __riscv_vrgather_vv_u64m1(cast_u32m1_u64m1(vec_in), xts_swap64, vl / 2);
+  tmp1 = cast_i64m1_u64m1(
+    __riscv_vsra_vx_i64m1(cast_u64m1_i64m1(tmp1), 63, vl / 2));
+  in_u64 = __riscv_vadd_vv_u64m1(in_u64, in_u64, vl / 2);
+  tmp1 = __riscv_vand_vv_u64m1(tmp1, xts_gfmul, vl / 2);
+
+  return cast_u64m1_u32m1(__riscv_vxor_vv_u64m1(in_u64, tmp1, vl / 2));
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes_riscv_xts_enc (void *context, unsigned char *tweak_arg, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschenc32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t tweak;
+  vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
+  vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
+  ROUND_KEY_VARIABLES;
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload tweak */
+  tweak = unaligned_load_u32m1(tweak_arg, vl);
+
+  memory_barrier_with_vec(xts_gfmul);
+  memory_barrier_with_vec(xts_swap64);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m4_t tweaks = __riscv_vundefined_u32m4();
+
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      AES_CRYPT(e, m4, rounds, data4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t tweak0 = tweak;
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      AES_CRYPT(e, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(tweak_arg, tweak, vl);
+
+  clear_vec_regs();
+}
+
+static ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+aes_riscv_xts_dec (void *context, unsigned char *tweak_arg, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  const u32 *rk = ctx->keyschdec32[0];
+  int rounds = ctx->rounds;
+  size_t vl = 4;
+  vuint32m1_t tweak;
+  vuint64m1_t xts_gfmul = __riscv_vle64_v_u64m1(xts_gfmul_const, vl / 2);
+  vuint64m1_t xts_swap64 = __riscv_vle64_v_u64m1(xts_swap64_const, vl / 2);
+  ROUND_KEY_VARIABLES;
+
+  if (!ctx->decryption_prepared)
+    {
+      do_prepare_decryption(ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  PRELOAD_ROUND_KEYS (rk, rounds, vl);
+
+  /* Preload tweak */
+  tweak = unaligned_load_u32m1(tweak_arg, vl);
+
+  memory_barrier_with_vec(xts_gfmul);
+  memory_barrier_with_vec(xts_swap64);
+
+  for (; nblocks >= 4; nblocks -= 4)
+    {
+      vuint32m4_t data4blks = unaligned_load_u32m4(inbuf, vl * 4);
+      vuint32m4_t tweaks = __riscv_vundefined_u32m4();
+
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 0, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 1, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 2, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+      tweaks = __riscv_vset_v_u32m1_u32m4(tweaks, 3, tweak);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      AES_CRYPT(d, m4, rounds, data4blks, vl * 4);
+
+      data4blks = vxor_u8_u32m4(tweaks, data4blks, vl * 4);
+
+      unaligned_store_u32m4(outbuf, data4blks, vl * 4);
+
+      inbuf += 4 * BLOCKSIZE;
+      outbuf += 4 * BLOCKSIZE;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(inbuf, vl);
+      vuint32m1_t tweak0 = tweak;
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      tweak = xts_gfmul_byA(tweak, xts_gfmul, xts_swap64, vl);
+
+      AES_CRYPT(d, m1, rounds, data, vl);
+
+      data = vxor_u8_u32m1(data, tweak0, vl);
+      unaligned_store_u32m1(outbuf, data, vl);
+
+      inbuf  += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  unaligned_store_u32m1(tweak_arg, tweak, vl);
+
+  clear_vec_regs();
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_OPT_O2 void
+_gcry_aes_riscv_zvkned_xts_crypt (void *context, unsigned char *tweak_arg,
+				  void *outbuf_arg, const void *inbuf_arg,
+				  size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    aes_riscv_xts_enc(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
+  else
+    aes_riscv_xts_dec(context, tweak_arg, outbuf_arg, inbuf_arg, nblocks);
+}
+
+#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 0c48793b..52500e59 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -280,6 +280,63 @@ extern void _gcry_aes_vp_riscv_xts_crypt (void *context, unsigned char *tweak,
 					  size_t nblocks, int encrypt);
 #endif
 
+#ifdef USE_RISCV_V_CRYPTO
+/* RISC-V vector cryptography extension implementation of AES */
+extern int
+_gcry_aes_riscv_zvkned_setup_acceleration (RIJNDAEL_context *ctx);
+
+extern void
+_gcry_aes_riscv_zvkned_setkey (RIJNDAEL_context *ctx, const byte *key);
+extern void
+_gcry_aes_riscv_zvkned_prepare_decryption (RIJNDAEL_context *ctx);
+
+extern unsigned int
+_gcry_aes_riscv_zvkned_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+				const unsigned char *src);
+extern unsigned int
+_gcry_aes_riscv_zvkned_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+				const unsigned char *src);
+extern void
+_gcry_aes_riscv_zvkned_cfb_enc (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_cbc_enc (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks, int cbc_mac);
+extern void
+_gcry_aes_riscv_zvkned_ctr_enc (void *context, unsigned char *ctr,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_ctr32le_enc (void *context, unsigned char *ctr,
+				    void *outbuf_arg, const void *inbuf_arg,
+				    size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_cfb_dec (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_cbc_dec (void *context, unsigned char *iv,
+				void *outbuf_arg, const void *inbuf_arg,
+				size_t nblocks);
+extern size_t
+_gcry_aes_riscv_zvkned_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt);
+extern size_t
+_gcry_aes_riscv_zvkned_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				 size_t nblocks);
+extern void
+_gcry_aes_riscv_zvkned_ecb_crypt (void *context, void *outbuf_arg,
+				  const void *inbuf_arg, size_t nblocks,
+				  int encrypt);
+extern void
+_gcry_aes_riscv_zvkned_xts_crypt (void *context, unsigned char *tweak,
+				  void *outbuf_arg, const void *inbuf_arg,
+				  size_t nblocks, int encrypt);
+#endif
+
 #ifdef USE_PADLOCK
 extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
                                                unsigned char *bx,
@@ -774,9 +831,36 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->xts_crypt = _gcry_aes_vp_aarch64_xts_crypt;
     }
 #endif
+#ifdef USE_RISCV_V_CRYPTO
+    else if ((hwfeatures & HWF_RISCV_IMAFDC)
+	     && (hwfeatures & HWF_RISCV_V)
+	     && (hwfeatures & HWF_RISCV_ZVKNED)
+	     && _gcry_aes_riscv_zvkned_setup_acceleration(ctx))
+    {
+      hw_setkey = _gcry_aes_riscv_zvkned_setkey;
+      ctx->encrypt_fn = _gcry_aes_riscv_zvkned_encrypt;
+      ctx->decrypt_fn = _gcry_aes_riscv_zvkned_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->prepare_decryption = _gcry_aes_riscv_zvkned_prepare_decryption;
+
+      /* Setup RISC-V vector cryptography bulk encryption routines.  */
+      bulk_ops->cfb_enc = _gcry_aes_riscv_zvkned_cfb_enc;
+      bulk_ops->cfb_dec = _gcry_aes_riscv_zvkned_cfb_dec;
+      bulk_ops->cbc_enc = _gcry_aes_riscv_zvkned_cbc_enc;
+      bulk_ops->cbc_dec = _gcry_aes_riscv_zvkned_cbc_dec;
+      bulk_ops->ctr_enc = _gcry_aes_riscv_zvkned_ctr_enc;
+      bulk_ops->ctr32le_enc = _gcry_aes_riscv_zvkned_ctr32le_enc;
+      bulk_ops->ocb_crypt = _gcry_aes_riscv_zvkned_ocb_crypt;
+      bulk_ops->ocb_auth = _gcry_aes_riscv_zvkned_ocb_auth;
+      bulk_ops->ecb_crypt = _gcry_aes_riscv_zvkned_ecb_crypt;
+      bulk_ops->xts_crypt = _gcry_aes_riscv_zvkned_xts_crypt;
+    }
+#endif
 #ifdef USE_VP_RISCV
-  else if ((hwfeatures & HWF_RISCV_IMAFDC) && (hwfeatures & HWF_RISCV_V) &&
-           _gcry_aes_vp_riscv_setup_acceleration(ctx))
+  else if ((hwfeatures & HWF_RISCV_IMAFDC)
+	   && (hwfeatures & HWF_RISCV_V)
+	   && _gcry_aes_vp_riscv_setup_acceleration(ctx))
     {
       hw_setkey = _gcry_aes_vp_riscv_do_setkey;
       ctx->encrypt_fn = _gcry_aes_vp_riscv_encrypt;
@@ -785,7 +869,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->prefetch_dec_fn = NULL;
       ctx->prepare_decryption = _gcry_aes_vp_riscv_prepare_decryption;
 
-      /* Setup vector permute AArch64 bulk encryption routines.  */
+      /* Setup vector permute RISC-V bulk encryption routines.  */
       bulk_ops->cfb_enc = _gcry_aes_vp_riscv_cfb_enc;
       bulk_ops->cfb_dec = _gcry_aes_vp_riscv_cfb_dec;
       bulk_ops->cbc_enc = _gcry_aes_vp_riscv_cbc_enc;
diff --git a/configure.ac b/configure.ac
index d45ea851..45fe5143 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2812,6 +2812,32 @@ m4_define([GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST],
 	__riscv_vse32_v_u32m1(ptr + 4 * vl, w2, vl);
 	__riscv_vse32_v_u32m1(ptr + 5 * vl, w3, vl);
       }
+      void test_aes_key(unsigned int *ptr)
+      {
+	int vl = __riscv_vsetvl_e32m1 (4);
+	vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl);
+	vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl);
+	vuint32m1_t c = __riscv_vaeskf1_vi_u32m1(a, 1, vl);
+	vuint32m1_t d = __riscv_vaeskf2_vi_u32m1(a, b, 2, vl);
+	__riscv_vse32_v_u32m1(ptr + 0 * vl, c, vl);
+	__riscv_vse32_v_u32m1(ptr + 1 * vl, d, vl);
+      }
+      void test_aes_crypt(unsigned int *ptr)
+      {
+	int vl = __riscv_vsetvl_e32m1 (4);
+	vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl);
+	vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl);
+	vuint32m1_t c = __riscv_vaesz_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t d = __riscv_vaesem_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t e = __riscv_vaesef_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t f = __riscv_vaesdm_vs_u32m1_u32m1(a, b, vl);
+	vuint32m1_t g = __riscv_vaesdf_vs_u32m1_u32m1(a, b, vl);
+	__riscv_vse32_v_u32m1(ptr + 0 * vl, c, vl);
+	__riscv_vse32_v_u32m1(ptr + 1 * vl, d, vl);
+	__riscv_vse32_v_u32m1(ptr + 2 * vl, e, vl);
+	__riscv_vse32_v_u32m1(ptr + 3 * vl, f, vl);
+	__riscv_vse32_v_u32m1(ptr + 4 * vl, g, vl);
+      }
       void test_inline_vec_asm(unsigned int *ptr)
       {
 	int vl = __riscv_vsetvl_e32m1 (4);
@@ -2918,6 +2944,53 @@ EOF
   CFLAGS=$_gcc_cflags_save;
 fi
 
+#
+# Check whether compiler has working RISC-V vector __riscv_vaes*_vs intrinsics
+#
+# Some GCC versions generate a 'vsetvli' instruction with an incorrect 'm1'
+# LMUL instead of the expected 'mX' for the __riscv_vaes*_vs_u32m1_u32mX
+# intrinsics.
+#
+if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" ||
+   test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
+
+  # Setup flags for test.
+  _gcc_cflags_save=$CFLAGS
+  if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
+    CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align"
+  else
+    CFLAGS="$CFLAGS -O2"
+  fi
+
+  AC_CACHE_CHECK([whether compiler has working RISC-V __riscv_vaes*_vs intrinsics],
+    [gcry_cv_riscv_vaes_vs_intrinsics_work],
+    [gcry_cv_riscv_vaes_vs_intrinsics_work=no
+     cat > conftest.c <<EOF
+#include <riscv_vector.h>
+vuint32m4_t test_fn(vuint32m4_t a, vuint32m1_t b, int vl)
+{
+  /* This intrinsic should result in a 'vsetvli' with m4 */
+  return __riscv_vaesem_vs_u32m1_u32m4(a, b, vl);
+}
+EOF
+
+    if $CC $CFLAGS -S conftest.c -o conftest.s >&5 2>&5; then
+      if grep -E 'vsetvli.*,[[[:space:]]]*m4[[[:space:]]]*,' conftest.s >/dev/null 2>&1; then
+	gcry_cv_riscv_vaes_vs_intrinsics_work=yes
+      fi
+    fi
+    rm -f conftest.*
+  ])
+
+  if test "$gcry_cv_riscv_vaes_vs_intrinsics_work" = "no"; then
+    AC_DEFINE([HAVE_BROKEN_VAES_VS_INTRINSIC], [1],
+	      [Define to 1 if __riscv_vaes*_vs intrinsics are broken])
+  fi
+
+  # Restore flags.
+  CFLAGS=$_gcc_cflags_save;
+fi
+
 
 #######################################
 #### Checks for library functions. ####
@@ -3390,6 +3463,9 @@ if test "$found" = "1" ; then
       riscv64-*-*)
          # Build with the vector permute SIMD128 implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vp-riscv.lo"
+
+         # Build with the RISC-V vector cryptography implementation
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-riscv-zvkned.lo"
       ;;
       s390x-*-*)
          # Big-Endian.
diff --git a/src/g10lib.h b/src/g10lib.h
index 4fa91ba9..991ec3ea 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -281,8 +281,9 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_RISCV_ZBB           (1 << 2)
 #define HWF_RISCV_ZBC           (1 << 3)
 #define HWF_RISCV_ZVKB          (1 << 4)
-#define HWF_RISCV_ZVKNHA        (1 << 5)
-#define HWF_RISCV_ZVKNHB        (1 << 6)
+#define HWF_RISCV_ZVKNED        (1 << 5)
+#define HWF_RISCV_ZVKNHA        (1 << 6)
+#define HWF_RISCV_ZVKNHB        (1 << 7)
 
 #endif
 
diff --git a/src/hwf-riscv.c b/src/hwf-riscv.c
index 925284a1..c37fd8dc 100644
--- a/src/hwf-riscv.c
+++ b/src/hwf-riscv.c
@@ -191,6 +191,7 @@ detect_riscv_at_hwcap(void)
 #define HWF_RISCV_HWPROBE_EXT_ZBS           (1U << 5)
 #define HWF_RISCV_HWPROBE_EXT_ZBC           (1U << 7)
 #define HWF_RISCV_HWPROBE_EXT_ZVKB          (1U << 19)
+#define HWF_RISCV_HWPROBE_EXT_ZVKNED        (1U << 21)
 #define HWF_RISCV_HWPROBE_EXT_ZVKNHA        (1U << 22)
 #define HWF_RISCV_HWPROBE_EXT_ZVKNHB        (1U << 23)
 #define HWF_RISCV_HWPROBE_EXT_ZICOND        (U64_C(1) << 35)
@@ -215,6 +216,7 @@ static const struct hwprobe_feature_map_s hwprobe_features[] =
     { HWF_RISCV_HWPROBE_EXT_ZBB,     HWF_RISCV_ZBB },
     { HWF_RISCV_HWPROBE_EXT_ZBC,     HWF_RISCV_ZBC },
     { HWF_RISCV_HWPROBE_EXT_ZVKB,    HWF_RISCV_ZVKB },
+    { HWF_RISCV_HWPROBE_EXT_ZVKNED,  HWF_RISCV_ZVKNED },
     { HWF_RISCV_HWPROBE_EXT_ZVKNHA,  HWF_RISCV_ZVKNHA },
     { HWF_RISCV_HWPROBE_EXT_ZVKNHB,  HWF_RISCV_ZVKNHB },
   };
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index df2aaf17..0752d787 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -99,6 +99,7 @@ static struct
     { HWF_RISCV_ZVKB,          "riscv-zvkb" },
     { HWF_RISCV_ZVKNHA,        "riscv-zvknha" },
     { HWF_RISCV_ZVKNHB,        "riscv-zvknhb" },
+    { HWF_RISCV_ZVKNED,        "riscv-zvkned" },
 #endif
   };
 
-- 
2.48.1


From jessicawuttig83 at gmail.com  Fri Aug  8 19:00:04 2025
From: jessicawuttig83 at gmail.com (Jessica Wuttig)
Date: Fri, 8 Aug 2025 18:00:04 +0100
Subject: =?UTF-8?B?2LTZh9in2K/YqSDYp9mE2KPYqNmI2Kkg2YHZiiDYp9mE2YXYutix2Kg=?=
Message-ID: <CADaJKDors_HJa05MeTb1CAO9qJ7J48pjXPNR8Y7ZMz5pECp7yQ@mail.gmail.com>

?? ?? ????? ?????? ??? ????? ?????? ???????


 ????? ????? ????? ?? ???? ?????? ???????
https://www.targir.com/2025/04/blog-post_14.html


??????? ????? ???????? ????? ?????? ?????????? ????????? ???????? ???


?? ?? ????? ?????? ?? ???????


????? ?????? ????? ??????? ???? ????? ??? ??? ?????? ?????? ??????? ???????
??? ????? ?????? ?????????? ?? ??? ??? ???? ??? ???? ???? ?? ????? ????.
????? ??? ??????? ????? ?? ????????? ???????? ???? ???? ??? ????? ??????
???????.


??????? ???? ?????? ??????? ????? ??????
1. ??????? ??? ???????? ?? ?????? ???????
????? ???? ??? ???? ???? ?????? ?? ?? ??????? ???? ??????? ?? ???? ??????
???????? ????? ???? ??? ??? ???? ?????? ??? ????? ??????.


2. ????? ????? ?? ???? ??? ????
?? ????? ??? ???? ????? ?????? ??? ???????? ????? ??? ??????? ??????
??????? ???????.


??????? ?????? ??? ????? ??????
????: ??? ???? ???? ???????
????? ??? ???? ?? ??? ??? ????? ????? ??? ??? ??????? ?????????? ???? ???
???? ??? ??? ????? ?? ????? ???? ?? ??? ????? ??? ????? ?????? ??????.


??????: ??????? ????????
???? ?? ????? ??????? ???????
??? ???????? ????? ?????? (?? ????)
????? ???? ???????
????? ??????
????? ?? ??????? ???? ????? ???? ??????
????? ????? ?????? pdf
???? ??? ??????? ??????????? ????? ????? ????? ?????? PDF ???? ???????? ???
?? ??????? ?????? ?? ??? ??? ??? ??? ?????? ???? ??????? ?????? ??????
??????? ?????? ?????.


??????? ??????? ??? ????? ??????
?? ???? ????? ??? ????? ?????? ???????????
??? ??????? ??? ?????? ?????? ?????? ???? ???????.


?? ?? ??? ?????? ??? ????? ???????
?? ????? ????? ??? ??? ???????? ???? ???? ?? ?????? ??? ??? ??? ????? ?????
??? ????? ?????.


??? ??? ????? ?????? ???? ???????? ????????? ??????
?? ????? ??? ??? ?????? ??? ????? ?????? ???? ???????? ????????? ??????
???? ???? ???? ??????? ???:


1. ??????? ?????? ?????? ?????
?????? ?? ????? ?? ??? ??????? ????????? ????? ????? ???? ??? ??????? ?????
??????? ??? ?? ????? ??:


??????? ???????? ???????? ????? ????? ?? ????? ???????
????? ???? ??????
??????? ?????? ??????????
2. ??????? ??????? ????? ?????
?? ????? ??? ???? ????? ???? ????? ??? ??????? ?? ????? ??????? ??? ?????
?? ????? ?????? ?? ????? ?????? ?????????? ??? ?? ???? ????? ?????? ??????
?? ??? ??????? ????????? ??????.


????? ??????
?????? ?? ??? ??????? ?? ????? ????? ?????? ?????? ???? ??? ????? ?? ?????
?????? ?? ???? ?? ??? ??????? ????????.


???? ?????
??????? ?????? ???????


??????? ????????
??????? ?????? :


?       ????? ??????? ?


?       ?????? ???????


??????? ?????? :


?       ????? ????? ???? ?


?       ?????? ??????? .


??????? ??????? ??????? ??????? ????????
??????? ???????? ??????? ???? ????? ???? ?????


??????? ??????? ?????? ??????
??????? ???????? ??????? ???? ????? ???? ?????


?? ???? ???? ?????? ??? ??????
????? ??????


???? ???????
????? ?????? ???????? ????????


???? ????????? (??????? ???????)
????? ?????? ???????? ????????_?????


????? ????? ????? ?? ???? ?????? ???????


????? ????? ?????? ???????
????? ?????? ??: [??? ???????]
??? ?????: [????? ???????]
??? ??????: [????? ???????]


???? ????? ???????
??? ?????? ?????:


????? ??????: [????? ?????? ????]


??? ????? ??????? ???????: [??? CIN]


????? ????? ????????: [????? ????? ????? ????]


??????? ??????: [????? ???????]


??????: [?????? ???????]


?????? ????? ???? ?????? ???? ??????? ??????????? ??? ???:


?? ????? (?) ?????? (?): [??? ?????/?]
??????? (?) ??????: [????? ???????]
??: [???? ???????]


??(??) ????/????? ?????????(?)? ??? ????(?) ????? ????? ????? ????? ????
???? ??????:
[??? ????]? ???????? ??????: [????? ????? ????]? ???????? ???? ?????
??????? ???????: [??? CIN ????].


?????? ???? ????????? ????????? ???????? ???? ??? ?????/?? ????? ?? ???????
??????? ????? ??? ????? ???? ??? ????? ??? ??????? ?? ??????? ?? ??????
???????.


???????
????? ??????: _____________________
???????: ___________________________
???????: ____ / ____ / ________

??????? :
????? ??? ??????? ???????? ??????? ??: ???? ??????? ???????? ????? ???????
????? ???????? ??????.
?????? ??? ????? ??? ??????? ?????????? (??? ???? ??????) ???? ??? ???
???????? ??? ????? ????? ?? ??? ??? ????.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250808/7b5acae7/attachment.html>

From jussi.kivilinna at iki.fi  Sun Aug 10 17:44:15 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 10 Aug 2025 18:44:15 +0300
Subject: [PATCH] Add RISC-V vector cryptography implementation of GHASH
Message-ID: <20250810154415.2709812-1-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'cipher-gcm-riscv-zvkg.c'.
* cipher/cipher-gcm-riscv-zbb-zbc.c: Fix cipher-internal.h include.
* cipher/cipher-gcm-riscv-zvkg.c: New.
* cipher/cipher-gcm.c [GCM_USE_RISCV_ZVKG] (_gcry_ghash_setup_riscv_zvkg)
(_gcry_ghash_riscv_zvkg, _gcry_polyval_riscv_zvkg): New.
(setupM) [GCM_USE_RISCV_ZVKG]: Add setup for Zvkg implementation.
* cipher/cipher-internal.h (GCM_USE_RISCV_ZVKG): New.
* configure.ac: Add 'cipher-gcm-riscv-zvkg.lo'.
(GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST): Add check for Zvkg instrinsic.
* src/g10lib.h (HWF_RISCV_ZVKG): Insert before HWF_RISCV_ZVKNED.
* src/hwdf-riscv.h (HWF_RISCV_HWPROBE_EXT_ZVKG): New.
(hwprobe_features): Add HWF_RISCV_ZVKG.
* src/hwfeatures.c (hwflist) [HAVE_CPU_ARCH_RISCV]: Add "riscv-zvkg".
--

Implementation has been tested against QEMU emulator as there is no
actual HW available with these instructions yet.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am                |  11 ++-
 cipher/cipher-gcm-riscv-zbb-zbc.c |   3 +-
 cipher/cipher-gcm-riscv-zvkg.c    | 130 ++++++++++++++++++++++++++++++
 cipher/cipher-gcm.c               |  20 +++++
 cipher/cipher-internal.h          |   9 +++
 configure.ac                      |  10 +++
 src/g10lib.h                      |   7 +-
 src/hwf-riscv.c                   |   2 +
 src/hwfeatures.c                  |   1 +
 9 files changed, 187 insertions(+), 6 deletions(-)
 create mode 100644 cipher/cipher-gcm-riscv-zvkg.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 85c9c6d8..d31da411 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -93,7 +93,7 @@ EXTRA_libcipher_la_SOURCES = \
 	cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \
 	cipher-gcm-aarch64-simd.c cipher-gcm-armv7-neon.S \
 	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
-	cipher-gcm-riscv-zbb-zbc.c \
+	cipher-gcm-riscv-zbb-zbc.c cipher-gcm-riscv-zvkg.c \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
@@ -383,16 +383,25 @@ rijndael-vp-riscv.lo: $(srcdir)/rijndael-vp-riscv.c Makefile
 if ENABLE_RISCV_VECTOR_CRYPTO_INTRINSICS_EXTRA_CFLAGS
 riscv_vector_crypto_aes_cflags = -O2 -march=rv64imafdcv_zvkned -mstrict-align
 riscv_vector_crypto_sha_cflags = -O2 -march=rv64imafdcv_zvknha_zvknhb_zvkb -mstrict-align
+riscv_vector_crypto_gcm_cflags = -O2 -march=rv64imafdcv_zvkg -mstrict-align
 else
 if SUPPORT_CC_RISCV_MSTRICT_ALIGN
 riscv_vector_crypto_aes_cflags = -O2 -mstrict-align
 riscv_vector_crypto_sha_cflags = -O2 -mstrict-align
+riscv_vector_crypto_gcm_cflags = -O2 -mstrict-align
 else
 riscv_vector_crypto_aes_cflags =
 riscv_vector_crypto_sha_cflags =
+riscv_vector_crypto_gcm_cflags =
 endif
 endif
 
+cipher-gcm-riscv-zvkg.o: $(srcdir)/cipher-gcm-riscv-zvkg.c Makefile
+	`echo $(COMPILE) $(riscv_vector_crypto_gcm_cflags) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-riscv-zvkg.lo: $(srcdir)/cipher-gcm-riscv-zvkg.c Makefile
+	`echo $(LTCOMPILE) $(riscv_vector_crypto_gcm_cflags) -c $< | $(instrumentation_munging) `
+
 rijndael-riscv-zvkned.o: $(srcdir)/rijndael-riscv-zvkned.c Makefile
 	`echo $(COMPILE) $(riscv_vector_crypto_aes_cflags) -c $< | $(instrumentation_munging) `
 
diff --git a/cipher/cipher-gcm-riscv-zbb-zbc.c b/cipher/cipher-gcm-riscv-zbb-zbc.c
index 1a1f1484..61539274 100644
--- a/cipher/cipher-gcm-riscv-zbb-zbc.c
+++ b/cipher/cipher-gcm-riscv-zbb-zbc.c
@@ -23,7 +23,7 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
-#include "./cipher-internal.h"
+#include "cipher-internal.h"
 
 #ifdef GCM_USE_RISCV_ZBB_ZBC
 
@@ -237,7 +237,6 @@ _gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
 
   store_aligned_u64x2(result, rhash);
 
-
   return 0;
 }
 
diff --git a/cipher/cipher-gcm-riscv-zvkg.c b/cipher/cipher-gcm-riscv-zvkg.c
new file mode 100644
index 00000000..c459a6fb
--- /dev/null
+++ b/cipher/cipher-gcm-riscv-zvkg.c
@@ -0,0 +1,130 @@
+/* cipher-gcm-riscv-zvkg.c - RISC-V vector cryptography Zvkg accelerated GHASH
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "g10lib.h"
+#include "simd-common-riscv.h"
+#include "cipher-internal.h"
+
+#ifdef GCM_USE_RISCV_ZVKG
+
+#include <riscv_vector.h>
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+#define cast_u8m1_u32m1(a) __riscv_vreinterpret_v_u8m1_u32m1(a)
+#define cast_u32m1_u8m1(a) __riscv_vreinterpret_v_u32m1_u8m1(a)
+
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+unaligned_load_u32m1(const void *ptr, size_t vl_u32)
+{
+  size_t vl_bytes = vl_u32 * 4;
+
+  return cast_u8m1_u32m1(__riscv_vle8_v_u8m1(ptr, vl_bytes));
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m1_t
+bswap128_u32m1(vuint32m1_t vec, size_t vl_u32)
+{
+  static const byte bswap128_arr[16] =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  size_t vl_bytes = vl_u32 * 4;
+  vuint8m1_t bswap128 = __riscv_vle8_v_u8m1(bswap128_arr, vl_bytes);
+
+  return cast_u8m1_u32m1(
+	    __riscv_vrgather_vv_u8m1(cast_u32m1_u8m1(vec), bswap128, vl_bytes));
+}
+
+
+ASM_FUNC_ATTR_NOINLINE int
+_gcry_ghash_setup_riscv_zvkg(gcry_cipher_hd_t c)
+{
+  (void)c;
+
+  if (__riscv_vsetvl_e32m1(4) != 4)
+    {
+      return 0; // VLEN=128 not supported.
+    }
+
+  return 1;
+}
+
+ASM_FUNC_ATTR_NOINLINE unsigned int
+_gcry_ghash_riscv_zvkg(gcry_cipher_hd_t c, byte *result, const byte *buf,
+		       size_t nblocks)
+{
+  u32 *result_u32 = (void *)result;
+  const u32 *key_u32 = (void *)c->u_mode.gcm.u_ghash_key.key;
+  size_t vl = 4;
+  vuint32m1_t rhash = __riscv_vle32_v_u32m1(result_u32, vl);
+  vuint32m1_t rh1 = __riscv_vle32_v_u32m1(key_u32, vl);
+
+  while (nblocks)
+    {
+      vuint32m1_t data = unaligned_load_u32m1(buf, vl);
+      buf += 16;
+      nblocks--;
+
+      rhash = __riscv_vghsh_vv_u32m1(rhash, rh1, data, vl);
+    }
+
+  __riscv_vse32_v_u32m1(result_u32, rhash, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+ASM_FUNC_ATTR_NOINLINE unsigned int
+_gcry_polyval_riscv_zvkg(gcry_cipher_hd_t c, byte *result, const byte *buf,
+		       size_t nblocks)
+{
+  u32 *result_u32 = (void *)result;
+  const u32 *key_u32 = (void *)c->u_mode.gcm.u_ghash_key.key;
+  size_t vl = 4;
+  vuint32m1_t rhash = __riscv_vle32_v_u32m1(result_u32, vl);
+  vuint32m1_t rh1 = __riscv_vle32_v_u32m1(key_u32, vl);
+
+  while (nblocks)
+    {
+      vuint32m1_t data = bswap128_u32m1(unaligned_load_u32m1(buf, vl), vl);
+      buf += 16;
+      nblocks--;
+
+      rhash = __riscv_vghsh_vv_u32m1(rhash, rh1, data, vl);
+    }
+
+  __riscv_vse32_v_u32m1(result_u32, rhash, vl);
+
+  clear_vec_regs();
+
+  return 0;
+}
+
+#endif /* GCM_USE_RISCV_V_ZVKG */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 5bb98015..143ae52a 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -109,6 +109,16 @@ extern unsigned int _gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result,
 					      const byte *buf, size_t nblocks);
 #endif /* GCM_USE_RISCV_ZBB_ZBC */
 
+#ifdef GCM_USE_RISCV_ZVKG
+extern int _gcry_ghash_setup_riscv_zvkg(gcry_cipher_hd_t c);
+
+extern unsigned int _gcry_ghash_riscv_zvkg(gcry_cipher_hd_t c, byte *result,
+					   const byte *buf, size_t nblocks);
+
+extern unsigned int _gcry_polyval_riscv_zvkg(gcry_cipher_hd_t c, byte *result,
+					     const byte *buf, size_t nblocks);
+#endif /* GCM_USE_RISCV_ZVKG */
+
 #ifdef GCM_USE_AARCH64
 extern void _gcry_ghash_setup_aarch64_simd(gcry_cipher_hd_t c);
 
@@ -628,6 +638,16 @@ setupM (gcry_cipher_hd_t c)
       _gcry_ghash_setup_aarch64_simd (c);
     }
 #endif
+#ifdef GCM_USE_RISCV_ZVKG
+  else if ((features & HWF_RISCV_IMAFDC)
+	   && (features & HWF_RISCV_V)
+	   && (features & HWF_RISCV_ZVKG)
+	   && _gcry_ghash_setup_riscv_zvkg (c))
+    {
+      c->u_mode.gcm.ghash_fn = _gcry_ghash_riscv_zvkg;
+      c->u_mode.gcm.polyval_fn = _gcry_polyval_riscv_zvkg;
+    }
+#endif
 #ifdef GCM_USE_RISCV_ZBB_ZBC
   else if ((features & HWF_RISCV_IMAFDC)
 	   && (features & HWF_RISCV_ZBB)
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index f2a2099a..dc4878bb 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -144,6 +144,15 @@
 # define GCM_USE_RISCV_ZBB_ZBC 1
 #endif
 
+/* GCM_USE_RISCV_V_ZVKG indicates whether to enable RISC-V vector Zvkg
+ * code. */
+#undef GCM_USE_RISCV_ZVKG
+#if defined (__riscv) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+    defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_CRYPTO_INTRINSICS)
+# define GCM_USE_RISCV_ZVKG 1
+#endif
+
 typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
                                     const byte *buf, size_t nblocks);
 
diff --git a/configure.ac b/configure.ac
index 36256df0..1b7d79f3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2838,6 +2838,15 @@ m4_define([GCRY_RISCV_VECTOR_CRYPTO_INTRINSICS_TEST],
 	__riscv_vse32_v_u32m1(ptr + 3 * vl, f, vl);
 	__riscv_vse32_v_u32m1(ptr + 4 * vl, g, vl);
       }
+      void test_ghash(unsigned int *ptr)
+      {
+	int vl = __riscv_vsetvl_e32m1 (4);
+	vuint32m1_t a = __riscv_vle32_v_u32m1(ptr + 0 * vl, vl);
+	vuint32m1_t b = __riscv_vle32_v_u32m1(ptr + 1 * vl, vl);
+	vuint32m1_t c = __riscv_vle32_v_u32m1(ptr + 2 * vl, vl);
+	vuint32m1_t d = __riscv_vghsh_vv_u32m1(a, b, c, vl);
+	__riscv_vse32_v_u32m1(ptr + 0 * vl, d, vl);
+      }
       void test_inline_vec_asm(unsigned int *ptr)
       {
 	int vl = __riscv_vsetvl_e32m1 (4);
@@ -4059,6 +4068,7 @@ case "${host}" in
   ;;
   riscv64-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-riscv-zbb-zbc.lo"
+    GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS cipher-gcm-riscv-zvkg.lo"
   ;;
 esac
 
diff --git a/src/g10lib.h b/src/g10lib.h
index 991ec3ea..6a4b9313 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -281,9 +281,10 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_RISCV_ZBB           (1 << 2)
 #define HWF_RISCV_ZBC           (1 << 3)
 #define HWF_RISCV_ZVKB          (1 << 4)
-#define HWF_RISCV_ZVKNED        (1 << 5)
-#define HWF_RISCV_ZVKNHA        (1 << 6)
-#define HWF_RISCV_ZVKNHB        (1 << 7)
+#define HWF_RISCV_ZVKG          (1 << 5)
+#define HWF_RISCV_ZVKNED        (1 << 6)
+#define HWF_RISCV_ZVKNHA        (1 << 7)
+#define HWF_RISCV_ZVKNHB        (1 << 8)
 
 #endif
 
diff --git a/src/hwf-riscv.c b/src/hwf-riscv.c
index c37fd8dc..5a7cf777 100644
--- a/src/hwf-riscv.c
+++ b/src/hwf-riscv.c
@@ -191,6 +191,7 @@ detect_riscv_at_hwcap(void)
 #define HWF_RISCV_HWPROBE_EXT_ZBS           (1U << 5)
 #define HWF_RISCV_HWPROBE_EXT_ZBC           (1U << 7)
 #define HWF_RISCV_HWPROBE_EXT_ZVKB          (1U << 19)
+#define HWF_RISCV_HWPROBE_EXT_ZVKG          (1U << 20)
 #define HWF_RISCV_HWPROBE_EXT_ZVKNED        (1U << 21)
 #define HWF_RISCV_HWPROBE_EXT_ZVKNHA        (1U << 22)
 #define HWF_RISCV_HWPROBE_EXT_ZVKNHB        (1U << 23)
@@ -216,6 +217,7 @@ static const struct hwprobe_feature_map_s hwprobe_features[] =
     { HWF_RISCV_HWPROBE_EXT_ZBB,     HWF_RISCV_ZBB },
     { HWF_RISCV_HWPROBE_EXT_ZBC,     HWF_RISCV_ZBC },
     { HWF_RISCV_HWPROBE_EXT_ZVKB,    HWF_RISCV_ZVKB },
+    { HWF_RISCV_HWPROBE_EXT_ZVKG,    HWF_RISCV_ZVKG },
     { HWF_RISCV_HWPROBE_EXT_ZVKNED,  HWF_RISCV_ZVKNED },
     { HWF_RISCV_HWPROBE_EXT_ZVKNHA,  HWF_RISCV_ZVKNHA },
     { HWF_RISCV_HWPROBE_EXT_ZVKNHB,  HWF_RISCV_ZVKNHB },
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 08b33090..aae9fdd3 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -97,6 +97,7 @@ static struct
     { HWF_RISCV_ZBB,           "riscv-zbb" },
     { HWF_RISCV_ZBC,           "riscv-zbc" },
     { HWF_RISCV_ZVKB,          "riscv-zvkb" },
+    { HWF_RISCV_ZVKNED,        "riscv-zvkg" },
     { HWF_RISCV_ZVKNED,        "riscv-zvkned" },
     { HWF_RISCV_ZVKNHA,        "riscv-zvknha" },
     { HWF_RISCV_ZVKNHB,        "riscv-zvknhb" },
-- 
2.48.1


From gniibe at fsij.org  Tue Aug 12 09:29:21 2025
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Tue, 12 Aug 2025 16:29:21 +0900
Subject: [PATCH] mpi: Fix redefinition of types.
Message-ID: <85927387741fad2aba4f8cc01e3b10ff4ff4c57f.1754983732.git.gniibe@fsij.org>


* mpi/mpi-internal.h (mpi_ptr_t): Remove, as it moved to mpi.h.
(mpi_size_t): Likewise.

--

Fixes-commit: 88ae76d069c331ad947ecab8419df9a00f979b0e
GnuPG-bug-id: 7775
Signed-off-by: NIIBE Yutaka <gniibe at fsij.org>
---
 mpi/mpi-internal.h | 3 ---
 1 file changed, 3 deletions(-)

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-mpi-Fix-redefinition-of-types.patch
Type: text/x-patch
Size: 403 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20250812/7b6808ef/attachment.bin>

From jussi.kivilinna at iki.fi  Sat Aug 16 20:36:28 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 16 Aug 2025 21:36:28 +0300
Subject: [PATCH 1/2] Add RISC-V Zbb+Zbc implementation of CRC
Message-ID: <20250816183631.1760692-1-jussi.kivilinna@iki.fi>

* cipher/Makefile.am: Add 'crc-riscv-zbb-zbc.c'.
* cipher/crc-riscv-zbb-zbc.c: New.
* cipher/crc.c (USE_RISCV_ZBB_ZBC): New.
(CRC_CONTEXT) [USE_RISCV_ZBB_ZBC]: Add 'use_riscv_zbc'.
[USE_RISCV_ZBB_ZBC] (_gcry_crc32_riscv_zbb_zbc)
(_gcry_crc24rfc2440_riscv_zbb_zbc): New.
(crc32_init): Rename to ...
(generic_crc32_init): ... this; Add 'init_value' parameter.
(generic_crc32_init) [USE_RISCV_ZBB_ZBC]: Add HW feature check for RISC-V
Zbb+Zbc implementation.
(crc24rfc2440_init) [USE_RISCV_ZBB_ZBC]: Likewise.
(crc32_init, crc32rfc1510_init): Use 'generic_crc32_init'.
(crc32_write) [USE_RISCV_ZBB_ZBC]: Add RISC-V Zbb+Zbc implementation.
(crc24rfc2440_write) [USE_RISCV_ZBB_ZBC]: Likewise.
* configure.ac: Add 'crc-riscv-zbb-zbc.lo'.
--

Benchmark on SpacemiT K1:

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 CRC32          |      3.01 ns/B     316.6 MiB/s      4.82 c/B      1600
 CRC24RFC2440   |      3.11 ns/B     306.9 MiB/s      4.97 c/B      1600

After:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 CRC32          |     0.275 ns/B      3472 MiB/s     0.439 c/B      1600
 CRC24RFC2440   |     0.394 ns/B      2418 MiB/s     0.631 c/B      1600

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am         |   1 +
 cipher/crc-riscv-zbb-zbc.c | 514 +++++++++++++++++++++++++++++++++++++
 cipher/crc.c               |  70 +++--
 configure.ac               |   4 +
 4 files changed, 569 insertions(+), 20 deletions(-)
 create mode 100644 cipher/crc-riscv-zbb-zbc.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index d31da411..b7a5c327 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -97,6 +97,7 @@ EXTRA_libcipher_la_SOURCES = \
 	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
 	crc-armv8-aarch64-ce.S \
 	crc-ppc.c \
+	crc-riscv-zbb-zbc.c \
 	des.c des-amd64.S \
 	dilithium.c dilithium.h pubkey-dilithium.c \
 	dsa.c \
diff --git a/cipher/crc-riscv-zbb-zbc.c b/cipher/crc-riscv-zbb-zbc.c
new file mode 100644
index 00000000..dffd451a
--- /dev/null
+++ b/cipher/crc-riscv-zbb-zbc.c
@@ -0,0 +1,514 @@
+/* crc-riscv-zbb-zbc.c - RISC-V Zbb+Zbc accelerated CRC implementation
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+
+#include "bufhelp.h"
+
+
+#if defined (__riscv) && \
+    (__riscv_xlen == 64) && \
+    defined(HAVE_GCC_INLINE_ASM_RISCV)
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+typedef struct
+{
+  u64 lo;
+  u64 hi;
+} u64x2;
+
+
+/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
+ * functions. */
+struct crc32_consts_s
+{
+  /* k: { x^(32*5), x^(32*3), x^(32*2) } mod P(x) */
+  u64 k[3];
+  /* my_p: { floor(x^64 / P(x)), P(x) } */
+  u64 my_p[2];
+};
+
+
+/* CLMUL constants for CRC32 and CRC32RFC1510. */
+static const struct crc32_consts_s crc32_consts =
+{
+  { /* k[3] = reverse_33bits( x^(32*y) mod P(x) ) */
+    U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
+    U64_C(0x163cd6124)                      /* y = 2 */
+  },
+  { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
+    U64_C(0x1f7011641), U64_C(0x1db710641)
+  }
+};
+
+/* CLMUL constants for CRC24RFC2440 (polynomial multiplied with x?). */
+static const struct crc32_consts_s crc24rfc2440_consts =
+{
+  { /* k[3] = x^(32*y) mod P(x) << 32*/
+    U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
+    U64_C(0xd9fe8c00) << 32                           /* y = 2 */
+  },
+  { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
+    U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+  }
+};
+
+
+static ASM_FUNC_ATTR_INLINE u64
+clmul_low(u64 a, u64 b)
+{
+  u64 out;
+  asm (".option push;\n\t"
+       ".option arch, +zbc;\n\t"
+       "clmul %0, %1, %2;\n\t"
+       ".option pop;\n\t"
+       : "=r" (out)
+       : "r" (a), "r" (b));
+  return out;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+clmul_high(u64 a, u64 b)
+{
+  u64 out;
+  asm (".option push;\n\t"
+       ".option arch, +zbc;\n\t"
+       "clmulh %0, %1, %2;\n\t"
+       ".option pop;\n\t"
+       : "=r" (out)
+       : "r" (a), "r" (b));
+  return out;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+byteswap_u64(u64 x)
+{
+  asm (".option push;\n\t"
+       ".option arch, +zbb;\n\t"
+       "rev8 %0, %1;\n\t"
+       ".option pop;\n\t"
+       : "=r" (x)
+       : "r" (x));
+  return x;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+and_u64(u64 a, u64 b)
+{
+  asm ("and %0, %1, %2;\n\t"
+       : "=r" (a)
+       : "r" (a), "r" (b));
+  return a;
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+byteswap_u64x2(u64x2 in)
+{
+  u64x2 out;
+  out.lo = byteswap_u64(in.hi);
+  out.hi = byteswap_u64(in.lo);
+  return out;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+byteswap_u32(u64 x)
+{
+  return byteswap_u64(x) >> 32;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+load_aligned_u32(const void *ptr)
+{
+  u64 out;
+  asm ("lw %0, 0(%1)"
+       : "=r" (out)
+       : "r" (ptr)
+       : "memory");
+  return out;
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+load_aligned_u64x2(const void *ptr)
+{
+  u64x2 vec;
+
+  asm ("ld %0, 0(%1)"
+       : "=r" (vec.lo)
+       : "r" (ptr)
+       : "memory");
+  asm ("ld %0, 8(%1)"
+       : "=r" (vec.hi)
+       : "r" (ptr)
+       : "memory");
+
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+clmul_128(u64 a, u64 b)
+{
+  u64x2 res;
+  res.lo = clmul_low(a, b);
+  res.hi = clmul_high(a, b);
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE u64x2
+xor_128(u64x2 a, u64x2 b)
+{
+  u64x2 res;
+  res.lo = a.lo ^ b.lo;
+  res.hi = a.hi ^ b.hi;
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+crc32r_reduction_4 (u64 data, u64 crc, const struct crc32_consts_s *consts)
+{
+  u64 step1, step2;
+
+  step1 = clmul_low(data, consts->my_p[0]);
+  step1 = and_u64(step1, 0xFFFFFFFFU);
+  step2 = clmul_low(step1, consts->my_p[1]);
+
+  return (step2 >> 32) ^ crc;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+bulk_crc32r (u32 *pcrc, const byte **inbuf, size_t *inlen,
+	     const struct crc32_consts_s *consts)
+{
+  u64 crc = *pcrc;
+  u64 k[2] = { consts->k[0], consts->k[1] };
+  u64x2 x0, x1, x2;
+
+  x0 = load_aligned_u64x2(*inbuf);
+  x0.lo ^= crc;
+
+  *inbuf += 16;
+  *inlen -= 16;
+
+  /* Fold by 128 bits */
+  while (*inlen >= 16)
+    {
+      x2 = load_aligned_u64x2(*inbuf);
+
+      x1 = clmul_128(x0.lo, k[0]);
+      x0 = clmul_128(x0.hi, k[1]);
+
+      x0 = xor_128(x0, x2);
+      x0 = xor_128(x0, x1);
+
+      *inbuf += 16;
+      *inlen -= 16;
+    }
+
+  /* Reduce 128 bits to 96 bits */
+  x1 = clmul_128(x0.lo, k[1]);
+  x1.lo ^= x0.hi;
+
+  /* Reduce 96 bits to 64 bits */
+  crc = (x1.lo >> 32) ^ (x1.hi << 32);
+  crc ^= clmul_low(x1.lo & 0xFFFFFFFFU, consts->k[2]);
+
+  /* Reduce 64 bits to 32 bits */
+  crc = crc32r_reduction_4(crc, crc >> 32, consts);
+
+  *pcrc = crc;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+tail_crc32r (u64 crc, const byte *inbuf, size_t inlen,
+	     const struct crc32_consts_s *consts)
+{
+  u64 data;
+
+  switch (inlen)
+    {
+    case 0:
+    default:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data <<= 24;
+      crc >>= 8;
+      crc = crc32r_reduction_4(data, crc, consts);
+      break;
+    case 2:
+      data = (u32)inbuf[0] | ((u32)inbuf[1] << 8);
+      data ^= crc;
+      data <<= 16;
+      crc >>= 16;
+      crc = crc32r_reduction_4(data, crc, consts);
+      break;
+    case 3:
+      data = (u32)inbuf[0] | ((u32)inbuf[1] << 8) | ((u32)inbuf[2] << 16);
+      data ^= crc;
+      data <<= 8;
+      crc >>= 24;
+      crc = crc32r_reduction_4(data, crc, consts);
+      break;
+    }
+
+  return crc;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+do_crc32r (u32 *pcrc, const byte *inbuf, size_t inlen,
+	   const struct crc32_consts_s *consts)
+{
+  u64 crc = *pcrc;
+  u64 data;
+
+  if ((uintptr_t)inbuf & 3)
+    {
+      /* align input */
+      size_t unaligned_len = (-(uintptr_t)inbuf) & 3;
+
+      unaligned_len = unaligned_len < inlen ? unaligned_len : inlen;
+      crc = tail_crc32r(crc, inbuf, unaligned_len, consts);
+
+      inbuf += unaligned_len;
+      inlen -= unaligned_len;
+    }
+
+  while (inlen >= 4)
+    {
+      data = load_aligned_u32(inbuf);
+      data ^= crc;
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = crc32r_reduction_4(data, 0, consts);
+    }
+
+  *pcrc = tail_crc32r(crc, inbuf, inlen, consts);
+}
+
+void ASM_FUNC_ATTR
+_gcry_crc32_riscv_zbb_zbc (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc32_consts;
+
+  if (!inlen)
+    return;
+
+  if (inlen >= 16)
+    {
+      size_t unaligned_len = (-(uintptr_t)inbuf) & 7;
+      if (inlen >= 16 + unaligned_len)
+	{
+	  if (unaligned_len > 0)
+	    {
+	      /* align input */
+	      do_crc32r (pcrc, inbuf, unaligned_len, consts);
+	      inbuf += unaligned_len;
+	      inlen -= unaligned_len;
+	    }
+
+	  bulk_crc32r (pcrc, &inbuf, &inlen, consts);
+	  if (!inlen)
+	    return;
+	}
+    }
+
+  do_crc32r (pcrc, inbuf, inlen, consts);
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+crc32_reduction_4 (u64 data, u64 crc,
+                   const struct crc32_consts_s *consts)
+{
+  u64 step1, step2;
+
+  step1 = clmul_high((u64)data << 32, consts->my_p[0]);
+  step2 = clmul_low(step1, consts->my_p[1]);
+
+  return (byteswap_u64(step2) >> 32) ^ crc;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+bulk_crc32 (u32 *pcrc, const byte **inbuf, size_t *inlen,
+	    const struct crc32_consts_s *consts)
+{
+  u64 crc = *pcrc;
+  u64 k[2] = { consts->k[0], consts->k[1] };
+  u64x2 x0, x1, x2;
+  u64 temp;
+
+  x0 = load_aligned_u64x2(*inbuf);
+  x0.lo ^= crc;
+  x0 = byteswap_u64x2(x0);
+
+  *inbuf += 16;
+  *inlen -= 16;
+
+  while (*inlen >= 16)
+    {
+      x2 = load_aligned_u64x2(*inbuf);
+
+      x1 = clmul_128(x0.hi, k[0]);
+      x2 = byteswap_u64x2(x2);
+      x0 = clmul_128(x0.lo, k[1]);
+
+      x1 = xor_128(x1, x2);
+      x0 = xor_128(x0, x1);
+
+      *inbuf += 16;
+      *inlen -= 16;
+    }
+
+  /* Reduce 128 bits to 96 bits */
+  x2 = clmul_128(x0.hi, k[1]);
+  x2.hi ^= x0.lo;
+
+  /* Reduce 96 bits to 64 bits */
+  crc = (x2.hi << 32) ^ (x2.lo >> 32);
+  crc ^= clmul_high(and_u64(x2.hi, ~(u64)0xFFFFFFFFU), consts->k[2]);
+
+  /* Reduce 64 bits to 32 bits */
+  temp = clmul_high(and_u64(crc, ~(u64)0xFFFFFFFFU), consts->my_p[0]);
+  temp = clmul_low(temp, consts->my_p[1]);
+  crc = temp ^ (crc & 0xFFFFFFFFU);
+
+  crc = byteswap_u32(crc);
+
+  *pcrc = crc;
+}
+
+static ASM_FUNC_ATTR_INLINE u64
+tail_crc32 (u64 crc, const byte *inbuf, size_t inlen,
+	     const struct crc32_consts_s *consts)
+{
+  u64 data;
+
+  switch (inlen)
+    {
+    case 0:
+    default:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data = data & 0xffU;
+      crc = crc >> 8;
+      crc = crc32_reduction_4(data, crc, consts);
+      break;
+    case 2:
+      data = (u32)inbuf[0] | ((u32)inbuf[1] << 8);
+      data ^= crc;
+      data = byteswap_u32(data << 16);
+      crc = crc >> 16;
+      crc = crc32_reduction_4(data, crc, consts);
+      break;
+    case 3:
+      data = (u32)inbuf[0] | ((u32)inbuf[1] << 8) | ((u32)inbuf[2] << 16);
+      data ^= crc;
+      data = byteswap_u32(data << 8);
+      crc = crc >> 24;
+      crc = crc32_reduction_4(data, crc, consts);
+      break;
+    }
+
+  return crc;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+do_crc32 (u32 *pcrc, const byte *inbuf, size_t inlen,
+          const struct crc32_consts_s *consts)
+{
+  u64 crc = *pcrc;
+  u64 data;
+
+  if ((uintptr_t)inbuf & 3)
+    {
+      /* align input */
+      size_t unaligned_len = (-(uintptr_t)inbuf) & 3;
+
+      unaligned_len = unaligned_len < inlen ? unaligned_len : inlen;
+      crc = tail_crc32(crc, inbuf, unaligned_len, consts);
+
+      inbuf += unaligned_len;
+      inlen -= unaligned_len;
+    }
+
+  while (inlen >= 4)
+    {
+      data = load_aligned_u32(inbuf);
+      data ^= crc;
+      data = byteswap_u32(data);
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = crc32_reduction_4(data, 0, consts);
+    }
+
+  *pcrc = tail_crc32(crc, inbuf, inlen, consts);
+}
+
+void ASM_FUNC_ATTR
+_gcry_crc24rfc2440_riscv_zbb_zbc (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc24rfc2440_consts;
+
+  if (!inlen)
+    return;
+
+  if (inlen >= 16)
+    {
+      size_t unaligned_len = (-(uintptr_t)inbuf) & 7;
+      if (inlen >= 16 + unaligned_len)
+	{
+	  if (unaligned_len > 0)
+	    {
+	      /* align input */
+	      do_crc32 (pcrc, inbuf, unaligned_len, consts);
+	      inbuf += unaligned_len;
+	      inlen -= unaligned_len;
+	    }
+
+	  bulk_crc32 (pcrc, &inbuf, &inlen, consts);
+	  if (!inlen)
+	    return;
+	}
+    }
+
+  do_crc32 (pcrc, inbuf, inlen, consts);
+}
+
+#endif
diff --git a/cipher/crc.c b/cipher/crc.c
index 21ab8523..2692e599 100644
--- a/cipher/crc.c
+++ b/cipher/crc.c
@@ -64,6 +64,13 @@
 # endif
 #endif /* USE_PPC_VPMSUM */
 
+/* USE_RISCV_ZBB_ZBC indicates whether to enable RISC-V Zbb+Zbc code. */
+#undef USE_RISCV_ZBB_ZBC
+#if defined (__riscv) && (__riscv_xlen == 64) && \
+    defined(HAVE_GCC_INLINE_ASM_RISCV)
+# define USE_RISCV_ZBB_ZBC 1
+#endif
+
 
 typedef struct
 {
@@ -77,6 +84,9 @@ typedef struct
 #endif
 #ifdef USE_PPC_VPMSUM
   unsigned int use_vpmsum:1;           /* POWER vpmsum shall be used. */
+#endif
+#ifdef USE_RISCV_ZBB_ZBC
+  unsigned int use_riscv_zbc:1;        /* RISC-V Zbc shall be used. */
 #endif
   byte buf[4];
 }
@@ -105,6 +115,13 @@ void _gcry_crc24rfc2440_ppc8_vpmsum (u32 *pcrc, const byte *inbuf,
 				     size_t inlen);
 #endif
 
+#ifdef USE_RISCV_ZBB_ZBC
+/*-- crc-ppc.c --*/
+void _gcry_crc32_riscv_zbb_zbc (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc24rfc2440_riscv_zbb_zbc (u32 *pcrc, const byte *inbuf,
+				       size_t inlen);
+#endif
+
 
 /*
  * Code generated by universal_crc by Danjel McGougan
@@ -402,7 +419,7 @@ crc32_next4 (u32 crc, u32 data)
 }
 
 static void
-crc32_init (void *context, unsigned int flags)
+generic_crc32_init (void *context, unsigned int flags, u32 init_value)
 {
   CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
   u32 hwf = _gcry_get_hw_features ();
@@ -417,11 +434,22 @@ crc32_init (void *context, unsigned int flags)
 #ifdef USE_PPC_VPMSUM
   ctx->use_vpmsum = !!(hwf & HWF_PPC_ARCH_2_07);
 #endif
+#ifdef USE_RISCV_ZBB_ZBC
+  ctx->use_riscv_zbc = (hwf & HWF_RISCV_IMAFDC)
+		       && (hwf & HWF_RISCV_ZBB)
+		       && (hwf & HWF_RISCV_ZBC);
+#endif
 
   (void)flags;
   (void)hwf;
 
-  ctx->CRC = 0 ^ 0xffffffffL;
+  ctx->CRC = init_value;
+}
+
+static void
+crc32_init (void *context, unsigned int flags)
+{
+  generic_crc32_init(context, flags, 0xffffffffUL);
 }
 
 static void
@@ -452,6 +480,13 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen)
       return;
     }
 #endif
+#ifdef USE_RISCV_ZBB_ZBC
+  if (ctx->use_riscv_zbc)
+    {
+      _gcry_crc32_riscv_zbb_zbc(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
 
   if (!inbuf || !inlen)
     return;
@@ -504,24 +539,7 @@ crc32_final (void *context)
 static void
 crc32rfc1510_init (void *context, unsigned int flags)
 {
-  CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
-  u32 hwf = _gcry_get_hw_features ();
-
-#ifdef USE_INTEL_PCLMUL
-  ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
-  ctx->hwfeatures = hwf;
-#endif
-#ifdef USE_ARM_PMULL
-  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
-#endif
-#ifdef USE_PPC_VPMSUM
-  ctx->use_vpmsum = !!(hwf & HWF_PPC_ARCH_2_07);
-#endif
-
-  (void)flags;
-  (void)hwf;
-
-  ctx->CRC = 0;
+  generic_crc32_init(context, flags, 0);
 }
 
 static void
@@ -855,6 +873,11 @@ crc24rfc2440_init (void *context, unsigned int flags)
 #ifdef USE_PPC_VPMSUM
   ctx->use_vpmsum = !!(hwf & HWF_PPC_ARCH_2_07);
 #endif
+#ifdef USE_RISCV_ZBB_ZBC
+  ctx->use_riscv_zbc = (hwf & HWF_RISCV_IMAFDC)
+		       && (hwf & HWF_RISCV_ZBB)
+		       && (hwf & HWF_RISCV_ZBC);
+#endif
 
   (void)hwf;
   (void)flags;
@@ -890,6 +913,13 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen)
       return;
     }
 #endif
+#ifdef USE_RISCV_ZBB_ZBC
+  if (ctx->use_riscv_zbc)
+    {
+      _gcry_crc24rfc2440_riscv_zbb_zbc(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
 
   if (!inbuf || !inlen)
     return;
diff --git a/configure.ac b/configure.ac
index 1b7d79f3..81110e00 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3805,6 +3805,10 @@ if test "$found" = "1" ; then
       powerpc-*-*)
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-ppc.lo"
       ;;
+      riscv64-*-*)
+         # Build with the RISC-V vector implementation
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS crc-riscv-zbb-zbc.lo"
+      ;;
    esac
 fi
 
-- 
2.48.1


From jussi.kivilinna at iki.fi  Sat Aug 16 20:36:29 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 16 Aug 2025 21:36:29 +0300
Subject: [PATCH 2/2] Require RISC-V B extension for vector intrinsics
 implementations
In-Reply-To: <20250816183631.1760692-1-jussi.kivilinna@iki.fi>
References: <20250816183631.1760692-1-jussi.kivilinna@iki.fi>
Message-ID: <20250816183631.1760692-2-jussi.kivilinna@iki.fi>

* cipher/Makefile.am (riscv_vector_cflags, riscv_vector_crypto_aes_cflags)
(riscv_vector_crypto_sha_cflags, riscv_vector_crypto_gcm_cflags): Use
MARCH_RVA22U64_WITH_VEC and MARCH_RVA23U64_BASE.
* cipher/chacha20.c (chacha20_do_setkey) [USE_RISCV_V]: Require HWF_RISCV_B.
* cipher/cipher-gcm.c (setupM) [GCM_UNSE_RISCV_ZVKG]: Likewise.
* cipher/rijndael.c (do_setkey) [USE_RISCV_V_CRYPTO]: Likewise.
(do_setkey) [USE_VP_RISCV]: Likewise.
* cipher/sha256.c (sha256_common_init) [USE_RISCV_V_CRYPTO]: Likewise.
* cipher/sha512.c (sha512_init_common) [USE_RISCV_V_CRYPTO]: Likewise.
* configure.ac (MARCH_RVA22U64_WITH_VEC, MARCH_RVA23U64_BASE)
(MARCH_RVA23U64_WITH_VEC_CRYPTO): New.
* src/g10lib.h (HWF_RISCV_B): Insert before HWF_RISCV_V.
* src/hwf-riscv.c (hwcap_features, hwprobe_features): Add HWF_RISCV_V.
(detect_riscv_hwf_by_toolchain) [__riscv_zba&&__riscv_zbb&&__riscv_zbs]: Add
faulty toolchain check for B extension.
* src/hwfeatures.c (hwflist) [HAVE_CPU_ARCH_RISCV]: Add "riscv-b".
--

Patch adds B extension as requirement for vector intrinsics implementations
to improve code generation. B extension is mandatory in RVA22U64 profile.
It is unlikely to encounter V extension (optional in RVA22U64, mandatory
in RVA23U64) without B extension.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am  |  8 ++++----
 cipher/chacha20.c   |  3 ++-
 cipher/cipher-gcm.c |  5 +++--
 cipher/rijndael.c   |  8 +++++---
 cipher/sha256.c     |  8 +++++---
 cipher/sha512.c     |  7 ++++---
 configure.ac        | 17 +++++++++++++----
 src/g10lib.h        | 17 +++++++++--------
 src/hwf-riscv.c     | 42 ++++++++++++++++++++++++++++++++++++++++++
 src/hwfeatures.c    |  1 +
 10 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index b7a5c327..bbcd518a 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -359,7 +359,7 @@ serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
 
 # Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
 if ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS
-riscv_vector_cflags = -O2 -march=rv64imafdcv -mstrict-align
+riscv_vector_cflags = -O2 -march=@MARCH_RVA22U64_WITH_VEC@ -mstrict-align
 else
 if SUPPORT_CC_RISCV_MSTRICT_ALIGN
 riscv_vector_cflags = -O2 -mstrict-align
@@ -382,9 +382,9 @@ rijndael-vp-riscv.lo: $(srcdir)/rijndael-vp-riscv.c Makefile
 
 # Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
 if ENABLE_RISCV_VECTOR_CRYPTO_INTRINSICS_EXTRA_CFLAGS
-riscv_vector_crypto_aes_cflags = -O2 -march=rv64imafdcv_zvkned -mstrict-align
-riscv_vector_crypto_sha_cflags = -O2 -march=rv64imafdcv_zvknha_zvknhb_zvkb -mstrict-align
-riscv_vector_crypto_gcm_cflags = -O2 -march=rv64imafdcv_zvkg -mstrict-align
+riscv_vector_crypto_aes_cflags = -O2 -march=@MARCH_RVA23U64_BASE at _zvkned -mstrict-align
+riscv_vector_crypto_sha_cflags = -O2 -march=@MARCH_RVA23U64_BASE at _zvknha_zvknhb_zvkb -mstrict-align
+riscv_vector_crypto_gcm_cflags = -O2 -march=@MARCH_RVA23U64_BASE at _zvkg -mstrict-align
 else
 if SUPPORT_CC_RISCV_MSTRICT_ALIGN
 riscv_vector_crypto_aes_cflags = -O2 -mstrict-align
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 848adbe5..17b9b9f0 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -564,7 +564,8 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #endif
 #ifdef USE_RISCV_V
   ctx->use_riscv_v = (features & HWF_RISCV_IMAFDC)
-		     && (features & HWF_RISCV_V)
+		     && (features & HWF_RISCV_B) /* Mandatory in RVA22U64 */
+		     && (features & HWF_RISCV_V) /* Optional in RVA22U64 */
 		     && _gcry_chacha20_riscv_v_check_hw();
 #endif
 
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 143ae52a..4c9f9ff5 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -640,8 +640,9 @@ setupM (gcry_cipher_hd_t c)
 #endif
 #ifdef GCM_USE_RISCV_ZVKG
   else if ((features & HWF_RISCV_IMAFDC)
-	   && (features & HWF_RISCV_V)
-	   && (features & HWF_RISCV_ZVKG)
+	   && (features & HWF_RISCV_B)      /* Mandatory in RVA23U64 */
+	   && (features & HWF_RISCV_V)      /* Mandatory in RVA23U64 */
+	   && (features & HWF_RISCV_ZVKG)   /* Optional in RVA23U64 */
 	   && _gcry_ghash_setup_riscv_zvkg (c))
     {
       c->u_mode.gcm.ghash_fn = _gcry_ghash_riscv_zvkg;
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 52500e59..972685b4 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -833,8 +833,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 #endif
 #ifdef USE_RISCV_V_CRYPTO
     else if ((hwfeatures & HWF_RISCV_IMAFDC)
-	     && (hwfeatures & HWF_RISCV_V)
-	     && (hwfeatures & HWF_RISCV_ZVKNED)
+	     && (hwfeatures & HWF_RISCV_B)      /* Mandatory in RVA23U64 */
+	     && (hwfeatures & HWF_RISCV_V)      /* Mandatory in RVA23U64 */
+	     && (hwfeatures & HWF_RISCV_ZVKNED) /* Optional in RVA23U64 */
 	     && _gcry_aes_riscv_zvkned_setup_acceleration(ctx))
     {
       hw_setkey = _gcry_aes_riscv_zvkned_setkey;
@@ -859,7 +860,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 #endif
 #ifdef USE_VP_RISCV
   else if ((hwfeatures & HWF_RISCV_IMAFDC)
-	   && (hwfeatures & HWF_RISCV_V)
+	   && (hwfeatures & HWF_RISCV_B) /* Mandatory in RVA22U64 */
+	   && (hwfeatures & HWF_RISCV_V) /* Optional in RVA22U64 */
 	   && _gcry_aes_vp_riscv_setup_acceleration(ctx))
     {
       hw_setkey = _gcry_aes_vp_riscv_do_setkey;
diff --git a/cipher/sha256.c b/cipher/sha256.c
index 27d4b1d4..abaf995d 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -352,9 +352,11 @@ sha256_common_init (SHA256_CONTEXT *hd)
 #endif
 #ifdef USE_RISCV_V_CRYPTO
   if ((features & HWF_RISCV_IMAFDC)
-      && (features & HWF_RISCV_V)
-      && (features & HWF_RISCV_ZVKB)
-      && ((features & HWF_RISCV_ZVKNHA) || (features & HWF_RISCV_ZVKNHB))
+      && (features & HWF_RISCV_B)           /* Mandatory in RVA23U64 */
+      && (features & HWF_RISCV_V)           /* Mandatory in RVA23U64 */
+      && (features & HWF_RISCV_ZVKB)        /* Mandatory in RVA23U64 (Zvbb) */
+      && ((features & HWF_RISCV_ZVKNHA)     /* Optional in RVA23U64 (Zvkng) */
+          || (features & HWF_RISCV_ZVKNHB)) /* Optional in RVA23U64 (Zvkng) */
       && _gcry_sha256_riscv_v_check_hw())
     hd->bctx.bwrite = do_sha256_transform_riscv_zvknha;
 #endif
diff --git a/cipher/sha512.c b/cipher/sha512.c
index a0c0bf1c..51bf6641 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -510,9 +510,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
 #endif
 #ifdef USE_RISCV_V_CRYPTO
   if ((features & HWF_RISCV_IMAFDC)
-      && (features & HWF_RISCV_V)
-      && (features & HWF_RISCV_ZVKB)
-      && (features & HWF_RISCV_ZVKNHB)
+      && (features & HWF_RISCV_B)      /* Mandatory in RVA23U64 */
+      && (features & HWF_RISCV_V)      /* Mandatory in RVA23U64 */
+      && (features & HWF_RISCV_ZVKB)   /* Mandatory in RVA23U64 (Zvbb) */
+      && (features & HWF_RISCV_ZVKNHB) /* Optional in RVA23U64 (Zvkng) */
       && _gcry_sha512_riscv_v_check_hw())
     ctx->bctx.bwrite = do_sha512_transform_riscv_zvknhb;
 #endif
diff --git a/configure.ac b/configure.ac
index 81110e00..80d38496 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2751,8 +2751,12 @@ if test "$gcry_cv_cc_riscv_vector_intrinsics" = "yes" ; then
 fi
 
 _gcc_cflags_save=$CFLAGS
+# Enable B extension (Zba+Zbb+Zbs) to align with RVA22U64 profile and for
+# better code generation for RISC-V vector implementations.
+MARCH_RVA22U64_WITH_VEC=rv64imafdcv_zba_zbb_zbs
+AC_SUBST([MARCH_RVA22U64_WITH_VEC])
 # Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
-CFLAGS="$CFLAGS -O2 -march=rv64imafdcv -mstrict-align"
+CFLAGS="$CFLAGS -O2 -march=$MARCH_RVA22U64_WITH_VEC -mstrict-align"
 
 if test "$gcry_cv_cc_riscv_vector_intrinsics" = "no" &&
    test "$mpi_cpu_arch" = "riscv64" &&
@@ -2882,8 +2886,13 @@ if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" ; then
 fi
 
 _gcc_cflags_save=$CFLAGS
+# Enable B extension (Zba+Zbb+Zbs) to align with RVA23U64 profile and for
+# better code generation for RISC-V vector implementations.
+MARCH_RVA23U64_BASE=${MARCH_RVA22U64_WITH_VEC}
+MARCH_RVA23U64_WITH_VEC_CRYPTO=${MARCH_RVA23U64_BASE}_zvbc_zvkg_zvkn_zvks
+AC_SUBST([MARCH_RVA23U64_BASE])
 # Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
-CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align"
+CFLAGS="$CFLAGS -O2 -march=$MARCH_RVA23U64_WITH_VEC_CRYPTO -mstrict-align"
 
 if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "no" &&
    test "$mpi_cpu_arch" = "riscv64" &&
@@ -2922,7 +2931,7 @@ if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" ||
   # Setup flags for test if needed.
   _gcc_cflags_save=$CFLAGS
   if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
-    CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align"
+    CFLAGS="$CFLAGS -O2 -march=$MARCH_RVA23U64_WITH_VEC_CRYPTO -mstrict-align"
   fi
 
   AC_CACHE_CHECK([whether compiler has working RISC-V __riscv_vsha2cl intrinsics],
@@ -2967,7 +2976,7 @@ if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics" = "yes" ||
   # Setup flags for test.
   _gcc_cflags_save=$CFLAGS
   if test "$gcry_cv_cc_riscv_vector_crypto_intrinsics_cflags" = "yes"; then
-    CFLAGS="$CFLAGS -O2 -march=rv64imafdcv_zvbc_zvkg_zvkn_zvks -mstrict-align"
+    CFLAGS="$CFLAGS -O2 -march=$MARCH_RVA23U64_WITH_VEC_CRYPTO -mstrict-align"
   else
     CFLAGS="$CFLAGS -O2"
   fi
diff --git a/src/g10lib.h b/src/g10lib.h
index 6a4b9313..68ce5405 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -277,14 +277,15 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #elif defined(HAVE_CPU_ARCH_RISCV)
 
 #define HWF_RISCV_IMAFDC        (1 << 0)
-#define HWF_RISCV_V             (1 << 1)
-#define HWF_RISCV_ZBB           (1 << 2)
-#define HWF_RISCV_ZBC           (1 << 3)
-#define HWF_RISCV_ZVKB          (1 << 4)
-#define HWF_RISCV_ZVKG          (1 << 5)
-#define HWF_RISCV_ZVKNED        (1 << 6)
-#define HWF_RISCV_ZVKNHA        (1 << 7)
-#define HWF_RISCV_ZVKNHB        (1 << 8)
+#define HWF_RISCV_B             (1 << 1)
+#define HWF_RISCV_V             (1 << 2)
+#define HWF_RISCV_ZBB           (1 << 3)
+#define HWF_RISCV_ZBC           (1 << 4)
+#define HWF_RISCV_ZVKB          (1 << 5)
+#define HWF_RISCV_ZVKG          (1 << 6)
+#define HWF_RISCV_ZVKNED        (1 << 7)
+#define HWF_RISCV_ZVKNHA        (1 << 8)
+#define HWF_RISCV_ZVKNHB        (1 << 9)
 
 #endif
 
diff --git a/src/hwf-riscv.c b/src/hwf-riscv.c
index 5a7cf777..6c642a10 100644
--- a/src/hwf-riscv.c
+++ b/src/hwf-riscv.c
@@ -90,6 +90,7 @@ static const struct hwcap_feature_map_s hwcap_features[] =
   {
     { HWCAP_ISA_IMAFDC,  HWF_RISCV_IMAFDC },
     { HWCAP_ISA('v'),    HWF_RISCV_V },
+    { HWCAP_ISA('b'),    HWF_RISCV_B },
     { HWCAP_ISA('b'),    HWF_RISCV_ZBB },
   };
 
@@ -216,6 +217,9 @@ static const struct hwprobe_feature_map_s hwprobe_features[] =
     { HWF_RISCV_HWPROBE_IMA_V,       HWF_RISCV_V },
     { HWF_RISCV_HWPROBE_EXT_ZBB,     HWF_RISCV_ZBB },
     { HWF_RISCV_HWPROBE_EXT_ZBC,     HWF_RISCV_ZBC },
+    { HWF_RISCV_HWPROBE_EXT_ZBA
+      | HWF_RISCV_HWPROBE_EXT_ZBB
+      | HWF_RISCV_HWPROBE_EXT_ZBS,   HWF_RISCV_B },
     { HWF_RISCV_HWPROBE_EXT_ZVKB,    HWF_RISCV_ZVKB },
     { HWF_RISCV_HWPROBE_EXT_ZVKG,    HWF_RISCV_ZVKG },
     { HWF_RISCV_HWPROBE_EXT_ZVKNED,  HWF_RISCV_ZVKNED },
@@ -296,6 +300,44 @@ detect_riscv_hwf_by_toolchain (void)
   }
 #endif
 
+#if defined(__riscv_zba) && __riscv_zba >= 1000000 && \
+    defined(__riscv_zbb) && __riscv_zbb >= 1000000 && \
+    defined(__riscv_zbs) && __riscv_zbs >= 1000000 && \
+    defined(HAVE_GCC_INLINE_ASM_RISCV)
+  {
+    unsigned int tmp = 0;
+
+    /* Early test for Zba instructions to detect faulty toolchain
+     * configuration. */
+    asm volatile (".option push;\n\t"
+		  ".option arch, +zba;\n\t"
+		  "sh2add %0, %1, %2;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (tmp)
+		  : "r" (321), "r" (123));
+
+    /* Early test for Zbb instructions to detect faulty toolchain
+     * configuration. */
+    asm volatile (".option push;\n\t"
+		  ".option arch, +zbb;\n\t"
+		  "cpop %0, %1;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (tmp)
+		  : "r" (321));
+
+    /* Early test for Zbs instructions to detect faulty toolchain
+     * configuration. */
+    asm volatile (".option push;\n\t"
+		  ".option arch, +zbs;\n\t"
+		  "bclr %0, %1, %2;\n\t"
+		  ".option pop;\n\t"
+		  : "=r" (tmp)
+		  : "r" (321), "r" (15));
+
+    features |= HWF_RISCV_B;
+  }
+#endif
+
 #if defined(__riscv_zbc) && __riscv_zbc >= 1000000 && \
     defined(HAVE_GCC_INLINE_ASM_RISCV)
   {
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index aae9fdd3..06709da7 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -93,6 +93,7 @@ static struct
     { HWF_S390X_VX,            "s390x-vx" },
 #elif defined(HAVE_CPU_ARCH_RISCV)
     { HWF_RISCV_IMAFDC,        "riscv-imafdc" },
+    { HWF_RISCV_B,             "riscv-b" },
     { HWF_RISCV_V,             "riscv-v" },
     { HWF_RISCV_ZBB,           "riscv-zbb" },
     { HWF_RISCV_ZBC,           "riscv-zbc" },
-- 
2.48.1


From jussi.kivilinna at iki.fi  Tue Aug 19 07:59:36 2025
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 19 Aug 2025 08:59:36 +0300
Subject: [PATCH] cipher-gcm-riscv-zbb-zbc: add POLYVAL acceleration
Message-ID: <20250819055936.23560-1-jussi.kivilinna@iki.fi>

* cipher/cipher-gcm-riscv-zbb-zbc.c (_gcry_ghash_riscv_zbb_zbc): Rename to ...
(ghash_polyval_riscv_zbb_zbc): ... this; Add 'is_polyval' argument.
(_gcry_ghash_riscv_zbb_zbc): New.
(ghash_polyval_riscv_zbb_zbc): New.
* cipher/cipher-gcm.c [GCM_USE_RISCV_ZBB_ZBC]
(ghash_polyval_riscv_zbb_zbc): New.
(setupM) [GCM_USE_RISCV_ZBB_ZBC]: Add setup for 'c->u_mode.gcm.polyval_fn'.
--

Benchmark on SpacemiT K1:

 Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
   GCM-SIV auth |      3.65 ns/B     261.4 MiB/s      5.84 c/B      1600

 After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
   GCM-SIV auth |     0.861 ns/B      1108 MiB/s      1.38 c/B      1600

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-riscv-zbb-zbc.c | 24 +++++++++++++++++++-----
 cipher/cipher-gcm.c               |  5 +++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/cipher/cipher-gcm-riscv-zbb-zbc.c b/cipher/cipher-gcm-riscv-zbb-zbc.c
index 61539274..e32bfafe 100644
--- a/cipher/cipher-gcm-riscv-zbb-zbc.c
+++ b/cipher/cipher-gcm-riscv-zbb-zbc.c
@@ -190,9 +190,9 @@ reduction(u64x2x2 r0r1)
   return veor_u64x2(r0, r1);
 }
 
-ASM_FUNC_ATTR_NOINLINE unsigned int
-_gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
-			  size_t nblocks)
+static ASM_FUNC_ATTR_INLINE unsigned int
+ghash_polyval_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
+			    size_t nblocks, int is_polyval)
 {
   u64x2 rhash;
   u64x2 rh1;
@@ -211,7 +211,7 @@ _gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
   buf += 16;
   nblocks--;
 
-  rbuf = byteswap_u64x2(rbuf);
+  rbuf = is_polyval ? rbuf : byteswap_u64x2(rbuf);
 
   rhash = veor_u64x2(rhash, rbuf);
 
@@ -223,7 +223,7 @@ _gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
 
       rr0rr1 = pmul_128x128(rhash, rh1);
 
-      rbuf = byteswap_u64x2(rbuf);
+      rbuf = is_polyval ? rbuf : byteswap_u64x2(rbuf);
 
       rhash = reduction(rr0rr1);
 
@@ -240,6 +240,20 @@ _gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
   return 0;
 }
 
+ASM_FUNC_ATTR_NOINLINE unsigned int
+_gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
+			  size_t nblocks)
+{
+  return ghash_polyval_riscv_zbb_zbc(c, result, buf, nblocks, 0);
+}
+
+ASM_FUNC_ATTR_NOINLINE unsigned int
+_gcry_polyval_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result, const byte *buf,
+			    size_t nblocks)
+{
+  return ghash_polyval_riscv_zbb_zbc(c, result, buf, nblocks, 1);
+}
+
 static ASM_FUNC_ATTR_INLINE void
 gcm_lsh_1(void *r_out, u64x2 i)
 {
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 4c9f9ff5..a9c48551 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -107,6 +107,10 @@ extern void _gcry_ghash_setup_riscv_zbb_zbc(gcry_cipher_hd_t c);
 
 extern unsigned int _gcry_ghash_riscv_zbb_zbc(gcry_cipher_hd_t c, byte *result,
 					      const byte *buf, size_t nblocks);
+
+extern unsigned int _gcry_polyval_riscv_zbb_zbc(gcry_cipher_hd_t c,
+						byte *result, const byte *buf,
+						size_t nblocks);
 #endif /* GCM_USE_RISCV_ZBB_ZBC */
 
 #ifdef GCM_USE_RISCV_ZVKG
@@ -655,6 +659,7 @@ setupM (gcry_cipher_hd_t c)
 	   && (features & HWF_RISCV_ZBC))
     {
       c->u_mode.gcm.ghash_fn = _gcry_ghash_riscv_zbb_zbc;
+      c->u_mode.gcm.polyval_fn = _gcry_polyval_riscv_zbb_zbc;
       _gcry_ghash_setup_riscv_zbb_zbc (c);
     }
 #endif
-- 
2.48.1