[PATCH 5/6] chacha20: add RISC-V vector intrinsics implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Jan 6 16:08:52 CET 2025
* cipher/Makefile.am: Add 'chacha20-riscv-v.c' and
add ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS handling for
'chacha20-riscv-v.o' and 'chacha20-riscv-v.lo'.
* cipher/chacha20-riscv-v.c: New.
* cipher/chacha20.c (USE_RISCV_V): New.
(CHACHA20_context_s): Add 'use_riscv_v'.
[USE_RISCV_V] (_gcry_chacha20_riscv_v_blocks)
(_gcry_chacha20_riscv_v_check_hw): New.
(chacha20_blocks) [USE_RISCV_V]: Add RISC-V vector code path.
(chacha20_do_setkey) [USE_RISCV_V]: Add HW feature detection for
RISC-V vector implementation.
* configure.ac: Add 'chacha20-riscv-v.lo'.
--
Patch adds RISC-V vector extension implementation. Variable length
vector implementation is used for large inputs (4 blocks or more blocks)
and fixed width 128-bit vector implementation is used for shorter input.
Benchmark on SpacemiT K1 (1600 Mhz):
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 10.67 ns/B 89.37 MiB/s 17.07 c/B
After (3x faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 3.41 ns/B 279.9 MiB/s 5.45 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 10 +-
cipher/chacha20-riscv-v.c | 565 ++++++++++++++++++++++++++++++++++++++
cipher/chacha20.c | 29 ++
configure.ac | 4 +
4 files changed, 606 insertions(+), 2 deletions(-)
create mode 100644 cipher/chacha20-riscv-v.c
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index a0a4d7d8..d871d38d 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -87,8 +87,8 @@ EXTRA_libcipher_la_SOURCES = \
cast5.c cast5-amd64.S cast5-arm.S \
chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
- chacha20-ppc.c chacha20-s390x.S \
- chacha20-p10le-8x.s \
+ chacha20-ppc.c chacha20-s390x.S chacha20-p10le-8x.s \
+ chacha20-riscv-v.c \
cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \
cipher-gcm-aarch64-simd.c cipher-gcm-armv7-neon.S \
cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
@@ -359,6 +359,12 @@ else
riscv_vector_cflags =
endif
+chacha20-riscv-v.o: $(srcdir)/chacha20-riscv-v.c Makefile
+ `echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
+chacha20-riscv-v.lo: $(srcdir)/chacha20-riscv-v.c Makefile
+ `echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
`echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/chacha20-riscv-v.c b/cipher/chacha20-riscv-v.c
new file mode 100644
index 00000000..1304a333
--- /dev/null
+++ b/cipher/chacha20-riscv-v.c
@@ -0,0 +1,565 @@
+/* chacha20-riscv-v.c - RISC-V vector implementation of ChaCha20
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined (__riscv) && \
+ defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS) && \
+ defined(USE_CHACHA20)
+
+#include "simd-common-riscv.h"
+#include <riscv_vector.h>
+#include "bufhelp.h"
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+/**********************************************************************
+ RISC-V vector extension chacha20
+ **********************************************************************/
+
+#define ROTATE16(v) __riscv_vreinterpret_v_u16m1_u32m1( \
+ __riscv_vrgather_vv_u16m1( \
+ __riscv_vreinterpret_v_u32m1_u16m1(v), \
+ rot16, vl * 2))
+#define ROTATE8(v) __riscv_vreinterpret_v_u8m1_u32m1( \
+ __riscv_vrgather_vv_u8m1( \
+ __riscv_vreinterpret_v_u32m1_u8m1(v), \
+ rot8, vl * 4))
+#define ROTATE(v, c) __riscv_vadd_vv_u32m1( \
+ __riscv_vsll_vx_u32m1((v), (c), vl), \
+ __riscv_vsrl_vx_u32m1((v), 32 - (c), vl), vl)
+#define XOR(v, w) __riscv_vxor_vv_u32m1((v), (w), vl)
+#define PLUS(v, w) __riscv_vadd_vv_u32m1((v), (w), vl)
+#define WORD_ROL(v, c) __riscv_vrgather_vv_u32m1((v), (rol##c), vl)
+
+#define QUARTERROUND_4(a0, b0, c0, d0, a1, b1, c1, d1, \
+ a2, b2, c2, d2, a3, b3, c3, d3) \
+ a0 = PLUS(a0, b0); a1 = PLUS(a1, b1); \
+ a2 = PLUS(a2, b2); a3 = PLUS(a3, b3); \
+ d0 = XOR(d0, a0); d1 = XOR(d1, a1); \
+ d2 = XOR(d2, a2); d3 = XOR(d3, a3); \
+ d0 = ROTATE16(d0); d1 = ROTATE16(d1); \
+ d2 = ROTATE16(d2); d3 = ROTATE16(d3); \
+ c0 = PLUS(c0, d0); c1 = PLUS(c1, d1); \
+ c2 = PLUS(c2, d2); c3 = PLUS(c3, d3); \
+ b0 = XOR(b0, c0); b1 = XOR(b1, c1); \
+ b2 = XOR(b2, c2); b3 = XOR(b3, c3); \
+ b0 = ROTATE(b0, 12); b1 = ROTATE(b1, 12); \
+ b2 = ROTATE(b2, 12); b3 = ROTATE(b3, 12); \
+ a0 = PLUS(a0, b0); a1 = PLUS(a1, b1); \
+ a2 = PLUS(a2, b2); a3 = PLUS(a3, b3); \
+ d0 = XOR(d0, a0); d1 = XOR(d1, a1); \
+ d2 = XOR(d2, a2); d3 = XOR(d3, a3); \
+ d0 = ROTATE8(d0); d1 = ROTATE8(d1); \
+ d2 = ROTATE8(d2); d3 = ROTATE8(d3); \
+ c0 = PLUS(c0, d0); c1 = PLUS(c1, d1); \
+ c2 = PLUS(c2, d2); c3 = PLUS(c3, d3); \
+ b0 = XOR(b0, c0); b1 = XOR(b1, c1); \
+ b2 = XOR(b2, c2); b3 = XOR(b3, c3); \
+ b0 = ROTATE(b0, 7); b1 = ROTATE(b1, 7); \
+ b2 = ROTATE(b2, 7); b3 = ROTATE(b3, 7);
+
+#define QUARTERROUND4_2(x0, x1, x2, x3, y0, y1, y2, y3, rol_x1, rol_x2, rol_x3) \
+ x0 = PLUS(x0, x1); y0 = PLUS(y0, y1); \
+ x3 = XOR(x3, x0); y3 = XOR(y3, y0); \
+ x3 = ROTATE16(x3); y3 = ROTATE16(y3); \
+ x2 = PLUS(x2, x3); y2 = PLUS(y2, y3); \
+ x1 = XOR(x1, x2); y1 = XOR(y1, y2); \
+ x1 = ROTATE(x1, 12); y1 = ROTATE(y1, 12); \
+ x0 = PLUS(x0, x1); y0 = PLUS(y0, y1); \
+ x3 = XOR(x3, x0); y3 = XOR(y3, y0); \
+ x3 = ROTATE8(x3); y3 = ROTATE8(y3); \
+ x2 = PLUS(x2, x3); y2 = PLUS(y2, y3); \
+ x3 = WORD_ROL(x3, rol_x3); y3 = WORD_ROL(y3, rol_x3);\
+ x1 = XOR(x1, x2); y1 = XOR(y1, y2); \
+ x2 = WORD_ROL(x2, rol_x2); y2 = WORD_ROL(y2, rol_x2); \
+ x1 = ROTATE(x1, 7); y1 = ROTATE(y1, 7); \
+ x1 = WORD_ROL(x1, rol_x1); y1 = WORD_ROL(y1, rol_x1);
+
+#define QUARTERROUND4(x0, x1, x2, x3, rol_x1, rol_x2, rol_x3) \
+ x0 = PLUS(x0, x1); x3 = XOR(x3, x0); x3 = ROTATE16(x3); \
+ x2 = PLUS(x2, x3); x1 = XOR(x1, x2); x1 = ROTATE(x1, 12); \
+ x0 = PLUS(x0, x1); x3 = XOR(x3, x0); x3 = ROTATE8(x3); \
+ x2 = PLUS(x2, x3); \
+ x3 = WORD_ROL(x3, rol_x3); \
+ x1 = XOR(x1, x2); \
+ x2 = WORD_ROL(x2, rol_x2); \
+ x1= ROTATE(x1, 7); \
+ x1 = WORD_ROL(x1, rol_x1);
+
+#define ADD_U64(a, b) __riscv_vreinterpret_v_u64m1_u32m1( \
+ __riscv_vadd_vv_u64m1( \
+ __riscv_vreinterpret_v_u32m1_u64m1(a), \
+ __riscv_vreinterpret_v_u32m1_u64m1(b), vl / 2))
+
+#define vxor_v_u32m1_u32m1x8(data, idx, vs, vl) \
+ __riscv_vset_v_u32m1_u32m1x8((data), (idx), \
+ __riscv_vxor_vv_u32m1( \
+ __riscv_vget_v_u32m1x8_u32m1((data), (idx)), (vs), (vl)))
+
+static ASM_FUNC_ATTR_INLINE vuint16m1_t
+gen_rot16(size_t vl)
+{
+ return __riscv_vxor_vx_u16m1(__riscv_vid_v_u16m1(vl * 2), 1, vl * 2);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint8m1_t
+gen_rot8(size_t vl)
+{
+ vuint8m1_t rot8, rot8_hi;
+
+ rot8 = __riscv_vid_v_u8m1(vl * 4);
+ rot8_hi = __riscv_vand_vx_u8m1(rot8, ~3, vl * 4);
+ rot8 = __riscv_vadd_vx_u8m1(rot8, 3, vl * 4);
+ rot8 = __riscv_vand_vx_u8m1(rot8, 3, vl * 4);
+ rot8 = __riscv_vadd_vv_u8m1(rot8, rot8_hi, vl * 4);
+
+ return rot8;
+}
+
+static ASM_FUNC_ATTR_INLINE vuint16m2_t
+gen_indexes(size_t vl, size_t stride)
+{
+ vuint16m2_t idx = __riscv_vid_v_u16m2(vl * 4);
+ vuint16m2_t idx_lo = __riscv_vand_vx_u16m2(idx, 3, vl * 4);
+ vuint16m2_t idx_hi = __riscv_vsrl_vx_u16m2(idx, 2, vl * 4);
+ idx_hi = __riscv_vmul_vx_u16m2(idx_hi, stride, vl * 4);
+ return __riscv_vadd_vv_u16m2(idx_hi, idx_lo, vl * 4);
+}
+
+static ASM_FUNC_ATTR_INLINE vuint32m1x8_t
+unaligned_vlsseg8e32_v_u32m1x8(const void *src, size_t vl)
+{
+ const byte *bsrc = src;
+ vuint16m2_t indexes;
+ vuint8m1_t b0, b1, b2, b3, b4, b5, b6, b7;
+ vuint32m1x8_t data;
+
+ if (LIKELY(((uintptr_t)src & 3) == 0))
+ {
+ /* Fast path for 32-bit aligned loads. */
+ return __riscv_vlsseg8e32_v_u32m1x8(src, 64, vl);
+ }
+
+ indexes = gen_indexes(4 * vl, 64);
+
+ b0 = __riscv_vluxei16_v_u8m1(bsrc + 0 * 4, indexes, vl * 4);
+ b1 = __riscv_vluxei16_v_u8m1(bsrc + 1 * 4, indexes, vl * 4);
+ b2 = __riscv_vluxei16_v_u8m1(bsrc + 2 * 4, indexes, vl * 4);
+ b3 = __riscv_vluxei16_v_u8m1(bsrc + 3 * 4, indexes, vl * 4);
+ b4 = __riscv_vluxei16_v_u8m1(bsrc + 4 * 4, indexes, vl * 4);
+ b5 = __riscv_vluxei16_v_u8m1(bsrc + 5 * 4, indexes, vl * 4);
+ b6 = __riscv_vluxei16_v_u8m1(bsrc + 6 * 4, indexes, vl * 4);
+ b7 = __riscv_vluxei16_v_u8m1(bsrc + 7 * 4, indexes, vl * 4);
+
+ data = __riscv_vundefined_u32m1x8();
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 0, __riscv_vreinterpret_v_u8m1_u32m1(b0));
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 1, __riscv_vreinterpret_v_u8m1_u32m1(b1));
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 2, __riscv_vreinterpret_v_u8m1_u32m1(b2));
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 3, __riscv_vreinterpret_v_u8m1_u32m1(b3));
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 4, __riscv_vreinterpret_v_u8m1_u32m1(b4));
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 5, __riscv_vreinterpret_v_u8m1_u32m1(b5));
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 6, __riscv_vreinterpret_v_u8m1_u32m1(b6));
+ data = __riscv_vset_v_u32m1_u32m1x8(
+ data, 7, __riscv_vreinterpret_v_u8m1_u32m1(b7));
+
+ return data;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+unaligned_vssseg8e32_v_u32m1x8(void *dst, vuint32m1x8_t data, size_t vl)
+{
+ byte *bdst = dst;
+ vuint16m2_t indexes;
+ vuint8m1_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ if (LIKELY(((uintptr_t)dst & 3) == 0))
+ {
+ /* Fast path for 32-bit aligned stores. */
+ __riscv_vssseg8e32_v_u32m1x8(dst, 64, data, vl);
+ return;
+ }
+
+ indexes = gen_indexes(4 * vl, 64);
+
+ b0 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 0));
+ b1 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 1));
+ b2 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 2));
+ b3 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 3));
+ b4 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 4));
+ b5 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 5));
+ b6 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 6));
+ b7 = __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vget_v_u32m1x8_u32m1(data, 7));
+
+ __riscv_vsuxei16_v_u8m1(bdst + 0 * 4, indexes, b0, vl * 4);
+ __riscv_vsuxei16_v_u8m1(bdst + 1 * 4, indexes, b1, vl * 4);
+ __riscv_vsuxei16_v_u8m1(bdst + 2 * 4, indexes, b2, vl * 4);
+ __riscv_vsuxei16_v_u8m1(bdst + 3 * 4, indexes, b3, vl * 4);
+ __riscv_vsuxei16_v_u8m1(bdst + 4 * 4, indexes, b4, vl * 4);
+ __riscv_vsuxei16_v_u8m1(bdst + 5 * 4, indexes, b5, vl * 4);
+ __riscv_vsuxei16_v_u8m1(bdst + 6 * 4, indexes, b6, vl * 4);
+ __riscv_vsuxei16_v_u8m1(bdst + 7 * 4, indexes, b7, vl * 4);
+}
+
+static ASM_FUNC_ATTR_INLINE unsigned int
+chacha20_rvv_blocks(u32 *input, byte *dst, const byte *src, size_t nblks)
+{
+ unsigned int i;
+
+ if (nblks == 0)
+ return 0;
+
+ /* Try use vector implementation when there is 4 or more blocks. */
+ if (nblks >= 4)
+ {
+ size_t vl = __riscv_vsetvl_e32m1(nblks) < 4
+ ? __riscv_vsetvl_e32m1(4) : __riscv_vsetvl_e32m1(nblks);
+ vuint32m1_t x0, x1, x2, x3, x4, x5, x6, x7;
+ vuint32m1_t x8, x9, x10, x11, x12, x13, x14, x15;
+ u32 s0, s1, s2, s3, s4, s5, s6, s7;
+ u32 s8, s9, s10, s11, s12, s13, s14, s15;
+ vuint16m1_t rot16 = gen_rot16(vl);
+ vuint8m1_t rot8 = gen_rot8(vl);
+
+ s0 = input[0];
+ s1 = input[1];
+ s2 = input[2];
+ s3 = input[3];
+ s4 = input[4];
+ s5 = input[5];
+ s6 = input[6];
+ s7 = input[7];
+ s8 = input[8];
+ s9 = input[9];
+ s10 = input[10];
+ s11 = input[11];
+ s12 = input[12];
+ s13 = input[13];
+ s14 = input[14];
+ s15 = input[15];
+
+ while (nblks >= 4)
+ {
+ vuint32m1_t ctr;
+ vbool32_t carry;
+ vuint32m1x8_t data;
+
+ if (vl < 4)
+ break;
+
+ x0 = __riscv_vmv_v_x_u32m1(s0, vl);
+ x1 = __riscv_vmv_v_x_u32m1(s1, vl);
+ x2 = __riscv_vmv_v_x_u32m1(s2, vl);
+ x3 = __riscv_vmv_v_x_u32m1(s3, vl);
+ x4 = __riscv_vmv_v_x_u32m1(s4, vl);
+ x5 = __riscv_vmv_v_x_u32m1(s5, vl);
+ x6 = __riscv_vmv_v_x_u32m1(s6, vl);
+ x7 = __riscv_vmv_v_x_u32m1(s7, vl);
+ x8 = __riscv_vmv_v_x_u32m1(s8, vl);
+ x9 = __riscv_vmv_v_x_u32m1(s9, vl);
+ x10 = __riscv_vmv_v_x_u32m1(s10, vl);
+ x11 = __riscv_vmv_v_x_u32m1(s11, vl);
+ x13 = __riscv_vmv_v_x_u32m1(s13, vl);
+ x14 = __riscv_vmv_v_x_u32m1(s14, vl);
+ x15 = __riscv_vmv_v_x_u32m1(s15, vl);
+
+ ctr = __riscv_vid_v_u32m1(vl);
+ carry = __riscv_vmadc_vx_u32m1_b32(ctr, s12, vl);
+ ctr = __riscv_vadd_vx_u32m1(ctr, s12, vl);
+ x12 = ctr;
+ x13 = __riscv_vadc_vxm_u32m1(x13, 0, carry, vl);
+
+ for (i = 20; i > 0; i -= 2)
+ {
+ QUARTERROUND_4(x0, x4, x8, x12,
+ x1, x5, x9, x13,
+ x2, x6, x10, x14,
+ x3, x7, x11, x15);
+ QUARTERROUND_4(x0, x5, x10, x15,
+ x1, x6, x11, x12,
+ x2, x7, x8, x13,
+ x3, x4, x9, x14);
+ }
+
+ x0 = __riscv_vadd_vx_u32m1(x0, s0, vl);
+ x1 = __riscv_vadd_vx_u32m1(x1, s1, vl);
+ x2 = __riscv_vadd_vx_u32m1(x2, s2, vl);
+ x3 = __riscv_vadd_vx_u32m1(x3, s3, vl);
+ x4 = __riscv_vadd_vx_u32m1(x4, s4, vl);
+ x5 = __riscv_vadd_vx_u32m1(x5, s5, vl);
+ x6 = __riscv_vadd_vx_u32m1(x6, s6, vl);
+ x7 = __riscv_vadd_vx_u32m1(x7, s7, vl);
+ x8 = __riscv_vadd_vx_u32m1(x8, s8, vl);
+ x9 = __riscv_vadd_vx_u32m1(x9, s9, vl);
+ x10 = __riscv_vadd_vx_u32m1(x10, s10, vl);
+ x11 = __riscv_vadd_vx_u32m1(x11, s11, vl);
+ x12 = __riscv_vadd_vv_u32m1(x12, ctr, vl);
+ x13 = __riscv_vadc_vxm_u32m1(x13, s13, carry, vl);
+ x14 = __riscv_vadd_vx_u32m1(x14, s14, vl);
+ x15 = __riscv_vadd_vx_u32m1(x15, s15, vl);
+
+ s12 += vl;
+ s13 += s12 < vl;
+
+ data = unaligned_vlsseg8e32_v_u32m1x8((const void *)src, vl);
+
+ data = vxor_v_u32m1_u32m1x8(data, 0, x0, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 1, x1, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 2, x2, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 3, x3, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 4, x4, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 5, x5, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 6, x6, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 7, x7, vl);
+
+ unaligned_vssseg8e32_v_u32m1x8((void *)dst, data, vl);
+
+ data = unaligned_vlsseg8e32_v_u32m1x8((const void *)(src + 32), vl);
+
+ data = vxor_v_u32m1_u32m1x8(data, 0, x8, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 1, x9, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 2, x10, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 3, x11, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 4, x12, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 5, x13, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 6, x14, vl);
+ data = vxor_v_u32m1_u32m1x8(data, 7, x15, vl);
+
+ unaligned_vssseg8e32_v_u32m1x8((void *)(dst + 32), data, vl);
+
+ src += vl * 64;
+ dst += vl * 64;
+ nblks -= vl;
+ vl = __riscv_vsetvl_e32m1(nblks) < 4
+ ? __riscv_vsetvl_e32m1(4) : __riscv_vsetvl_e32m1(nblks);
+ }
+
+ input[12] = s12;
+ input[13] = s13;
+ }
+
+ /* Use SIMD implementation for remaining blocks. */
+ if (nblks > 0)
+ {
+ static const u32 rol_const[3][4] =
+ {
+ { 1, 2, 3, 0 },
+ { 2, 3, 0, 1 },
+ { 3, 0, 1, 2 }
+ };
+ static const u32 one_u64_const[4] = { 1, 0, 0, 0 };
+ size_t vl = 4;
+ vuint32m1_t rol1, rol2, rol3;
+ vuint32m1_t one_u64;
+ vuint32m1_t v0, v1, v2, v3;
+ vuint32m1_t v4, v5, v6, v7;
+ vuint32m1_t state0, state1, state2, state3;
+ vuint8m1_t i0, i1, i2, i3;
+ vuint8m1_t i4, i5, i6, i7;
+ vuint16m1_t rot16 = gen_rot16(vl);
+ vuint8m1_t rot8 = gen_rot8(vl);
+
+ rol1 = __riscv_vle32_v_u32m1(rol_const[0], vl);
+ rol2 = __riscv_vle32_v_u32m1(rol_const[1], vl);
+ rol3 = __riscv_vle32_v_u32m1(rol_const[2], vl);
+ one_u64 = __riscv_vle32_v_u32m1(one_u64_const, vl);
+
+ state0 = __riscv_vle32_v_u32m1(&input[0], vl);
+ state1 = __riscv_vle32_v_u32m1(&input[4], vl);
+ state2 = __riscv_vle32_v_u32m1(&input[8], vl);
+ state3 = __riscv_vle32_v_u32m1(&input[12], vl);
+
+ input[12] += nblks;
+ input[13] += input[12] < nblks;
+
+ /* SIMD 2x block implementation */
+ while (nblks >= 2)
+ {
+ v0 = state0;
+ v1 = state1;
+ v2 = state2;
+ v3 = state3;
+
+ v4 = state0;
+ v5 = state1;
+ v6 = state2;
+ v7 = state3;
+ v7 = ADD_U64(v7, one_u64);
+
+ i0 = __riscv_vle8_v_u8m1(src + 0 * 16, vl * 4);
+ i1 = __riscv_vle8_v_u8m1(src + 1 * 16, vl * 4);
+ i2 = __riscv_vle8_v_u8m1(src + 2 * 16, vl * 4);
+ i3 = __riscv_vle8_v_u8m1(src + 3 * 16, vl * 4);
+
+ for (i = 20; i > 0; i -= 2)
+ {
+ QUARTERROUND4_2(v0, v1, v2, v3, v4, v5, v6, v7, 1, 2, 3);
+ QUARTERROUND4_2(v0, v1, v2, v3, v4, v5, v6, v7, 3, 2, 1);
+ }
+
+ v0 = __riscv_vadd_vv_u32m1(v0, state0, vl);
+ v1 = __riscv_vadd_vv_u32m1(v1, state1, vl);
+ v2 = __riscv_vadd_vv_u32m1(v2, state2, vl);
+ v3 = __riscv_vadd_vv_u32m1(v3, state3, vl);
+ state3 = ADD_U64(state3, one_u64);
+
+ v0 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i0),
+ v0, vl);
+ v1 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i1),
+ v1, vl);
+ v2 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i2),
+ v2, vl);
+ v3 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i3),
+ v3, vl);
+
+ v4 = __riscv_vadd_vv_u32m1(v4, state0, vl);
+ v5 = __riscv_vadd_vv_u32m1(v5, state1, vl);
+ v6 = __riscv_vadd_vv_u32m1(v6, state2, vl);
+ v7 = __riscv_vadd_vv_u32m1(v7, state3, vl);
+ state3 = ADD_U64(state3, one_u64);
+
+ i4 = __riscv_vle8_v_u8m1(src + 4 * 16, vl * 4);
+ i5 = __riscv_vle8_v_u8m1(src + 5 * 16, vl * 4);
+ i6 = __riscv_vle8_v_u8m1(src + 6 * 16, vl * 4);
+ i7 = __riscv_vle8_v_u8m1(src + 7 * 16, vl * 4);
+
+ __riscv_vse8_v_u8m1(dst + 0 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v0), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 1 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v1), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 2 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v2), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 3 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v3), vl * 4);
+
+ v4 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i4),
+ v4, vl);
+ v5 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i5),
+ v5, vl);
+ v6 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i6),
+ v6, vl);
+ v7 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i7),
+ v7, vl);
+
+ __riscv_vse8_v_u8m1(dst + 4 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v4), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 5 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v5), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 6 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v6), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 7 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v7), vl * 4);
+
+ src += 2 * 64;
+ dst += 2 * 64;
+
+ nblks -= 2;
+ }
+
+ /* 1x block implementation */
+ while (nblks)
+ {
+ v0 = state0;
+ v1 = state1;
+ v2 = state2;
+ v3 = state3;
+
+ i0 = __riscv_vle8_v_u8m1(src + 0 * 16, vl * 4);
+ i1 = __riscv_vle8_v_u8m1(src + 1 * 16, vl * 4);
+ i2 = __riscv_vle8_v_u8m1(src + 2 * 16, vl * 4);
+ i3 = __riscv_vle8_v_u8m1(src + 3 * 16, vl * 4);
+
+ for (i = 20; i > 0; i -= 2)
+ {
+ QUARTERROUND4(v0, v1, v2, v3, 1, 2, 3);
+ QUARTERROUND4(v0, v1, v2, v3, 3, 2, 1);
+ }
+
+ v0 = __riscv_vadd_vv_u32m1(v0, state0, vl);
+ v1 = __riscv_vadd_vv_u32m1(v1, state1, vl);
+ v2 = __riscv_vadd_vv_u32m1(v2, state2, vl);
+ v3 = __riscv_vadd_vv_u32m1(v3, state3, vl);
+
+ state3 = ADD_U64(state3, one_u64);
+
+ v0 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i0),
+ v0, vl);
+ v1 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i1),
+ v1, vl);
+ v2 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i2),
+ v2, vl);
+ v3 = __riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_u8m1_u32m1(i3),
+ v3, vl);
+ __riscv_vse8_v_u8m1(dst + 0 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v0), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 1 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v1), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 2 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v2), vl * 4);
+ __riscv_vse8_v_u8m1(dst + 3 * 16,
+ __riscv_vreinterpret_v_u32m1_u8m1(v3), vl * 4);
+ src += 64;
+ dst += 64;
+
+ nblks--;
+ }
+ }
+
+ clear_vec_regs();
+
+ return 0;
+}
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT_O2
+#endif
+
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_chacha20_riscv_v_blocks(u32 *state, byte *dst, const byte *src,
+ size_t nblks)
+{
+ return chacha20_rvv_blocks(state, dst, src, nblks);
+}
+
+unsigned int ASM_FUNC_ATTR FUNC_ATTR_OPT_O2
+_gcry_chacha20_riscv_v_check_hw(void)
+{
+ return (__riscv_vsetvl_e8m1(16) == 16);
+}
+
+#endif /* HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS */
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index ca8176f4..8b547db3 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -113,6 +113,12 @@
# endif /* USE_S390X_VX */
#endif
+/* USE_RISCV_V indicates whether to enable RISC-V vector extension code. */
+#undef USE_RISCV_V
+#if defined (__riscv) && defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS)
+# define USE_RISCV_V 1
+#endif
+
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
@@ -137,6 +143,7 @@ typedef struct CHACHA20_context_s
unsigned int use_p9:1;
unsigned int use_p10:1;
unsigned int use_s390x:1;
+ unsigned int use_riscv_v:1;
} CHACHA20_context_t;
@@ -259,6 +266,16 @@ unsigned int _gcry_chacha20_poly1305_aarch64_blocks4(
#endif /* USE_AARCH64_SIMD */
+#ifdef USE_RISCV_V
+
+unsigned int _gcry_chacha20_riscv_v_blocks(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks);
+
+unsigned int _gcry_chacha20_riscv_v_check_hw(void);
+
+#endif /* USE_RISCV_V */
+
static const char *selftest (void);
@@ -396,6 +413,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
}
#endif
+#ifdef USE_RISCV_V
+ if (ctx->use_riscv_v)
+ {
+ return _gcry_chacha20_riscv_v_blocks(ctx->input, dst, src, nblks);
+ }
+#endif
+
return do_chacha20_blocks (ctx->input, dst, src, nblks);
}
@@ -538,6 +562,11 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
#ifdef USE_S390X_VX
ctx->use_s390x = (features & HWF_S390X_VX) != 0;
#endif
+#ifdef USE_RISCV_V
+ ctx->use_riscv_v = (features & HWF_RISCV_IMAFDC)
+ && (features & HWF_RISCV_V)
+ && _gcry_chacha20_riscv_v_check_hw();
+#endif
(void)features;
diff --git a/configure.ac b/configure.ac
index fbe82695..4e9f1754 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3510,6 +3510,10 @@ if test "$found" = "1" ; then
# Build with the s390x/zSeries vector implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-s390x.lo"
;;
+ riscv64-*-*)
+ # Build with the RISC-V vector implementation
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-riscv-v.lo"
+ ;;
esac
fi
--
2.45.2
More information about the Gcrypt-devel
mailing list