[PATCH 3/6] Add RISC-V vector permute AES
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Jan 6 16:08:50 CET 2025
* cipher/Makefile.am: Add 'rinjdael-vp-riscv.c' and
CFLAG handling for 'rijndael-vp-riscv.o' and 'rijndael-vp-riscv.lo'.
(ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS): New.
* cipher/rijndael-internal.h (USE_VP_RISCV): New.
* cipher/rijndael-vp-simd128.h [__ARM_NEON]: Move ARM NEON macros to ...
* cipher/rijndael-vp-aarch64.c: ... here.
* cipher/rijndael-vp-riscv.c: New.
* cipher/rijndael-vp-simd128.h: Use '__m128i_const' type for constant
vector values and use *_amemld() macros to load these values to vector
registers.
[__x86_64__] (vpaddd128, vpaddb128): Remove.
[__x86_64__] (psrl_byte_128, movdqa128_memld, pand128_amemld)
(paddq128_amemld, paddd128_amemld, pshufb128_amemld): New.
[HAVE_SIMD256] (aes_encrypt_core_4blks_simd256)
(aes_decrypt_core_4blks_simd256): New.
(FUNC_CTR_ENC, FUNC_CTR32LE_ENC, FUNC_CFB_DEC, FUNC_CBC_DEC)
(aes_simd128_ocb_enc, aes_simd128_ocb_dec, FUNC_OCB_AUTH)
(aes_simd128_ecb_enc, aes_simd128_ecb_dec, aes_simd128_xts_enc)
(aes_simd128_xts_dec) [HAVE_SIMD256]: Add 4 block parallel code paths
for HW with 256-bit wide vectors.
* cipher/rijndael.c [USE_VP_RISCV]
(_gcry_aes_vp_riscv_setup_acceleration, _gcry_aes_vp_riscv_do_setkey)
(_gcry_aes_vp_riscv_prepare_decryption, _gcry_aes_vp_riscv_encrypt)
(_gcry_aes_vp_riscv_decrypt, _gcry_aes_vp_riscv_cfb_enc)
(_gcry_aes_vp_riscv_cbc_enc, _gcry_aes_vp_riscv_ctr_enc)
(_gcry_aes_vp_riscv_ctr32le_enc, _gcry_aes_vp_riscv_cfb_dec)
(_gcry_aes_vp_riscv_cbc_dec, _gcry_aes_vp_riscv_ocb_crypt)
(_gcry_aes_vp_riscv_ocb_auth, _gcry_aes_vp_riscv_ecb_crypt)
(_gcry_aes_vp_riscv_xts_crypt): New.
(do_setkey) [USE_VP_RISCV]: Setup vector permute AES for RISC-V with
HWF_RISCV_IMAFDC and HWF_RISCV_V.
* cipher/simd-common-riscv.h: New.
* configure.ac: Add 'rijndael-vp-riscv.lo'.
(gcry_cv_cc_riscv_vector_intrinsics)
(gcry_cv_cc_riscv_vector_intrinsics_cflags): New.
--
Patch adds AES vector permutation implementation for RISC-V with
fixed vector lengths of 128-bit and 256-bit.
Benchmark on SpacemiT K1 (1600 Mhz):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 35.30 ns/B 27.02 MiB/s 56.48 c/B
ECB dec | 35.51 ns/B 26.86 MiB/s 56.81 c/B
CBC enc | 35.40 ns/B 26.94 MiB/s 56.63 c/B
CBC dec | 36.30 ns/B 26.27 MiB/s 58.08 c/B
CFB enc | 36.25 ns/B 26.31 MiB/s 58.00 c/B
CFB dec | 36.25 ns/B 26.31 MiB/s 58.00 c/B
OFB enc | 38.28 ns/B 24.91 MiB/s 61.25 c/B
OFB dec | 38.28 ns/B 24.91 MiB/s 61.26 c/B
CTR enc | 39.81 ns/B 23.96 MiB/s 63.69 c/B
CTR dec | 39.81 ns/B 23.96 MiB/s 63.69 c/B
XTS enc | 36.38 ns/B 26.22 MiB/s 58.20 c/B
XTS dec | 36.26 ns/B 26.30 MiB/s 58.01 c/B
OCB enc | 40.94 ns/B 23.29 MiB/s 65.50 c/B
OCB dec | 40.71 ns/B 23.43 MiB/s 65.13 c/B
OCB auth | 37.34 ns/B 25.54 MiB/s 59.75 c/B
After:
AES | nanosecs/byte mebibytes/sec cycles/byte speed vs old
ECB enc | 16.76 ns/B 56.90 MiB/s 26.82 c/B 2.11x
ECB dec | 19.94 ns/B 47.84 MiB/s 31.90 c/B 1.78x
CBC enc | 31.72 ns/B 30.06 MiB/s 50.75 c/B 1.12x
CBC dec | 20.24 ns/B 47.12 MiB/s 32.38 c/B 1.79x
CFB enc | 31.80 ns/B 29.99 MiB/s 50.88 c/B 1.14x
CFB dec | 16.87 ns/B 56.55 MiB/s 26.98 c/B 2.15x
OFB enc | 38.68 ns/B 24.66 MiB/s 61.88 c/B 0.99x
OFB dec | 38.65 ns/B 24.67 MiB/s 61.85 c/B 0.99x
CTR enc | 16.86 ns/B 56.57 MiB/s 26.97 c/B 2.36x
XTS enc | 17.49 ns/B 54.51 MiB/s 27.99 c/B 2.08x
XTS dec | 20.80 ns/B 45.86 MiB/s 33.27 c/B 1.74x
GCM enc | 31.16 ns/B 30.61 MiB/s 49.85 c/B 1.73x
OCB enc | 17.25 ns/B 55.28 MiB/s 27.60 c/B 2.37x
OCB dec | 20.64 ns/B 46.21 MiB/s 33.02 c/B 1.97x
OCB auth | 17.11 ns/B 55.73 MiB/s 27.38 c/B 2.18x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 16 +-
cipher/rijndael-internal.h | 8 +-
cipher/rijndael-vp-aarch64.c | 60 +-
cipher/rijndael-vp-riscv.c | 285 ++++++++++
cipher/rijndael-vp-simd128.h | 1044 +++++++++++++++++++++++++++-------
cipher/rijndael.c | 80 +++
cipher/simd-common-riscv.h | 48 ++
configure.ac | 118 ++++
8 files changed, 1463 insertions(+), 196 deletions(-)
create mode 100644 cipher/rijndael-vp-riscv.c
create mode 100644 cipher/simd-common-riscv.h
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 90415d83..88b2d17c 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -119,7 +119,8 @@ EXTRA_libcipher_la_SOURCES = \
rijndael-p10le.c rijndael-gcm-p10le.s \
rijndael-ppc-common.h rijndael-ppc-functions.h \
rijndael-s390x.c \
- rijndael-vp-aarch64.c rijndael-vp-simd128.h \
+ rijndael-vp-aarch64.c rijndael-vp-riscv.c \
+ rijndael-vp-simd128.h \
rmd160.c \
rsa.c \
salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
@@ -349,3 +350,16 @@ serpent-avx512-x86.o: $(srcdir)/serpent-avx512-x86.c Makefile
serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
`echo $(LTCOMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
+
+if ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS
+# Note: -mstrict-align needed for GCC-14 bug (disable unaligned vector loads)
+riscv_vector_cflags = -O2 -march=rv64imafdcv -mstrict-align
+else
+riscv_vector_cflags =
+endif
+
+rijndael-vp-riscv.o: $(srcdir)/rijndael-vp-riscv.c Makefile
+ `echo $(COMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-vp-riscv.lo: $(srcdir)/rijndael-vp-riscv.c Makefile
+ `echo $(LTCOMPILE) $(riscv_vector_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 69ef86af..92310fc5 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -124,12 +124,18 @@
# endif
#endif /* ENABLE_ARM_CRYPTO_SUPPORT */
-/* USE_ARM_CE indicates whether to enable vector permute AArch64 SIMD code. */
+/* USE_VP_AARCH64 indicates whether to enable vector permute AArch64 SIMD code. */
#undef USE_VP_AARCH64
#if defined(__AARCH64EL__) && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS)
# define USE_VP_AARCH64 1
#endif
+/* USE_VP_RISCV indicates whether to enable vector permute RISC-V code. */
+#undef USE_VP_RISCV
+#if defined (__riscv) && defined(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS)
+# define USE_VP_RISCV 1
+#endif
+
/* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
* accelerated code. USE_PPC_CRYPTO_WITH_PPC9LE indicates whether to
* enable POWER9 optimized variant. */
diff --git a/cipher/rijndael-vp-aarch64.c b/cipher/rijndael-vp-aarch64.c
index 0532c421..9c8b852b 100644
--- a/cipher/rijndael-vp-aarch64.c
+++ b/cipher/rijndael-vp-aarch64.c
@@ -1,5 +1,5 @@
-/* SSSE3 vector permutation AES for Libgcrypt
- * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+/* AArch64 SIMD vector permutation AES for Libgcrypt
+ * Copyright (C) 2014-2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -50,6 +50,62 @@
#ifdef USE_VP_AARCH64
+/**********************************************************************
+ AT&T x86 asm to intrinsics conversion macros (ARM)
+ **********************************************************************/
+
+#include "simd-common-aarch64.h"
+#include <arm_neon.h>
+
+#define __m128i uint64x2_t
+
+#define pand128(a, o) (o = vandq_u64(o, a))
+#define pandn128(a, o) (o = vbicq_u64(a, o))
+#define pxor128(a, o) (o = veorq_u64(o, a))
+#define paddq128(a, o) (o = vaddq_u64(o, a))
+#define paddd128(a, o) (o = (__m128i)vaddq_u32((uint32x4_t)o, (uint32x4_t)a))
+#define paddb128(a, o) (o = (__m128i)vaddq_u8((uint8x16_t)o, (uint8x16_t)a))
+
+#define psrld128(s, o) (o = (__m128i)vshrq_n_u32((uint32x4_t)o, s))
+#define psraq128(s, o) (o = (__m128i)vshrq_n_s64((int64x2_t)o, s))
+#define psrldq128(s, o) ({ uint64x2_t __tmp = { 0, 0 }; \
+ o = (__m128i)vextq_u8((uint8x16_t)o, \
+ (uint8x16_t)__tmp, (s) & 15);})
+#define pslldq128(s, o) ({ uint64x2_t __tmp = { 0, 0 }; \
+ o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
+ (uint8x16_t)o, (16 - (s)) & 15);})
+#define psrl_byte_128(s, o) (o = (__m128i)vshrq_n_u8((uint8x16_t)o, s))
+
+#define pshufb128(m8, o) (o = (__m128i)vqtbl1q_u8((uint8x16_t)o, (uint8x16_t)m8))
+#define pshufd128(m32, a, o) ({ static const __m128i __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+ __m128i __tmp2; \
+ movdqa128(a, o); \
+ movdqa128_memld(&__tmp1, __tmp2); \
+ pshufb128(__tmp2, o); })
+#define pshufd128_0x93(a, o) (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 12))
+#define pshufd128_0xFF(a, o) (o = (__m128i)vdupq_laneq_u32((uint32x4_t)a, 3))
+#define pshufd128_0xFE(a, o) pshufd128(0xFE, a, o)
+#define pshufd128_0x4E(a, o) (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
+
+#define palignr128(s, a, o) (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)o, s))
+
+#define movdqa128(a, o) (o = a)
+
+#define movdqa128_memld(a, o) (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+
+#define pand128_amemld(m, o) pand128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define pxor128_amemld(m, o) pxor128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define paddq128_amemld(m, o) paddq128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define paddd128_amemld(m, o) paddd128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+#define pshufb128_amemld(m, o) pshufb128((__m128i)vld1q_u8((const uint8_t *)(m)), o)
+
+/* Following operations may have unaligned memory input */
+#define movdqu128_memld(a, o) (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
+
+/* Following operations may have unaligned memory output */
+#define movdqu128_memst(a, o) vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
+
+
#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
#else
diff --git a/cipher/rijndael-vp-riscv.c b/cipher/rijndael-vp-riscv.c
new file mode 100644
index 00000000..b8c6ed13
--- /dev/null
+++ b/cipher/rijndael-vp-riscv.c
@@ -0,0 +1,285 @@
+/* RISC-V vector permutation AES for Libgcrypt
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The code is based on the public domain library libvpaes version 0.5
+ * available at http://crypto.stanford.edu/vpaes/ and which carries
+ * this notice:
+ *
+ * libvpaes: constant-time SSSE3 AES encryption and decryption.
+ * version 0.5
+ *
+ * By Mike Hamburg, Stanford University, 2009. Public domain.
+ * I wrote essentially all of this code. I did not write the test
+ * vectors; they are the NIST known answer tests. I hereby release all
+ * the code and documentation here that I wrote into the public domain.
+ *
+ * This is an implementation of AES following my paper,
+ * "Accelerating AES with Vector Permute Instructions"
+ * CHES 2009; http://shiftleft.org/papers/vector_aes/
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h" /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+
+#ifdef USE_VP_RISCV
+
+
+/**********************************************************************
+ AT&T x86 asm to intrinsics conversion macros (RISC-V)
+ **********************************************************************/
+
+#include <riscv_vector.h>
+#include "simd-common-riscv.h"
+
+/*
+ * SIMD128
+ */
+
+typedef vuint8m1_t __m128i;
+
+#define cast_m128i_to_s8(a) (__riscv_vreinterpret_v_u8m1_i8m1(a))
+#define cast_m128i_to_u32(a) (__riscv_vreinterpret_v_u8m1_u32m1(a))
+#define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+#define cast_m128i_to_s64(a) (__riscv_vreinterpret_v_u64m1_i64m1(cast_m128i_to_u64(a)))
+
+#define cast_s8_to_m128i(a) (__riscv_vreinterpret_v_i8m1_u8m1(a))
+#define cast_u32_to_m128i(a) (__riscv_vreinterpret_v_u32m1_u8m1(a))
+#define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+#define cast_s64_to_m128i(a) (cast_u64_to_m128i(__riscv_vreinterpret_v_i64m1_u64m1(a)))
+
+#define pand128(a, o) (o = __riscv_vand_vv_u8m1((o), (a), 16))
+#define pandn128(a, o) (o = __riscv_vand_vv_u8m1(__riscv_vnot_v_u8m1((o), 16), (a), 16))
+#define pxor128(a, o) (o = __riscv_vxor_vv_u8m1((o), (a), 16))
+#define paddb128(a, o) (o = __riscv_vadd_vv_u8m1((o), (a), 16))
+#define paddd128(a, o) (o = cast_u32_to_m128i(__riscv_vadd_vv_u32m1( \
+ cast_m128i_to_u32(o), \
+ cast_m128i_to_u32(a), 4)))
+#define paddq128(a, o) (o = cast_u64_to_m128i(__riscv_vadd_vv_u64m1( \
+ cast_m128i_to_u64(o), \
+ cast_m128i_to_u64(a), 2)))
+
+#define psrld128(s, o) (o = cast_u32_to_m128i(__riscv_vsrl_vx_u32m1(cast_m128i_to_u32(o), (s), 4))
+#define psraq128(s, o) (o = cast_s64_to_m128i(__riscv_vsra_vx_i64m1(cast_m128i_to_s64(o), (s), 2)))
+#define psrldq128(s, o) (o = __riscv_vslidedown_vx_u8m1((o), (s), 16))
+#define pslldq128(s, o) ({ vuint8m1_t __tmp = __riscv_vmv_v_x_u8m1(0, 16); \
+ o = __riscv_vslideup_vx_u8m1(__tmp, (o), (s), 16); })
+#define psrl_byte_128(s, o) (o = __riscv_vsrl_vx_u8m1((o), (s), 16))
+
+#define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+#define pshufd128(m32, a, o) ({ static const __m128i_const __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+ __m128i __tmp2; \
+ movdqa128(a, o); \
+ movdqa128_memld(&__tmp1, __tmp2); \
+ pshufb128(__tmp2, o); })
+
+#define pshufd128_0x93(a, o) pshufd128(0x93, a, o)
+#define pshufd128_0xFF(a, o) (o = cast_u32_to_m128i(__riscv_vrgather_vx_u32m1(cast_m128i_to_u32(a), 3, 4)))
+#define pshufd128_0xFE(a, o) pshufd128(0xFE, a, o)
+#define pshufd128_0x4E(a, o) pshufd128(0x4E, a, o)
+
+#define palignr128(s, a, o) (o = __riscv_vslideup_vx_u8m1(__riscv_vslidedown_vx_u8m1((a), (s), 16), (o), 16 - (s), 16))
+
+#define movdqa128(a, o) (o = (a))
+
+#define movdqa128_memld(a, o) (o = __riscv_vle8_v_u8m1((const void *)(a), 16))
+
+#define pand128_amemld(m, o) pand128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define pxor128_amemld(m, o) pxor128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define paddq128_amemld(m, o) paddq128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define paddd128_amemld(m, o) paddd128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+#define pshufb128_amemld(m, o) pshufb128(__riscv_vle8_v_u8m1((const void *)(m), 16), (o))
+
+/* Following operations may have unaligned memory input */
+#define movdqu128_memld(a, o) (o = __riscv_vle8_v_u8m1((const void *)(a), 16))
+
+/* Following operations may have unaligned memory output */
+#define movdqu128_memst(a, o) (__riscv_vse8_v_u8m1((void *)(o), (a), 16))
+
+/*
+ * SIMD256
+ */
+
+#define PSHUFD256_MASK_TO_PSHUFB256_MASK(m32) { \
+ M128I_BYTE(((((m32) >> 0) & 0x03) * 4) + 0, \
+ ((((m32) >> 0) & 0x03) * 4) + 1, \
+ ((((m32) >> 0) & 0x03) * 4) + 2, \
+ ((((m32) >> 0) & 0x03) * 4) + 3, \
+ ((((m32) >> 2) & 0x03) * 4) + 0, \
+ ((((m32) >> 2) & 0x03) * 4) + 1, \
+ ((((m32) >> 2) & 0x03) * 4) + 2, \
+ ((((m32) >> 2) & 0x03) * 4) + 3, \
+ ((((m32) >> 4) & 0x03) * 4) + 0, \
+ ((((m32) >> 4) & 0x03) * 4) + 1, \
+ ((((m32) >> 4) & 0x03) * 4) + 2, \
+ ((((m32) >> 4) & 0x03) * 4) + 3, \
+ ((((m32) >> 6) & 0x03) * 4) + 0, \
+ ((((m32) >> 6) & 0x03) * 4) + 1, \
+ ((((m32) >> 6) & 0x03) * 4) + 2, \
+ ((((m32) >> 6) & 0x03) * 4) + 3), \
+ M128I_BYTE(((((m32) >> 0) & 0x03) * 4) + 1 + 16, \
+ ((((m32) >> 0) & 0x03) * 4) + 1 + 16, \
+ ((((m32) >> 0) & 0x03) * 4) + 2 + 16, \
+ ((((m32) >> 0) & 0x03) * 4) + 3 + 16, \
+ ((((m32) >> 2) & 0x03) * 4) + 0 + 16, \
+ ((((m32) >> 2) & 0x03) * 4) + 1 + 16, \
+ ((((m32) >> 2) & 0x03) * 4) + 2 + 16, \
+ ((((m32) >> 2) & 0x03) * 4) + 3 + 16, \
+ ((((m32) >> 4) & 0x03) * 4) + 0 + 16, \
+ ((((m32) >> 4) & 0x03) * 4) + 1 + 16, \
+ ((((m32) >> 4) & 0x03) * 4) + 2 + 16, \
+ ((((m32) >> 4) & 0x03) * 4) + 3 + 16, \
+ ((((m32) >> 6) & 0x03) * 4) + 0 + 16, \
+ ((((m32) >> 6) & 0x03) * 4) + 1 + 16, \
+ ((((m32) >> 6) & 0x03) * 4) + 2 + 16, \
+ ((((m32) >> 6) & 0x03) * 4) + 3 + 16) }
+
+typedef vuint8m1_t __m256i;
+
+#define HAVE_SIMD256 1
+
+#define check_simd256_support() (__riscv_vsetvl_e8m1(32) == 32)
+
+#define cast_m256i_to_s8(a) cast_m128i_to_s8(a)
+#define cast_m256i_to_u32(a) cast_m128i_to_u32(a)
+#define cast_m256i_to_u64(a) cast_m128i_to_u64(a)
+#define cast_m256i_to_s64(a) cast_m128i_to_s64(a)
+
+#define cast_s8_to_m256i(a) (__riscv_vreinterpret_v_i8m1_u8m1(a))
+#define cast_u32_to_m256i(a) (__riscv_vreinterpret_v_u32m1_u8m1(a))
+#define cast_u64_to_m256i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+#define cast_s64_to_m256i(a) (cast_u64_to_m128i(__riscv_vreinterpret_v_i64m1_u64m1(a)))
+
+#define pand256(a, o) (o = __riscv_vand_vv_u8m1((o), (a), 32))
+#define pandn256(a, o) (o = __riscv_vand_vv_u8m1(__riscv_vnot_v_u8m1((o), 32), (a), 32))
+#define pxor256(a, o) (o = __riscv_vxor_vv_u8m1((o), (a), 32))
+#define paddb256(a, o) (o = __riscv_vadd_vv_u8m1((o), (a), 32))
+#define paddd256(a, o) (o = cast_u32_to_m256i(__riscv_vadd_vv_u32m1( \
+ cast_m256i_to_u32(o), \
+ cast_m256i_to_u32(a), 8)))
+#define paddq256(a, o) (o = cast_u64_to_m256i(__riscv_vadd_vv_u64m1( \
+ cast_m256i_to_u64(o), \
+ cast_m256i_to_u64(a), 4)))
+
+#define psrld256(s, o) (o = cast_u32_to_m256i(__riscv_vsrl_vx_u32m1(cast_m256i_to_u32(o), (s), 8))
+#define psraq256(s, o) (o = cast_s64_to_m256i(__riscv_vsra_vx_i64m1(cast_m256i_to_s64(o), (s), 4)))
+#define psrl_byte_256(s, o) (o = __riscv_vsrl_vx_u8m1((o), (s), 32))
+
+/* Note: these are not PSHUFB equavalent as full 256-bit vector is used as
+ * 32 byte table. 256-bit PSHUFB on x86 handles 128-bit lanes separately as
+ * 128-bit 16 byte tables. */
+
+/* tab32 variant: indexes have values 0..31. Used when 'm8' is constant and
+ * variable data is in 'o'. */
+#define pshufb256_tab32(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 32))
+
+/* tab16 variant: indexes have values 0..16 and only low 128-bit of 'o' is
+ * used. Used when 'o' is constant and variable data is in 'm8'. */
+#define pshufb256_tab16(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 32))
+
+/* Load 16 byte mask for 'pshufb256_tab32' usage as if 256-bit PSHUFB was to be
+ * used as on x86 (two separate 128-bit lanes). */
+#define load_tab32_mask(m, o) ({ __m128i __tmp_lo128; \
+ __m128i __tmp_hi128; \
+ movdqu128_memld(m, __tmp_lo128); \
+ __tmp_hi128 = __riscv_vadd_vx_u8m1(__tmp_lo128, 16, 16); \
+ o = __riscv_vslideup_vx_u8m1(__tmp_lo128, __tmp_hi128, 16, 32); })
+
+#define broadcast128_256(a, o) (o = __riscv_vslideup_vx_u8m1((a), (a), 16, 32))
+
+/* Load 16 byte table for 'pshufb256_tab16' usage. On x86 this would splat
+ * 128-bit table from memory to both 128-bit lanes of 256-bit register.
+ * On RISC-V this just loads memory to lower 128-bits. */
+#define load_tab16_table(m, o) movdqu128_memld(m, o)
+
+#define pshufd256(m32, a, o) ({ static const __m128i_const __tmp1 = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
+ __m256i __tmp2; \
+ movdqa256(a, o); \
+ load_tab32_mask(&__tmp1, __tmp2); \
+ pshufb256_tab32(__tmp2, o); })
+
+#define pshufd256_0x93(a, o) pshufd256(0x93, a, o)
+
+#define insert256_hi128(x, o) (o = __riscv_vslideup_vx_u8m1((o), (x), 16, 32))
+#define extract256_hi128(y, o) (o = __riscv_vslidedown_vx_u8m1((y), 16, 32))
+
+#define movdqa256(a, o) (o = (a))
+
+#define movdqa128_256(a, o) (o = (a))
+#define movdqa256_128(a, o) (o = (a))
+
+#define movdqa256_memld(a, o) (o = __riscv_vle8_v_u8m1((const void *)(a), 32))
+
+#define pand256_amemld(m, o) pand128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define pxor256_amemld(m, o) pxor128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define paddq256_amemld(m, o) paddq128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define paddd256_amemld(m, o) paddd128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define pshufb256_amemld(m, o) pshufb128(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+#define broadcast128_256_amemld(m, o) \
+ broadcast128_256(__riscv_vle8_v_u8m1((const void *)(m), 32), (o))
+
+/* Following operations may have unaligned memory input */
+#define movdqu256_memld(a, o) (o = __riscv_vle8_v_u8m1((const void *)(a), 32))
+
+/* Following operations may have unaligned memory output */
+#define movdqu256_memst(a, o) (__riscv_vse8_v_u8m1((void *)(o), (a), 32))
+
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENCRYPT _gcry_aes_vp_riscv_encrypt
+#define FUNC_DECRYPT _gcry_aes_vp_riscv_decrypt
+#define FUNC_CFB_ENC _gcry_aes_vp_riscv_cfb_enc
+#define FUNC_CFB_DEC _gcry_aes_vp_riscv_cfb_dec
+#define FUNC_CBC_ENC _gcry_aes_vp_riscv_cbc_enc
+#define FUNC_CBC_DEC _gcry_aes_vp_riscv_cbc_dec
+#define FUNC_CTR_ENC _gcry_aes_vp_riscv_ctr_enc
+#define FUNC_CTR32LE_ENC _gcry_aes_vp_riscv_ctr32le_enc
+#define FUNC_OCB_CRYPT _gcry_aes_vp_riscv_ocb_crypt
+#define FUNC_OCB_AUTH _gcry_aes_vp_riscv_ocb_auth
+#define FUNC_ECB_CRYPT _gcry_aes_vp_riscv_ecb_crypt
+#define FUNC_XTS_CRYPT _gcry_aes_vp_riscv_xts_crypt
+#define FUNC_SETKEY _gcry_aes_vp_riscv_do_setkey
+#define FUNC_PREPARE_DEC _gcry_aes_vp_riscv_prepare_decryption
+
+#include "rijndael-vp-simd128.h"
+
+int
+_gcry_aes_vp_riscv_setup_acceleration(RIJNDAEL_context *ctx)
+{
+ (void)ctx;
+ return (__riscv_vsetvl_e8m1(16) == 16);
+}
+
+#endif /* USE_VP_RISCV */
diff --git a/cipher/rijndael-vp-simd128.h b/cipher/rijndael-vp-simd128.h
index f6fc8d5e..af8ee291 100644
--- a/cipher/rijndael-vp-simd128.h
+++ b/cipher/rijndael-vp-simd128.h
@@ -1,5 +1,5 @@
/* SIMD128 intrinsics implementation vector permutation AES for Libgcrypt
- * Copyright (C) 2024 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2024-2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -92,55 +92,7 @@
#define M128I_U64(a0, a1) { a0, a1 }
-#ifdef __ARM_NEON
-
-/**********************************************************************
- AT&T x86 asm to intrinsics conversion macros (ARM)
- **********************************************************************/
-
-#include "simd-common-aarch64.h"
-#include <arm_neon.h>
-
-#define __m128i uint64x2_t
-
-#define pand128(a, o) (o = vandq_u64(o, a))
-#define pandn128(a, o) (o = vbicq_u64(a, o))
-#define pxor128(a, o) (o = veorq_u64(o, a))
-#define paddq128(a, o) (o = vaddq_u64(o, a))
-#define paddd128(a, o) (o = (__m128i)vaddq_u32((uint32x4_t)o, (uint32x4_t)a))
-#define paddb128(a, o) (o = (__m128i)vaddq_u8((uint8x16_t)o, (uint8x16_t)a))
-
-#define psrld128(s, o) (o = (__m128i)vshrq_n_u32((uint32x4_t)o, s))
-#define psraq128(s, o) (o = (__m128i)vshrq_n_s64((int64x2_t)o, s))
-#define psrldq128(s, o) ({ uint64x2_t __tmp = { 0, 0 }; \
- o = (__m128i)vextq_u8((uint8x16_t)o, \
- (uint8x16_t)__tmp, (s) & 15);})
-#define pslldq128(s, o) ({ uint64x2_t __tmp = { 0, 0 }; \
- o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
- (uint8x16_t)o, (16 - (s)) & 15);})
-
-#define pshufb128(m8, o) (o = (__m128i)vqtbl1q_u8((uint8x16_t)o, (uint8x16_t)m8))
-#define pshufd128(m32, a, o) ({ static const __m128i __tmp = PSHUFD_MASK_TO_PSHUFB_MASK(m32); \
- movdqa128(a, o); \
- pshufb128(__tmp, o); })
-#define pshufd128_0x93(a, o) (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 12))
-#define pshufd128_0xFF(a, o) (o = (__m128i)vdupq_laneq_u32((uint32x4_t)a, 3))
-#define pshufd128_0xFE(a, o) pshufd128(0xFE, a, o)
-#define pshufd128_0x4E(a, o) (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)a, 8))
-
-#define palignr128(s, a, o) (o = (__m128i)vextq_u8((uint8x16_t)a, (uint8x16_t)o, s))
-
-#define movdqa128(a, o) (o = a)
-
-#define pxor128_amemld(m, o) pxor128(*(const __m128i *)(m), o)
-
-/* Following operations may have unaligned memory input */
-#define movdqu128_memld(a, o) (o = (__m128i)vld1q_u8((const uint8_t *)(a)))
-
-/* Following operations may have unaligned memory output */
-#define movdqu128_memst(a, o) vst1q_u8((uint8_t *)(o), (uint8x16_t)a)
-
-#endif /* __ARM_NEON */
+typedef u64 __m128i_const[2] __attribute__ ((aligned (16)));
#if defined(__x86_64__) || defined(__i386__)
@@ -154,13 +106,12 @@
#define pandn128(a, o) (o = _mm_andnot_si128(o, a))
#define pxor128(a, o) (o = _mm_xor_si128(o, a))
#define paddq128(a, o) (o = _mm_add_epi64(o, a))
-#define vpaddd128(a, o) (o = _mm_add_epi32(o, a))
-#define vpaddb128(a, o) (o = _mm_add_epi8(o, a))
#define psrld128(s, o) (o = _mm_srli_epi32(o, s))
#define psraq128(s, o) (o = _mm_srai_epi64(o, s))
#define psrldq128(s, o) (o = _mm_srli_si128(o, s))
#define pslldq128(s, o) (o = _mm_slli_si128(o, s))
+#define psrl_byte_128(s, o) psrld128(o, s)
#define pshufb128(m8, o) (o = _mm_shuffle_epi8(o, m8))
#define pshufd128(m32, a, o) (o = _mm_shuffle_epi32(a, m32))
@@ -173,7 +124,13 @@
#define movdqa128(a, o) (o = a)
-#define pxor128_amemld(m, o) pxor128(*(const __m128i *)(m), o)
+#define movdqa128_memld(a, o) (o = (__m128i)_mm_load_si128((const void *)(a)))
+
+#define pand128_amemld(m, o) pand128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define pxor128_amemld(m, o) pxor128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define paddq128_amemld(m, o) paddq128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define paddd128_amemld(m, o) paddd128((__m128i)_mm_load_si128((const void *)(m)), o)
+#define pshufb128_amemld(m, o) pshufb128((__m128i)_mm_load_si128((const void *)(m)), o)
/* Following operations may have unaligned memory input */
#define movdqu128_memld(a, o) (o = _mm_loadu_si128((const __m128i *)(a)))
@@ -225,73 +182,73 @@
constant vectors
**********************************************************************/
-static const __m128i k_s0F =
+static const __m128i_const k_s0F =
M128I_U64(
0x0F0F0F0F0F0F0F0F,
0x0F0F0F0F0F0F0F0F
);
-static const __m128i k_iptlo =
+static const __m128i_const k_iptlo =
M128I_U64(
0xC2B2E8985A2A7000,
0xCABAE09052227808
);
-static const __m128i k_ipthi =
+static const __m128i_const k_ipthi =
M128I_U64(
0x4C01307D317C4D00,
0xCD80B1FCB0FDCC81
);
-static const __m128i k_inv =
+static const __m128i_const k_inv =
M128I_U64(
0x0E05060F0D080180,
0x040703090A0B0C02
);
-static const __m128i k_inva =
+static const __m128i_const k_inva =
M128I_U64(
0x01040A060F0B0780,
0x030D0E0C02050809
);
-static const __m128i k_sb1u =
+static const __m128i_const k_sb1u =
M128I_U64(
0xB19BE18FCB503E00,
0xA5DF7A6E142AF544
);
-static const __m128i k_sb1t =
+static const __m128i_const k_sb1t =
M128I_U64(
0x3618D415FAE22300,
0x3BF7CCC10D2ED9EF
);
-static const __m128i k_sb2u =
+static const __m128i_const k_sb2u =
M128I_U64(
0xE27A93C60B712400,
0x5EB7E955BC982FCD
);
-static const __m128i k_sb2t =
+static const __m128i_const k_sb2t =
M128I_U64(
0x69EB88400AE12900,
0xC2A163C8AB82234A
);
-static const __m128i k_sbou =
+static const __m128i_const k_sbou =
M128I_U64(
0xD0D26D176FBDC700,
0x15AABF7AC502A878
);
-static const __m128i k_sbot =
+static const __m128i_const k_sbot =
M128I_U64(
0xCFE474A55FBB6A00,
0x8E1E90D1412B35FA
);
-static const __m128i k_mc_forward[4] =
+static const __m128i_const k_mc_forward[4] =
{
M128I_U64(
0x0407060500030201,
@@ -311,7 +268,7 @@ static const __m128i k_mc_forward[4] =
)
};
-static const __m128i k_mc_backward[4] =
+static const __m128i_const k_mc_backward[4] =
{
M128I_U64(
0x0605040702010003,
@@ -331,7 +288,7 @@ static const __m128i k_mc_backward[4] =
)
};
-static const __m128i k_sr[4] =
+static const __m128i_const k_sr[4] =
{
M128I_U64(
0x0706050403020100,
@@ -351,19 +308,19 @@ static const __m128i k_sr[4] =
)
};
-static const __m128i k_rcon =
+static const __m128i_const k_rcon =
M128I_U64(
0x1F8391B9AF9DEEB6,
0x702A98084D7C7D81
);
-static const __m128i k_s63 =
+static const __m128i_const k_s63 =
M128I_U64(
0x5B5B5B5B5B5B5B5B,
0x5B5B5B5B5B5B5B5B
);
-static const __m128i k_opt[2] =
+static const __m128i_const k_opt[2] =
{
M128I_U64(
0xFF9F4929D6B66000,
@@ -375,7 +332,7 @@ static const __m128i k_opt[2] =
)
};
-static const __m128i k_deskew[2] =
+static const __m128i_const k_deskew[2] =
{
M128I_U64(
0x07E4A34047A4E300,
@@ -387,7 +344,7 @@ static const __m128i k_deskew[2] =
)
};
-static const __m128i k_dks_1[2] =
+static const __m128i_const k_dks_1[2] =
{
M128I_U64(
0xB6116FC87ED9A700,
@@ -399,7 +356,7 @@ static const __m128i k_dks_1[2] =
)
};
-static const __m128i k_dks_2[2] =
+static const __m128i_const k_dks_2[2] =
{
M128I_U64(
0x27438FEBCCA86400,
@@ -411,7 +368,7 @@ static const __m128i k_dks_2[2] =
)
};
-static const __m128i k_dks_3[2] =
+static const __m128i_const k_dks_3[2] =
{
M128I_U64(
0x03C4C50201C6C700,
@@ -423,7 +380,7 @@ static const __m128i k_dks_3[2] =
)
};
-static const __m128i k_dks_4[2] =
+static const __m128i_const k_dks_4[2] =
{
M128I_U64(
0xE3C390B053732000,
@@ -435,7 +392,7 @@ static const __m128i k_dks_4[2] =
)
};
-static const __m128i k_dipt[2] =
+static const __m128i_const k_dipt[2] =
{
M128I_U64(
0x0F505B040B545F00,
@@ -447,7 +404,7 @@ static const __m128i k_dipt[2] =
)
};
-static const __m128i k_dsb9[2] =
+static const __m128i_const k_dsb9[2] =
{
M128I_U64(
0x851C03539A86D600,
@@ -459,7 +416,7 @@ static const __m128i k_dsb9[2] =
)
};
-static const __m128i k_dsbd[2] =
+static const __m128i_const k_dsbd[2] =
{
M128I_U64(
0x7D57CCDFE6B1A200,
@@ -471,7 +428,7 @@ static const __m128i k_dsbd[2] =
)
};
-static const __m128i k_dsbb[2] =
+static const __m128i_const k_dsbb[2] =
{
M128I_U64(
0xD022649296B44200,
@@ -483,7 +440,7 @@ static const __m128i k_dsbb[2] =
)
};
-static const __m128i k_dsbe[2] =
+static const __m128i_const k_dsbe[2] =
{
M128I_U64(
0x46F2929626D4D000,
@@ -495,7 +452,7 @@ static const __m128i k_dsbe[2] =
)
};
-static const __m128i k_dsbo[2] =
+static const __m128i_const k_dsbo[2] =
{
M128I_U64(
0x1387EA537EF94000,
@@ -551,8 +508,8 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
if (!low_round_only)
{
/* extract rcon from xmm8 */
- static const __m128i zero = { 0 };
- xmm1 = zero;
+ static const __m128i_const zero = { 0 };
+ movdqa128_memld(&zero, xmm1);
palignr128(15, xmm8, xmm1);
palignr128(15, xmm8, xmm8);
pxor128(xmm1, xmm7);
@@ -569,12 +526,12 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
movdqa128(xmm7, xmm1);
pslldq128(8, xmm7);
pxor128(xmm1, xmm7);
- pxor128(k_s63, xmm7);
+ pxor128_amemld(&k_s63, xmm7);
/* subbytes */
movdqa128(xmm9, xmm1);
pandn128(xmm0, xmm1);
- psrld128(4, xmm1); /* 1 = i */
+ psrl_byte_128(4, xmm1); /* 1 = i */
pand128(xmm9, xmm0); /* 0 = k */
movdqa128(xmm11, xmm2); /* 2 : a/k */
pshufb128(xmm0, xmm2); /* 2 = a/k */
@@ -591,9 +548,9 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
movdqa128(xmm10, xmm3); /* 3 : 1/jak */
pshufb128(xmm4, xmm3); /* 3 = 1/jak */
pxor128(xmm1, xmm3); /* 3 = jo */
- movdqa128(k_sb1u, xmm4); /* 4 : sbou */
+ movdqa128_memld(&k_sb1u, xmm4); /* 4 : sbou */
pshufb128(xmm2, xmm4); /* 4 = sbou */
- movdqa128(k_sb1t, xmm0); /* 0 : sbot */
+ movdqa128_memld(&k_sb1t, xmm0); /* 0 : sbot */
pshufb128(xmm3, xmm0); /* 0 = sb1t */
pxor128(xmm4, xmm0); /* 0 = sbox output */
@@ -608,7 +565,8 @@ aes_schedule_round(__m128i *pxmm0, __m128i *pxmm7, __m128i *pxmm8,
static ASM_FUNC_ATTR_INLINE __m128i
aes_schedule_transform(__m128i xmm0, const __m128i xmm9,
- const __m128i tablelo, const __m128i tablehi)
+ const __m128i_const *tablelo,
+ const __m128i_const *tablehi)
{
/* aes_schedule_transform
*
@@ -622,11 +580,11 @@ aes_schedule_transform(__m128i xmm0, const __m128i xmm9,
movdqa128(xmm9, xmm1);
pandn128(xmm0, xmm1);
- psrld128(4, xmm1);
+ psrl_byte_128(4, xmm1);
pand128(xmm9, xmm0);
- movdqa128(tablelo, xmm2);
+ movdqa128_memld(tablelo, xmm2);
pshufb128(xmm0, xmm2);
- movdqa128(tablehi, xmm0);
+ movdqa128_memld(tablehi, xmm0);
pshufb128(xmm1, xmm0);
pxor128(xmm2, xmm0);
@@ -662,12 +620,12 @@ aes_schedule_mangle(__m128i xmm0, struct vp_aes_config_s *pconfig, int decrypt,
unsigned int rotoffs = *protoffs;
movdqa128(xmm0, xmm4);
- movdqa128(k_mc_forward[0], xmm5);
+ movdqa128_memld(&k_mc_forward[0], xmm5);
if (!decrypt)
{
keysched += 16;
- pxor128(k_s63, xmm4);
+ pxor128_amemld(&k_s63, xmm4);
pshufb128(xmm5, xmm4);
movdqa128(xmm4, xmm3);
pshufb128(xmm5, xmm4);
@@ -678,29 +636,29 @@ aes_schedule_mangle(__m128i xmm0, struct vp_aes_config_s *pconfig, int decrypt,
else
{
/* first table: *9 */
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_1[0], k_dks_1[1]);
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_1[0], &k_dks_1[1]);
movdqa128(xmm0, xmm3);
pshufb128(xmm5, xmm3);
/* next table: *B */
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_2[0], k_dks_2[1]);
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_2[0], &k_dks_2[1]);
pxor128(xmm0, xmm3);
pshufb128(xmm5, xmm3);
/* next table: *D */
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_3[0], k_dks_3[1]);
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_3[0], &k_dks_3[1]);
pxor128(xmm0, xmm3);
pshufb128(xmm5, xmm3);
/* next table: *E */
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_dks_4[0], k_dks_4[1]);
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_dks_4[0], &k_dks_4[1]);
pxor128(xmm0, xmm3);
pshufb128(xmm5, xmm3);
keysched -= 16;
}
- pshufb128(k_sr[rotoffs], xmm3);
+ pshufb128_amemld(&k_sr[rotoffs], xmm3);
rotoffs -= 16 / 16;
rotoffs &= 48 / 16;
movdqu128_memst(xmm3, keysched);
@@ -725,16 +683,16 @@ aes_schedule_mangle_last(__m128i xmm0, struct vp_aes_config_s config,
if (!decrypt)
{
- pshufb128(k_sr[rotoffs], xmm0); /* output permute */
+ pshufb128_amemld(&k_sr[rotoffs], xmm0); /* output permute */
config.keysched += 16;
- pxor128(k_s63, xmm0);
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_opt[0], k_opt[1]);
+ pxor128_amemld(&k_s63, xmm0);
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_opt[0], &k_opt[1]);
}
else
{
config.keysched -= 16;
- pxor128(k_s63, xmm0);
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_deskew[0], k_deskew[1]);
+ pxor128_amemld(&k_s63, xmm0);
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_deskew[0], &k_deskew[1]);
}
movdqu128_memst(xmm0, config.keysched); /* save last key */
@@ -825,7 +783,7 @@ aes_schedule_192(const byte *key, struct vp_aes_config_s config, int decrypt,
int r = 4;
movdqu128_memld(key + 8, xmm0); /* load key part 2 (very unaligned) */
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi); /* input transform */
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi); /* input transform */
movdqa128(xmm0, xmm6);
psrldq128(8, xmm6);
pslldq128(8, xmm6); /* clobber low side with zeros */
@@ -867,7 +825,7 @@ aes_schedule_256(const byte *key, struct vp_aes_config_s config, int decrypt,
int r = 7;
movdqu128_memld(key + 16, xmm0); /* load key part 2 (unaligned) */
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi); /* input transform */
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi); /* input transform */
while (1)
{
@@ -900,16 +858,16 @@ aes_schedule_core(const byte *key, struct vp_aes_config_s config,
unsigned int keybits = (config.nround - 10) * 32 + 128;
__m128i xmm0, xmm3, xmm7, xmm8, xmm9, xmm10, xmm11;
- movdqa128(k_s0F, xmm9);
- movdqa128(k_inv, xmm10);
- movdqa128(k_inva, xmm11);
- movdqa128(k_rcon, xmm8);
+ movdqa128_memld(&k_s0F, xmm9);
+ movdqa128_memld(&k_inv, xmm10);
+ movdqa128_memld(&k_inva, xmm11);
+ movdqa128_memld(&k_rcon, xmm8);
movdqu128_memld(key, xmm0);
/* input transform */
movdqa128(xmm0, xmm3);
- xmm0 = aes_schedule_transform(xmm0, xmm9, k_iptlo, k_ipthi);
+ xmm0 = aes_schedule_transform(xmm0, xmm9, &k_iptlo, &k_ipthi);
movdqa128(xmm0, xmm7);
if (!decrypt)
@@ -920,7 +878,7 @@ aes_schedule_core(const byte *key, struct vp_aes_config_s config,
else
{
/* decrypting, output zeroth round key after shiftrows */
- pshufb128(k_sr[rotoffs], xmm3);
+ pshufb128_amemld(&k_sr[rotoffs], xmm3);
movdqu128_memst(xmm3, config.keysched);
rotoffs ^= 48 / 16;
}
@@ -998,23 +956,23 @@ FUNC_PREPARE_DEC (RIJNDAEL_context *ctx)
}
#define enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15) \
- movdqa128(k_s0F, xmm9); \
- movdqa128(k_inv, xmm10); \
- movdqa128(k_inva, xmm11); \
- movdqa128(k_sb1u, xmm13); \
- movdqa128(k_sb1t, xmm12); \
- movdqa128(k_sb2u, xmm15); \
- movdqa128(k_sb2t, xmm14);
+ movdqa128_memld(&k_s0F, xmm9); \
+ movdqa128_memld(&k_inv, xmm10); \
+ movdqa128_memld(&k_inva, xmm11); \
+ movdqa128_memld(&k_sb1u, xmm13); \
+ movdqa128_memld(&k_sb1t, xmm12); \
+ movdqa128_memld(&k_sb2u, xmm15); \
+ movdqa128_memld(&k_sb2t, xmm14);
#define dec_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm8) \
- movdqa128(k_s0F, xmm9); \
- movdqa128(k_inv, xmm10); \
- movdqa128(k_inva, xmm11); \
- movdqa128(k_dsb9[0], xmm13); \
- movdqa128(k_dsb9[1], xmm12); \
- movdqa128(k_dsbd[0], xmm15); \
- movdqa128(k_dsbb[0], xmm14); \
- movdqa128(k_dsbe[0], xmm8);
+ movdqa128_memld(&k_s0F, xmm9); \
+ movdqa128_memld(&k_inv, xmm10); \
+ movdqa128_memld(&k_inva, xmm11); \
+ movdqa128_memld(&k_dsb9[0], xmm13); \
+ movdqa128_memld(&k_dsb9[1], xmm12); \
+ movdqa128_memld(&k_dsbd[0], xmm15); \
+ movdqa128_memld(&k_dsbb[0], xmm14); \
+ movdqa128_memld(&k_dsbe[0], xmm8);
static ASM_FUNC_ATTR_INLINE __m128i
aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
@@ -1025,13 +983,13 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
const byte *end_keys = config.sched_keys + 16 * config.nround;
unsigned int mc_pos = 1;
- movdqa128(k_iptlo, xmm2);
+ movdqa128_memld(&k_iptlo, xmm2);
movdqa128(xmm9, xmm1);
pandn128(xmm0, xmm1);
- psrld128(4, xmm1);
+ psrl_byte_128(4, xmm1);
pand128(xmm9, xmm0);
pshufb128(xmm0, xmm2);
- movdqa128(k_ipthi, xmm0);
+ movdqa128_memld(&k_ipthi, xmm0);
pshufb128(xmm1, xmm0);
pxor128_amemld(config.sched_keys, xmm2);
@@ -1044,7 +1002,7 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
/* top of round */
movdqa128(xmm9, xmm1); /* 1 : i */
pandn128(xmm0, xmm1); /* 1 = i<<4 */
- psrld128(4, xmm1); /* 1 = i */
+ psrl_byte_128(4, xmm1); /* 1 = i */
pand128(xmm9, xmm0); /* 0 = k */
movdqa128(xmm11, xmm2); /* 2 : a/k */
pshufb128(xmm0, xmm2); /* 2 = a/k */
@@ -1074,14 +1032,14 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
pxor128(xmm4, xmm0); /* 0 = A */
movdqa128(xmm15, xmm4); /* 4 : sb2u */
pshufb128(xmm2, xmm4); /* 4 = sb2u */
- movdqa128(k_mc_forward[mc_pos], xmm1);
+ movdqa128_memld(&k_mc_forward[mc_pos], xmm1);
movdqa128(xmm14, xmm2); /* 2 : sb2t */
pshufb128(xmm3, xmm2); /* 2 = sb2t */
pxor128(xmm4, xmm2); /* 2 = 2A */
movdqa128(xmm0, xmm3); /* 3 = A */
pshufb128(xmm1, xmm0); /* 0 = B */
pxor128(xmm2, xmm0); /* 0 = 2A+B */
- pshufb128(k_mc_backward[mc_pos], xmm3); /* 3 = D */
+ pshufb128_amemld(&k_mc_backward[mc_pos], xmm3); /* 3 = D */
pxor128(xmm0, xmm3); /* 3 = 2A+B+D */
pshufb128(xmm1, xmm0); /* 0 = 2B+C */
pxor128(xmm3, xmm0); /* 0 = 2A+3B+C+D */
@@ -1091,13 +1049,13 @@ aes_encrypt_core(__m128i xmm0, struct vp_aes_config_s config,
}
/* middle of last round */
- movdqa128(k_sbou, xmm4); /* 3 : sbou */
+ movdqa128_memld(&k_sbou, xmm4); /* 3 : sbou */
pshufb128(xmm2, xmm4); /* 4 = sbou */
pxor128_amemld(config.sched_keys, xmm4); /* 4 = sb1u + k */
- movdqa128(k_sbot, xmm0); /* 0 : sbot */
+ movdqa128_memld(&k_sbot, xmm0); /* 0 : sbot */
pshufb128(xmm3, xmm0); /* 0 = sb1t */
pxor128(xmm4, xmm0); /* 0 = A */
- pshufb128(k_sr[mc_pos], xmm0);
+ pshufb128_amemld(&k_sr[mc_pos], xmm0);
return xmm0;
}
@@ -1112,20 +1070,20 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
__m128i xmm0_a, xmm0_b;
__m128i xmm1_a, xmm2_a, xmm3_a, xmm4_a;
__m128i xmm1_b, xmm2_b, xmm3_b, xmm4_b;
- __m128i xmm5;
+ __m128i xmm5, xmm6;
const byte *end_keys = config.sched_keys + 16 * config.nround;
unsigned int mc_pos = 1;
xmm0_a = *pxmm0_a;
xmm0_b = *pxmm0_b;
- movdqa128(k_iptlo, xmm2_a); movdqa128(k_iptlo, xmm2_b);
+ movdqa128_memld(&k_iptlo, xmm2_a); movdqa128(xmm2_a, xmm2_b);
movdqa128(xmm9, xmm1_a); movdqa128(xmm9, xmm1_b);
pandn128(xmm0_a, xmm1_a); pandn128(xmm0_b, xmm1_b);
- psrld128(4, xmm1_a); psrld128(4, xmm1_b);
+ psrl_byte_128(4, xmm1_a); psrl_byte_128(4, xmm1_b);
pand128(xmm9, xmm0_a); pand128(xmm9, xmm0_b);
pshufb128(xmm0_a, xmm2_a); pshufb128(xmm0_b, xmm2_b);
- movdqa128(k_ipthi, xmm0_a); movdqa128(k_ipthi, xmm0_b);
+ movdqa128_memld(&k_ipthi, xmm0_a); movdqa128(xmm0_a, xmm0_b);
pshufb128(xmm1_a, xmm0_a); pshufb128(xmm1_b, xmm0_b);
movdqu128_memld(config.sched_keys, xmm5);
@@ -1139,7 +1097,7 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
/* top of round */
movdqa128(xmm9, xmm1_a); movdqa128(xmm9, xmm1_b);
pandn128(xmm0_a, xmm1_a); pandn128(xmm0_b, xmm1_b);
- psrld128(4, xmm1_a); psrld128(4, xmm1_b);
+ psrl_byte_128(4, xmm1_a); psrl_byte_128(4, xmm1_b);
pand128(xmm9, xmm0_a); pand128(xmm9, xmm0_b);
movdqa128(xmm11, xmm2_a); movdqa128(xmm11, xmm2_b);
pshufb128(xmm0_a, xmm2_a); pshufb128(xmm0_b, xmm2_b);
@@ -1170,18 +1128,17 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
pxor128(xmm4_a, xmm0_a); pxor128(xmm4_b, xmm0_b);
movdqa128(xmm15, xmm4_a); movdqa128(xmm15, xmm4_b);
pshufb128(xmm2_a, xmm4_a); pshufb128(xmm2_b, xmm4_b);
- movdqa128(k_mc_forward[mc_pos], xmm1_a);
- movdqa128(k_mc_forward[mc_pos], xmm1_b);
+ movdqa128_memld(&k_mc_forward[mc_pos], xmm6);
movdqa128(xmm14, xmm2_a); movdqa128(xmm14, xmm2_b);
pshufb128(xmm3_a, xmm2_a); pshufb128(xmm3_b, xmm2_b);
pxor128(xmm4_a, xmm2_a); pxor128(xmm4_b, xmm2_b);
movdqa128(xmm0_a, xmm3_a); movdqa128(xmm0_b, xmm3_b);
- pshufb128(xmm1_a, xmm0_a); pshufb128(xmm1_b, xmm0_b);
+ pshufb128(xmm6, xmm0_a); pshufb128(xmm6, xmm0_b);
pxor128(xmm2_a, xmm0_a); pxor128(xmm2_b, xmm0_b);
- pshufb128(k_mc_backward[mc_pos], xmm3_a);
- pshufb128(k_mc_backward[mc_pos], xmm3_b);
+ movdqa128_memld(&k_mc_backward[mc_pos], xmm5);
+ pshufb128(xmm5, xmm3_a); pshufb128(xmm5, xmm3_b);
pxor128(xmm0_a, xmm3_a); pxor128(xmm0_b, xmm3_b);
- pshufb128(xmm1_a, xmm0_a); pshufb128(xmm1_b, xmm0_b);
+ pshufb128(xmm6, xmm0_a); pshufb128(xmm6, xmm0_b);
pxor128(xmm3_a, xmm0_a); pxor128(xmm3_b, xmm0_b);
config.sched_keys += 16;
@@ -1189,20 +1146,133 @@ aes_encrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
}
/* middle of last round */
- movdqa128(k_sbou, xmm4_a); movdqa128(k_sbou, xmm4_b);
+ movdqa128_memld(&k_sbou, xmm4_a); movdqa128_memld(&k_sbou, xmm4_b);
pshufb128(xmm2_a, xmm4_a); pshufb128(xmm2_b, xmm4_b);
movdqu128_memld(config.sched_keys, xmm5);
pxor128(xmm5, xmm4_a); pxor128(xmm5, xmm4_b);
- movdqa128(k_sbot, xmm0_a); movdqa128(k_sbot, xmm0_b);
+ movdqa128_memld(&k_sbot, xmm0_a); movdqa128_memld(&k_sbot, xmm0_b);
pshufb128(xmm3_a, xmm0_a); pshufb128(xmm3_b, xmm0_b);
pxor128(xmm4_a, xmm0_a); pxor128(xmm4_b, xmm0_b);
- pshufb128(k_sr[mc_pos], xmm0_a);
- pshufb128(k_sr[mc_pos], xmm0_b);
+ movdqa128_memld(&k_sr[mc_pos], xmm5);
+ pshufb128(xmm5, xmm0_a); pshufb128(xmm5, xmm0_b);
*pxmm0_a = xmm0_a;
*pxmm0_b = xmm0_b;
}
+#ifdef HAVE_SIMD256
+
+static ASM_FUNC_ATTR_INLINE void
+aes_encrypt_core_4blks_simd256(__m256i *pymm0_a, __m256i *pymm0_b,
+ struct vp_aes_config_s config,
+ __m128i xmm9, __m128i xmm10, __m128i xmm11,
+ __m128i xmm12, __m128i xmm13, __m128i xmm14,
+ __m128i xmm15)
+{
+ __m256i ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+ __m256i ymm0_a, ymm0_b;
+ __m256i ymm1_a, ymm2_a, ymm3_a, ymm4_a;
+ __m256i ymm1_b, ymm2_b, ymm3_b, ymm4_b;
+ __m256i ymm5, ymm6;
+ const byte *end_keys = config.sched_keys + 16 * config.nround;
+ unsigned int mc_pos = 1;
+
+ broadcast128_256(xmm9, ymm9);
+ movdqa128_256(xmm10, ymm10);
+ movdqa128_256(xmm11, ymm11);
+ movdqa128_256(xmm12, ymm12);
+ movdqa128_256(xmm13, ymm13);
+ movdqa128_256(xmm14, ymm14);
+ movdqa128_256(xmm15, ymm15);
+
+ ymm0_a = *pymm0_a;
+ ymm0_b = *pymm0_b;
+
+ load_tab16_table(&k_iptlo, ymm2_a); movdqa256(ymm2_a, ymm2_b);
+ movdqa256(ymm9, ymm1_a); movdqa256(ymm9, ymm1_b);
+ pandn256(ymm0_a, ymm1_a); pandn256(ymm0_b, ymm1_b);
+ psrl_byte_256(4, ymm1_a); psrl_byte_256(4, ymm1_b);
+ pand256(ymm9, ymm0_a); pand256(ymm9, ymm0_b);
+ pshufb256_tab16(ymm0_a, ymm2_a); pshufb256_tab16(ymm0_b, ymm2_b);
+ load_tab16_table(&k_ipthi, ymm0_a); movdqa256(ymm0_a, ymm0_b);
+
+ pshufb256_tab16(ymm1_a, ymm0_a); pshufb256_tab16(ymm1_b, ymm0_b);
+ broadcast128_256_amemld(config.sched_keys, ymm5);
+ pxor256(ymm5, ymm2_a); pxor256(ymm5, ymm2_b);
+ pxor256(ymm2_a, ymm0_a); pxor256(ymm2_b, ymm0_b);
+
+ config.sched_keys += 16;
+
+ while (1)
+ {
+ /* top of round */
+ movdqa256(ymm9, ymm1_a); movdqa256(ymm9, ymm1_b);
+ pandn256(ymm0_a, ymm1_a); pandn256(ymm0_b, ymm1_b);
+ psrl_byte_256(4, ymm1_a); psrl_byte_256(4, ymm1_b);
+ pand256(ymm9, ymm0_a); pand256(ymm9, ymm0_b);
+ movdqa256(ymm11, ymm2_a); movdqa256(ymm11, ymm2_b);
+ pshufb256_tab16(ymm0_a, ymm2_a); pshufb256_tab16(ymm0_b, ymm2_b);
+ pxor256(ymm1_a, ymm0_a); pxor256(ymm1_b, ymm0_b);
+ movdqa256(ymm10, ymm3_a); movdqa256(ymm10, ymm3_b);
+ pshufb256_tab16(ymm1_a, ymm3_a); pshufb256_tab16(ymm1_b, ymm3_b);
+ pxor256(ymm2_a, ymm3_a); pxor256(ymm2_b, ymm3_b);
+ movdqa256(ymm10, ymm4_a); movdqa256(ymm10, ymm4_b);
+ pshufb256_tab16(ymm0_a, ymm4_a); pshufb256_tab16(ymm0_b, ymm4_b);
+ pxor256(ymm2_a, ymm4_a); pxor256(ymm2_b, ymm4_b);
+ movdqa256(ymm10, ymm2_a); movdqa256(ymm10, ymm2_b);
+ pshufb256_tab16(ymm3_a, ymm2_a); pshufb256_tab16(ymm3_b, ymm2_b);
+ pxor256(ymm0_a, ymm2_a); pxor256(ymm0_b, ymm2_b);
+ movdqa256(ymm10, ymm3_a); movdqa256(ymm10, ymm3_b);
+ pshufb256_tab16(ymm4_a, ymm3_a); pshufb256_tab16(ymm4_b, ymm3_b);
+ pxor256(ymm1_a, ymm3_a); pxor256(ymm1_b, ymm3_b);
+
+ if (config.sched_keys == end_keys)
+ break;
+
+ /* middle of middle round */
+ movdqa256(ymm13, ymm4_a); movdqa256(ymm13, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ broadcast128_256_amemld(config.sched_keys, ymm5);
+ pxor256(ymm5, ymm4_a); pxor256(ymm5, ymm4_b);
+ movdqa256(ymm12, ymm0_a); movdqa256(ymm12, ymm0_b);
+ pshufb256_tab16(ymm3_a, ymm0_a); pshufb256_tab16(ymm3_b, ymm0_b);
+ pxor256(ymm4_a, ymm0_a); pxor256(ymm4_b, ymm0_b);
+ movdqa256(ymm15, ymm4_a); movdqa256(ymm15, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ load_tab32_mask(&k_mc_forward[mc_pos], ymm6);
+ movdqa256(ymm14, ymm2_a); movdqa256(ymm14, ymm2_b);
+ pshufb256_tab16(ymm3_a, ymm2_a); pshufb256_tab16(ymm3_b, ymm2_b);
+ pxor256(ymm4_a, ymm2_a); pxor256(ymm4_b, ymm2_b);
+ movdqa256(ymm0_a, ymm3_a); movdqa256(ymm0_b, ymm3_b);
+ pshufb256_tab32(ymm6, ymm0_a); pshufb256_tab32(ymm6, ymm0_b);
+ pxor256(ymm2_a, ymm0_a); pxor256(ymm2_b, ymm0_b);
+ load_tab32_mask(&k_mc_backward[mc_pos], ymm5);
+ pshufb256_tab32(ymm5, ymm3_a); pshufb256_tab32(ymm5, ymm3_b);
+ pxor256(ymm0_a, ymm3_a); pxor256(ymm0_b, ymm3_b);
+ pshufb256_tab32(ymm6, ymm0_a); pshufb256_tab32(ymm6, ymm0_b);
+ pxor256(ymm3_a, ymm0_a); pxor256(ymm3_b, ymm0_b);
+
+ config.sched_keys += 16;
+ mc_pos = (mc_pos + 1) % 4; /* next mc mod 4 */
+ }
+
+ /* middle of last round */
+ movdqa256_memld(&k_sbou, ymm4_a); movdqa256_memld(&k_sbou, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ broadcast128_256_amemld(config.sched_keys, ymm5);
+ pxor256(ymm5, ymm4_a); pxor256(ymm5, ymm4_b);
+ movdqa256_memld(&k_sbot, ymm0_a); movdqa256_memld(&k_sbot, ymm0_b);
+ pshufb256_tab16(ymm3_a, ymm0_a); pshufb256_tab16(ymm3_b, ymm0_b);
+ pxor256(ymm4_a, ymm0_a); pxor256(ymm4_b, ymm0_b);
+ load_tab32_mask(&k_sr[mc_pos], ymm5);
+ pshufb256_tab32(ymm5, ymm0_a); pshufb256_tab32(ymm5, ymm0_b);
+
+ *pymm0_a = ymm0_a;
+ *pymm0_b = ymm0_b;
+}
+
+#endif /* HAVE_SIMD256 */
+
static ASM_FUNC_ATTR_INLINE __m128i
aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
__m128i xmm9, __m128i xmm10, __m128i xmm11, __m128i xmm12,
@@ -1212,17 +1282,17 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
const byte *end_keys = config.sched_keys + 16 * config.nround;
unsigned int mc_pos = config.nround % 4;
- movdqa128(k_dipt[0], xmm2);
+ movdqa128_memld(&k_dipt[0], xmm2);
movdqa128(xmm9, xmm1);
pandn128(xmm0, xmm1);
- psrld128(4, xmm1);
+ psrl_byte_128(4, xmm1);
pand128(xmm9, xmm0);
pshufb128(xmm0, xmm2);
- movdqa128(k_dipt[1], xmm0);
+ movdqa128_memld(&k_dipt[1], xmm0);
pshufb128(xmm1, xmm0);
pxor128_amemld(config.sched_keys, xmm2);
pxor128(xmm2, xmm0);
- movdqa128(k_mc_forward[3], xmm5);
+ movdqa128_memld(&k_mc_forward[3], xmm5);
config.sched_keys += 16;
@@ -1231,7 +1301,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
/* top of round */
movdqa128(xmm9, xmm1); /* 1 : i */
pandn128(xmm0, xmm1); /* 1 = i<<4 */
- psrld128(4, xmm1); /* 1 = i */
+ psrl_byte_128(4, xmm1); /* 1 = i */
pand128(xmm9, xmm0); /* 0 = k */
movdqa128(xmm11, xmm2); /* 2 : a/k */
pshufb128(xmm0, xmm2); /* 2 = a/k */
@@ -1258,7 +1328,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
pxor128_amemld(config.sched_keys, xmm4);
movdqa128(xmm12, xmm0); /* 0 : sb9t */
pshufb128(xmm3, xmm0); /* 0 = sb9t */
- movdqa128(k_dsbd[1], xmm1); /* 1 : sbdt */
+ movdqa128_memld(&k_dsbd[1], xmm1); /* 1 : sbdt */
pxor128(xmm4, xmm0); /* 0 = ch */
pshufb128(xmm5, xmm0); /* MC ch */
@@ -1272,7 +1342,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
movdqa128(xmm14, xmm4); /* 4 : sbbu */
pshufb128(xmm2, xmm4); /* 4 = sbbu */
pxor128(xmm1, xmm4); /* 4 = ch */
- movdqa128(k_dsbb[1], xmm0); /* 0 : sbbt */
+ movdqa128_memld(&k_dsbb[1], xmm0); /* 0 : sbbt */
pshufb128(xmm3, xmm0); /* 0 = sbbt */
pxor128(xmm4, xmm0); /* 0 = ch */
@@ -1281,7 +1351,7 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
pshufb128(xmm2, xmm4); /* 4 = sbeu */
pshufd128_0x93(xmm5, xmm5);
pxor128(xmm0, xmm4); /* 4 = ch */
- movdqa128(k_dsbe[1], xmm0); /* 0 : sbet */
+ movdqa128_memld(&k_dsbe[1], xmm0); /* 0 : sbet */
pshufb128(xmm3, xmm0); /* 0 = sbet */
pxor128(xmm4, xmm0); /* 0 = ch */
@@ -1289,13 +1359,13 @@ aes_decrypt_core(__m128i xmm0, struct vp_aes_config_s config,
}
/* middle of last round */
- movdqa128(k_dsbo[0], xmm4); /* 3 : sbou */
+ movdqa128_memld(&k_dsbo[0], xmm4);/* 3 : sbou */
pshufb128(xmm2, xmm4); /* 4 = sbou */
pxor128_amemld(config.sched_keys, xmm4); /* 4 = sb1u + k */
- movdqa128(k_dsbo[1], xmm0); /* 0 : sbot */
+ movdqa128_memld(&k_dsbo[1], xmm0);/* 0 : sbot */
pshufb128(xmm3, xmm0); /* 0 = sb1t */
pxor128(xmm4, xmm0); /* 0 = A */
- pshufb128(k_sr[mc_pos], xmm0);
+ pshufb128_amemld(&k_sr[mc_pos], xmm0);
return xmm0;
}
@@ -1317,18 +1387,18 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
xmm0_a = *pxmm0_a;
xmm0_b = *pxmm0_b;
- movdqa128(k_dipt[0], xmm2_a); movdqa128(k_dipt[0], xmm2_b);
+ movdqa128_memld(&k_dipt[0], xmm2_a); movdqa128(xmm2_a, xmm2_b);
movdqa128(xmm9, xmm1_a); movdqa128(xmm9, xmm1_b);
pandn128(xmm0_a, xmm1_a); pandn128(xmm0_b, xmm1_b);
- psrld128(4, xmm1_a); psrld128(4, xmm1_b);
+ psrl_byte_128(4, xmm1_a); psrl_byte_128(4, xmm1_b);
pand128(xmm9, xmm0_a); pand128(xmm9, xmm0_b);
pshufb128(xmm0_a, xmm2_a); pshufb128(xmm0_b, xmm2_b);
- movdqa128(k_dipt[1], xmm0_a); movdqa128(k_dipt[1], xmm0_b);
+ movdqa128_memld(&k_dipt[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
pshufb128(xmm1_a, xmm0_a); pshufb128(xmm1_b, xmm0_b);
movdqu128_memld(config.sched_keys, xmm6);
pxor128(xmm6, xmm2_a); pxor128(xmm6, xmm2_b);
pxor128(xmm2_a, xmm0_a); pxor128(xmm2_b, xmm0_b);
- movdqa128(k_mc_forward[3], xmm5);
+ movdqa128_memld(&k_mc_forward[3], xmm5);
config.sched_keys += 16;
@@ -1337,7 +1407,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
/* top of round */
movdqa128(xmm9, xmm1_a); movdqa128(xmm9, xmm1_b);
pandn128(xmm0_a, xmm1_a); pandn128(xmm0_b, xmm1_b);
- psrld128(4, xmm1_a); psrld128(4, xmm1_b);
+ psrl_byte_128(4, xmm1_a); psrl_byte_128(4, xmm1_b);
pand128(xmm9, xmm0_a); pand128(xmm9, xmm0_b);
movdqa128(xmm11, xmm2_a); movdqa128(xmm11, xmm2_b);
pshufb128(xmm0_a, xmm2_a); pshufb128(xmm0_b, xmm2_b);
@@ -1365,7 +1435,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
pxor128(xmm6, xmm4_a); pxor128(xmm6, xmm4_b);
movdqa128(xmm12, xmm0_a); movdqa128(xmm12, xmm0_b);
pshufb128(xmm3_a, xmm0_a); pshufb128(xmm3_b, xmm0_b);
- movdqa128(k_dsbd[1], xmm1_a); movdqa128(k_dsbd[1], xmm1_b);
+ movdqa128_memld(&k_dsbd[1], xmm1_a); movdqa128(xmm1_a, xmm1_b);
pxor128(xmm4_a, xmm0_a); pxor128(xmm4_b, xmm0_b);
pshufb128(xmm5, xmm0_a); pshufb128(xmm5, xmm0_b);
@@ -1379,7 +1449,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
movdqa128(xmm14, xmm4_a); movdqa128(xmm14, xmm4_b);
pshufb128(xmm2_a, xmm4_a); pshufb128(xmm2_b, xmm4_b);
pxor128(xmm1_a, xmm4_a); pxor128(xmm1_b, xmm4_b);
- movdqa128(k_dsbb[1], xmm0_a); movdqa128(k_dsbb[1], xmm0_b);
+ movdqa128_memld(&k_dsbb[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
pshufb128(xmm3_a, xmm0_a); pshufb128(xmm3_b, xmm0_b);
pxor128(xmm4_a, xmm0_a); pxor128(xmm4_b, xmm0_b);
@@ -1388,7 +1458,7 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
pshufb128(xmm2_a, xmm4_a); pshufb128(xmm2_b, xmm4_b);
pshufd128_0x93(xmm5, xmm5);
pxor128(xmm0_a, xmm4_a); pxor128(xmm0_b, xmm4_b);
- movdqa128(k_dsbe[1], xmm0_a); movdqa128(k_dsbe[1], xmm0_b);
+ movdqa128_memld(&k_dsbe[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
pshufb128(xmm3_a, xmm0_a); pshufb128(xmm3_b, xmm0_b);
pxor128(xmm4_a, xmm0_a); pxor128(xmm4_b, xmm0_b);
@@ -1396,20 +1466,144 @@ aes_decrypt_core_2blks(__m128i *pxmm0_a, __m128i *pxmm0_b,
}
/* middle of last round */
- movdqa128(k_dsbo[0], xmm4_a); movdqa128(k_dsbo[0], xmm4_b);
+ movdqa128_memld(&k_dsbo[0], xmm4_a); movdqa128(xmm4_a, xmm4_b);
pshufb128(xmm2_a, xmm4_a); pshufb128(xmm2_b, xmm4_b);
movdqu128_memld(config.sched_keys, xmm6);
pxor128(xmm6, xmm4_a); pxor128(xmm6, xmm4_b);
- movdqa128(k_dsbo[1], xmm0_a); movdqa128(k_dsbo[1], xmm0_b);
+ movdqa128_memld(&k_dsbo[1], xmm0_a); movdqa128(xmm0_a, xmm0_b);
pshufb128(xmm3_a, xmm0_a); pshufb128(xmm3_b, xmm0_b);
pxor128(xmm4_a, xmm0_a); pxor128(xmm4_b, xmm0_b);
- pshufb128(k_sr[mc_pos], xmm0_a);
- pshufb128(k_sr[mc_pos], xmm0_b);
+ movdqa128_memld(&k_sr[mc_pos], xmm5);
+ pshufb128(xmm5, xmm0_a); pshufb128(xmm5, xmm0_b);
*pxmm0_a = xmm0_a;
*pxmm0_b = xmm0_b;
}
+#ifdef HAVE_SIMD256
+
+static ASM_FUNC_ATTR_INLINE void
+aes_decrypt_core_4blks_simd256(__m256i *pymm0_a, __m256i *pymm0_b,
+ struct vp_aes_config_s config,
+ __m128i xmm9, __m128i xmm10, __m128i xmm11,
+ __m128i xmm12, __m128i xmm13, __m128i xmm14,
+ __m128i xmm15, __m128i xmm8)
+{
+ __m256i ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm8;
+ __m256i ymm0_a, ymm0_b;
+ __m256i ymm1_a, ymm2_a, ymm3_a, ymm4_a;
+ __m256i ymm1_b, ymm2_b, ymm3_b, ymm4_b;
+ __m256i ymm5, ymm6;
+ const byte *end_keys = config.sched_keys + 16 * config.nround;
+ unsigned int mc_pos = config.nround % 4;
+
+ broadcast128_256(xmm9, ymm9);
+ movdqa128_256(xmm10, ymm10);
+ movdqa128_256(xmm11, ymm11);
+ movdqa128_256(xmm12, ymm12);
+ movdqa128_256(xmm13, ymm13);
+ movdqa128_256(xmm14, ymm14);
+ movdqa128_256(xmm15, ymm15);
+ movdqa128_256(xmm8, ymm8);
+
+ ymm0_a = *pymm0_a;
+ ymm0_b = *pymm0_b;
+
+ load_tab16_table(&k_dipt[0], ymm2_a); movdqa256(ymm2_a, ymm2_b);
+ movdqa256(ymm9, ymm1_a); movdqa256(ymm9, ymm1_b);
+ pandn256(ymm0_a, ymm1_a); pandn256(ymm0_b, ymm1_b);
+ psrl_byte_256(4, ymm1_a); psrl_byte_256(4, ymm1_b);
+ pand256(ymm9, ymm0_a); pand256(ymm9, ymm0_b);
+ pshufb256_tab16(ymm0_a, ymm2_a); pshufb256_tab16(ymm0_b, ymm2_b);
+ load_tab16_table(&k_dipt[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+ pshufb256_tab16(ymm1_a, ymm0_a); pshufb256_tab16(ymm1_b, ymm0_b);
+ broadcast128_256_amemld(config.sched_keys, ymm6);
+ pxor256(ymm6, ymm2_a); pxor256(ymm6, ymm2_b);
+ pxor256(ymm2_a, ymm0_a); pxor256(ymm2_b, ymm0_b);
+ load_tab32_mask(&k_mc_forward[3], ymm5);
+
+ config.sched_keys += 16;
+
+ while (1)
+ {
+ /* top of round */
+ movdqa256(ymm9, ymm1_a); movdqa256(ymm9, ymm1_b);
+ pandn256(ymm0_a, ymm1_a); pandn256(ymm0_b, ymm1_b);
+ psrl_byte_256(4, ymm1_a); psrl_byte_256(4, ymm1_b);
+ pand256(ymm9, ymm0_a); pand256(ymm9, ymm0_b);
+ movdqa256(ymm11, ymm2_a); movdqa256(ymm11, ymm2_b);
+ pshufb256_tab16(ymm0_a, ymm2_a); pshufb256_tab16(ymm0_b, ymm2_b);
+ pxor256(ymm1_a, ymm0_a); pxor256(ymm1_b, ymm0_b);
+ movdqa256(ymm10, ymm3_a); movdqa256(ymm10, ymm3_b);
+ pshufb256_tab16(ymm1_a, ymm3_a); pshufb256_tab16(ymm1_b, ymm3_b);
+ pxor256(ymm2_a, ymm3_a); pxor256(ymm2_b, ymm3_b);
+ movdqa256(ymm10, ymm4_a); movdqa256(ymm10, ymm4_b);
+ pshufb256_tab16(ymm0_a, ymm4_a); pshufb256_tab16(ymm0_b, ymm4_b);
+ pxor256(ymm2_a, ymm4_a); pxor256(ymm2_b, ymm4_b);
+ movdqa256(ymm10, ymm2_a); movdqa256(ymm10, ymm2_b);
+ pshufb256_tab16(ymm3_a, ymm2_a); pshufb256_tab16(ymm3_b, ymm2_b);
+ pxor256(ymm0_a, ymm2_a); pxor256(ymm0_b, ymm2_b);
+ movdqa256(ymm10, ymm3_a); movdqa256(ymm10, ymm3_b);
+ pshufb256_tab16(ymm4_a, ymm3_a); pshufb256_tab16(ymm4_b, ymm3_b);
+ pxor256(ymm1_a, ymm3_a); pxor256(ymm1_b, ymm3_b);
+
+ if (config.sched_keys == end_keys)
+ break;
+
+ /* Inverse mix columns */
+ movdqa256(ymm13, ymm4_a); movdqa256(ymm13, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ broadcast128_256_amemld(config.sched_keys, ymm6);
+ pxor256(ymm6, ymm4_a); pxor256(ymm6, ymm4_b);
+ movdqa256(ymm12, ymm0_a); movdqa256(ymm12, ymm0_b);
+ pshufb256_tab16(ymm3_a, ymm0_a); pshufb256_tab16(ymm3_b, ymm0_b);
+ load_tab16_table(&k_dsbd[1], ymm1_a); movdqa256(ymm1_a, ymm1_b);
+ pxor256(ymm4_a, ymm0_a); pxor256(ymm4_b, ymm0_b);
+
+ pshufb256_tab32(ymm5, ymm0_a); pshufb256_tab32(ymm5, ymm0_b);
+ movdqa256(ymm15, ymm4_a); movdqa256(ymm15, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ pxor256(ymm0_a, ymm4_a); pxor256(ymm0_b, ymm4_b);
+ pshufb256_tab16(ymm3_a, ymm1_a); pshufb256_tab16(ymm3_b, ymm1_b);
+ pxor256(ymm4_a, ymm1_a); pxor256(ymm4_b, ymm1_b);
+
+ pshufb256_tab32(ymm5, ymm1_a); pshufb256_tab32(ymm5, ymm1_b);
+ movdqa256(ymm14, ymm4_a); movdqa256(ymm14, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ pxor256(ymm1_a, ymm4_a); pxor256(ymm1_b, ymm4_b);
+ load_tab16_table(&k_dsbb[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+ pshufb256_tab16(ymm3_a, ymm0_a); pshufb256_tab16(ymm3_b, ymm0_b);
+ pxor256(ymm4_a, ymm0_a); pxor256(ymm4_b, ymm0_b);
+
+ pshufb256_tab32(ymm5, ymm0_a); pshufb256_tab32(ymm5, ymm0_b);
+ movdqa256(ymm8, ymm4_a); movdqa256(ymm8, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ pshufd256_0x93(ymm5, ymm5);
+ pxor256(ymm0_a, ymm4_a); pxor256(ymm0_b, ymm4_b);
+ load_tab16_table(&k_dsbe[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+ pshufb256_tab16(ymm3_a, ymm0_a); pshufb256_tab16(ymm3_b, ymm0_b);
+ pxor256(ymm4_a, ymm0_a); pxor256(ymm4_b, ymm0_b);
+
+ config.sched_keys += 16;
+ }
+
+ /* middle of last round */
+ load_tab16_table(&k_dsbo[0], ymm4_a); movdqa256(ymm4_a, ymm4_b);
+ pshufb256_tab16(ymm2_a, ymm4_a); pshufb256_tab16(ymm2_b, ymm4_b);
+ broadcast128_256_amemld(config.sched_keys, ymm6);
+ pxor256(ymm6, ymm4_a); pxor256(ymm6, ymm4_b);
+ load_tab16_table(&k_dsbo[1], ymm0_a); movdqa256(ymm0_a, ymm0_b);
+ pshufb256_tab16(ymm3_a, ymm0_a); pshufb256_tab16(ymm3_b, ymm0_b);
+ pxor256(ymm4_a, ymm0_a); pxor256(ymm4_b, ymm0_b);
+ load_tab32_mask(&k_sr[mc_pos], ymm5);
+ pshufb256_tab16(ymm5, ymm0_a); pshufb256_tab16(ymm5, ymm0_b);
+
+ *pymm0_a = ymm0_a;
+ *pymm0_b = ymm0_b;
+}
+
+#endif /* HAVE_SIMD256 */
+
ASM_FUNC_ATTR_NOINLINE unsigned int
FUNC_ENCRYPT (const RIJNDAEL_context *ctx, unsigned char *dst,
const unsigned char *src)
@@ -1534,12 +1728,12 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7, xmm8;
__m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
- static const __m128i be_mask =
+ static const __m128i_const be_mask =
M128I_BYTE(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
- static const __m128i bigendian_add =
+ static const __m128i_const bigendian_add =
M128I_BYTE(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
- static const __m128i carry_add = M128I_U64(1, 1);
- static const __m128i nocarry_add = M128I_U64(1, 0);
+ static const __m128i_const carry_add = M128I_U64(1, 1);
+ static const __m128i_const nocarry_add = M128I_U64(1, 0);
u64 ctrlow = buf_get_be64(ctr + 8);
struct vp_aes_config_s config;
@@ -1548,9 +1742,77 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
- movdqa128(bigendian_add, xmm8); /* Preload byte add */
+ movdqa128_memld(&bigendian_add, xmm8); /* Preload byte add */
movdqu128_memld(ctr, xmm7); /* Preload CTR */
- movdqa128(be_mask, xmm6); /* Preload mask */
+ movdqa128_memld(&be_mask, xmm6); /* Preload mask */
+
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm0, ymm1, ymm2, ymm3;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqa128_256(xmm7, ymm0);
+
+ /* detect if 8-bit carry handling is needed */
+ if (UNLIKELY(((ctrlow += 4) & 0xff) <= 3))
+ {
+ static const __m128i_const *adders[5][4] =
+ {
+ { &nocarry_add, &nocarry_add, &nocarry_add, &carry_add },
+ { &nocarry_add, &nocarry_add, &carry_add, &nocarry_add },
+ { &nocarry_add, &carry_add, &nocarry_add, &nocarry_add },
+ { &carry_add, &nocarry_add, &nocarry_add, &nocarry_add },
+ { &nocarry_add, &nocarry_add, &nocarry_add, &nocarry_add }
+ };
+ unsigned int idx = ctrlow <= 3 ? ctrlow : 4;
+
+ pshufb128(xmm6, xmm7);
+
+ paddq128_amemld(adders[idx][0], xmm7);
+ movdqa128(xmm7, xmm2);
+ pshufb128(xmm6, xmm2);
+ insert256_hi128(xmm2, ymm0);
+ paddq128_amemld(adders[idx][1], xmm7);
+ movdqa128(xmm7, xmm2);
+ pshufb128(xmm6, xmm2);
+ movdqa128_256(xmm2, ymm1);
+ paddq128_amemld(adders[idx][2], xmm7);
+ movdqa128(xmm7, xmm2);
+ pshufb128(xmm6, xmm2);
+ insert256_hi128(xmm2, ymm1);
+ paddq128_amemld(adders[idx][3], xmm7);
+
+ pshufb128(xmm6, xmm7);
+ }
+ else
+ {
+ paddb128(xmm8, xmm7);
+ insert256_hi128(xmm7, ymm0);
+ paddb128(xmm8, xmm7);
+ movdqa128_256(xmm7, ymm1);
+ paddb128(xmm8, xmm7);
+ insert256_hi128(xmm7, ymm1);
+ paddb128(xmm8, xmm7);
+ }
+
+ aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
+ xmm15);
+
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+ pxor256(ymm2, ymm0);
+ pxor256(ymm3, ymm1);
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ outbuf += 4 * BLOCKSIZE;
+ inbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
for (; nblocks >= 2; nblocks -= 2)
{
@@ -1564,24 +1826,24 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
/* detect if 64-bit carry handling is needed */
if (UNLIKELY(ctrlow == 1))
{
- paddq128(carry_add, xmm7);
+ paddq128_amemld(&carry_add, xmm7);
movdqa128(xmm7, xmm1);
pshufb128(xmm6, xmm1);
- paddq128(nocarry_add, xmm7);
+ paddq128_amemld(&nocarry_add, xmm7);
}
else if (UNLIKELY(ctrlow == 0))
{
- paddq128(nocarry_add, xmm7);
+ paddq128_amemld(&nocarry_add, xmm7);
movdqa128(xmm7, xmm1);
pshufb128(xmm6, xmm1);
- paddq128(carry_add, xmm7);
+ paddq128_amemld(&carry_add, xmm7);
}
else
{
- paddq128(nocarry_add, xmm7);
+ paddq128_amemld(&nocarry_add, xmm7);
movdqa128(xmm7, xmm1);
pshufb128(xmm6, xmm1);
- paddq128(nocarry_add, xmm7);
+ paddq128_amemld(&nocarry_add, xmm7);
}
pshufb128(xmm6, xmm7);
@@ -1617,7 +1879,7 @@ FUNC_CTR_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
pshufb128(xmm6, xmm7);
/* detect if 64-bit carry handling is needed */
- paddq128(UNLIKELY(ctrlow == 0) ? carry_add : nocarry_add, xmm7);
+ paddq128_amemld(UNLIKELY(ctrlow == 0) ? &carry_add : &nocarry_add, xmm7);
pshufb128(xmm6, xmm7);
}
@@ -1649,8 +1911,8 @@ FUNC_CTR32LE_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm7, xmm8;
__m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
- static const __m128i add_one = M128I_U64(1, 0);
- static const __m128i add_two = M128I_U64(2, 0);
+ static const __m128i_const add_one = M128I_U64(1, 0);
+ static const __m128i_const add_two = M128I_U64(2, 0);
struct vp_aes_config_s config;
config.nround = ctx->rounds;
@@ -1658,15 +1920,53 @@ FUNC_CTR32LE_ENC (RIJNDAEL_context *ctx, unsigned char *ctr,
enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
- movdqa128(add_one, xmm8); /* Preload byte add */
+ movdqa128_memld(&add_one, xmm8); /* Preload byte add */
movdqu128_memld(ctr, xmm7); /* Preload CTR */
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm0, ymm1, ymm2, ymm3;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqa128(xmm7, xmm0);
+ movdqa128(xmm7, xmm1);
+ paddd128(xmm8, xmm1);
+ paddd128_amemld(&add_two, xmm7);
+ movdqa128_256(xmm0, ymm0);
+ insert256_hi128(xmm1, ymm0);
+
+ movdqa128(xmm7, xmm1);
+ movdqa128(xmm7, xmm2);
+ paddd128(xmm8, xmm2);
+ paddd128_amemld(&add_two, xmm7);
+ movdqa128_256(xmm1, ymm1);
+ insert256_hi128(xmm2, ymm1);
+
+ aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15);
+
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+ pxor256(ymm2, ymm0);
+ pxor256(ymm3, ymm1);
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ outbuf += 4 * BLOCKSIZE;
+ inbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
movdqa128(xmm7, xmm0);
movdqa128(xmm7, xmm1);
paddd128(xmm8, xmm1);
- paddd128(add_two, xmm7);
+ paddd128_amemld(&add_two, xmm7);
aes_encrypt_core_2blks(&xmm0, &xmm1, config,
xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
@@ -1719,6 +2019,36 @@ FUNC_CFB_DEC (RIJNDAEL_context *ctx, unsigned char *iv,
movdqu128_memld(iv, xmm0);
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm6, ymm1, ymm2, ymm3;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqa128_256(xmm0, ymm6);
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm2);
+ movdqa256_128(ymm2, xmm2);
+ insert256_hi128(xmm2, ymm6);
+ movdqu256_memld(inbuf + 1 * BLOCKSIZE, ymm1);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm3);
+ extract256_hi128(ymm3, xmm0);
+
+ aes_encrypt_core_4blks_simd256(&ymm6, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15);
+
+ pxor256(ymm2, ymm6);
+ pxor256(ymm3, ymm1);
+ movdqu256_memst(ymm6, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ outbuf += 4 * BLOCKSIZE;
+ inbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
movdqa128(xmm0, xmm1);
@@ -1779,6 +2109,36 @@ FUNC_CBC_DEC (RIJNDAEL_context *ctx, unsigned char *iv,
movdqu128_memld(iv, xmm7);
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm0, ymm1, ymm2, ymm3;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+ movdqa256_128(ymm0, xmm0);
+ movdqa128_256(xmm7, ymm2);
+ insert256_hi128(xmm0, ymm2);
+ movdqu256_memld(inbuf + 1 * BLOCKSIZE, ymm3);
+ extract256_hi128(ymm1, xmm7);
+
+ aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15, xmm8);
+
+ pxor256(ymm2, ymm0);
+ pxor256(ymm3, ymm1);
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ outbuf += 4 * BLOCKSIZE;
+ inbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
movdqu128_memld(inbuf, xmm0);
@@ -1843,6 +2203,68 @@ aes_simd128_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
movdqu128_memld(c->u_iv.iv, xmm7);
movdqu128_memld(c->u_ctr.ctr, xmm6);
+#ifdef HAVE_SIMD256
+ if (check_simd256_support() && nblocks >= 4)
+ {
+ __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+ movdqa128_256(xmm6, ymm6);
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ const unsigned char *l;
+
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ movdqa128_256(xmm7, ymm3);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ insert256_hi128(xmm7, ymm3);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ movdqa128_256(xmm7, ymm8);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ insert256_hi128(xmm7, ymm8);
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ pxor256(ymm0, ymm6);
+ pxor256(ymm1, ymm6);
+
+ pxor256(ymm3, ymm0);
+ pxor256(ymm8, ymm1);
+
+ aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15);
+
+ pxor256(ymm3, ymm0);
+ pxor256(ymm8, ymm1);
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ inbuf += 4 * BLOCKSIZE;
+ outbuf += 4 * BLOCKSIZE;
+ }
+
+ extract256_hi128(ymm6, xmm0);
+ movdqa256_128(ymm6, xmm6);
+ pxor128(xmm0, xmm6);
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
const unsigned char *l;
@@ -1942,6 +2364,69 @@ aes_simd128_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
movdqu128_memld(c->u_iv.iv, xmm7);
movdqu128_memld(c->u_ctr.ctr, xmm6);
+#ifdef HAVE_SIMD256
+ if (check_simd256_support() && nblocks >= 4)
+ {
+ __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+ movdqa128_256(xmm6, ymm6);
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ const unsigned char *l;
+
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ movdqa128_256(xmm7, ymm3);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ insert256_hi128(xmm7, ymm3);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ movdqa128_256(xmm7, ymm8);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ insert256_hi128(xmm7, ymm8);
+
+ pxor256(ymm3, ymm0);
+ pxor256(ymm8, ymm1);
+
+ aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15, xmm8);
+
+ pxor256(ymm3, ymm0);
+ pxor256(ymm8, ymm1);
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ pxor256(ymm0, ymm6);
+ pxor256(ymm1, ymm6);
+
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ inbuf += 4 * BLOCKSIZE;
+ outbuf += 4 * BLOCKSIZE;
+ }
+
+ extract256_hi128(ymm6, xmm0);
+ movdqa256_128(ymm6, xmm6);
+ pxor128(xmm0, xmm6);
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
const unsigned char *l;
@@ -2044,6 +2529,61 @@ FUNC_OCB_AUTH(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
movdqu128_memld(c->u_mode.ocb.aad_offset, xmm7);
movdqu128_memld(c->u_mode.ocb.aad_sum, xmm6);
+#ifdef HAVE_SIMD256
+ if (check_simd256_support() && nblocks >= 4)
+ {
+ __m256i ymm0, ymm1, ymm3, ymm6, ymm8;
+
+ movdqa128_256(xmm6, ymm6);
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ const unsigned char *l;
+
+ movdqu256_memld(abuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(abuf + 2 * BLOCKSIZE, ymm1);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ movdqa128_256(xmm7, ymm3);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ insert256_hi128(xmm7, ymm3);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ movdqa128_256(xmm7, ymm8);
+
+ l = ocb_get_l(c, ++n);
+ movdqu128_memld(l, xmm2);
+ pxor128(xmm2, xmm7);
+ insert256_hi128(xmm7, ymm8);
+
+ pxor256(ymm3, ymm0);
+ pxor256(ymm8, ymm1);
+
+ aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15);
+
+ pxor256(ymm0, ymm6);
+ pxor256(ymm1, ymm6);
+
+ abuf += 4 * BLOCKSIZE;
+ }
+
+ extract256_hi128(ymm6, xmm0);
+ movdqa256_128(ymm6, xmm6);
+ pxor128(xmm0, xmm6);
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
const unsigned char *l;
@@ -2117,6 +2657,29 @@ aes_simd128_ecb_enc (void *context, void *outbuf_arg, const void *inbuf_arg,
enc_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm0, ymm1;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+ aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15);
+
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ inbuf += 4 * BLOCKSIZE;
+ outbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
movdqu128_memld(inbuf + 0 * BLOCKSIZE, xmm0);
@@ -2171,6 +2734,29 @@ aes_simd128_ecb_dec (void *context, void *outbuf_arg, const void *inbuf_arg,
dec_preload(xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm8);
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm0, ymm1;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+ aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15, xmm8);
+
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ inbuf += 4 * BLOCKSIZE;
+ outbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
movdqu128_memld(inbuf + 0 * BLOCKSIZE, xmm0);
@@ -2216,13 +2802,13 @@ FUNC_ECB_CRYPT (void *context, void *outbuf_arg, const void *inbuf_arg,
static ASM_FUNC_ATTR_INLINE __m128i xts_gfmul_byA (__m128i xmm5)
{
- static const __m128i xts_gfmul_const = M128I_U64(0x87, 0x01);
+ static const __m128i_const xts_gfmul_const = M128I_U64(0x87, 0x01);
__m128i xmm1;
pshufd128_0x4E(xmm5, xmm1);
psraq128(63, xmm1);
paddq128(xmm5, xmm5);
- pand128(xts_gfmul_const, xmm1);
+ pand128_amemld(&xts_gfmul_const, xmm1);
pxor128(xmm1, xmm5);
return xmm5;
@@ -2246,6 +2832,43 @@ aes_simd128_xts_enc (void *context, unsigned char *tweak, void *outbuf_arg,
movdqu128_memld(tweak, xmm7); /* Preload tweak */
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm0, ymm1, ymm2, ymm3;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+ movdqa128_256(xmm7, ymm2);
+ xmm7 = xts_gfmul_byA(xmm7);
+ insert256_hi128(xmm7, ymm2);
+ xmm7 = xts_gfmul_byA(xmm7);
+ movdqa128_256(xmm7, ymm3);
+ xmm7 = xts_gfmul_byA(xmm7);
+ insert256_hi128(xmm7, ymm3);
+ xmm7 = xts_gfmul_byA(xmm7);
+
+ pxor256(ymm2, ymm0);
+ pxor256(ymm3, ymm1);
+
+ aes_encrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15);
+
+ pxor256(ymm2, ymm0);
+ pxor256(ymm3, ymm1);
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ outbuf += 4 * BLOCKSIZE;
+ inbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
movdqu128_memld(inbuf, xmm0);
@@ -2315,6 +2938,43 @@ aes_simd128_xts_dec (void *context, unsigned char *tweak, void *outbuf_arg,
movdqu128_memld(tweak, xmm7); /* Preload tweak */
+#ifdef HAVE_SIMD256
+ if (check_simd256_support())
+ {
+ __m256i ymm0, ymm1, ymm2, ymm3;
+
+ for (; nblocks >= 4; nblocks -= 4)
+ {
+ movdqu256_memld(inbuf + 0 * BLOCKSIZE, ymm0);
+ movdqu256_memld(inbuf + 2 * BLOCKSIZE, ymm1);
+
+ movdqa128_256(xmm7, ymm2);
+ xmm7 = xts_gfmul_byA(xmm7);
+ insert256_hi128(xmm7, ymm2);
+ xmm7 = xts_gfmul_byA(xmm7);
+ movdqa128_256(xmm7, ymm3);
+ xmm7 = xts_gfmul_byA(xmm7);
+ insert256_hi128(xmm7, ymm3);
+ xmm7 = xts_gfmul_byA(xmm7);
+
+ pxor256(ymm2, ymm0);
+ pxor256(ymm3, ymm1);
+
+ aes_decrypt_core_4blks_simd256(&ymm0, &ymm1, config,
+ xmm9, xmm10, xmm11, xmm12, xmm13,
+ xmm14, xmm15, xmm8);
+
+ pxor256(ymm2, ymm0);
+ pxor256(ymm3, ymm1);
+ movdqu256_memst(ymm0, outbuf + 0 * BLOCKSIZE);
+ movdqu256_memst(ymm1, outbuf + 2 * BLOCKSIZE);
+
+ outbuf += 4 * BLOCKSIZE;
+ inbuf += 4 * BLOCKSIZE;
+ }
+ }
+#endif /* HAVE_SIMD256 */
+
for (; nblocks >= 2; nblocks -= 2)
{
movdqu128_memld(inbuf, xmm0);
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 12c27319..0c48793b 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -224,6 +224,62 @@ extern void _gcry_aes_vp_aarch64_xts_crypt (void *context, unsigned char *tweak,
size_t nblocks, int encrypt);
#endif
+#ifdef USE_VP_RISCV
+/* RISC-V vector permutation implementation of AES */
+extern int _gcry_aes_vp_riscv_setup_acceleration(RIJNDAEL_context *ctx);
+
+extern void _gcry_aes_vp_riscv_do_setkey(RIJNDAEL_context *ctx,
+ const byte *key);
+extern void _gcry_aes_vp_riscv_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_vp_riscv_encrypt (const RIJNDAEL_context *ctx,
+ unsigned char *dst,
+ const unsigned char *src);
+extern unsigned int _gcry_aes_vp_riscv_decrypt (const RIJNDAEL_context *ctx,
+ unsigned char *dst,
+ const unsigned char *src);
+extern void _gcry_aes_vp_riscv_cfb_enc (void *context, unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks);
+extern void _gcry_aes_vp_riscv_cbc_enc (void *context, unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ int cbc_mac);
+extern void _gcry_aes_vp_riscv_ctr_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks);
+extern void _gcry_aes_vp_riscv_ctr32le_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks);
+extern void _gcry_aes_vp_riscv_cfb_dec (void *context, unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks);
+extern void _gcry_aes_vp_riscv_cbc_dec (void *context, unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks);
+extern size_t _gcry_aes_vp_riscv_ocb_crypt (gcry_cipher_hd_t c,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks,
+ int encrypt);
+extern size_t _gcry_aes_vp_riscv_ocb_auth (gcry_cipher_hd_t c,
+ const void *abuf_arg,
+ size_t nblocks);
+extern void _gcry_aes_vp_riscv_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks, int encrypt);
+extern void _gcry_aes_vp_riscv_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks, int encrypt);
+#endif
+
#ifdef USE_PADLOCK
extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
unsigned char *bx,
@@ -718,6 +774,30 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->xts_crypt = _gcry_aes_vp_aarch64_xts_crypt;
}
#endif
+#ifdef USE_VP_RISCV
+ else if ((hwfeatures & HWF_RISCV_IMAFDC) && (hwfeatures & HWF_RISCV_V) &&
+ _gcry_aes_vp_riscv_setup_acceleration(ctx))
+ {
+ hw_setkey = _gcry_aes_vp_riscv_do_setkey;
+ ctx->encrypt_fn = _gcry_aes_vp_riscv_encrypt;
+ ctx->decrypt_fn = _gcry_aes_vp_riscv_decrypt;
+ ctx->prefetch_enc_fn = NULL;
+ ctx->prefetch_dec_fn = NULL;
+ ctx->prepare_decryption = _gcry_aes_vp_riscv_prepare_decryption;
+
+ /* Setup vector permute AArch64 bulk encryption routines. */
+ bulk_ops->cfb_enc = _gcry_aes_vp_riscv_cfb_enc;
+ bulk_ops->cfb_dec = _gcry_aes_vp_riscv_cfb_dec;
+ bulk_ops->cbc_enc = _gcry_aes_vp_riscv_cbc_enc;
+ bulk_ops->cbc_dec = _gcry_aes_vp_riscv_cbc_dec;
+ bulk_ops->ctr_enc = _gcry_aes_vp_riscv_ctr_enc;
+ bulk_ops->ctr32le_enc = _gcry_aes_vp_riscv_ctr32le_enc;
+ bulk_ops->ocb_crypt = _gcry_aes_vp_riscv_ocb_crypt;
+ bulk_ops->ocb_auth = _gcry_aes_vp_riscv_ocb_auth;
+ bulk_ops->ecb_crypt = _gcry_aes_vp_riscv_ecb_crypt;
+ bulk_ops->xts_crypt = _gcry_aes_vp_riscv_xts_crypt;
+ }
+#endif
#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
else if ((hwfeatures & HWF_PPC_VCRYPTO) && (hwfeatures & HWF_PPC_ARCH_3_00))
{
diff --git a/cipher/simd-common-riscv.h b/cipher/simd-common-riscv.h
new file mode 100644
index 00000000..8381000f
--- /dev/null
+++ b/cipher/simd-common-riscv.h
@@ -0,0 +1,48 @@
+/* simd-common-riscv.h - Common macros for RISC-V vector code
+ *
+ * Copyright (C) 2025 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_SIMD_COMMON_RISCV_H
+#define GCRY_SIMD_COMMON_RISCV_H
+
+#include <config.h>
+
+#define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+
+#define clear_vec_regs() __asm__ volatile("vsetvli zero, %0, e8, m1, ta, ma;\n" \
+ "vmv.v.i v0, 0;\n" \
+ "vmv.v.i v1, 0;\n" \
+ "vmv2r.v v2, v0;\n" \
+ "vmv4r.v v4, v0;\n" \
+ "vmv8r.v v8, v0;\n" \
+ "vmv8r.v v16, v0;\n" \
+ "vmv8r.v v24, v0;\n" \
+ : \
+ : "r" (~0) \
+ : "memory", "vl", "vtype", \
+ "v0", "v1", "v2", "v3", \
+ "v4", "v5", "v6", "v7", \
+ "v8", "v9", "v10", "v11", \
+ "v12", "v13", "v14", "v15", \
+ "v16", "v17", "v18", "v19", \
+ "v20", "v21", "v22", "v23", \
+ "v24", "v25", "v26", "v27", \
+ "v28", "v29", "v30", "v31")
+
+#endif /* GCRY_SIMD_COMMON_RISCV_H */
diff --git a/configure.ac b/configure.ac
index f20d654d..55d15fa3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2705,6 +2705,120 @@ if test "$gcry_cv_gcc_inline_asm_riscv_v" = "yes" ; then
fi
+#
+# Check whether compiler supports RISC-V vector intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics],
+ [gcry_cv_cc_riscv_vector_intrinsics],
+ [if test "$mpi_cpu_arch" != "riscv64" ||
+ test "$try_asm_modules" != "yes" ; then
+ gcry_cv_cc_riscv_vector_intrinsics="n/a"
+ else
+ gcry_cv_cc_riscv_vector_intrinsics=no
+ AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+ [[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+ #error __riscv_v_intrinsic not defined or too old version
+ #endif
+ #include <riscv_vector.h>
+ typedef vuint8m1_t __m128i;
+ #define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+ #define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+ #define paddq128(a, o) (o = cast_u64_to_m128i( \
+ __riscv_vadd_vv_u64m1( \
+ cast_m128i_to_u64(o), \
+ cast_m128i_to_u64(a), 2)))
+ #define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+ #define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+ #define clear_vec_reg_v0() \
+ __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
+ "vmv.v.x v0, zero;\n" \
+ ::: "memory", "vtype", "vl", "v0")
+ static inline __attribute__((always_inline)) __m128i
+ fn2(__m128i a)
+ {
+ paddq128(a, a);
+ return a;
+ }
+ __m128i fn(__m128i in)
+ {
+ __m128i x;
+ memory_barrier_with_vec(in);
+ x = fn2(in);
+ memory_barrier_with_vec(x);
+ pshufb128(in, x);
+ memory_barrier_with_vec(in);
+ clear_vec_reg_v0();
+ return in;
+ }
+ ]])],
+ [gcry_cv_cc_riscv_vector_intrinsics=yes])
+ fi])
+if test "$gcry_cv_cc_riscv_vector_intrinsics" = "yes" ; then
+ AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
+ [Defined if underlying compiler supports RISC-V vector intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=rv64imafdcv -mstrict-align"
+
+if test "$gcry_cv_cc_riscv_vector_intrinsics" = "no" &&
+ test "$mpi_cpu_arch" = "riscv64" &&
+ test "$try_asm_modules" = "yes" ; then
+ AC_CACHE_CHECK([whether compiler supports RISC-V vector intrinsics with extra GCC flags],
+ [gcry_cv_cc_riscv_vector_intrinsics_cflags],
+ [gcry_cv_cc_riscv_vector_intrinsics_cflags=no
+ AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+ [[#if !(defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 12000)
+ #error __riscv_v_intrinsic not defined or too old version
+ #endif
+ #include <riscv_vector.h>
+ typedef vuint8m1_t __m128i;
+ #define cast_m128i_to_u64(a) (__riscv_vreinterpret_v_u8m1_u64m1(a))
+ #define cast_u64_to_m128i(a) (__riscv_vreinterpret_v_u64m1_u8m1(a))
+ #define paddq128(a, o) (o = cast_u64_to_m128i( \
+ __riscv_vadd_vv_u64m1( \
+ cast_m128i_to_u64(o), \
+ cast_m128i_to_u64(a), 2)))
+ #define pshufb128(m8, o) (o = __riscv_vrgather_vv_u8m1((o), (m8), 16))
+ #define memory_barrier_with_vec(a) __asm__("" : "+vr"(a) :: "memory")
+ #define clear_vec_reg_v0() \
+ __asm__ volatile("vsetivli zero, 16, e8, m1, ta, ma;\n" \
+ "vmv.v.x v0, zero;\n" \
+ ::: "memory", "vl", "v0")
+ static inline __attribute__((always_inline)) __m128i
+ fn2(__m128i a)
+ {
+ paddq128(a, a);
+ return a;
+ }
+ __m128i fn(__m128i in)
+ {
+ __m128i x;
+ memory_barrier_with_vec(in);
+ x = fn2(in);
+ memory_barrier_with_vec(x);
+ pshufb128(in, x);
+ memory_barrier_with_vec(in);
+ clear_vec_reg_v0();
+ return in;
+ }
+ ]])],
+ [gcry_cv_cc_riscv_vector_intrinsics_cflags=yes])])
+ if test "$gcry_cv_cc_riscv_vector_intrinsics_cflags" = "yes" ; then
+ AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS,1,
+ [Defined if underlying compiler supports RISC-V vector intrinsics])
+ AC_DEFINE(HAVE_COMPATIBLE_CC_RISCV_VECTOR_INTRINSICS_WITH_CFLAGS,1,
+ [Defined if underlying compiler supports RISC-V vector intrinsics with extra GCC flags])
+ fi
+fi
+
+AM_CONDITIONAL(ENABLE_RISCV_VECTOR_INTRINSICS_EXTRA_CFLAGS,
+ test "$gcry_cv_cc_riscv_vector_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
#######################################
#### Checks for library functions. ####
#######################################
@@ -3183,6 +3297,10 @@ if test "$found" = "1" ; then
# Build with the crypto extension implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-ppc.lo"
;;
+ riscv64-*-*)
+ # Build with the vector permute SIMD128 implementation
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS rijndael-vp-riscv.lo"
+ ;;
s390x-*-*)
# Big-Endian.
# Build with the crypto extension implementation
--
2.45.2
More information about the Gcrypt-devel
mailing list