From guidovranken at gmail.com Sun Jul 3 17:02:21 2022 From: guidovranken at gmail.com (Guido Vranken) Date: Sun, 3 Jul 2022 17:02:21 +0200 Subject: Reject invalid HKDF key sizes In-Reply-To: <87v8su7goz.fsf@akagi.fsij.org> References: <87v8su7goz.fsf@akagi.fsij.org> Message-ID: Your fix introduces a memory leak. Please free 'h' before returning. diff --git a/cipher/kdf.c b/cipher/kdf.c index 2e5eef3..d371bdd 100644 --- a/cipher/kdf.c +++ b/cipher/kdf.c @@ -1699,7 +1699,10 @@ hkdf_open (gcry_kdf_hd_t *hd, int macalgo, } if (outlen > 255 * h->blklen) - return GPG_ERR_INV_VALUE; + { + xfree (h); + return GPG_ERR_INV_VALUE; + } ec = _gcry_mac_open (&h->md, macalgo, 0, NULL); if (ec) On Tue, Jun 21, 2022 at 7:02 AM NIIBE Yutaka wrote: > Guido Vranken wrote: > > HKDF prohibits output sizes which exceed digest size * 255. See section > 2.3 > > of RFC 5869. > > Thank you. > > Fixed in the commit: > > e0f0c788dc0f268965c0f63eb33d9f98c0575d58 > -- > -------------- next part -------------- An HTML attachment was scrubbed... URL: From cllang at redhat.com Tue Jul 5 17:31:11 2022 From: cllang at redhat.com (Clemens Lang) Date: Tue, 5 Jul 2022 17:31:11 +0200 Subject: [PATCH] tests/basic: Skip non-FIPS tests in FIPS mode Message-ID: <20220705153111.540552-1-cllang@redhat.com> * tests/basic.c (check_pubkey): Skip non-FIPS tests in FIPS mode, fixes a logic error previously introduced in e9698002. -- e96980022e5ec079c9d4e3492eb6a1131c68e0f2 did change the behavior of tests/basic.c to actually run non-FIPS tests in FIPS mode because the 'continue' statement was moved into the else block. This fails when libgcrypt is configured, for example, without support for RSA. GnuPG-bug-id: 6048 Signed-off-by: Clemens Lang --- tests/basic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/basic.c b/tests/basic.c index ecbe58c2..3cf5ee2f 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -17098,8 +17098,8 @@ check_pubkey (void) else { show_pk_not_available (pubkeys[i].id); - continue; } + continue; } check_one_pubkey (i, pubkeys[i]); } -- 2.35.3 From cllang at redhat.com Wed Jul 6 16:55:59 2022 From: cllang at redhat.com (Clemens Lang) Date: Wed, 6 Jul 2022 16:55:59 +0200 Subject: [PATCH] tests: Test gcry_pk_hash_sign w/explicit hash algo Message-ID: <20220706145559.64970-1-cllang@redhat.com> * tests/t-ecdsa.c (one_test_sexp): Re-run signature operation with hash algorithm explicitly specified in data_tmpl as documented in the manpage. -- The code path to decode the explicit hash algorithm specification in data_tmpl was previously not covered by tests. Verifying with a data_tmpl that contains the hash algorithm as string currently fails and should be enabled later. See also https://dev.gnupg.org/T6066. Signed-off-by: Clemens Lang --- tests/t-ecdsa.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/t-ecdsa.c b/tests/t-ecdsa.c index fa0a2ef9..725fcb4f 100644 --- a/tests/t-ecdsa.c +++ b/tests/t-ecdsa.c @@ -225,10 +225,11 @@ one_test_sexp (const char *curvename, const char *sha_alg, gcry_ctx_t ctx = NULL; int md_algo; const char *data_tmpl; + char data_tmpl2[256]; gcry_md_hd_t hd = NULL; gcry_sexp_t s_pk = NULL; gcry_sexp_t s_sk = NULL; - gcry_sexp_t s_sig= NULL; + gcry_sexp_t s_sig = NULL, s_sig2 = NULL; gcry_sexp_t s_tmp, s_tmp2; unsigned char *out_r = NULL; unsigned char *out_s = NULL; @@ -373,6 +374,21 @@ one_test_sexp (const char *curvename, const char *sha_alg, goto leave; } + if (snprintf (data_tmpl2, sizeof(data_tmpl2), + "(data(flags raw)(hash %s %%b)(label %%b))", + gcry_md_algo_name(md_algo)) >= sizeof(data_tmpl2)) + { + fail ("snprintf out of bounds"); + goto leave; + } + err = gcry_pk_hash_sign (&s_sig2, data_tmpl2, s_sk, hd, ctx); + if (err) + { + fail ("gcry_pk_hash_sign with explicit hash algorithm %s failed: %s", + gcry_md_algo_name (md_algo), gpg_strerror (err)); + goto leave; + } + out_r_len = out_s_len = 0; out_s = out_r = NULL; s_tmp2 = NULL; @@ -470,11 +486,20 @@ one_test_sexp (const char *curvename, const char *sha_alg, if (err) fail ("gcry_pk_hash_verify failed for test: %s", gpg_strerror (err)); + + /* TODO Verifying with data_tmpl2 crashes because gcry_pk_hash_verify() + * does not support specifying the hash algorithm explicitly. See + * https://dev.gnupg.org/T6066, which tracks this problem. */ + err = gcry_pk_hash_verify (s_sig2, data_tmpl, s_pk, hd, ctx); + if (err) + fail ("gcry_pk_hash_verify with explicit hash algorithm %s failed: %s", + gcry_md_algo_name (md_algo), gpg_strerror (err)); } leave: gcry_ctx_release (ctx); gcry_sexp_release (s_sig); + gcry_sexp_release (s_sig2); gcry_sexp_release (s_sk); gcry_sexp_release (s_pk); if (hd) -- 2.35.3 From cllang at redhat.com Wed Jul 6 18:33:42 2022 From: cllang at redhat.com (Clemens Lang) Date: Wed, 6 Jul 2022 18:33:42 +0200 Subject: [PATCH] tests/t-kdf: Test KDF FIPS indicator Message-ID: <20220706163342.73999-1-cllang@redhat.com> * tests/t-kdf.c (check_fips_indicators): Add test for gcry_control (GCRYCTL_FIPS_SERVICE_INDICATOR_KDF). -- Add a tests that checks that gcry_control(GCRYCTL_FIPS_SERVICE_INDICATOR_KDF) works correctly, does not return unexpected values, and returns that only PBKDF2 is approved at the moment. Signed-off-by: Clemens Lang --- tests/t-kdf.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tests/t-kdf.c b/tests/t-kdf.c index 454b7c48..d9d57778 100644 --- a/tests/t-kdf.c +++ b/tests/t-kdf.c @@ -1895,6 +1895,67 @@ check_hkdf (void) goto again; } +static void +check_fips_indicators (void) +{ + enum gcry_kdf_algos fips_kdf_algos[] = { + GCRY_KDF_PBKDF2, + }; + enum gcry_kdf_algos kdf_algos[] = { + GCRY_KDF_SIMPLE_S2K, + GCRY_KDF_SALTED_S2K, + GCRY_KDF_ITERSALTED_S2K, + GCRY_KDF_PBKDF1, + GCRY_KDF_PBKDF2, + GCRY_KDF_SCRYPT, + GCRY_KDF_ARGON2 , + GCRY_KDF_BALLOON , + GCRY_KDF_ONESTEP_KDF, + GCRY_KDF_ONESTEP_KDF_MAC, + GCRY_KDF_HKDF, + }; + size_t i, j; + + for (i = 0; i < sizeof(kdf_algos) / sizeof(*kdf_algos); i++) + { + int is_fips_kdf_algo = 0; + gcry_error_t err = gcry_control (GCRYCTL_FIPS_SERVICE_INDICATOR_KDF, kdf_algos[i]); + + if (verbose) + fprintf (stderr, "checking FIPS indicator for KDF %d: %s\n", + kdf_algos[i], gcry_strerror (err)); + + for (j = 0; j < sizeof(fips_kdf_algos) / sizeof(*fips_kdf_algos); j++) + { + if (kdf_algos[i] == fips_kdf_algos[j]) + { + is_fips_kdf_algo = 1; + break; + } + } + + switch (err & GPG_ERR_CODE_MASK) + { + case GPG_ERR_NO_ERROR: + if (!is_fips_kdf_algo) + fail ("KDF algorithm %d is marked as approved by" + " GCRYCTL_FIPS_SERVICE_INDICATOR_KDF, but only PBKDF2 should" + " be marked as approved.", kdf_algos[i]); + break; + case GPG_ERR_NOT_SUPPORTED: + if (is_fips_kdf_algo) + fail ("KDF algorithm %d is marked as not approved by" + " GCRYCTL_FIPS_SERVICE_INDICATOR_KDF, but it should be" + " approved", kdf_algos[i]); + break; + default: + fail ("Unexpected error '%s' (%d) returned by" + " GCRYCTL_FIPS_SERVICE_INDICATOR_KDF for KDF algorithm %d", + gcry_strerror (err), err, kdf_algos[i]); + } + } +} + int main (int argc, char **argv) @@ -1976,6 +2037,8 @@ main (int argc, char **argv) check_balloon (); check_onestep_kdf (); check_hkdf (); + if (in_fips_mode) + check_fips_indicators(); } return error_count ? 1 : 0; -- 2.35.3 From jussi.kivilinna at iki.fi Sat Jul 16 18:00:03 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 16 Jul 2022 19:00:03 +0300 Subject: [PATCH 1/3] hwf-arm: add ARM HW feature detection support for MacOS Message-ID: <20220716160005.1186238-1-jussi.kivilinna@iki.fi> * configure.ac: Add detection for header 'sys/sysctl.h' and system function 'sysctlbyname'. * src/hwf-arm.c (HAS_APPLE_SYSCTLBYNAME) (detect_arm_apple_sysctlbyname): New. (detect_arm_hwf_by_toolchain) [__ARM_FEATURE_CRYPTO]: Also check for ENABLE_ARM_CRYPTO_SUPPORT. (_gcry_hwf_detect_arm) [HAS_APPLE_SYSCTLBYNAME]: Check HWFs with 'detect_arm_apple_sysctlbyname' function. -- Signed-off-by: Jussi Kivilinna --- configure.ac | 4 +-- src/hwf-arm.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 946659c5..a907e085 100644 --- a/configure.ac +++ b/configure.ac @@ -806,7 +806,7 @@ AC_SEARCH_LIBS(setsockopt, [nsl]) #### Checks for header files. #### ################################## -AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h) +AC_CHECK_HEADERS(unistd.h sys/auxv.h sys/random.h sys/sysctl.h) ########################################## @@ -2211,7 +2211,7 @@ AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise) AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4) AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog) AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info) -AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy) +AC_CHECK_FUNCS(explicit_bzero explicit_memset getentropy sysctlbyname) GNUPG_CHECK_MLOCK diff --git a/src/hwf-arm.c b/src/hwf-arm.c index 70d375b2..a0205ee1 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -1,5 +1,5 @@ /* hwf-arm.c - Detect hardware features - ARM part - * Copyright (C) 2013,2019 Jussi Kivilinna + * Copyright (C) 2013,2019,2022 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -28,6 +28,10 @@ defined(HAVE_ELF_AUX_INFO)) #include #endif +#if defined(__APPLE__) && defined(HAVE_SYS_SYSCTL_H) && \ + defined(HAVE_SYSCTLBYNAME) +#include +#endif #include "g10lib.h" #include "hwf-common.h" @@ -385,6 +389,63 @@ detect_arm_proc_cpuinfo(unsigned int *broken_hwfs) #endif /* __linux__ */ + +#undef HAS_APPLE_SYSCTLBYNAME +#if defined(__APPLE__) && defined(HAVE_SYS_SYSCTL_H) && \ + defined(HAVE_SYSCTLBYNAME) +#define HAS_APPLE_SYSCTLBYNAME 1 + +static unsigned int +detect_arm_apple_sysctlbyname (void) +{ + static const struct + { + const char *feat_name; + unsigned int hwf_flag; + } hw_optional_arm_features[] = + { +#ifdef ENABLE_NEON_SUPPORT + { "hw.optional.neon", HWF_ARM_NEON }, + { "hw.optional.AdvSIMD", HWF_ARM_NEON }, +#endif +#ifdef ENABLE_ARM_CRYPTO_SUPPORT + { "hw.optional.arm.FEAT_AES", HWF_ARM_AES }, + { "hw.optional.arm.FEAT_SHA1", HWF_ARM_SHA1 }, + { "hw.optional.arm.FEAT_SHA256", HWF_ARM_SHA2 }, + { "hw.optional.arm.FEAT_PMULL", HWF_ARM_PMULL }, + { "hw.optional.arm.FEAT_SHA3", HWF_ARM_SHA3 }, + { "hw.optional.armv8_2_sha3", HWF_ARM_SHA3 }, + { "hw.optional.arm.FEAT_SHA512", HWF_ARM_SHA512 }, + { "hw.optional.armv8_2_sha512", HWF_ARM_SHA512 }, +#endif + }; + unsigned int i; + unsigned int hwf = 0; + + for (i = 0; i < DIM(hw_optional_arm_features); i++) + { + const char *name = hw_optional_arm_features[i].feat_name; + int sysctl_value = 0; + size_t value_size = sizeof(sysctl_value); + + if (sysctlbyname (name, &sysctl_value, &value_size, NULL, 0) != 0) + continue; + + if (value_size != sizeof(sysctl_value)) + continue; + + if (sysctl_value == 1) + { + hwf |= hw_optional_arm_features[i].hwf_flag; + } + } + + return hwf; +} + +#endif /* __APPLE__ */ + + static unsigned int detect_arm_hwf_by_toolchain (void) { @@ -414,7 +475,7 @@ detect_arm_hwf_by_toolchain (void) #endif /* __ARM_NEON */ -#if defined(__ARM_FEATURE_CRYPTO) +#if defined(__ARM_FEATURE_CRYPTO) && defined(ENABLE_ARM_CRYPTO_SUPPORT) /* ARMv8 crypto extensions include support for PMULL, AES, SHA1 and SHA2 * instructions. */ ret |= HWF_ARM_PMULL; @@ -464,6 +525,10 @@ _gcry_hwf_detect_arm (void) ret |= detect_arm_proc_cpuinfo (&broken_hwfs); #endif +#if defined (HAS_APPLE_SYSCTLBYNAME) + ret |= detect_arm_apple_sysctlbyname (); +#endif + ret |= detect_arm_hwf_by_toolchain (); ret &= ~broken_hwfs; -- 2.34.1 From jussi.kivilinna at iki.fi Sat Jul 16 18:00:04 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 16 Jul 2022 19:00:04 +0300 Subject: [PATCH 2/3] hwf-x86: fix UBSAN warning In-Reply-To: <20220716160005.1186238-1-jussi.kivilinna@iki.fi> References: <20220716160005.1186238-1-jussi.kivilinna@iki.fi> Message-ID: <20220716160005.1186238-2-jussi.kivilinna@iki.fi> * src/hwf-x86.c (detect_x86_gnuc): Change `(1 << 31)` to `(1U << 31)` to fix undefined behaviour. -- Signed-off-by: Jussi Kivilinna --- src/hwf-x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 20420798..b440827e 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -431,7 +431,7 @@ detect_x86_gnuc (void) && (features & (1 << 21)) && (features & (1 << 28)) && (features & (1 << 30)) - && (features & (1 << 31)) + && (features & (1U << 31)) && (features2 & (1 << 1)) && (features2 & (1 << 6)) && (features2 & (1 << 11)) -- 2.34.1 From jussi.kivilinna at iki.fi Sat Jul 16 18:00:05 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 16 Jul 2022 19:00:05 +0300 Subject: [PATCH 3/3] visibility: add missing fips_is_operational check for gcry_md_extract In-Reply-To: <20220716160005.1186238-1-jussi.kivilinna@iki.fi> References: <20220716160005.1186238-1-jussi.kivilinna@iki.fi> Message-ID: <20220716160005.1186238-3-jussi.kivilinna@iki.fi> * src/visibility.c (gcry_md_extract): Add 'fips_is_operational' check. -- Signed-off-by: Jussi Kivilinna --- src/visibility.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/visibility.c b/src/visibility.c index aee5bffb..daaf4033 100644 --- a/src/visibility.c +++ b/src/visibility.c @@ -1244,7 +1244,9 @@ gcry_md_read (gcry_md_hd_t hd, int algo) gcry_error_t gcry_md_extract (gcry_md_hd_t hd, int algo, void *buffer, size_t length) { - return gpg_error (_gcry_md_extract(hd, algo, buffer, length)); + if (!fips_is_operational ()) + return gpg_error (fips_not_operational ()); + return gpg_error (_gcry_md_extract (hd, algo, buffer, length)); } void -- 2.34.1 From tianjia.zhang at linux.alibaba.com Wed Jul 20 10:25:35 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Wed, 20 Jul 2022 16:25:35 +0800 Subject: [PATCH 1/3] Add detection for HW feature "ARMv8 SVE" Message-ID: <20220720082537.4973-1-tianjia.zhang@linux.alibaba.com> * configure.ac (svesupport, gcry_cv_gcc_inline_asm_aarch64_sve) (ENABLE_SVE_SUPPORT): New. * doc/gcrypt.texi: Add "arm-sve" to HW features list. * src/g10lib.h (HWF_ARM_SVE): New. * src/hwf-arm.c (arm_features): Add "sve". * src/hwfeatures.c (hwflist): Add "arm-sve". -- Signed-off-by: Tianjia Zhang --- configure.ac | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ doc/gcrypt.texi | 1 + src/g10lib.h | 1 + src/hwf-arm.c | 6 ++++++ src/hwfeatures.c | 1 + 5 files changed, 59 insertions(+) diff --git a/configure.ac b/configure.ac index 8bb8d66f2fb7..0bb345c1fead 100644 --- a/configure.ac +++ b/configure.ac @@ -698,6 +698,14 @@ AC_ARG_ENABLE(arm-crypto-support, armcryptosupport=$enableval,armcryptosupport=yes) AC_MSG_RESULT($armcryptosupport) +# Implementation of the --disable-sve-support switch. +AC_MSG_CHECKING([whether SVE support is requested]) +AC_ARG_ENABLE(sve-support, + AS_HELP_STRING([--disable-sve-support], + [Disable support for the ARMv8 SVE instructions]), + svesupport=$enableval,svesupport=yes) +AC_MSG_RESULT($svesupport) + # Implementation of the --disable-ppc-crypto-support switch. AC_MSG_CHECKING([whether PPC crypto support is requested]) AC_ARG_ENABLE(ppc-crypto-support, @@ -1321,6 +1329,7 @@ if test "$mpi_cpu_arch" != "arm" ; then if test "$mpi_cpu_arch" != "aarch64" ; then neonsupport="n/a" armcryptosupport="n/a" + svesupport="n/a" fi fi @@ -1974,6 +1983,35 @@ if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then fi +# +# Check whether GCC inline assembler supports AArch64 SVE instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SVE instructions], + [gcry_cv_gcc_inline_asm_aarch64_sve], + [if test "$mpi_cpu_arch" != "aarch64" || + test "$try_asm_modules" != "yes" ; then + gcry_cv_gcc_inline_asm_aarch64_sve="n/a" + else + gcry_cv_gcc_inline_asm_aarch64_sve=no + AC_LINK_IFELSE([AC_LANG_PROGRAM( + [[__asm__( + ".cpu generic+simd+sve\n\t" + ".text\n\t" + "testfn:\n\t" + "mov x0, \#60;\n\t" + "whilelo p0.s, xzr, x0;\n\t" + "mov z0.s, p0/z, \#55;\n\t" + "ld1b {z0.b}, p0/z, [x1];\n\t" + ); + ]], [ testfn(); ])], + [gcry_cv_gcc_inline_asm_aarch64_sve=yes]) + fi]) +if test "$gcry_cv_gcc_inline_asm_aarch64_sve" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SVE,1, + [Defined if inline assembler supports AArch64 SVE instructions]) +fi + + # # Check whether PowerPC AltiVec/VSX intrinsics # @@ -2462,6 +2500,13 @@ if test x"$armcryptosupport" = xyes ; then fi fi fi +if test x"$svesupport" = xyes ; then + if test "$gcry_cv_gcc_inline_asm_sve" != "yes" ; then + if test "$gcry_cv_gcc_inline_asm_aarch64_sve" != "yes" ; then + svesupport="no (unsupported by compiler)" + fi + fi +fi if test x"$aesnisupport" = xyes ; then AC_DEFINE(ENABLE_AESNI_SUPPORT, 1, @@ -2503,6 +2548,10 @@ if test x"$armcryptosupport" = xyes ; then AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1, [Enable support for ARMv8 Crypto Extension instructions.]) fi +if test x"$svesupport" = xyes ; then + AC_DEFINE(ENABLE_SVE_SUPPORT,1, + [Enable support for ARMv8 SVE instructions.]) +fi if test x"$ppccryptosupport" = xyes ; then AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1, [Enable support for POWER 8 (PowerISA 2.07) crypto extension.]) @@ -3385,6 +3434,7 @@ GCRY_MSG_SHOW([Try using Intel AVX512: ],[$avx512support]) GCRY_MSG_SHOW([Try using Intel GFNI: ],[$gfnisupport]) GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport]) GCRY_MSG_SHOW([Try using ARMv8 crypto: ],[$armcryptosupport]) +GCRY_MSG_SHOW([Try using ARMv8 SVE: ],[$svesupport]) GCRY_MSG_SHOW([Try using PPC crypto: ],[$ppccryptosupport]) GCRY_MSG_SHOW([],[]) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index b82535e236b8..5e07926bdaf0 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -601,6 +601,7 @@ are @item arm-sm3 @item arm-sm4 @item arm-sha512 + at item arm-sve @item ppc-vcrypto @item ppc-arch_3_00 @item ppc-arch_2_07 diff --git a/src/g10lib.h b/src/g10lib.h index a5bed0027eb1..91d53ff37d96 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -251,6 +251,7 @@ char **_gcry_strtokenize (const char *string, const char *delim); #define HWF_ARM_SM3 (1 << 6) #define HWF_ARM_SM4 (1 << 7) #define HWF_ARM_SHA512 (1 << 8) +#define HWF_ARM_SVE (1 << 9) #elif defined(HAVE_CPU_ARCH_PPC) diff --git a/src/hwf-arm.c b/src/hwf-arm.c index 70d375b21f55..3b6563190a9c 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -149,6 +149,9 @@ static const struct feature_map_s arm_features[] = #ifndef HWCAP_SHA512 # define HWCAP_SHA512 (1 << 21) #endif +#ifndef HWCAP_SVE +# define HWCAP_SVE (1 << 22) +#endif static const struct feature_map_s arm_features[] = { @@ -164,6 +167,9 @@ static const struct feature_map_s arm_features[] = { HWCAP_SM3, 0, " sm3", HWF_ARM_SM3 }, { HWCAP_SM4, 0, " sm4", HWF_ARM_SM4 }, { HWCAP_SHA512, 0, " sha512", HWF_ARM_SHA512 }, +#endif +#ifdef ENABLE_SVE_SUPPORT + { HWCAP_SVE, 0, " sve", HWF_ARM_SVE }, #endif }; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index af5daf62134d..dec5efd3c196 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -74,6 +74,7 @@ static struct { HWF_ARM_SM3, "arm-sm3" }, { HWF_ARM_SM4, "arm-sm4" }, { HWF_ARM_SHA512, "arm-sha512" }, + { HWF_ARM_SVE, "arm-sve" }, #elif defined(HAVE_CPU_ARCH_PPC) { HWF_PPC_VCRYPTO, "ppc-vcrypto" }, { HWF_PPC_ARCH_3_00, "ppc-arch_3_00" }, -- 2.24.3 (Apple Git-128) From tianjia.zhang at linux.alibaba.com Wed Jul 20 10:25:36 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Wed, 20 Jul 2022 16:25:36 +0800 Subject: [PATCH 2/3] hwf-arm: add ARMv9 SVE2 and optional Crypto Extension HW features In-Reply-To: <20220720082537.4973-1-tianjia.zhang@linux.alibaba.com> References: <20220720082537.4973-1-tianjia.zhang@linux.alibaba.com> Message-ID: <20220720082537.4973-2-tianjia.zhang@linux.alibaba.com> * doc/gcrypt.texi: Add "sve2, sveaes, svepmull, svesha3, svesm4" to ARM hardware features list. * src/g10lib.h (HWF_ARM_SVE2, HWF_ARM_SVEAES, HWF_ARM_SVEPMULL) (HWF_ARM_SVESHA3, HWF_ARM_SVESM4): New. * src/hwf-arm.c (arm_features): Add "sve2, sveaes, svepmull, svesha3, svesm4". * src/hwfeatures.c (hwflist): Add "arm-sve2, arm-sveaes, arm-svepmull, arm-svesha3, arm-svesm4". -- Signed-off-by: Tianjia Zhang --- doc/gcrypt.texi | 5 +++++ src/g10lib.h | 5 +++++ src/hwf-arm.c | 21 +++++++++++++++++++++ src/hwfeatures.c | 5 +++++ 4 files changed, 36 insertions(+) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 5e07926bdaf0..f2c1cc948d23 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -602,6 +602,11 @@ are @item arm-sm4 @item arm-sha512 @item arm-sve + at item arm-sve2 + at item arm-sveaes + at item arm-svepmull + at item arm-svesha3 + at item arm-svesm4 @item ppc-vcrypto @item ppc-arch_3_00 @item ppc-arch_2_07 diff --git a/src/g10lib.h b/src/g10lib.h index 91d53ff37d96..8ba0a5c2aa0f 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -252,6 +252,11 @@ char **_gcry_strtokenize (const char *string, const char *delim); #define HWF_ARM_SM4 (1 << 7) #define HWF_ARM_SHA512 (1 << 8) #define HWF_ARM_SVE (1 << 9) +#define HWF_ARM_SVE2 (1 << 10) +#define HWF_ARM_SVEAES (1 << 11) +#define HWF_ARM_SVEPMULL (1 << 12) +#define HWF_ARM_SVESHA3 (1 << 13) +#define HWF_ARM_SVESM4 (1 << 14) #elif defined(HAVE_CPU_ARCH_PPC) diff --git a/src/hwf-arm.c b/src/hwf-arm.c index 3b6563190a9c..969cfde6b87a 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -153,6 +153,22 @@ static const struct feature_map_s arm_features[] = # define HWCAP_SVE (1 << 22) #endif +#ifndef HWCAP2_SVE2 +# define HWCAP2_SVE2 (1 << 1) +#endif +#ifndef HWCAP2_SVEAES +# define HWCAP2_SVEAES (1 << 2) +#endif +#ifndef HWCAP2_SVEPMULL +# define HWCAP2_SVEPMULL (1 << 3) +#endif +#ifndef HWCAP2_SVESHA3 +# define HWCAP2_SVESHA3 (1 << 5) +#endif +#ifndef HWCAP2_SVESM4 +# define HWCAP2_SVESM4 (1 << 6) +#endif + static const struct feature_map_s arm_features[] = { #ifdef ENABLE_NEON_SUPPORT @@ -170,6 +186,11 @@ static const struct feature_map_s arm_features[] = #endif #ifdef ENABLE_SVE_SUPPORT { HWCAP_SVE, 0, " sve", HWF_ARM_SVE }, + { 0, HWCAP2_SVE2, " sve2", HWF_ARM_SVE2 }, + { 0, HWCAP2_SVEAES, " sveaes", HWF_ARM_SVEAES }, + { 0, HWCAP2_SVEPMULL, " svepmull", HWF_ARM_SVEPMULL }, + { 0, HWCAP2_SVESHA3, " svesha3", HWF_ARM_SVESHA3 }, + { 0, HWCAP2_SVESM4, " svesm4", HWF_ARM_SVESM4 }, #endif }; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index dec5efd3c196..b11cadefa9ef 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -75,6 +75,11 @@ static struct { HWF_ARM_SM4, "arm-sm4" }, { HWF_ARM_SHA512, "arm-sha512" }, { HWF_ARM_SVE, "arm-sve" }, + { HWF_ARM_SVE2, "arm-sve2" }, + { HWF_ARM_SVEAES, "arm-sveaes" }, + { HWF_ARM_SVEPMULL, "arm-svepmull" }, + { HWF_ARM_SVESHA3, "arm-svesha3" }, + { HWF_ARM_SVESM4, "arm-svesm4" }, #elif defined(HAVE_CPU_ARCH_PPC) { HWF_PPC_VCRYPTO, "ppc-vcrypto" }, { HWF_PPC_ARCH_3_00, "ppc-arch_3_00" }, -- 2.24.3 (Apple Git-128) From tianjia.zhang at linux.alibaba.com Wed Jul 20 10:25:37 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Wed, 20 Jul 2022 16:25:37 +0800 Subject: [PATCH 3/3] Add SM4 ARMv9 SVE CE assembly implementation In-Reply-To: <20220720082537.4973-1-tianjia.zhang@linux.alibaba.com> References: <20220720082537.4973-1-tianjia.zhang@linux.alibaba.com> Message-ID: <20220720082537.4973-3-tianjia.zhang@linux.alibaba.com> * cipher/Makefile.am: Add 'sm4-armv9-aarch64-sve-ce.S'. * cipher/sm4-armv9-aarch64-sve-ce.S: New. * cipher/sm4.c (USE_ARM_SVE_CE): New. (SM4_context) [USE_ARM_SVE_CE]: Add 'use_arm_sve_ce'. (_gcry_sm4_armv9_sve_ce_crypt, _gcry_sm4_armv9_sve_ce_ctr_enc) (_gcry_sm4_armv9_sve_ce_cbc_dec, _gcry_sm4_armv9_sve_ce_cfb_dec) (sm4_armv9_sve_ce_crypt_blk1_16): New. (sm4_setkey): Enable ARMv9 SVE CE if supported by HW. (sm4_get_crypt_blk1_16_fn) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions. (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions. * configure.ac: Add 'sm4-armv9-aarch64-sve-ce.lo'. -- Signed-off-by: Tianjia Zhang --- cipher/Makefile.am | 1 + cipher/sm4-armv9-aarch64-sve-ce.S | 966 ++++++++++++++++++++++++++++++ cipher/sm4.c | 85 +++ configure.ac | 1 + 4 files changed, 1053 insertions(+) create mode 100644 cipher/sm4-armv9-aarch64-sve-ce.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 042dc0a7170d..97823cb48bd3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -120,6 +120,7 @@ EXTRA_libcipher_la_SOURCES = \ serpent.c serpent-sse2-amd64.S \ sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \ sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \ + sm4-armv9-aarch64-sve-ce.S \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S b/cipher/sm4-armv9-aarch64-sve-ce.S new file mode 100644 index 000000000000..2f4cfcc9ecab --- /dev/null +++ b/cipher/sm4-armv9-aarch64-sve-ce.S @@ -0,0 +1,966 @@ +/* sm4-armv9-aarch64-sve-ce.S - ARMv9/AArch64 SVE Cryptography accelerated SM4 + * + * Copyright (C) 2022 Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include "asm-common-aarch64.h" + +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) && \ + defined(USE_SM4) + +.cpu generic+simd+crypto+sve+sve2 + +/* Constants */ + +.text +.align 4 +ELF(.type _gcry_sm4_armv9_svesm4_consts, at object) +_gcry_sm4_armv9_svesm4_consts: +.Lbswap128_mask: + .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b + .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 + .byte 0x1c, 0x1d, 0x1e, 0x1f, 0x18, 0x19, 0x1a, 0x1b + .byte 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13 + .byte 0x2c, 0x2d, 0x2e, 0x2f, 0x28, 0x29, 0x2a, 0x2b + .byte 0x24, 0x25, 0x26, 0x27, 0x20, 0x21, 0x22, 0x23 + .byte 0x3c, 0x3d, 0x3e, 0x3f, 0x38, 0x39, 0x3a, 0x3b + .byte 0x34, 0x35, 0x36, 0x37, 0x30, 0x31, 0x32, 0x33 + .byte 0x4c, 0x4d, 0x4e, 0x4f, 0x48, 0x49, 0x4a, 0x4b + .byte 0x44, 0x45, 0x46, 0x47, 0x40, 0x41, 0x42, 0x43 + .byte 0x5c, 0x5d, 0x5e, 0x5f, 0x58, 0x59, 0x5a, 0x5b + .byte 0x54, 0x55, 0x56, 0x57, 0x50, 0x51, 0x52, 0x53 + .byte 0x6c, 0x6d, 0x6e, 0x6f, 0x68, 0x69, 0x6a, 0x6b + .byte 0x64, 0x65, 0x66, 0x67, 0x60, 0x61, 0x62, 0x63 + .byte 0x7c, 0x7d, 0x7e, 0x7f, 0x78, 0x79, 0x7a, 0x7b + .byte 0x74, 0x75, 0x76, 0x77, 0x70, 0x71, 0x72, 0x73 + .byte 0x8c, 0x8d, 0x8e, 0x8f, 0x88, 0x89, 0x8a, 0x8b + .byte 0x84, 0x85, 0x86, 0x87, 0x80, 0x81, 0x82, 0x83 + .byte 0x9c, 0x9d, 0x9e, 0x9f, 0x98, 0x99, 0x9a, 0x9b + .byte 0x94, 0x95, 0x96, 0x97, 0x90, 0x91, 0x92, 0x93 + .byte 0xac, 0xad, 0xae, 0xaf, 0xa8, 0xa9, 0xaa, 0xab + .byte 0xa4, 0xa5, 0xa6, 0xa7, 0xa0, 0xa1, 0xa2, 0xa3 + .byte 0xbc, 0xbd, 0xbe, 0xbf, 0xb8, 0xb9, 0xba, 0xbb + .byte 0xb4, 0xb5, 0xb6, 0xb7, 0xb0, 0xb1, 0xb2, 0xb3 + .byte 0xcc, 0xcd, 0xce, 0xcf, 0xc8, 0xc9, 0xca, 0xcb + .byte 0xc4, 0xc5, 0xc6, 0xc7, 0xc0, 0xc1, 0xc2, 0xc3 + .byte 0xdc, 0xdd, 0xde, 0xdf, 0xd8, 0xd9, 0xda, 0xdb + .byte 0xd4, 0xd5, 0xd6, 0xd7, 0xd0, 0xd1, 0xd2, 0xd3 + .byte 0xec, 0xed, 0xee, 0xef, 0xe8, 0xe9, 0xea, 0xeb + .byte 0xe4, 0xe5, 0xe6, 0xe7, 0xe0, 0xe1, 0xe2, 0xe3 + .byte 0xfc, 0xfd, 0xfe, 0xff, 0xf8, 0xf9, 0xfa, 0xfb + .byte 0xf4, 0xf5, 0xf6, 0xf7, 0xf0, 0xf1, 0xf2, 0xf3 + +.Lle128_inc: + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +ELF(.size _gcry_sm4_armv9_svesm4_consts,.-_gcry_sm4_armv9_svesm4_consts) + +/* Register macros */ + +#define RCTR z16 +#define RCTRv v16 +#define RIV z16 +#define RIVv v16 +#define RSWAP128 z17 +#define RZERO z18 +#define RLE128_INC z19 + +#define RTMP0 z20 +#define RTMP1 z21 +#define RTMP2 z22 +#define RTMP3 z23 +#define RTMP0v v20 + +#define vecnum_z0 0 +#define vecnum_z1 1 +#define vecnum_z2 2 +#define vecnum_z3 3 +#define vecnum_z4 4 +#define vecnum_z5 5 +#define vecnum_z6 6 +#define vecnum_z7 7 +#define vecnum_z8 8 +#define vecnum_z9 9 +#define vecnum_z10 10 +#define vecnum_z11 11 +#define vecnum_z12 12 +#define vecnum_z13 13 +#define vecnum_z14 14 +#define vecnum_z15 15 +#define vecnum_z16 16 +#define vecnum_z24 24 +#define vecnum_z25 25 +#define vecnum_z26 26 +#define vecnum_z27 27 +#define vecnum_z28 28 +#define vecnum_z29 29 +#define vecnum_z30 30 +#define vecnum_z31 31 + +#define vecnum_v0 0 +#define vecnum_v15 15 +#define vecnum_v24 24 +#define vecnum_v25 25 +#define vecnum_v26 26 +#define vecnum_v27 27 +#define vecnum_v28 28 +#define vecnum_v29 29 +#define vecnum_v30 30 +#define vecnum_v31 31 + +#define sm4e_ce(vd, vn) \ + .inst (0xcec08400 | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm4e_sve(zd, zm) \ + .inst (0x4523e000 | (vecnum_##zm << 5) | vecnum_##zd) + +/* Helper macros. */ + +#define PREPARE() \ + GET_LOCAL_POINTER(x7, .Lbswap128_mask); \ + ptrue p0.b, ALL; \ + rdvl x5, #1; \ + ld1b {RSWAP128.b}, p0/z, [x7]; \ + \ + ld1 {v24.16b-v27.16b}, [x0], #64; \ + ld1 {v28.16b-v31.16b}, [x0]; \ + dup z24.q, z24.q[0]; \ + dup z25.q, z25.q[0]; \ + dup z26.q, z26.q[0]; \ + dup z27.q, z27.q[0]; \ + dup z28.q, z28.q[0]; \ + dup z29.q, z29.q[0]; \ + dup z30.q, z30.q[0]; \ + dup z31.q, z31.q[0]; + + +#define SM4_SVE_CE_CRYPT_BLK(b0) \ + revb b0.s, p0/m, b0.s; \ + sm4e_sve(b0, z24); \ + sm4e_sve(b0, z25); \ + sm4e_sve(b0, z26); \ + sm4e_sve(b0, z27); \ + sm4e_sve(b0, z28); \ + sm4e_sve(b0, z29); \ + sm4e_sve(b0, z30); \ + sm4e_sve(b0, z31); \ + tbl b0.b, {b0.b}, RSWAP128.b; \ + revb b0.s, p0/m, b0.s; + + +#define SM4_SVE_CE_CRYPT_BLK4(b0, b1, b2, b3) \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; \ + sm4e_sve(b0, z24); \ + sm4e_sve(b1, z24); \ + sm4e_sve(b2, z24); \ + sm4e_sve(b3, z24); \ + sm4e_sve(b0, z25); \ + sm4e_sve(b1, z25); \ + sm4e_sve(b2, z25); \ + sm4e_sve(b3, z25); \ + sm4e_sve(b0, z26); \ + sm4e_sve(b1, z26); \ + sm4e_sve(b2, z26); \ + sm4e_sve(b3, z26); \ + sm4e_sve(b0, z27); \ + sm4e_sve(b1, z27); \ + sm4e_sve(b2, z27); \ + sm4e_sve(b3, z27); \ + sm4e_sve(b0, z28); \ + sm4e_sve(b1, z28); \ + sm4e_sve(b2, z28); \ + sm4e_sve(b3, z28); \ + sm4e_sve(b0, z29); \ + sm4e_sve(b1, z29); \ + sm4e_sve(b2, z29); \ + sm4e_sve(b3, z29); \ + sm4e_sve(b0, z30); \ + sm4e_sve(b1, z30); \ + sm4e_sve(b2, z30); \ + sm4e_sve(b3, z30); \ + sm4e_sve(b0, z31); \ + sm4e_sve(b1, z31); \ + sm4e_sve(b2, z31); \ + sm4e_sve(b3, z31); \ + tbl b0.b, {b0.b}, RSWAP128.b; \ + tbl b1.b, {b1.b}, RSWAP128.b; \ + tbl b2.b, {b2.b}, RSWAP128.b; \ + tbl b3.b, {b3.b}, RSWAP128.b; \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; + + +#define SM4_SVE_CE_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; \ + revb b4.s, p0/m, b4.s; \ + revb b5.s, p0/m, b5.s; \ + revb b6.s, p0/m, b6.s; \ + revb b7.s, p0/m, b7.s; \ + sm4e_sve(b0, z24); \ + sm4e_sve(b1, z24); \ + sm4e_sve(b2, z24); \ + sm4e_sve(b3, z24); \ + sm4e_sve(b4, z24); \ + sm4e_sve(b5, z24); \ + sm4e_sve(b6, z24); \ + sm4e_sve(b7, z24); \ + sm4e_sve(b0, z25); \ + sm4e_sve(b1, z25); \ + sm4e_sve(b2, z25); \ + sm4e_sve(b3, z25); \ + sm4e_sve(b4, z25); \ + sm4e_sve(b5, z25); \ + sm4e_sve(b6, z25); \ + sm4e_sve(b7, z25); \ + sm4e_sve(b0, z26); \ + sm4e_sve(b1, z26); \ + sm4e_sve(b2, z26); \ + sm4e_sve(b3, z26); \ + sm4e_sve(b4, z26); \ + sm4e_sve(b5, z26); \ + sm4e_sve(b6, z26); \ + sm4e_sve(b7, z26); \ + sm4e_sve(b0, z27); \ + sm4e_sve(b1, z27); \ + sm4e_sve(b2, z27); \ + sm4e_sve(b3, z27); \ + sm4e_sve(b4, z27); \ + sm4e_sve(b5, z27); \ + sm4e_sve(b6, z27); \ + sm4e_sve(b7, z27); \ + sm4e_sve(b0, z28); \ + sm4e_sve(b1, z28); \ + sm4e_sve(b2, z28); \ + sm4e_sve(b3, z28); \ + sm4e_sve(b4, z28); \ + sm4e_sve(b5, z28); \ + sm4e_sve(b6, z28); \ + sm4e_sve(b7, z28); \ + sm4e_sve(b0, z29); \ + sm4e_sve(b1, z29); \ + sm4e_sve(b2, z29); \ + sm4e_sve(b3, z29); \ + sm4e_sve(b4, z29); \ + sm4e_sve(b5, z29); \ + sm4e_sve(b6, z29); \ + sm4e_sve(b7, z29); \ + sm4e_sve(b0, z30); \ + sm4e_sve(b1, z30); \ + sm4e_sve(b2, z30); \ + sm4e_sve(b3, z30); \ + sm4e_sve(b4, z30); \ + sm4e_sve(b5, z30); \ + sm4e_sve(b6, z30); \ + sm4e_sve(b7, z30); \ + sm4e_sve(b0, z31); \ + sm4e_sve(b1, z31); \ + sm4e_sve(b2, z31); \ + sm4e_sve(b3, z31); \ + sm4e_sve(b4, z31); \ + sm4e_sve(b5, z31); \ + sm4e_sve(b6, z31); \ + sm4e_sve(b7, z31); \ + tbl b0.b, {b0.b}, RSWAP128.b; \ + tbl b1.b, {b1.b}, RSWAP128.b; \ + tbl b2.b, {b2.b}, RSWAP128.b; \ + tbl b3.b, {b3.b}, RSWAP128.b; \ + tbl b4.b, {b4.b}, RSWAP128.b; \ + tbl b5.b, {b5.b}, RSWAP128.b; \ + tbl b6.b, {b6.b}, RSWAP128.b; \ + tbl b7.b, {b7.b}, RSWAP128.b; \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; \ + revb b4.s, p0/m, b4.s; \ + revb b5.s, p0/m, b5.s; \ + revb b6.s, p0/m, b6.s; \ + revb b7.s, p0/m, b7.s; + + +#define SM4_CE_CRYPT_BLK(b0) \ + rev32 b0.16b, b0.16b; \ + sm4e_ce(b0, v24); \ + sm4e_ce(b0, v25); \ + sm4e_ce(b0, v26); \ + sm4e_ce(b0, v27); \ + sm4e_ce(b0, v28); \ + sm4e_ce(b0, v29); \ + sm4e_ce(b0, v30); \ + sm4e_ce(b0, v31); \ + rev64 b0.4s, b0.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + rev32 b0.16b, b0.16b; + + +.align 3 +.global _gcry_sm4_armv9_sve_ce_crypt +ELF(.type _gcry_sm4_armv9_sve_ce_crypt,%function;) +_gcry_sm4_armv9_sve_ce_crypt: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: nblocks + */ + CFI_STARTPROC(); + + PREPARE(); + +.Lcrypt_loop_blks: + sub x3, x3, x5, LSR #1; /* x3 - (8 * VL) */ + tbnz x3, #63, .Lcrypt_tail8; + + ld1b {z0.b}, p0/z, [x2]; + ld1b {z1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z3.b}, p0/z, [x2, #3, MUL VL]; + ld1b {z4.b}, p0/z, [x2, #4, MUL VL]; + ld1b {z5.b}, p0/z, [x2, #5, MUL VL]; + ld1b {z6.b}, p0/z, [x2, #6, MUL VL]; + ld1b {z7.b}, p0/z, [x2, #7, MUL VL]; + addvl x2, x2, #8; + + SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7); + + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x1, x1, #8; + + cbz x3, .Lcrypt_end; + b .Lcrypt_loop_blks; + +.Lcrypt_tail8: + add x3, x3, x5, LSR #1; + cmp x3, x5, LSR #2; + blt .Lcrypt_tail4; + + sub x3, x3, x5, LSR #2; /* x3 - (4 * VL) */ + + ld1b {z0.b}, p0/z, [x2]; + ld1b {z1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z3.b}, p0/z, [x2, #3, MUL VL]; + addvl x2, x2, #4; + + SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3); + + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x1, x1, #4; + + cbz x3, .Lcrypt_end; + +.Lcrypt_tail4: + cmp x3, x5, LSR #4; + blt .Lcrypt_tail; + + sub x3, x3, x5, LSR #4; /* x3 - VL */ + + ld1b {z0.b}, p0/z, [x2]; + addvl x2, x2, #1; + + SM4_SVE_CE_CRYPT_BLK(z0); + + st1b {z0.b}, p0, [x1]; + addvl x1, x1, #1; + + cbz x3, .Lcrypt_end; + +.Lcrypt_tail: + sub x3, x3, #1; + + ld1 {v0.16b}, [x2], #16; + SM4_CE_CRYPT_BLK(v0); + st1 {v0.16b}, [x1], #16; + + cbnz x3, .Lcrypt_tail; + +.Lcrypt_end: + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_crypt,.-_gcry_sm4_armv9_sve_ce_crypt;) + +.align 3 +.global _gcry_sm4_armv9_sve_ce_cbc_dec +ELF(.type _gcry_sm4_armv9_sve_ce_cbc_dec,%function;) +_gcry_sm4_armv9_sve_ce_cbc_dec: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * x4: nblocks + */ + CFI_STARTPROC(); + VPUSH_ABI; + + PREPARE(); + ld1 {RIVv.16b}, [x3]; + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcbc_loop_blks: + sub x4, x4, x5, LSR #1; /* x4 - (8 * VL) */ + tbnz x4, #63, .Lcbc_tail8; + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + ld1b {z11.b}, p0/z, [x2, #4, MUL VL]; + ld1b {z10.b}, p0/z, [x2, #5, MUL VL]; + ld1b {z9.b}, p0/z, [x2, #6, MUL VL]; + ld1b {z8.b}, p0/z, [x2, #7, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev z4.b, z11.b; + rev z5.b, z10.b; + rev z6.b, z9.b; + rev z7.b, z8.b; + rev RTMP0.b, RIV.b; + ext z7.b, z7.b, z6.b, #16; + ext z6.b, z6.b, z5.b, #16; + ext z5.b, z5.b, z4.b, #16; + ext z4.b, z4.b, z3.b, #16; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z7.b, z7.b; + rev z6.b, z6.b; + rev z5.b, z5.b; + rev z4.b, z4.b; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z8.d; + + SM4_SVE_CE_CRYPT_BLK8(z15, z14, z13, z12, z11, z10, z9, z8); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + eor z4.d, z4.d, z11.d; + eor z5.d, z5.d, z10.d; + eor z6.d, z6.d, z9.d; + eor z7.d, z7.d, z8.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x2, x2, #8; + addvl x1, x1, #8; + + cbz x4, .Lcbc_end; + b .Lcbc_loop_blks; + +.Lcbc_tail8: + add x4, x4, x5, LSR #1; + cmp x4, x5, LSR #2; + blt .Lcbc_tail4; + + sub x4, x4, x5, LSR #2; /* x4 - (4 * VL) */ + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev RTMP0.b, RIV.b; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z12.d; + + SM4_SVE_CE_CRYPT_BLK4(z15, z14, z13, z12); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x2, x2, #4; + addvl x1, x1, #4; + + cbz x4, .Lcbc_end; + +.Lcbc_tail4: + cmp x4, x5, LSR #4; + blt .Lcbc_tail_ce; + + sub x4, x4, x5, LSR #4; /* x4 - VL */ + + ld1b {z15.b}, p0/z, [x2]; + rev RTMP0.b, RIV.b; + rev z0.b, z15.b; + ext z0.b, z0.b, RTMP0.b, #16; + rev z0.b, z0.b; + mov RIV.d, z15.d; + + SM4_SVE_CE_CRYPT_BLK(z15); + + eor z0.d, z0.d, z15.d; + st1b {z0.b}, p0, [x1]; + addvl x2, x2, #1; + addvl x1, x1, #1; + + cbz x4, .Lcbc_end; + b .Lcbc_tail4; + +.Lcbc_tail_ce: + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + +.Lcbc_tail: + sub x4, x4, #1; + + ld1 {v15.16b}, [x2], #16; + mov v0.16b, RIVv.16b; + mov RIVv.16b, v15.16b; + SM4_CE_CRYPT_BLK(v15); + eor v0.16b, v0.16b, v15.16b; + st1 {v0.16b}, [x1], #16; + + cbnz x4, .Lcbc_tail; + + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcbc_end: + /* store new IV */ + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + st1 {RIVv.16b}, [x3]; + + VPOP_ABI; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_cbc_dec,.-_gcry_sm4_armv9_sve_ce_cbc_dec;) + +.align 3 +.global _gcry_sm4_armv9_sve_ce_cfb_dec +ELF(.type _gcry_sm4_armv9_sve_ce_cfb_dec,%function;) +_gcry_sm4_armv9_sve_ce_cfb_dec: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * x4: nblocks + */ + CFI_STARTPROC(); + VPUSH_ABI; + + PREPARE(); + ld1 {RIVv.16b}, [x3]; + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcfb_loop_blks: + sub x4, x4, x5, LSR #1; /* x4 - (8 * VL) */ + tbnz x4, #63, .Lcfb_tail8; + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + ld1b {z11.b}, p0/z, [x2, #4, MUL VL]; + ld1b {z10.b}, p0/z, [x2, #5, MUL VL]; + ld1b {z9.b}, p0/z, [x2, #6, MUL VL]; + ld1b {z8.b}, p0/z, [x2, #7, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev z4.b, z11.b; + rev z5.b, z10.b; + rev z6.b, z9.b; + rev z7.b, z8.b; + rev RTMP0.b, RIV.b; + ext z7.b, z7.b, z6.b, #16; + ext z6.b, z6.b, z5.b, #16; + ext z5.b, z5.b, z4.b, #16; + ext z4.b, z4.b, z3.b, #16; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z7.b, z7.b; + rev z6.b, z6.b; + rev z5.b, z5.b; + rev z4.b, z4.b; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z8.d; + + SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + eor z4.d, z4.d, z11.d; + eor z5.d, z5.d, z10.d; + eor z6.d, z6.d, z9.d; + eor z7.d, z7.d, z8.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x2, x2, #8; + addvl x1, x1, #8; + + cbz x4, .Lcfb_end; + b .Lcfb_loop_blks; + +.Lcfb_tail8: + add x4, x4, x5, LSR #1; + cmp x4, x5, LSR #2; + blt .Lcfb_tail4; + + sub x4, x4, x5, LSR #2; /* x4 - (4 * VL) */ + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev RTMP0.b, RIV.b; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z12.d; + + SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x2, x2, #4; + addvl x1, x1, #4; + + cbz x4, .Lcfb_end; + +.Lcfb_tail4: + cmp x4, x5, LSR #4; + blt .Lcfb_tail_ce; + + sub x4, x4, x5, LSR #4; /* x4 - VL */ + + ld1b {z15.b}, p0/z, [x2]; + rev RTMP0.b, RIV.b; + rev z0.b, z15.b; + ext z0.b, z0.b, RTMP0.b, #16; + rev z0.b, z0.b; + mov RIV.d, z15.d; + + SM4_SVE_CE_CRYPT_BLK(z0); + + eor z0.d, z0.d, z15.d; + st1b {z0.b}, p0, [x1]; + addvl x2, x2, #1; + addvl x1, x1, #1; + + cbz x4, .Lcfb_end; + b .Lcfb_tail4; + +.Lcfb_tail_ce: + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + +.Lcfb_tail: + sub x4, x4, #1; + + ld1 {v15.16b}, [x2], #16; + mov v0.16b, RIVv.16b; + mov RIVv.16b, v15.16b; + SM4_CE_CRYPT_BLK(v0); + eor v0.16b, v0.16b, v15.16b; + st1 {v0.16b}, [x1], #16; + + cbnz x4, .Lcfb_tail; + + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcfb_end: + /* store new IV */ + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + st1 {RIVv.16b}, [x3]; + + VPOP_ABI; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_cfb_dec,.-_gcry_sm4_armv9_sve_ce_cfb_dec;) + +.align 3 +.global _gcry_sm4_armv9_sve_ce_ctr_enc +ELF(.type _gcry_sm4_armv9_sve_ce_ctr_enc,%function;) +_gcry_sm4_armv9_sve_ce_ctr_enc: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * x4: nblocks + */ + CFI_STARTPROC(); + + PREPARE(); + + dup RZERO.d, #0; + GET_LOCAL_POINTER(x6, .Lle128_inc); + ld1b {RLE128_INC.b}, p0/z, [x6]; + + ldp x7, x8, [x3]; + rev x7, x7; + rev x8, x8; + +#define inc_le128(zctr) \ + mov RCTRv.d[1], x8; \ + mov RCTRv.d[0], x7; \ + mov zctr.d, RLE128_INC.d; \ + dup RCTR.q, RCTR.q[0]; \ + adds x8, x8, x5, LSR #4; \ + adc x7, x7, xzr; \ + adclt zctr.d, RCTR.d, RZERO.d; \ + adclt RCTR.d, zctr.d, RZERO.d; \ + trn1 zctr.d, RCTR.d, zctr.d; \ + revb zctr.d, p0/m, zctr.d; + +.Lctr_loop_blks: + sub x4, x4, x5, LSR #1; /* x4 - (8 * VL) */ + tbnz x4, #63, .Lctr_tail8; + + inc_le128(z0); + inc_le128(z1); + inc_le128(z2); + inc_le128(z3); + inc_le128(z4); + inc_le128(z5); + inc_le128(z6); + inc_le128(z7); + + SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7); + + ld1b {RTMP0.b}, p0/z, [x2]; + ld1b {RTMP1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {RTMP2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {RTMP3.b}, p0/z, [x2, #3, MUL VL]; + eor z0.d, z0.d, RTMP0.d; + eor z1.d, z1.d, RTMP1.d; + eor z2.d, z2.d, RTMP2.d; + eor z3.d, z3.d, RTMP3.d; + ld1b {RTMP0.b}, p0/z, [x2, #4, MUL VL]; + ld1b {RTMP1.b}, p0/z, [x2, #5, MUL VL]; + ld1b {RTMP2.b}, p0/z, [x2, #6, MUL VL]; + ld1b {RTMP3.b}, p0/z, [x2, #7, MUL VL]; + eor z4.d, z4.d, RTMP0.d; + eor z5.d, z5.d, RTMP1.d; + eor z6.d, z6.d, RTMP2.d; + eor z7.d, z7.d, RTMP3.d; + addvl x2, x2, #8; + + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x1, x1, #8; + + cbz x4, .Lctr_end; + b .Lctr_loop_blks; + +.Lctr_tail8: + add x4, x4, x5, LSR #1; + cmp x4, x5, LSR #2; + blt .Lctr_tail4; + + sub x4, x4, x5, LSR #2; /* x4 - (4 * VL) */ + + inc_le128(z0); + inc_le128(z1); + inc_le128(z2); + inc_le128(z3); + + SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3); + + ld1b {RTMP0.b}, p0/z, [x2]; + ld1b {RTMP1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {RTMP2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {RTMP3.b}, p0/z, [x2, #3, MUL VL]; + eor z0.d, z0.d, RTMP0.d; + eor z1.d, z1.d, RTMP1.d; + eor z2.d, z2.d, RTMP2.d; + eor z3.d, z3.d, RTMP3.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x2, x2, #4; + addvl x1, x1, #4; + + cbz x4, .Lctr_end; + +.Lctr_tail4: + cmp x4, x5, LSR #4; + blt .Lctr_tail; + + sub x4, x4, x5, LSR #4; /* x4 - VL */ + + inc_le128(z0); + SM4_SVE_CE_CRYPT_BLK(z0); + ld1b {RTMP0.b}, p0/z, [x2]; + eor z0.d, z0.d, RTMP0.d; + st1b {z0.b}, p0, [x1]; + addvl x2, x2, #1; + addvl x1, x1, #1; + + cbz x4, .Lctr_end; + b .Lctr_tail4; + +.Lctr_tail: + sub x4, x4, #1; + + /* inc_le128 for CE */ + mov v0.d[1], x8; + mov v0.d[0], x7; + adds x8, x8, #1; + adc x7, x7, xzr; + rev64 v0.16b, v0.16b; + + SM4_CE_CRYPT_BLK(v0); + ld1 {RTMP0v.16b}, [x2], #16; + eor v0.16b, v0.16b, RTMP0v.16b; + st1 {v0.16b}, [x1], #16; + + cbnz x4, .Lctr_tail; + +.Lctr_end: + /* store new CTR */ + rev x7, x7; + rev x8, x8; + stp x7, x8, [x3]; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_ctr_enc,.-_gcry_sm4_armv9_sve_ce_ctr_enc;) + +.align 3 +.global _gcry_sm4_armv9_sve_get_vl +ELF(.type _gcry_sm4_armv9_sve_get_vl,%function;) +_gcry_sm4_armv9_sve_get_vl: + CFI_STARTPROC(); + + /* VL in bytes */ + rdvl x0, #1; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_get_vl,.-_gcry_sm4_armv9_sve_get_vl;) + +#endif diff --git a/cipher/sm4.c b/cipher/sm4.c index 1c54b339db82..bd56be0ebd7a 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -94,6 +94,16 @@ # endif #endif +#undef USE_ARM_SVE_CE +#ifdef ENABLE_SVE_SUPPORT +# if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) +# define USE_ARM_SVE_CE 1 +# endif +#endif + static const char *sm4_selftest (void); static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr, @@ -133,6 +143,9 @@ typedef struct #ifdef USE_ARM_CE unsigned int use_arm_ce:1; #endif +#ifdef USE_ARM_SVE_CE + unsigned int use_arm_sve_ce:1; +#endif } SM4_context; typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out, @@ -448,6 +461,37 @@ sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, #endif /* USE_ARM_CE */ +#ifdef USE_ARM_SVE_CE +extern void _gcry_sm4_armv9_sve_ce_crypt(const u32 *rk, byte *out, + const byte *in, + size_t nblocks); + +extern void _gcry_sm4_armv9_sve_ce_ctr_enc(const u32 *rk_enc, byte *out, + const byte *in, + byte *ctr, + size_t nblocks); + +extern void _gcry_sm4_armv9_sve_ce_cbc_dec(const u32 *rk_dec, byte *out, + const byte *in, + byte *iv, + size_t nblocks); + +extern void _gcry_sm4_armv9_sve_ce_cfb_dec(const u32 *rk_enc, byte *out, + const byte *in, + byte *iv, + size_t nblocks); + +static inline unsigned int +sm4_armv9_sve_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, + unsigned int num_blks) +{ + _gcry_sm4_armv9_sve_ce_crypt(rk, out, in, num_blks); + return 0; +} + +extern unsigned int _gcry_sm4_armv9_sve_get_vl(void); +#endif /* USE_ARM_SVE_CE */ + static inline void prefetch_sbox_table(void) { const volatile byte *vtab = (void *)&sbox_table; @@ -606,6 +650,11 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, #ifdef USE_ARM_CE ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4); #endif +#ifdef USE_ARM_SVE_CE + /* Only enabled when the SVE vector length is greater than 128 bits */ + ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVESM4) + && _gcry_sm4_armv9_sve_get_vl() > 16; +#endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) @@ -802,6 +851,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx) return &sm4_aesni_avx_crypt_blk1_16; } #endif +#ifdef USE_ARM_SVE_CE + else if (ctx->use_arm_sve_ce) + { + return &sm4_armv9_sve_ce_crypt_blk1_16; + } +#endif #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { @@ -879,6 +934,16 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr, } #endif +#ifdef USE_ARM_SVE_CE + if (ctx->use_arm_sve_ce) + { + /* Process all blocks at a time. */ + _gcry_sm4_armv9_sve_ce_ctr_enc(ctx->rkey_enc, outbuf, inbuf, + ctr, nblocks); + nblocks = 0; + } +#endif + #ifdef USE_ARM_CE if (ctx->use_arm_ce) { @@ -990,6 +1055,16 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv, } #endif +#ifdef USE_ARM_SVE_CE + if (ctx->use_arm_sve_ce) + { + /* Process all blocks at a time. */ + _gcry_sm4_armv9_sve_ce_cbc_dec(ctx->rkey_dec, outbuf, inbuf, + iv, nblocks); + nblocks = 0; + } +#endif + #ifdef USE_ARM_CE if (ctx->use_arm_ce) { @@ -1101,6 +1176,16 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, } #endif +#ifdef USE_ARM_SVE_CE + if (ctx->use_arm_sve_ce) + { + /* Process all blocks at a time. */ + _gcry_sm4_armv9_sve_ce_cfb_dec(ctx->rkey_enc, outbuf, inbuf, + iv, nblocks); + nblocks = 0; + } +#endif + #ifdef USE_ARM_CE if (ctx->use_arm_ce) { diff --git a/configure.ac b/configure.ac index 0bb345c1fead..e7bfbcc0f57c 100644 --- a/configure.ac +++ b/configure.ac @@ -2904,6 +2904,7 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo" + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv9-aarch64-sve-ce.lo" esac fi -- 2.24.3 (Apple Git-128) From jussi.kivilinna at iki.fi Wed Jul 20 21:22:51 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 20 Jul 2022 22:22:51 +0300 Subject: [PATCH 3/3] Add SM4 ARMv9 SVE CE assembly implementation In-Reply-To: <20220720082537.4973-3-tianjia.zhang@linux.alibaba.com> References: <20220720082537.4973-1-tianjia.zhang@linux.alibaba.com> <20220720082537.4973-3-tianjia.zhang@linux.alibaba.com> Message-ID: <003281b5-c41d-547c-e034-06f4799d76b8@iki.fi> Hello, On 20.7.2022 11.25, Tianjia Zhang wrote: > * cipher/Makefile.am: Add 'sm4-armv9-aarch64-sve-ce.S'. > * cipher/sm4-armv9-aarch64-sve-ce.S: New. > * cipher/sm4.c (USE_ARM_SVE_CE): New. > (SM4_context) [USE_ARM_SVE_CE]: Add 'use_arm_sve_ce'. > (_gcry_sm4_armv9_sve_ce_crypt, _gcry_sm4_armv9_sve_ce_ctr_enc) > (_gcry_sm4_armv9_sve_ce_cbc_dec, _gcry_sm4_armv9_sve_ce_cfb_dec) > (sm4_armv9_sve_ce_crypt_blk1_16): New. > (sm4_setkey): Enable ARMv9 SVE CE if supported by HW. > (sm4_get_crypt_blk1_16_fn) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE > bulk functions. > (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec) > [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions. > * configure.ac: Add 'sm4-armv9-aarch64-sve-ce.lo'. > -- > > Signed-off-by: Tianjia Zhang > --- > cipher/Makefile.am | 1 + > cipher/sm4-armv9-aarch64-sve-ce.S | 966 ++++++++++++++++++++++++++++++ > cipher/sm4.c | 85 +++ > configure.ac | 1 + > 4 files changed, 1053 insertions(+) > create mode 100644 cipher/sm4-armv9-aarch64-sve-ce.S > > diff --git a/cipher/Makefile.am b/cipher/Makefile.am > index 042dc0a7170d..97823cb48bd3 100644 > --- a/cipher/Makefile.am > +++ b/cipher/Makefile.am > @@ -120,6 +120,7 @@ EXTRA_libcipher_la_SOURCES = \ > serpent.c serpent-sse2-amd64.S \ > sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \ > sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \ > + sm4-armv9-aarch64-sve-ce.S \ > serpent-avx2-amd64.S serpent-armv7-neon.S \ > sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ > sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ > diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S b/cipher/sm4-armv9-aarch64-sve-ce.S > new file mode 100644 > index 000000000000..2f4cfcc9ecab > --- /dev/null > +++ b/cipher/sm4-armv9-aarch64-sve-ce.S > @@ -0,0 +1,966 @@ > +/* sm4-armv9-aarch64-sve-ce.S - ARMv9/AArch64 SVE Cryptography accelerated SM4 > + * > + * Copyright (C) 2022 Alibaba Group. > + * Copyright (C) 2022 Tianjia Zhang > + * > + * This file is part of Libgcrypt. > + * > + * Libgcrypt is free software; you can redistribute it and/or modify > + * it under the terms of the GNU Lesser General Public License as > + * published by the Free Software Foundation; either version 2.1 of > + * the License, or (at your option) any later version. > + * > + * Libgcrypt is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this program; if not, see . > + */ > + > +#include "asm-common-aarch64.h" > + > +#if defined(__AARCH64EL__) && \ > + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ > + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ > + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) && \ > + defined(USE_SM4) > + > +.cpu generic+simd+crypto+sve+sve2 Should we add "gcry_cv_gcc_inline_asm_aarch64_sve2"/"HAVE_GCC_INLINE_ASM_AARCH64_SVE2" to configure.ac as SVE2 instructions are being used in assembly below? > + > +/* Constants */ > + > +#define inc_le128(zctr) \ > + mov RCTRv.d[1], x8; \ > + mov RCTRv.d[0], x7; \ > + mov zctr.d, RLE128_INC.d; \ > + dup RCTR.q, RCTR.q[0]; \ > + adds x8, x8, x5, LSR #4; \ > + adc x7, x7, xzr; \ > + adclt zctr.d, RCTR.d, RZERO.d; \ > + adclt RCTR.d, zctr.d, RZERO.d; \ > + trn1 zctr.d, RCTR.d, zctr.d; \ > + revb zctr.d, p0/m, zctr.d; 'adclt' here is SVE2. > diff --git a/cipher/sm4.c b/cipher/sm4.c > index 1c54b339db82..bd56be0ebd7a 100644 > --- a/cipher/sm4.c > +++ b/cipher/sm4.c > @@ -94,6 +94,16 @@ > # endif > #endif > > +#undef USE_ARM_SVE_CE > +#ifdef ENABLE_SVE_SUPPORT > +# if defined(__AARCH64EL__) && \ > + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ > + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ > + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) > +# define USE_ARM_SVE_CE 1 > +# endif > +#endif > @@ -606,6 +650,11 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, > #ifdef USE_ARM_CE > ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4); > #endif > +#ifdef USE_ARM_SVE_CE > + /* Only enabled when the SVE vector length is greater than 128 bits */ > + ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVESM4) > + && _gcry_sm4_armv9_sve_get_vl() > 16; > +#endif Should this also check for HWF_ARM_SVE2 or does HWF_ARM_SVESM4 imply that SVE2 is also available? Otherwise patches look good. I tested them with QEMU which I found out had SVE2 and SVE-SM4 support available. -Jussi From tianjia.zhang at linux.alibaba.com Thu Jul 21 05:45:58 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Thu, 21 Jul 2022 11:45:58 +0800 Subject: [PATCH 3/3] Add SM4 ARMv9 SVE CE assembly implementation In-Reply-To: <003281b5-c41d-547c-e034-06f4799d76b8@iki.fi> References: <20220720082537.4973-1-tianjia.zhang@linux.alibaba.com> <20220720082537.4973-3-tianjia.zhang@linux.alibaba.com> <003281b5-c41d-547c-e034-06f4799d76b8@iki.fi> Message-ID: <6132116e-f1e3-761d-dbe9-602f601f4065@linux.alibaba.com> Hi Jussi, On 7/21/22 3:22 AM, Jussi Kivilinna wrote: > Hello, > > On 20.7.2022 11.25, Tianjia Zhang wrote: >> * cipher/Makefile.am: Add 'sm4-armv9-aarch64-sve-ce.S'. >> * cipher/sm4-armv9-aarch64-sve-ce.S: New. >> * cipher/sm4.c (USE_ARM_SVE_CE): New. >> (SM4_context) [USE_ARM_SVE_CE]: Add 'use_arm_sve_ce'. >> (_gcry_sm4_armv9_sve_ce_crypt, _gcry_sm4_armv9_sve_ce_ctr_enc) >> (_gcry_sm4_armv9_sve_ce_cbc_dec, _gcry_sm4_armv9_sve_ce_cfb_dec) >> (sm4_armv9_sve_ce_crypt_blk1_16): New. >> (sm4_setkey): Enable ARMv9 SVE CE if supported by HW. >> (sm4_get_crypt_blk1_16_fn) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE >> bulk functions. >> (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec) >> [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions. >> * configure.ac: Add 'sm4-armv9-aarch64-sve-ce.lo'. >> -- >> >> Signed-off-by: Tianjia Zhang >> --- >> ? cipher/Makefile.am??????????????? |?? 1 + >> ? cipher/sm4-armv9-aarch64-sve-ce.S | 966 ++++++++++++++++++++++++++++++ >> ? cipher/sm4.c????????????????????? |? 85 +++ >> ? configure.ac????????????????????? |?? 1 + >> ? 4 files changed, 1053 insertions(+) >> ? create mode 100644 cipher/sm4-armv9-aarch64-sve-ce.S >> >> diff --git a/cipher/Makefile.am b/cipher/Makefile.am >> index 042dc0a7170d..97823cb48bd3 100644 >> --- a/cipher/Makefile.am >> +++ b/cipher/Makefile.am >> @@ -120,6 +120,7 @@ EXTRA_libcipher_la_SOURCES = \ >> ????? serpent.c serpent-sse2-amd64.S \ >> ????? sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \ >> ????? sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \ >> +??? sm4-armv9-aarch64-sve-ce.S \ >> ????? serpent-avx2-amd64.S serpent-armv7-neon.S \ >> ????? sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ >> ????? sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ >> diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S >> b/cipher/sm4-armv9-aarch64-sve-ce.S >> new file mode 100644 >> index 000000000000..2f4cfcc9ecab >> --- /dev/null >> +++ b/cipher/sm4-armv9-aarch64-sve-ce.S >> @@ -0,0 +1,966 @@ >> +/* sm4-armv9-aarch64-sve-ce.S - ARMv9/AArch64 SVE Cryptography >> accelerated SM4 >> + * >> + * Copyright (C) 2022 Alibaba Group. >> + * Copyright (C) 2022 Tianjia Zhang >> + * >> + * This file is part of Libgcrypt. >> + * >> + * Libgcrypt is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU Lesser General Public License as >> + * published by the Free Software Foundation; either version 2.1 of >> + * the License, or (at your option) any later version. >> + * >> + * Libgcrypt is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.? See the >> + * GNU Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with this program; if not, see >> . >> + */ >> + >> +#include "asm-common-aarch64.h" >> + >> +#if defined(__AARCH64EL__) && \ >> +??? defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ >> +??? defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ >> +??? defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) && \ >> +??? defined(USE_SM4) >> + >> +.cpu generic+simd+crypto+sve+sve2 > > Should we add > "gcry_cv_gcc_inline_asm_aarch64_sve2"/"HAVE_GCC_INLINE_ASM_AARCH64_SVE2" > to configure.ac as SVE2 instructions are being used in assembly below? > Will be added in the next patch. >> + >> +/* Constants */ >> + > > +#define inc_le128(zctr)???????????????????????????? \ >> +??????? mov???????? RCTRv.d[1], x8;???????????????? \ >> +??????? mov???????? RCTRv.d[0], x7;???????????????? \ >> +??????? mov???????? zctr.d, RLE128_INC.d;?????????? \ >> +??????? dup???????? RCTR.q, RCTR.q[0];????????????? \ >> +??????? adds??????? x8, x8, x5, LSR #4;???????????? \ >> +??????? adc???????? x7, x7, xzr;??????????????????? \ >> +??????? adclt?????? zctr.d, RCTR.d, RZERO.d;??????? \ >> +??????? adclt?????? RCTR.d, zctr.d, RZERO.d;??????? \ >> +??????? trn1??????? zctr.d, RCTR.d, zctr.d;???????? \ >> +??????? revb??????? zctr.d, p0/m, zctr.d; > > 'adclt' here is SVE2. > >> diff --git a/cipher/sm4.c b/cipher/sm4.c >> index 1c54b339db82..bd56be0ebd7a 100644 >> --- a/cipher/sm4.c >> +++ b/cipher/sm4.c >> @@ -94,6 +94,16 @@ >> ? # endif >> ? #endif >> +#undef USE_ARM_SVE_CE >> +#ifdef ENABLE_SVE_SUPPORT >> +# if defined(__AARCH64EL__) && \ >> +???? defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ >> +???? defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ >> +???? defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) >> +#?? define USE_ARM_SVE_CE 1 >> +# endif >> +#endif > >> @@ -606,6 +650,11 @@ sm4_setkey (void *context, const byte *key, const >> unsigned keylen, >> ? #ifdef USE_ARM_CE >> ??? ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4); >> ? #endif >> +#ifdef USE_ARM_SVE_CE >> +? /* Only enabled when the SVE vector length is greater than 128 bits */ >> +? ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVESM4) >> +??????? && _gcry_sm4_armv9_sve_get_vl() > 16; >> +#endif > > Should this also check for HWF_ARM_SVE2 or does HWF_ARM_SVESM4 imply > that SVE2 is also available? > > > Otherwise patches look good. I tested them with QEMU which I found out > had SVE2 and SVE-SM4 support available. > > -Jussi Thanks for your suggestion and test. HWF_ARM_SVESM4 already implies the existence of HWF_ARM_SVE2, but it is still a good practice to check SVE2 explicitly, I will add this check. Best regards, Tianjia From tianjia.zhang at linux.alibaba.com Thu Jul 21 08:32:15 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Thu, 21 Jul 2022 14:32:15 +0800 Subject: [PATCH v2 1/3] Add detection for HW feature "ARMv8 SVE" Message-ID: <20220721063217.20034-1-tianjia.zhang@linux.alibaba.com> * configure.ac (svesupport, gcry_cv_gcc_inline_asm_aarch64_sve) (ENABLE_SVE_SUPPORT): New. * doc/gcrypt.texi: Add "arm-sve" to HW features list. * src/g10lib.h (HWF_ARM_SVE): New. * src/hwf-arm.c (arm_features): Add "sve". * src/hwfeatures.c (hwflist): Add "arm-sve". -- Signed-off-by: Tianjia Zhang --- configure.ac | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ doc/gcrypt.texi | 1 + src/g10lib.h | 1 + src/hwf-arm.c | 6 ++++++ src/hwfeatures.c | 1 + 5 files changed, 59 insertions(+) diff --git a/configure.ac b/configure.ac index 74150ae13c3c..b6c51ab9998d 100644 --- a/configure.ac +++ b/configure.ac @@ -698,6 +698,14 @@ AC_ARG_ENABLE(arm-crypto-support, armcryptosupport=$enableval,armcryptosupport=yes) AC_MSG_RESULT($armcryptosupport) +# Implementation of the --disable-sve-support switch. +AC_MSG_CHECKING([whether SVE support is requested]) +AC_ARG_ENABLE(sve-support, + AS_HELP_STRING([--disable-sve-support], + [Disable support for the ARMv8 SVE instructions]), + svesupport=$enableval,svesupport=yes) +AC_MSG_RESULT($svesupport) + # Implementation of the --disable-ppc-crypto-support switch. AC_MSG_CHECKING([whether PPC crypto support is requested]) AC_ARG_ENABLE(ppc-crypto-support, @@ -1321,6 +1329,7 @@ if test "$mpi_cpu_arch" != "arm" ; then if test "$mpi_cpu_arch" != "aarch64" ; then neonsupport="n/a" armcryptosupport="n/a" + svesupport="n/a" fi fi @@ -1974,6 +1983,35 @@ if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then fi +# +# Check whether GCC inline assembler supports AArch64 SVE instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SVE instructions], + [gcry_cv_gcc_inline_asm_aarch64_sve], + [if test "$mpi_cpu_arch" != "aarch64" || + test "$try_asm_modules" != "yes" ; then + gcry_cv_gcc_inline_asm_aarch64_sve="n/a" + else + gcry_cv_gcc_inline_asm_aarch64_sve=no + AC_LINK_IFELSE([AC_LANG_PROGRAM( + [[__asm__( + ".cpu generic+simd+sve\n\t" + ".text\n\t" + "testfn:\n\t" + "mov x0, \#60;\n\t" + "whilelo p0.s, xzr, x0;\n\t" + "mov z0.s, p0/z, \#55;\n\t" + "ld1b {z0.b}, p0/z, [x1];\n\t" + ); + ]], [ testfn(); ])], + [gcry_cv_gcc_inline_asm_aarch64_sve=yes]) + fi]) +if test "$gcry_cv_gcc_inline_asm_aarch64_sve" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SVE,1, + [Defined if inline assembler supports AArch64 SVE instructions]) +fi + + # # Check whether PowerPC AltiVec/VSX intrinsics # @@ -2462,6 +2500,13 @@ if test x"$armcryptosupport" = xyes ; then fi fi fi +if test x"$svesupport" = xyes ; then + if test "$gcry_cv_gcc_inline_asm_sve" != "yes" ; then + if test "$gcry_cv_gcc_inline_asm_aarch64_sve" != "yes" ; then + svesupport="no (unsupported by compiler)" + fi + fi +fi if test x"$aesnisupport" = xyes ; then AC_DEFINE(ENABLE_AESNI_SUPPORT, 1, @@ -2503,6 +2548,10 @@ if test x"$armcryptosupport" = xyes ; then AC_DEFINE(ENABLE_ARM_CRYPTO_SUPPORT,1, [Enable support for ARMv8 Crypto Extension instructions.]) fi +if test x"$svesupport" = xyes ; then + AC_DEFINE(ENABLE_SVE_SUPPORT,1, + [Enable support for ARMv8 SVE instructions.]) +fi if test x"$ppccryptosupport" = xyes ; then AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1, [Enable support for POWER 8 (PowerISA 2.07) crypto extension.]) @@ -3385,6 +3434,7 @@ GCRY_MSG_SHOW([Try using Intel AVX512: ],[$avx512support]) GCRY_MSG_SHOW([Try using Intel GFNI: ],[$gfnisupport]) GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport]) GCRY_MSG_SHOW([Try using ARMv8 crypto: ],[$armcryptosupport]) +GCRY_MSG_SHOW([Try using ARMv8 SVE: ],[$svesupport]) GCRY_MSG_SHOW([Try using PPC crypto: ],[$ppccryptosupport]) GCRY_MSG_SHOW([],[]) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index b82535e236b8..5e07926bdaf0 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -601,6 +601,7 @@ are @item arm-sm3 @item arm-sm4 @item arm-sha512 + at item arm-sve @item ppc-vcrypto @item ppc-arch_3_00 @item ppc-arch_2_07 diff --git a/src/g10lib.h b/src/g10lib.h index a5bed0027eb1..91d53ff37d96 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -251,6 +251,7 @@ char **_gcry_strtokenize (const char *string, const char *delim); #define HWF_ARM_SM3 (1 << 6) #define HWF_ARM_SM4 (1 << 7) #define HWF_ARM_SHA512 (1 << 8) +#define HWF_ARM_SVE (1 << 9) #elif defined(HAVE_CPU_ARCH_PPC) diff --git a/src/hwf-arm.c b/src/hwf-arm.c index a0205ee103d2..0bc2713b677f 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -153,6 +153,9 @@ static const struct feature_map_s arm_features[] = #ifndef HWCAP_SHA512 # define HWCAP_SHA512 (1 << 21) #endif +#ifndef HWCAP_SVE +# define HWCAP_SVE (1 << 22) +#endif static const struct feature_map_s arm_features[] = { @@ -168,6 +171,9 @@ static const struct feature_map_s arm_features[] = { HWCAP_SM3, 0, " sm3", HWF_ARM_SM3 }, { HWCAP_SM4, 0, " sm4", HWF_ARM_SM4 }, { HWCAP_SHA512, 0, " sha512", HWF_ARM_SHA512 }, +#endif +#ifdef ENABLE_SVE_SUPPORT + { HWCAP_SVE, 0, " sve", HWF_ARM_SVE }, #endif }; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index af5daf62134d..dec5efd3c196 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -74,6 +74,7 @@ static struct { HWF_ARM_SM3, "arm-sm3" }, { HWF_ARM_SM4, "arm-sm4" }, { HWF_ARM_SHA512, "arm-sha512" }, + { HWF_ARM_SVE, "arm-sve" }, #elif defined(HAVE_CPU_ARCH_PPC) { HWF_PPC_VCRYPTO, "ppc-vcrypto" }, { HWF_PPC_ARCH_3_00, "ppc-arch_3_00" }, -- 2.24.3 (Apple Git-128) From tianjia.zhang at linux.alibaba.com Thu Jul 21 08:32:16 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Thu, 21 Jul 2022 14:32:16 +0800 Subject: [PATCH v2 2/3] Add ARMv9 SVE2 and optional Crypto Extension HW features In-Reply-To: <20220721063217.20034-1-tianjia.zhang@linux.alibaba.com> References: <20220721063217.20034-1-tianjia.zhang@linux.alibaba.com> Message-ID: <20220721063217.20034-2-tianjia.zhang@linux.alibaba.com> * configure.ac (sve2support, gcry_cv_gcc_inline_asm_aarch64_sve2) (ENABLE_SVE2_SUPPORT): New. * doc/gcrypt.texi: Add "sve2, sveaes, svepmull, svesha3, svesm4" to ARM hardware features list. * src/g10lib.h (HWF_ARM_SVE2, HWF_ARM_SVEAES, HWF_ARM_SVEPMULL) (HWF_ARM_SVESHA3, HWF_ARM_SVESM4): New. * src/hwf-arm.c (arm_features): Add "sve2, sveaes, svepmull, svesha3, svesm4". * src/hwfeatures.c (hwflist): Add "arm-sve2, arm-sveaes, arm-svepmull, arm-svesha3, arm-svesm4". -- Signed-off-by: Tianjia Zhang --- configure.ac | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ doc/gcrypt.texi | 5 +++++ src/g10lib.h | 5 +++++ src/hwf-arm.c | 21 +++++++++++++++++++ src/hwfeatures.c | 5 +++++ 5 files changed, 90 insertions(+) diff --git a/configure.ac b/configure.ac index b6c51ab9998d..31bcd77e3c75 100644 --- a/configure.ac +++ b/configure.ac @@ -706,6 +706,14 @@ AC_ARG_ENABLE(sve-support, svesupport=$enableval,svesupport=yes) AC_MSG_RESULT($svesupport) +# Implementation of the --disable-sve2-support switch. +AC_MSG_CHECKING([whether SVE2 support is requested]) +AC_ARG_ENABLE(sve2-support, + AS_HELP_STRING([--disable-sve2-support], + [Disable support for the ARMv9 SVE2 instructions]), + sve2support=$enableval,sve2support=yes) +AC_MSG_RESULT($sve2support) + # Implementation of the --disable-ppc-crypto-support switch. AC_MSG_CHECKING([whether PPC crypto support is requested]) AC_ARG_ENABLE(ppc-crypto-support, @@ -1330,6 +1338,7 @@ if test "$mpi_cpu_arch" != "arm" ; then neonsupport="n/a" armcryptosupport="n/a" svesupport="n/a" + sve2support="n/a" fi fi @@ -2012,6 +2021,39 @@ if test "$gcry_cv_gcc_inline_asm_aarch64_sve" = "yes" ; then fi +# +# Check whether GCC inline assembler supports AArch64 SVE2 instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SVE2 instructions], + [gcry_cv_gcc_inline_asm_aarch64_sve2], + [if test "$mpi_cpu_arch" != "aarch64" || + test "$try_asm_modules" != "yes" ; then + gcry_cv_gcc_inline_asm_aarch64_sve2="n/a" + else + gcry_cv_gcc_inline_asm_aarch64_sve2=no + AC_LINK_IFELSE([AC_LANG_PROGRAM( + [[__asm__( + ".cpu generic+simd+sve2\n\t" + ".text\n\t" + "testfn:\n\t" + ";\n\t" + "eor3 z0.d, z0.d, z1.d, z2.d;\n\t" + "ext z8.b, {z20.b, z21.b}, \#3;\n\t" + "adclt z0.d, z1.d, z2.d;\n\t" + "tbl z0.b, {z8.b, z9.b}, z1.b;\n\t" + "addhnb z16.s, z17.d, z18.d;\n\t" + "mov z0.s, p0/z, \#55;\n\t" + "ld1b {z0.b}, p0/z, [x1];\n\t" + ); + ]], [ testfn(); ])], + [gcry_cv_gcc_inline_asm_aarch64_sve2=yes]) + fi]) +if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SVE2,1, + [Defined if inline assembler supports AArch64 SVE2 instructions]) +fi + + # # Check whether PowerPC AltiVec/VSX intrinsics # @@ -2507,6 +2549,13 @@ if test x"$svesupport" = xyes ; then fi fi fi +if test x"$sve2support" = xyes ; then + if test "$gcry_cv_gcc_inline_asm_sve2" != "yes" ; then + if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" != "yes" ; then + sve2support="no (unsupported by compiler)" + fi + fi +fi if test x"$aesnisupport" = xyes ; then AC_DEFINE(ENABLE_AESNI_SUPPORT, 1, @@ -2552,6 +2601,10 @@ if test x"$svesupport" = xyes ; then AC_DEFINE(ENABLE_SVE_SUPPORT,1, [Enable support for ARMv8 SVE instructions.]) fi +if test x"$sve2support" = xyes ; then + AC_DEFINE(ENABLE_SVE2_SUPPORT,1, + [Enable support for ARMv9 SVE2 instructions.]) +fi if test x"$ppccryptosupport" = xyes ; then AC_DEFINE(ENABLE_PPC_CRYPTO_SUPPORT,1, [Enable support for POWER 8 (PowerISA 2.07) crypto extension.]) @@ -3435,6 +3488,7 @@ GCRY_MSG_SHOW([Try using Intel GFNI: ],[$gfnisupport]) GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport]) GCRY_MSG_SHOW([Try using ARMv8 crypto: ],[$armcryptosupport]) GCRY_MSG_SHOW([Try using ARMv8 SVE: ],[$svesupport]) +GCRY_MSG_SHOW([Try using ARMv9 SVE2: ],[$sve2support]) GCRY_MSG_SHOW([Try using PPC crypto: ],[$ppccryptosupport]) GCRY_MSG_SHOW([],[]) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 5e07926bdaf0..f2c1cc948d23 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -602,6 +602,11 @@ are @item arm-sm4 @item arm-sha512 @item arm-sve + at item arm-sve2 + at item arm-sveaes + at item arm-svepmull + at item arm-svesha3 + at item arm-svesm4 @item ppc-vcrypto @item ppc-arch_3_00 @item ppc-arch_2_07 diff --git a/src/g10lib.h b/src/g10lib.h index 91d53ff37d96..8ba0a5c2aa0f 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -252,6 +252,11 @@ char **_gcry_strtokenize (const char *string, const char *delim); #define HWF_ARM_SM4 (1 << 7) #define HWF_ARM_SHA512 (1 << 8) #define HWF_ARM_SVE (1 << 9) +#define HWF_ARM_SVE2 (1 << 10) +#define HWF_ARM_SVEAES (1 << 11) +#define HWF_ARM_SVEPMULL (1 << 12) +#define HWF_ARM_SVESHA3 (1 << 13) +#define HWF_ARM_SVESM4 (1 << 14) #elif defined(HAVE_CPU_ARCH_PPC) diff --git a/src/hwf-arm.c b/src/hwf-arm.c index 0bc2713b677f..500cd97a7179 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -157,6 +157,22 @@ static const struct feature_map_s arm_features[] = # define HWCAP_SVE (1 << 22) #endif +#ifndef HWCAP2_SVE2 +# define HWCAP2_SVE2 (1 << 1) +#endif +#ifndef HWCAP2_SVEAES +# define HWCAP2_SVEAES (1 << 2) +#endif +#ifndef HWCAP2_SVEPMULL +# define HWCAP2_SVEPMULL (1 << 3) +#endif +#ifndef HWCAP2_SVESHA3 +# define HWCAP2_SVESHA3 (1 << 5) +#endif +#ifndef HWCAP2_SVESM4 +# define HWCAP2_SVESM4 (1 << 6) +#endif + static const struct feature_map_s arm_features[] = { #ifdef ENABLE_NEON_SUPPORT @@ -174,6 +190,11 @@ static const struct feature_map_s arm_features[] = #endif #ifdef ENABLE_SVE_SUPPORT { HWCAP_SVE, 0, " sve", HWF_ARM_SVE }, + { 0, HWCAP2_SVE2, " sve2", HWF_ARM_SVE2 }, + { 0, HWCAP2_SVEAES, " sveaes", HWF_ARM_SVEAES }, + { 0, HWCAP2_SVEPMULL, " svepmull", HWF_ARM_SVEPMULL }, + { 0, HWCAP2_SVESHA3, " svesha3", HWF_ARM_SVESHA3 }, + { 0, HWCAP2_SVESM4, " svesm4", HWF_ARM_SVESM4 }, #endif }; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index dec5efd3c196..b11cadefa9ef 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -75,6 +75,11 @@ static struct { HWF_ARM_SM4, "arm-sm4" }, { HWF_ARM_SHA512, "arm-sha512" }, { HWF_ARM_SVE, "arm-sve" }, + { HWF_ARM_SVE2, "arm-sve2" }, + { HWF_ARM_SVEAES, "arm-sveaes" }, + { HWF_ARM_SVEPMULL, "arm-svepmull" }, + { HWF_ARM_SVESHA3, "arm-svesha3" }, + { HWF_ARM_SVESM4, "arm-svesm4" }, #elif defined(HAVE_CPU_ARCH_PPC) { HWF_PPC_VCRYPTO, "ppc-vcrypto" }, { HWF_PPC_ARCH_3_00, "ppc-arch_3_00" }, -- 2.24.3 (Apple Git-128) From tianjia.zhang at linux.alibaba.com Thu Jul 21 08:32:17 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Thu, 21 Jul 2022 14:32:17 +0800 Subject: [PATCH v2 3/3] Add SM4 ARMv9 SVE CE assembly implementation In-Reply-To: <20220721063217.20034-1-tianjia.zhang@linux.alibaba.com> References: <20220721063217.20034-1-tianjia.zhang@linux.alibaba.com> Message-ID: <20220721063217.20034-3-tianjia.zhang@linux.alibaba.com> * cipher/Makefile.am: Add 'sm4-armv9-aarch64-sve-ce.S'. * cipher/sm4-armv9-aarch64-sve-ce.S: New. * cipher/sm4.c (USE_ARM_SVE_CE): New. (SM4_context) [USE_ARM_SVE_CE]: Add 'use_arm_sve_ce'. (_gcry_sm4_armv9_sve_ce_crypt, _gcry_sm4_armv9_sve_ce_ctr_enc) (_gcry_sm4_armv9_sve_ce_cbc_dec, _gcry_sm4_armv9_sve_ce_cfb_dec) (sm4_armv9_sve_ce_crypt_blk1_16): New. (sm4_setkey): Enable ARMv9 SVE CE if supported by HW. (sm4_get_crypt_blk1_16_fn) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions. (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions. * configure.ac: Add 'sm4-armv9-aarch64-sve-ce.lo'. -- Signed-off-by: Tianjia Zhang --- cipher/Makefile.am | 1 + cipher/sm4-armv9-aarch64-sve-ce.S | 967 ++++++++++++++++++++++++++++++ cipher/sm4.c | 86 +++ configure.ac | 1 + 4 files changed, 1055 insertions(+) create mode 100644 cipher/sm4-armv9-aarch64-sve-ce.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 042dc0a7170d..97823cb48bd3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -120,6 +120,7 @@ EXTRA_libcipher_la_SOURCES = \ serpent.c serpent-sse2-amd64.S \ sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \ sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \ + sm4-armv9-aarch64-sve-ce.S \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S b/cipher/sm4-armv9-aarch64-sve-ce.S new file mode 100644 index 000000000000..21e34e6ffc84 --- /dev/null +++ b/cipher/sm4-armv9-aarch64-sve-ce.S @@ -0,0 +1,967 @@ +/* sm4-armv9-aarch64-sve-ce.S - ARMv9/AArch64 SVE Cryptography accelerated SM4 + * + * Copyright (C) 2022 Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include "asm-common-aarch64.h" + +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE2) && \ + defined(USE_SM4) + +.cpu generic+simd+crypto+sve+sve2 + +/* Constants */ + +.text +.align 4 +ELF(.type _gcry_sm4_armv9_svesm4_consts, at object) +_gcry_sm4_armv9_svesm4_consts: +.Lbswap128_mask: + .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b + .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 + .byte 0x1c, 0x1d, 0x1e, 0x1f, 0x18, 0x19, 0x1a, 0x1b + .byte 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13 + .byte 0x2c, 0x2d, 0x2e, 0x2f, 0x28, 0x29, 0x2a, 0x2b + .byte 0x24, 0x25, 0x26, 0x27, 0x20, 0x21, 0x22, 0x23 + .byte 0x3c, 0x3d, 0x3e, 0x3f, 0x38, 0x39, 0x3a, 0x3b + .byte 0x34, 0x35, 0x36, 0x37, 0x30, 0x31, 0x32, 0x33 + .byte 0x4c, 0x4d, 0x4e, 0x4f, 0x48, 0x49, 0x4a, 0x4b + .byte 0x44, 0x45, 0x46, 0x47, 0x40, 0x41, 0x42, 0x43 + .byte 0x5c, 0x5d, 0x5e, 0x5f, 0x58, 0x59, 0x5a, 0x5b + .byte 0x54, 0x55, 0x56, 0x57, 0x50, 0x51, 0x52, 0x53 + .byte 0x6c, 0x6d, 0x6e, 0x6f, 0x68, 0x69, 0x6a, 0x6b + .byte 0x64, 0x65, 0x66, 0x67, 0x60, 0x61, 0x62, 0x63 + .byte 0x7c, 0x7d, 0x7e, 0x7f, 0x78, 0x79, 0x7a, 0x7b + .byte 0x74, 0x75, 0x76, 0x77, 0x70, 0x71, 0x72, 0x73 + .byte 0x8c, 0x8d, 0x8e, 0x8f, 0x88, 0x89, 0x8a, 0x8b + .byte 0x84, 0x85, 0x86, 0x87, 0x80, 0x81, 0x82, 0x83 + .byte 0x9c, 0x9d, 0x9e, 0x9f, 0x98, 0x99, 0x9a, 0x9b + .byte 0x94, 0x95, 0x96, 0x97, 0x90, 0x91, 0x92, 0x93 + .byte 0xac, 0xad, 0xae, 0xaf, 0xa8, 0xa9, 0xaa, 0xab + .byte 0xa4, 0xa5, 0xa6, 0xa7, 0xa0, 0xa1, 0xa2, 0xa3 + .byte 0xbc, 0xbd, 0xbe, 0xbf, 0xb8, 0xb9, 0xba, 0xbb + .byte 0xb4, 0xb5, 0xb6, 0xb7, 0xb0, 0xb1, 0xb2, 0xb3 + .byte 0xcc, 0xcd, 0xce, 0xcf, 0xc8, 0xc9, 0xca, 0xcb + .byte 0xc4, 0xc5, 0xc6, 0xc7, 0xc0, 0xc1, 0xc2, 0xc3 + .byte 0xdc, 0xdd, 0xde, 0xdf, 0xd8, 0xd9, 0xda, 0xdb + .byte 0xd4, 0xd5, 0xd6, 0xd7, 0xd0, 0xd1, 0xd2, 0xd3 + .byte 0xec, 0xed, 0xee, 0xef, 0xe8, 0xe9, 0xea, 0xeb + .byte 0xe4, 0xe5, 0xe6, 0xe7, 0xe0, 0xe1, 0xe2, 0xe3 + .byte 0xfc, 0xfd, 0xfe, 0xff, 0xf8, 0xf9, 0xfa, 0xfb + .byte 0xf4, 0xf5, 0xf6, 0xf7, 0xf0, 0xf1, 0xf2, 0xf3 + +.Lle128_inc: + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +ELF(.size _gcry_sm4_armv9_svesm4_consts,.-_gcry_sm4_armv9_svesm4_consts) + +/* Register macros */ + +#define RCTR z16 +#define RCTRv v16 +#define RIV z16 +#define RIVv v16 +#define RSWAP128 z17 +#define RZERO z18 +#define RLE128_INC z19 + +#define RTMP0 z20 +#define RTMP1 z21 +#define RTMP2 z22 +#define RTMP3 z23 +#define RTMP0v v20 + +#define vecnum_z0 0 +#define vecnum_z1 1 +#define vecnum_z2 2 +#define vecnum_z3 3 +#define vecnum_z4 4 +#define vecnum_z5 5 +#define vecnum_z6 6 +#define vecnum_z7 7 +#define vecnum_z8 8 +#define vecnum_z9 9 +#define vecnum_z10 10 +#define vecnum_z11 11 +#define vecnum_z12 12 +#define vecnum_z13 13 +#define vecnum_z14 14 +#define vecnum_z15 15 +#define vecnum_z16 16 +#define vecnum_z24 24 +#define vecnum_z25 25 +#define vecnum_z26 26 +#define vecnum_z27 27 +#define vecnum_z28 28 +#define vecnum_z29 29 +#define vecnum_z30 30 +#define vecnum_z31 31 + +#define vecnum_v0 0 +#define vecnum_v15 15 +#define vecnum_v24 24 +#define vecnum_v25 25 +#define vecnum_v26 26 +#define vecnum_v27 27 +#define vecnum_v28 28 +#define vecnum_v29 29 +#define vecnum_v30 30 +#define vecnum_v31 31 + +#define sm4e_ce(vd, vn) \ + .inst (0xcec08400 | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm4e_sve(zd, zm) \ + .inst (0x4523e000 | (vecnum_##zm << 5) | vecnum_##zd) + +/* Helper macros. */ + +#define PREPARE() \ + GET_LOCAL_POINTER(x7, .Lbswap128_mask); \ + ptrue p0.b, ALL; \ + rdvl x5, #1; \ + ld1b {RSWAP128.b}, p0/z, [x7]; \ + \ + ld1 {v24.16b-v27.16b}, [x0], #64; \ + ld1 {v28.16b-v31.16b}, [x0]; \ + dup z24.q, z24.q[0]; \ + dup z25.q, z25.q[0]; \ + dup z26.q, z26.q[0]; \ + dup z27.q, z27.q[0]; \ + dup z28.q, z28.q[0]; \ + dup z29.q, z29.q[0]; \ + dup z30.q, z30.q[0]; \ + dup z31.q, z31.q[0]; + + +#define SM4_SVE_CE_CRYPT_BLK(b0) \ + revb b0.s, p0/m, b0.s; \ + sm4e_sve(b0, z24); \ + sm4e_sve(b0, z25); \ + sm4e_sve(b0, z26); \ + sm4e_sve(b0, z27); \ + sm4e_sve(b0, z28); \ + sm4e_sve(b0, z29); \ + sm4e_sve(b0, z30); \ + sm4e_sve(b0, z31); \ + tbl b0.b, {b0.b}, RSWAP128.b; \ + revb b0.s, p0/m, b0.s; + + +#define SM4_SVE_CE_CRYPT_BLK4(b0, b1, b2, b3) \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; \ + sm4e_sve(b0, z24); \ + sm4e_sve(b1, z24); \ + sm4e_sve(b2, z24); \ + sm4e_sve(b3, z24); \ + sm4e_sve(b0, z25); \ + sm4e_sve(b1, z25); \ + sm4e_sve(b2, z25); \ + sm4e_sve(b3, z25); \ + sm4e_sve(b0, z26); \ + sm4e_sve(b1, z26); \ + sm4e_sve(b2, z26); \ + sm4e_sve(b3, z26); \ + sm4e_sve(b0, z27); \ + sm4e_sve(b1, z27); \ + sm4e_sve(b2, z27); \ + sm4e_sve(b3, z27); \ + sm4e_sve(b0, z28); \ + sm4e_sve(b1, z28); \ + sm4e_sve(b2, z28); \ + sm4e_sve(b3, z28); \ + sm4e_sve(b0, z29); \ + sm4e_sve(b1, z29); \ + sm4e_sve(b2, z29); \ + sm4e_sve(b3, z29); \ + sm4e_sve(b0, z30); \ + sm4e_sve(b1, z30); \ + sm4e_sve(b2, z30); \ + sm4e_sve(b3, z30); \ + sm4e_sve(b0, z31); \ + sm4e_sve(b1, z31); \ + sm4e_sve(b2, z31); \ + sm4e_sve(b3, z31); \ + tbl b0.b, {b0.b}, RSWAP128.b; \ + tbl b1.b, {b1.b}, RSWAP128.b; \ + tbl b2.b, {b2.b}, RSWAP128.b; \ + tbl b3.b, {b3.b}, RSWAP128.b; \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; + + +#define SM4_SVE_CE_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; \ + revb b4.s, p0/m, b4.s; \ + revb b5.s, p0/m, b5.s; \ + revb b6.s, p0/m, b6.s; \ + revb b7.s, p0/m, b7.s; \ + sm4e_sve(b0, z24); \ + sm4e_sve(b1, z24); \ + sm4e_sve(b2, z24); \ + sm4e_sve(b3, z24); \ + sm4e_sve(b4, z24); \ + sm4e_sve(b5, z24); \ + sm4e_sve(b6, z24); \ + sm4e_sve(b7, z24); \ + sm4e_sve(b0, z25); \ + sm4e_sve(b1, z25); \ + sm4e_sve(b2, z25); \ + sm4e_sve(b3, z25); \ + sm4e_sve(b4, z25); \ + sm4e_sve(b5, z25); \ + sm4e_sve(b6, z25); \ + sm4e_sve(b7, z25); \ + sm4e_sve(b0, z26); \ + sm4e_sve(b1, z26); \ + sm4e_sve(b2, z26); \ + sm4e_sve(b3, z26); \ + sm4e_sve(b4, z26); \ + sm4e_sve(b5, z26); \ + sm4e_sve(b6, z26); \ + sm4e_sve(b7, z26); \ + sm4e_sve(b0, z27); \ + sm4e_sve(b1, z27); \ + sm4e_sve(b2, z27); \ + sm4e_sve(b3, z27); \ + sm4e_sve(b4, z27); \ + sm4e_sve(b5, z27); \ + sm4e_sve(b6, z27); \ + sm4e_sve(b7, z27); \ + sm4e_sve(b0, z28); \ + sm4e_sve(b1, z28); \ + sm4e_sve(b2, z28); \ + sm4e_sve(b3, z28); \ + sm4e_sve(b4, z28); \ + sm4e_sve(b5, z28); \ + sm4e_sve(b6, z28); \ + sm4e_sve(b7, z28); \ + sm4e_sve(b0, z29); \ + sm4e_sve(b1, z29); \ + sm4e_sve(b2, z29); \ + sm4e_sve(b3, z29); \ + sm4e_sve(b4, z29); \ + sm4e_sve(b5, z29); \ + sm4e_sve(b6, z29); \ + sm4e_sve(b7, z29); \ + sm4e_sve(b0, z30); \ + sm4e_sve(b1, z30); \ + sm4e_sve(b2, z30); \ + sm4e_sve(b3, z30); \ + sm4e_sve(b4, z30); \ + sm4e_sve(b5, z30); \ + sm4e_sve(b6, z30); \ + sm4e_sve(b7, z30); \ + sm4e_sve(b0, z31); \ + sm4e_sve(b1, z31); \ + sm4e_sve(b2, z31); \ + sm4e_sve(b3, z31); \ + sm4e_sve(b4, z31); \ + sm4e_sve(b5, z31); \ + sm4e_sve(b6, z31); \ + sm4e_sve(b7, z31); \ + tbl b0.b, {b0.b}, RSWAP128.b; \ + tbl b1.b, {b1.b}, RSWAP128.b; \ + tbl b2.b, {b2.b}, RSWAP128.b; \ + tbl b3.b, {b3.b}, RSWAP128.b; \ + tbl b4.b, {b4.b}, RSWAP128.b; \ + tbl b5.b, {b5.b}, RSWAP128.b; \ + tbl b6.b, {b6.b}, RSWAP128.b; \ + tbl b7.b, {b7.b}, RSWAP128.b; \ + revb b0.s, p0/m, b0.s; \ + revb b1.s, p0/m, b1.s; \ + revb b2.s, p0/m, b2.s; \ + revb b3.s, p0/m, b3.s; \ + revb b4.s, p0/m, b4.s; \ + revb b5.s, p0/m, b5.s; \ + revb b6.s, p0/m, b6.s; \ + revb b7.s, p0/m, b7.s; + + +#define SM4_CE_CRYPT_BLK(b0) \ + rev32 b0.16b, b0.16b; \ + sm4e_ce(b0, v24); \ + sm4e_ce(b0, v25); \ + sm4e_ce(b0, v26); \ + sm4e_ce(b0, v27); \ + sm4e_ce(b0, v28); \ + sm4e_ce(b0, v29); \ + sm4e_ce(b0, v30); \ + sm4e_ce(b0, v31); \ + rev64 b0.4s, b0.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + rev32 b0.16b, b0.16b; + + +.align 3 +.global _gcry_sm4_armv9_sve_ce_crypt +ELF(.type _gcry_sm4_armv9_sve_ce_crypt,%function;) +_gcry_sm4_armv9_sve_ce_crypt: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: nblocks + */ + CFI_STARTPROC(); + + PREPARE(); + +.Lcrypt_loop_blks: + sub x3, x3, x5, LSR #1; /* x3 - (8 * VL) */ + tbnz x3, #63, .Lcrypt_tail8; + + ld1b {z0.b}, p0/z, [x2]; + ld1b {z1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z3.b}, p0/z, [x2, #3, MUL VL]; + ld1b {z4.b}, p0/z, [x2, #4, MUL VL]; + ld1b {z5.b}, p0/z, [x2, #5, MUL VL]; + ld1b {z6.b}, p0/z, [x2, #6, MUL VL]; + ld1b {z7.b}, p0/z, [x2, #7, MUL VL]; + addvl x2, x2, #8; + + SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7); + + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x1, x1, #8; + + cbz x3, .Lcrypt_end; + b .Lcrypt_loop_blks; + +.Lcrypt_tail8: + add x3, x3, x5, LSR #1; + cmp x3, x5, LSR #2; + blt .Lcrypt_tail4; + + sub x3, x3, x5, LSR #2; /* x3 - (4 * VL) */ + + ld1b {z0.b}, p0/z, [x2]; + ld1b {z1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z3.b}, p0/z, [x2, #3, MUL VL]; + addvl x2, x2, #4; + + SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3); + + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x1, x1, #4; + + cbz x3, .Lcrypt_end; + +.Lcrypt_tail4: + cmp x3, x5, LSR #4; + blt .Lcrypt_tail; + + sub x3, x3, x5, LSR #4; /* x3 - VL */ + + ld1b {z0.b}, p0/z, [x2]; + addvl x2, x2, #1; + + SM4_SVE_CE_CRYPT_BLK(z0); + + st1b {z0.b}, p0, [x1]; + addvl x1, x1, #1; + + cbz x3, .Lcrypt_end; + +.Lcrypt_tail: + sub x3, x3, #1; + + ld1 {v0.16b}, [x2], #16; + SM4_CE_CRYPT_BLK(v0); + st1 {v0.16b}, [x1], #16; + + cbnz x3, .Lcrypt_tail; + +.Lcrypt_end: + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_crypt,.-_gcry_sm4_armv9_sve_ce_crypt;) + +.align 3 +.global _gcry_sm4_armv9_sve_ce_cbc_dec +ELF(.type _gcry_sm4_armv9_sve_ce_cbc_dec,%function;) +_gcry_sm4_armv9_sve_ce_cbc_dec: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * x4: nblocks + */ + CFI_STARTPROC(); + VPUSH_ABI; + + PREPARE(); + ld1 {RIVv.16b}, [x3]; + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcbc_loop_blks: + sub x4, x4, x5, LSR #1; /* x4 - (8 * VL) */ + tbnz x4, #63, .Lcbc_tail8; + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + ld1b {z11.b}, p0/z, [x2, #4, MUL VL]; + ld1b {z10.b}, p0/z, [x2, #5, MUL VL]; + ld1b {z9.b}, p0/z, [x2, #6, MUL VL]; + ld1b {z8.b}, p0/z, [x2, #7, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev z4.b, z11.b; + rev z5.b, z10.b; + rev z6.b, z9.b; + rev z7.b, z8.b; + rev RTMP0.b, RIV.b; + ext z7.b, z7.b, z6.b, #16; + ext z6.b, z6.b, z5.b, #16; + ext z5.b, z5.b, z4.b, #16; + ext z4.b, z4.b, z3.b, #16; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z7.b, z7.b; + rev z6.b, z6.b; + rev z5.b, z5.b; + rev z4.b, z4.b; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z8.d; + + SM4_SVE_CE_CRYPT_BLK8(z15, z14, z13, z12, z11, z10, z9, z8); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + eor z4.d, z4.d, z11.d; + eor z5.d, z5.d, z10.d; + eor z6.d, z6.d, z9.d; + eor z7.d, z7.d, z8.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x2, x2, #8; + addvl x1, x1, #8; + + cbz x4, .Lcbc_end; + b .Lcbc_loop_blks; + +.Lcbc_tail8: + add x4, x4, x5, LSR #1; + cmp x4, x5, LSR #2; + blt .Lcbc_tail4; + + sub x4, x4, x5, LSR #2; /* x4 - (4 * VL) */ + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev RTMP0.b, RIV.b; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z12.d; + + SM4_SVE_CE_CRYPT_BLK4(z15, z14, z13, z12); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x2, x2, #4; + addvl x1, x1, #4; + + cbz x4, .Lcbc_end; + +.Lcbc_tail4: + cmp x4, x5, LSR #4; + blt .Lcbc_tail_ce; + + sub x4, x4, x5, LSR #4; /* x4 - VL */ + + ld1b {z15.b}, p0/z, [x2]; + rev RTMP0.b, RIV.b; + rev z0.b, z15.b; + ext z0.b, z0.b, RTMP0.b, #16; + rev z0.b, z0.b; + mov RIV.d, z15.d; + + SM4_SVE_CE_CRYPT_BLK(z15); + + eor z0.d, z0.d, z15.d; + st1b {z0.b}, p0, [x1]; + addvl x2, x2, #1; + addvl x1, x1, #1; + + cbz x4, .Lcbc_end; + b .Lcbc_tail4; + +.Lcbc_tail_ce: + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + +.Lcbc_tail: + sub x4, x4, #1; + + ld1 {v15.16b}, [x2], #16; + mov v0.16b, RIVv.16b; + mov RIVv.16b, v15.16b; + SM4_CE_CRYPT_BLK(v15); + eor v0.16b, v0.16b, v15.16b; + st1 {v0.16b}, [x1], #16; + + cbnz x4, .Lcbc_tail; + + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcbc_end: + /* store new IV */ + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + st1 {RIVv.16b}, [x3]; + + VPOP_ABI; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_cbc_dec,.-_gcry_sm4_armv9_sve_ce_cbc_dec;) + +.align 3 +.global _gcry_sm4_armv9_sve_ce_cfb_dec +ELF(.type _gcry_sm4_armv9_sve_ce_cfb_dec,%function;) +_gcry_sm4_armv9_sve_ce_cfb_dec: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: iv (big endian, 128 bit) + * x4: nblocks + */ + CFI_STARTPROC(); + VPUSH_ABI; + + PREPARE(); + ld1 {RIVv.16b}, [x3]; + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcfb_loop_blks: + sub x4, x4, x5, LSR #1; /* x4 - (8 * VL) */ + tbnz x4, #63, .Lcfb_tail8; + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + ld1b {z11.b}, p0/z, [x2, #4, MUL VL]; + ld1b {z10.b}, p0/z, [x2, #5, MUL VL]; + ld1b {z9.b}, p0/z, [x2, #6, MUL VL]; + ld1b {z8.b}, p0/z, [x2, #7, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev z4.b, z11.b; + rev z5.b, z10.b; + rev z6.b, z9.b; + rev z7.b, z8.b; + rev RTMP0.b, RIV.b; + ext z7.b, z7.b, z6.b, #16; + ext z6.b, z6.b, z5.b, #16; + ext z5.b, z5.b, z4.b, #16; + ext z4.b, z4.b, z3.b, #16; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z7.b, z7.b; + rev z6.b, z6.b; + rev z5.b, z5.b; + rev z4.b, z4.b; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z8.d; + + SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + eor z4.d, z4.d, z11.d; + eor z5.d, z5.d, z10.d; + eor z6.d, z6.d, z9.d; + eor z7.d, z7.d, z8.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x2, x2, #8; + addvl x1, x1, #8; + + cbz x4, .Lcfb_end; + b .Lcfb_loop_blks; + +.Lcfb_tail8: + add x4, x4, x5, LSR #1; + cmp x4, x5, LSR #2; + blt .Lcfb_tail4; + + sub x4, x4, x5, LSR #2; /* x4 - (4 * VL) */ + + ld1b {z15.b}, p0/z, [x2]; + ld1b {z14.b}, p0/z, [x2, #1, MUL VL]; + ld1b {z13.b}, p0/z, [x2, #2, MUL VL]; + ld1b {z12.b}, p0/z, [x2, #3, MUL VL]; + rev z0.b, z15.b; + rev z1.b, z14.b; + rev z2.b, z13.b; + rev z3.b, z12.b; + rev RTMP0.b, RIV.b; + ext z3.b, z3.b, z2.b, #16; + ext z2.b, z2.b, z1.b, #16; + ext z1.b, z1.b, z0.b, #16; + ext z0.b, z0.b, RTMP0.b, #16; + rev z3.b, z3.b; + rev z2.b, z2.b; + rev z1.b, z1.b; + rev z0.b, z0.b; + mov RIV.d, z12.d; + + SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3); + + eor z0.d, z0.d, z15.d; + eor z1.d, z1.d, z14.d; + eor z2.d, z2.d, z13.d; + eor z3.d, z3.d, z12.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x2, x2, #4; + addvl x1, x1, #4; + + cbz x4, .Lcfb_end; + +.Lcfb_tail4: + cmp x4, x5, LSR #4; + blt .Lcfb_tail_ce; + + sub x4, x4, x5, LSR #4; /* x4 - VL */ + + ld1b {z15.b}, p0/z, [x2]; + rev RTMP0.b, RIV.b; + rev z0.b, z15.b; + ext z0.b, z0.b, RTMP0.b, #16; + rev z0.b, z0.b; + mov RIV.d, z15.d; + + SM4_SVE_CE_CRYPT_BLK(z0); + + eor z0.d, z0.d, z15.d; + st1b {z0.b}, p0, [x1]; + addvl x2, x2, #1; + addvl x1, x1, #1; + + cbz x4, .Lcfb_end; + b .Lcfb_tail4; + +.Lcfb_tail_ce: + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + +.Lcfb_tail: + sub x4, x4, #1; + + ld1 {v15.16b}, [x2], #16; + mov v0.16b, RIVv.16b; + mov RIVv.16b, v15.16b; + SM4_CE_CRYPT_BLK(v0); + eor v0.16b, v0.16b, v15.16b; + st1 {v0.16b}, [x1], #16; + + cbnz x4, .Lcfb_tail; + + ext RIV.b, RIV.b, RIV.b, #16; + +.Lcfb_end: + /* store new IV */ + rev RIV.s, RIV.s; + tbl RIV.b, {RIV.b}, RSWAP128.b; + st1 {RIVv.16b}, [x3]; + + VPOP_ABI; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_cfb_dec,.-_gcry_sm4_armv9_sve_ce_cfb_dec;) + +.align 3 +.global _gcry_sm4_armv9_sve_ce_ctr_enc +ELF(.type _gcry_sm4_armv9_sve_ce_ctr_enc,%function;) +_gcry_sm4_armv9_sve_ce_ctr_enc: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: ctr (big endian, 128 bit) + * x4: nblocks + */ + CFI_STARTPROC(); + + PREPARE(); + + dup RZERO.d, #0; + GET_LOCAL_POINTER(x6, .Lle128_inc); + ld1b {RLE128_INC.b}, p0/z, [x6]; + + ldp x7, x8, [x3]; + rev x7, x7; + rev x8, x8; + +#define inc_le128(zctr) \ + mov RCTRv.d[1], x8; \ + mov RCTRv.d[0], x7; \ + mov zctr.d, RLE128_INC.d; \ + dup RCTR.q, RCTR.q[0]; \ + adds x8, x8, x5, LSR #4; \ + adc x7, x7, xzr; \ + adclt zctr.d, RCTR.d, RZERO.d; \ + adclt RCTR.d, zctr.d, RZERO.d; \ + trn1 zctr.d, RCTR.d, zctr.d; \ + revb zctr.d, p0/m, zctr.d; + +.Lctr_loop_blks: + sub x4, x4, x5, LSR #1; /* x4 - (8 * VL) */ + tbnz x4, #63, .Lctr_tail8; + + inc_le128(z0); + inc_le128(z1); + inc_le128(z2); + inc_le128(z3); + inc_le128(z4); + inc_le128(z5); + inc_le128(z6); + inc_le128(z7); + + SM4_SVE_CE_CRYPT_BLK8(z0, z1, z2, z3, z4, z5, z6, z7); + + ld1b {RTMP0.b}, p0/z, [x2]; + ld1b {RTMP1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {RTMP2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {RTMP3.b}, p0/z, [x2, #3, MUL VL]; + eor z0.d, z0.d, RTMP0.d; + eor z1.d, z1.d, RTMP1.d; + eor z2.d, z2.d, RTMP2.d; + eor z3.d, z3.d, RTMP3.d; + ld1b {RTMP0.b}, p0/z, [x2, #4, MUL VL]; + ld1b {RTMP1.b}, p0/z, [x2, #5, MUL VL]; + ld1b {RTMP2.b}, p0/z, [x2, #6, MUL VL]; + ld1b {RTMP3.b}, p0/z, [x2, #7, MUL VL]; + eor z4.d, z4.d, RTMP0.d; + eor z5.d, z5.d, RTMP1.d; + eor z6.d, z6.d, RTMP2.d; + eor z7.d, z7.d, RTMP3.d; + addvl x2, x2, #8; + + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + st1b {z4.b}, p0, [x1, #4, MUL VL]; + st1b {z5.b}, p0, [x1, #5, MUL VL]; + st1b {z6.b}, p0, [x1, #6, MUL VL]; + st1b {z7.b}, p0, [x1, #7, MUL VL]; + addvl x1, x1, #8; + + cbz x4, .Lctr_end; + b .Lctr_loop_blks; + +.Lctr_tail8: + add x4, x4, x5, LSR #1; + cmp x4, x5, LSR #2; + blt .Lctr_tail4; + + sub x4, x4, x5, LSR #2; /* x4 - (4 * VL) */ + + inc_le128(z0); + inc_le128(z1); + inc_le128(z2); + inc_le128(z3); + + SM4_SVE_CE_CRYPT_BLK4(z0, z1, z2, z3); + + ld1b {RTMP0.b}, p0/z, [x2]; + ld1b {RTMP1.b}, p0/z, [x2, #1, MUL VL]; + ld1b {RTMP2.b}, p0/z, [x2, #2, MUL VL]; + ld1b {RTMP3.b}, p0/z, [x2, #3, MUL VL]; + eor z0.d, z0.d, RTMP0.d; + eor z1.d, z1.d, RTMP1.d; + eor z2.d, z2.d, RTMP2.d; + eor z3.d, z3.d, RTMP3.d; + st1b {z0.b}, p0, [x1]; + st1b {z1.b}, p0, [x1, #1, MUL VL]; + st1b {z2.b}, p0, [x1, #2, MUL VL]; + st1b {z3.b}, p0, [x1, #3, MUL VL]; + addvl x2, x2, #4; + addvl x1, x1, #4; + + cbz x4, .Lctr_end; + +.Lctr_tail4: + cmp x4, x5, LSR #4; + blt .Lctr_tail; + + sub x4, x4, x5, LSR #4; /* x4 - VL */ + + inc_le128(z0); + SM4_SVE_CE_CRYPT_BLK(z0); + ld1b {RTMP0.b}, p0/z, [x2]; + eor z0.d, z0.d, RTMP0.d; + st1b {z0.b}, p0, [x1]; + addvl x2, x2, #1; + addvl x1, x1, #1; + + cbz x4, .Lctr_end; + b .Lctr_tail4; + +.Lctr_tail: + sub x4, x4, #1; + + /* inc_le128 for CE */ + mov v0.d[1], x8; + mov v0.d[0], x7; + adds x8, x8, #1; + adc x7, x7, xzr; + rev64 v0.16b, v0.16b; + + SM4_CE_CRYPT_BLK(v0); + ld1 {RTMP0v.16b}, [x2], #16; + eor v0.16b, v0.16b, RTMP0v.16b; + st1 {v0.16b}, [x1], #16; + + cbnz x4, .Lctr_tail; + +.Lctr_end: + /* store new CTR */ + rev x7, x7; + rev x8, x8; + stp x7, x8, [x3]; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_ce_ctr_enc,.-_gcry_sm4_armv9_sve_ce_ctr_enc;) + +.align 3 +.global _gcry_sm4_armv9_sve_get_vl +ELF(.type _gcry_sm4_armv9_sve_get_vl,%function;) +_gcry_sm4_armv9_sve_get_vl: + CFI_STARTPROC(); + + /* VL in bytes */ + rdvl x0, #1; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_armv9_sve_get_vl,.-_gcry_sm4_armv9_sve_get_vl;) + +#endif diff --git a/cipher/sm4.c b/cipher/sm4.c index 1c54b339db82..062a14f4c670 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -94,6 +94,17 @@ # endif #endif +#undef USE_ARM_SVE_CE +#ifdef ENABLE_SVE_SUPPORT +# if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_SVE2) +# define USE_ARM_SVE_CE 1 +# endif +#endif + static const char *sm4_selftest (void); static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr, @@ -133,6 +144,9 @@ typedef struct #ifdef USE_ARM_CE unsigned int use_arm_ce:1; #endif +#ifdef USE_ARM_SVE_CE + unsigned int use_arm_sve_ce:1; +#endif } SM4_context; typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out, @@ -448,6 +462,37 @@ sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, #endif /* USE_ARM_CE */ +#ifdef USE_ARM_SVE_CE +extern void _gcry_sm4_armv9_sve_ce_crypt(const u32 *rk, byte *out, + const byte *in, + size_t nblocks); + +extern void _gcry_sm4_armv9_sve_ce_ctr_enc(const u32 *rk_enc, byte *out, + const byte *in, + byte *ctr, + size_t nblocks); + +extern void _gcry_sm4_armv9_sve_ce_cbc_dec(const u32 *rk_dec, byte *out, + const byte *in, + byte *iv, + size_t nblocks); + +extern void _gcry_sm4_armv9_sve_ce_cfb_dec(const u32 *rk_enc, byte *out, + const byte *in, + byte *iv, + size_t nblocks); + +static inline unsigned int +sm4_armv9_sve_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, + unsigned int num_blks) +{ + _gcry_sm4_armv9_sve_ce_crypt(rk, out, in, num_blks); + return 0; +} + +extern unsigned int _gcry_sm4_armv9_sve_get_vl(void); +#endif /* USE_ARM_SVE_CE */ + static inline void prefetch_sbox_table(void) { const volatile byte *vtab = (void *)&sbox_table; @@ -606,6 +651,11 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, #ifdef USE_ARM_CE ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4); #endif +#ifdef USE_ARM_SVE_CE + /* Only enabled when the SVE vector length is greater than 128 bits */ + ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVE2) && (hwf & HWF_ARM_SVESM4) + && _gcry_sm4_armv9_sve_get_vl() > 16; +#endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) @@ -802,6 +852,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx) return &sm4_aesni_avx_crypt_blk1_16; } #endif +#ifdef USE_ARM_SVE_CE + else if (ctx->use_arm_sve_ce) + { + return &sm4_armv9_sve_ce_crypt_blk1_16; + } +#endif #ifdef USE_ARM_CE else if (ctx->use_arm_ce) { @@ -879,6 +935,16 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr, } #endif +#ifdef USE_ARM_SVE_CE + if (ctx->use_arm_sve_ce) + { + /* Process all blocks at a time. */ + _gcry_sm4_armv9_sve_ce_ctr_enc(ctx->rkey_enc, outbuf, inbuf, + ctr, nblocks); + nblocks = 0; + } +#endif + #ifdef USE_ARM_CE if (ctx->use_arm_ce) { @@ -990,6 +1056,16 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv, } #endif +#ifdef USE_ARM_SVE_CE + if (ctx->use_arm_sve_ce) + { + /* Process all blocks at a time. */ + _gcry_sm4_armv9_sve_ce_cbc_dec(ctx->rkey_dec, outbuf, inbuf, + iv, nblocks); + nblocks = 0; + } +#endif + #ifdef USE_ARM_CE if (ctx->use_arm_ce) { @@ -1101,6 +1177,16 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, } #endif +#ifdef USE_ARM_SVE_CE + if (ctx->use_arm_sve_ce) + { + /* Process all blocks at a time. */ + _gcry_sm4_armv9_sve_ce_cfb_dec(ctx->rkey_enc, outbuf, inbuf, + iv, nblocks); + nblocks = 0; + } +#endif + #ifdef USE_ARM_CE if (ctx->use_arm_ce) { diff --git a/configure.ac b/configure.ac index 31bcd77e3c75..b55510d81eec 100644 --- a/configure.ac +++ b/configure.ac @@ -2957,6 +2957,7 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo" + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv9-aarch64-sve-ce.lo" esac fi -- 2.24.3 (Apple Git-128) From jussi.kivilinna at iki.fi Thu Jul 21 10:09:29 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 21 Jul 2022 11:09:29 +0300 Subject: [PATCH 2/3] sm4 & camellia: add generic bulk acceleration for CTR32LE mode (GCM-SIV) In-Reply-To: <20220721080930.3271436-1-jussi.kivilinna@iki.fi> References: <20220721080930.3271436-1-jussi.kivilinna@iki.fi> Message-ID: <20220721080930.3271436-2-jussi.kivilinna@iki.fi> * cipher/bulkhelp.h (bulk_ctr32le_enc_128): New. * cipher/camellia-glue.c (_gcry_camellia_ctr32le_enc): New. (camellia_setkey): Setup `bulk_ops->ctr32le_enc` if any AVX2 implementation is available. * cipher/sm4.c (_gcry_sm4_ctr32le_enc): New. (sm4_setkey): Setup `bulk_ops->ctr32le_enc`. * tests/basic.c (check_gcm_siv_cipher): Add large bulk encryption test vectors for SM4 and CAMELLIA128. -- On Intel tigerlake, SM4-GCM-SIV encryption performance is now 1.69 cycles/byte (was 32.9 c/B). CAMELLIA128-GCM-SIV encryption is now 1.38 cycles/byte (was 21.2 c/B). Signed-off-by: Jussi Kivilinna --- cipher/bulkhelp.h | 49 +++++ cipher/camellia-glue.c | 42 ++++- cipher/sm4.c | 34 ++++ tests/basic.c | 410 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 531 insertions(+), 4 deletions(-) diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h index 8c322ede..444973ab 100644 --- a/cipher/bulkhelp.h +++ b/cipher/bulkhelp.h @@ -177,6 +177,55 @@ bulk_ctr_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, } +static inline unsigned int +bulk_ctr32le_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, + const byte *inbuf, size_t nblocks, byte *ctr, + byte *tmpbuf, size_t tmpbuf_nblocks, + unsigned int *num_used_tmpblocks) +{ + unsigned int tmp_used = 16; + unsigned int burn_depth = 0; + unsigned int nburn; + + while (nblocks >= 1) + { + size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks; + u64 ctr_lo = buf_get_le64(ctr + 0 * 8); + u64 ctr_hi = buf_get_he64(ctr + 1 * 8); + size_t i; + + if (curr_blks * 16 > tmp_used) + tmp_used = curr_blks * 16; + + cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16); + for (i = 1; i < curr_blks; i++) + { + u32 lo_u32 = (u32)ctr_lo + i; + u64 lo_u64 = ctr_lo & ~(u64)(u32)-1; + lo_u64 += lo_u32; + buf_put_le64(&tmpbuf[0 * 8 + i * 16], lo_u64); + buf_put_he64(&tmpbuf[1 * 8 + i * 16], ctr_hi); + } + buf_put_le32(ctr, (u32)ctr_lo + curr_blks); + + nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks); + burn_depth = nburn > burn_depth ? nburn : burn_depth; + + for (i = 0; i < curr_blks; i++) + { + cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16); + outbuf += 16; + inbuf += 16; + } + + nblocks -= curr_blks; + } + + *num_used_tmpblocks = tmp_used; + return burn_depth; +} + + static inline unsigned int bulk_cbc_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, const byte *inbuf, size_t nblocks, byte *iv, diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index c938be71..b2a50233 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -407,6 +407,9 @@ static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv, static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +static void _gcry_camellia_ctr32le_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); @@ -469,7 +472,13 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, bulk_ops->ocb_auth = _gcry_camellia_ocb_auth; #ifdef USE_AESNI_AVX2 if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2) - bulk_ops->xts_crypt = _gcry_camellia_xts_crypt; + { + bulk_ops->xts_crypt = _gcry_camellia_xts_crypt; + bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc; + } +#else + (void)_gcry_camellia_xts_crypt; + (void)_gcry_camellia_ctr32le_enc; #endif if (0) @@ -1149,6 +1158,37 @@ _gcry_camellia_xts_crypt (void *context, unsigned char *tweak, _gcry_burn_stack(burn_stack_depth); } +/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */ +static void +_gcry_camellia_ctr32le_enc(void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + CAMELLIA_context *ctx = context; + byte *outbuf = outbuf_arg; + const byte *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + byte tmpbuf[64 * CAMELLIA_BLOCK_SIZE]; + unsigned int tmp_used = CAMELLIA_BLOCK_SIZE; + size_t nburn; + + nburn = bulk_ctr32le_enc_128 (ctx, camellia_encrypt_blk1_64, outbuf, + inbuf, nblocks, ctr, tmpbuf, + sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + /* Bulk encryption/decryption of complete blocks in OCB mode. */ static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, diff --git a/cipher/sm4.c b/cipher/sm4.c index 02c399a9..f68197c4 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -129,6 +129,9 @@ static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv, static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +static void _gcry_sm4_ctr32le_enc(void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); @@ -786,6 +789,7 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, bulk_ops->cfb_dec = _gcry_sm4_cfb_dec; bulk_ops->ctr_enc = _gcry_sm4_ctr_enc; bulk_ops->xts_crypt = _gcry_sm4_xts_crypt; + bulk_ops->ctr32le_enc = _gcry_sm4_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt; bulk_ops->ocb_auth = _gcry_sm4_ocb_auth; @@ -1520,6 +1524,36 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, _gcry_burn_stack(burn_stack_depth); } +/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */ +static void +_gcry_sm4_ctr32le_enc(void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + SM4_context *ctx = context; + byte *outbuf = outbuf_arg; + const byte *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + byte tmpbuf[32 * 16]; + unsigned int tmp_used = 16; + size_t nburn; + + nburn = bulk_ctr32le_enc_128 (ctx, sm4_encrypt_blk1_32, outbuf, inbuf, + nblocks, ctr, tmpbuf, sizeof(tmpbuf) / 16, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + /* Bulk encryption/decryption of complete blocks in OCB mode. */ static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, diff --git a/tests/basic.c b/tests/basic.c index 92d21b04..f88277cb 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5333,9 +5333,9 @@ check_gcm_siv_cipher (void) char nonce[12]; char ad[MAX_DATA_LEN]; int adlen; - unsigned char plaintext[MAX_DATA_LEN]; + unsigned char plaintext[MAX_DATA_LEN * 2]; int inlen; - char out[MAX_DATA_LEN]; + char out[MAX_DATA_LEN * 2]; char tag[MAX_DATA_LEN]; } tv[] = { @@ -6158,11 +6158,415 @@ check_gcm_siv_cipher (void) "\xcb\xec\x6a\x28\xa3\xf3\x4a\x6c\x0d\xb0\x79\x34\x13\x10\x64\xfc" "\xee\x12\x55\x82\x25\x25\x30\xb9\xa6\xf8\x3c\x81\x36\xcd\xef", "\xce\xc3\x13\x6c\x40\x2a\xcc\x51\xa1\xce\xb3\xed\xe8\xa6\x5b\x04", + }, + { + GCRY_CIPHER_SM4, + "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "", + 0, + "\x72\x94\x7b\x5d\x3c\x14\xc0\xa6\x27\x8d\x8d\xee\xbd\xe8\x8c\x6a" + "\x21\x34\xce\x64\x8f\x01\x01\xc6\xe4\x5d\xed\x2e\xb9\xec\xac\x53" + "\xf2\x07\xed\x60\xc8\xa2\x2f\x2e\x83\x0e\xf2\xbc\x42\x51\x24\x3b" + "\x41\x4f\x26\x84\xf0\x25\x69\x3f\x38\x29\xfb\xe9\xbb\x1a\x94\xd1" + "\x94\x0c\xce\xad\x8e\x66\xeb\xda\xc9\x1c\x72\x5a\x7f\x95\x4f\x9c" + "\x02\x27\x79\x8f\xe7\x51\x51\x3d\x1e\x2c\x4e\xcd\x07\xe5\xd1\xf0" + "\x6c\x95\x82\x37\x00\x50\x5e\xff\x82\xfb\x69\x0b\x4e\x7f\x10\x12" + "\x7d\x18\x7f\xa8\x88\x59\xfb\x55\x9b\x70\x36\xfc\xde\x75\xed\x77" + "\xf9\x09\x87\x29\x30\x7c\x81\x41\x12\xc2\xbd\xcd\x9f\x86\x98\x38" + "\x96\x44\x4c\xda\x2e\xbe\x7a\xfb\xdd\x4a\x4e\xa0\x84\x94\xd5\x76" + "\xa6\xae\x02\xcb\x1b\xd4\xd8\xcb\xa5\x24\x28\xe1\x3c\x1e\xdc\x3d" + "\x25\x50\xe7\xfb\x92\xad\xd9\x80\x33\xe0\xb2\x50\x07\xd4\x43\x40" + "\x41\x63\x98\x63\xa6\x1a\xfc\x56\x84\x3f\xf7\x4f\x31\xe7\xfe\xc5" + "\x73\x52\xfd\x6d\x9b\xbb\x9b\xf8\x19\xf8\xdc\x9f\x3a\x88\xa6\x7c" + "\xf3\x6b\xbe\xde\xda\x05\x2e\x79\x54\xb9\x3e\x59\x43\x0a\x1b\x16" + "\xcf\x94\x97\x71\x03\x74\x12\x37\xaf\xd4\x0a\x4b\x30\x16\x9b\x8b" + "\x9f\xae\x78\x46\x83\xde\x34\xc5\x31\x71\x67\x5e\xdb\x8d\x93\x71" + "\x90\x03\x72\x00\x9f\x4e\x1e\x7d\xf3\x3f\xf8\x31\xe7\xf6\xb4\x6d" + "\x8d\xdc\xa0\x85\x32\x7b\x32\x40\x8c\xa9\x90\x69\xac\x03\xdb\xd4" + "\xa5\x62\x9c\xfd\x78\xde\xc8\x4a\x18\x67\xa0\xee\x5e\x1e\xad\x1a" + "\x1c\xee\x78\xbd\xea\xdc\xc8\x34\xd1\x92\x20\xa7\x0d\x12\x90\x88" + "\x91\xe4\x6c\x3c\x06\x78\x13\x00\xdc\xc7\x3e\xd7\x91\xf7\xc1\xd6" + "\x5a\x99\x95\x23\xb5\xd8\x3d\x0f\x12\xaf\x25\xd8\xcf\xe8\x27\x7f" + "\xbc\x7c\xe2\xad\x34\x66\x7f\xfb\xf5\xa8\x11\xc1\xe6\x04\x37\x41" + "\xaf\x96\xb3\xb7\xee\x05\xf5\xd7\x7c\xc6\xfe\x2e\xa9\x07\x47\x08" + "\xa4\x50\x65\xc0\x2e\xd7\x27\xd8\x70\x8c\xf1\x12\x30\x4a\x82\xf6" + "\xb7\x68\xdb\x9d\x73\xc2\x82\x3d\x44\xda\xfb\xdd\x03\xc1\xdc\xfc" + "\x3f\x7f\x2e\xe2\xd3\x73\x24\xaf\xd1\x35\xa9\x4f\x3a\xad\x9d\x5c" + "\xd7\xc6\xa3\xb1\x11\xf1\xbb\xa0\x23\xe1\x22\x88\x5b\x10\xb3\xd6" + "\x01\x78\x5f\x9e\x4d\x96\x7b\xeb\x81\x6b\xce\x2d\xf5\x6a\xd1\xa8" + "\xb7\x56\xdd\xd0\x4b\xb0\xc9\x64\x7a\x2f\x63\xcb\xd6\x61\x84\x4b" + "\x9e\x4d\x0b\x2c\x99\xbc\xa2\x94\xf5\x07\x20\xe6\xe9\xc2\xd2\xa6" + "\x1c\x37\xd5\x88\x01\x71\xe2\x16\xcd\x10\x7a\x07\x8b\xf3\xb5\x49" + "\x75\xbe\x0b\xe1\xb2\x28\x15\x88\x2b\xb4\xee\x34\xfd\x67\x30\xd8" + "\xdc\x38\x90\x66\xb6\x51\x90\xb3\xdb\xee\x4e\x66\xc3\x05\xdf\xee" + "\x32\xac\x8b\xa2\x00\xcc\xff\xa2\x52\x19\x79\x7e\x6c\xc9\x68\xb2" + "\xab\xe4\x69\x11\xea\x00\xc9\x2b\x58\x77\x8b\x6c\x28\x0e\x40\x42" + "\xcc\xa7\xb2\x58\xed\x5e\x0b\x19\x49\xe5\x5e\xb1\xb5\x24\x61\x63" + "\x7d\x5b\x6a\x7d\x3d\xc1\x6e\x09\x64\x43\x66\x31\x3c\xb0\x26\x2e" + "\xc8\x27\xf6\x5a\x5f\x22\x94\x42\x62\x2a\xf6\x5a\x7d\xc2\x4a\x0d" + "\xd2\xad\xaa\x0e\xb2\xa4\x29\x1c\xb8\x3b\xaa\xc9\x1d\x1a\x30\xf8" + "\x0b\x35\xb2\x84\x75\xc3\x08\x0c\xe5\x36\xa9\xff\xfe\xb9\xc2\xb7" + "\x51\xab\x2d\x9d\x3e\x1c\x08\x8c\x6c\x64\xe1\xd9\x97\xf4\xfc\x4d" + "\x77\x6d\x0e\xce\x73\x0b\x7f\x57\x41\xed\xdf\x96\x11\xb3\xcc\x61" + "\xe8\x12\x31\x16\x72\x4c\x10\xd4\x52\x14\x4c\x83\xaa\x3c\x29\x6c" + "\x51\x40\x9a\x4d\x9b\xd0\xe6\x7f\xad\x31\x54\x88\x90\xe1\xa8\x0e" + "\xd8\xf4\x84\x11\xdb\x02\x41\xff\xb0\x8a\x92\x95\x97\xd6\x98\x8a" + "\xa0\x43\xda\x70\xbb\x17\xd0\x5a\x81\x3e\xf7\xcf\xc9\x33\xd9\x76" + "\x2f\x53\xa2\xac\xa0\x8a\x73\xe4\x0c\x81\xbe\x26\x01\x3f\x6d\x79" + "\x8a\x37\x59\x5b\x0a\x9a\x10\x6b\x04\x30\xed\xda\x11\x73\x73\xd9" + "\xa2\x9a\xf8\x8e\x67\x82\x5a\x8d\xc0\x52\xe8\x42\x89\xcd\x9c\xb1" + "\x5c\x3d\xd4\x75\x03\x71\x03\x3f\xdc\x6b\x79\xb4\x02\xb6\xac\xc4" + "\x11\x0f\x61\xc8\xf7\x5d\xc6\xbf\x48\x02\xa3\xdc\xa8\x37\x10\x85" + "\xb2\x8d\xbd\xb0\x79\x09\xb0\x5f\x30\x6c\x40\xba\x03\xbb\x22\xcc" + "\x80\xa1\xc3\x91\x88\x25\x92\xbe\xa6\xfa\x14\x77\x56\xb3\xc0\xb5" + "\x69\x8c\x6f\xed\x21\xaf\x0c\x79\x07\x64\xa2\xea\xeb\x47\x2c\x1e" + "\x7d\x6c\x12\xae\x75\xc4\xee\x12\x46\x72\x87\x65\x73\x51\xee\xf8" + "\x08\x63\x20\xa1\x61\xca\x73\x8f\xdf\xcb\x97\xf8\xfc\xb0\x56\xea" + "\x34\x9d\xce\xb8\x91\xb8\xfc\xec\x76\xd0\x71\xb7\x92\xc9\xb2\x28" + "\xee\x0b\x5d\x7c\x4a\xf6\x73\x4d\xc2\x5b\x5b\xae\x7b\xa6\x9c\xba" + "\x29\x7d\x7d\x3c\x29\x01\x04\x2d\xd1\x6c\x8d\x8d\xe5\xb4\x6b\xf9" + "\x2a\x83\xb8\x14\x00\x1c\x91\x72\x5e\x8f\x13\x56\x6d\x9b\x6d\x27" + "\xe8\x22\x55\x4b\x2f\x8a\x31\x16\x98\x03\x51\x73\xa7\x2e\x18\x81" + "\x51\x0a\x8f\x6d\x17\xd0\xea\x04\x1c\x11\xb9\x6b\x8e\xaa\x76", + 1023, + "\x00\xf8\xa8\x64\x74\x1e\x9c\x18\x72\xa5\x55\x9e\x83\x8f\xde\xa5" + "\xd2\x34\x8e\x06\x25\xe8\x00\x15\xac\x4f\x26\x8d\x12\xe7\x3b\x7b" + "\xbb\xa7\x16\x54\x9b\xad\x82\x94\x0f\xdf\x3b\x3e\xfe\x42\xc0\x0f" + "\x23\x4d\x5b\x8e\xb5\x13\xf8\xb7\x40\x9b\xf6\x09\x1b\xc2\x9d\x2f" + "\xa4\x38\x6f\x19\x86\xd5\x6f\xac\x1f\xe4\x2c\x5c\x74\xc2\xdb\x7a" + "\x77\xac\xed\x83\xdb\xfe\x5f\x1c\x2a\x4f\xba\x00\xfc\x47\x8b\xe2" + "\x77\xb8\x38\x86\x7c\x21\x10\x64\xde\x37\x0f\x4c\x09\xcd\x6f\x0a" + "\x3f\x6d\xf4\xf1\x55\x6c\xe2\x29\x7f\xf8\xd6\x84\x31\xd5\x9c\x08" + "\x10\x94\x45\x7d\x62\x73\xfa\x28\x5e\x81\x90\x13\xb8\x0a\xd2\x4e" + "\xfd\x11\x99\x42\xd7\xb3\x90\x38\x26\x05\xda\xad\x11\x43\x84\x28" + "\x64\x0b\x50\xd9\x47\xa7\xad\x7c\xba\x51\xd2\x9c\xe4\xe9\xdf\x67" + "\x09\xe5\xb2\xfe\xb1\x60\xe9\x3f\x93\x6b\x4a\xe8\x71\x71\x9c\xdf" + "\xbe\x82\x59\x1c\x25\x8b\x72\xe8\x9d\x64\x4c\x21\x4b\x61\x11\x82" + "\x65\xb5\xf7\x80\x5c\xec\xee\x08\x7c\x35\x5a\x40\xb5\x64\xf6\xa2" + "\xa9\xda\x81\xff\x92\xf9\x49\x4f\x08\x24\xdc\x6a\x2f\x3f\xe6\xac" + "\x68\xf8\x5a\x10\xd4\x3b\xc7\x60\x0c\x69\x6c\x42\x99\xa3\x03\x2d" + "\x98\x64\x03\xe7\x4d\x07\x49\x91\x82\x1b\x34\x11\x9b\x16\xef\x2c" + "\x77\x10\xb4\xd7\xc5\xa7\xca\xbe\xb9\x71\xa0\x74\xb7\xfc\x06\xcd" + "\x82\x9f\xb3\xb0\xde\x49\xe2\x5a\x9c\xc6\x3b\x4b\xd7\x3e\x8b\xdf" + "\xd0\x27\xb8\x7a\xb5\x45\x05\xe5\xfa\xff\x6c\x9d\x6e\x9c\x0b\x4e" + "\xc3\x7d\xd1\x28\xd2\x30\x58\xc9\xa9\x7f\xa0\xd3\x65\xaf\x7b\x04" + "\x27\x39\xb3\x80\x6c\x68\x5c\x27\x60\xab\x98\x3b\xca\x16\xb4\x98" + "\x3c\xed\xf1\xd6\xf9\x43\x55\x51\xf8\xba\xdb\x96\xc2\xbb\xc4\x53" + "\x8e\x49\x0b\x8f\x82\x92\x75\x9c\x83\x7a\xf9\x7b\x0f\x30\x4f\x6d" + "\x8b\x6a\x05\xd9\x6e\x47\x88\x09\xfc\x56\x57\x91\x9a\xcd\xbb\xa9" + "\x39\x45\x20\x81\xd9\x23\x72\x1d\xfa\xea\x24\xb7\xeb\x2a\xcf\x19" + "\xcc\xcc\x63\xd6\xbb\x29\x5f\x9f\x71\x7c\x45\x15\x7b\x37\x12\x82" + "\x64\x41\xad\xe6\x20\xf1\x5d\xd0\x14\xff\x7b\x0c\x72\xe9\xc3\xf5" + "\x8a\xf2\xa3\x2e\x30\xdd\x32\xdc\x10\x9d\x9e\x05\xd8\x0d\xd8\x22" + "\xdd\xa6\x7f\x0d\xf5\x00\x3e\x7a\x92\xa6\x01\x3c\xc7\xdc\xf7\xae" + "\x73\x0c\xbf\xd4\x98\xfc\x30\xa5\xe8\xc1\x69\xb8\x57\xc9\x31\x4c" + "\x82\x1e\x3e\x17\x5f\x4d\x0c\x4d\x31\xbe\x21\x60\x79\x31\x52\x12" + "\x08\x09\x52\x8d\xf7\xbc\x73\x21\x95\x28\x09\x1f\x9b\xcd\x79\x42" + "\x61\x1f\x9f\x9e\x87\x53\x4c\x39\x50\x90\x74\xc4\xe1\xf7\x4f\x72" + "\xe6\x95\xf3\x38\xcb\x41\x3c\x26\x48\x00\x12\x0f\xbb\x3e\xd3\x17" + "\x7c\x03\xe1\x6e\x76\x58\xfc\x87\xa0\x99\x7f\x1e\x00\xea\x9e\x4e" + "\xef\x4c\x10\xee\xee\x79\xeb\x13\x8c\x19\x01\xd0\x2a\x74\x48\x99" + "\x66\x7e\x77\x1e\xa4\xee\x31\xae\xaf\x7b\x8f\x80\x06\x51\x5d\x7d" + "\x5d\x9f\x68\x1d\xea\xa8\x43\x99\xff\xac\x5d\x04\xb0\x30\x70\xf8" + "\x4a\xd3\xba\x6c\xd6\xb2\x01\x86\x8f\x4b\x2e\x6b\x5a\xd4\xc3\x74" + "\x1c\xb1\xe8\x4e\xbf\x7e\x18\xf3\x14\xe8\xf6\x05\xb5\xb6\x6c\xa7" + "\x94\xce\xba\xd2\x70\x3b\x49\x32\x80\xef\xaa\xdd\xa3\xfd\x49\x0d" + "\x0e\x24\x36\x69\x0a\x20\x7e\xbf\xfa\xca\x1b\xc9\xd9\xfd\x2b\x83" + "\x5d\xab\x3a\xa1\x2c\x43\xc7\xf1\xc4\x43\x37\x97\xa9\xd2\x39\x67" + "\x5d\xac\xdd\xf6\x0b\x6e\x99\x9a\x4b\x83\xaf\xba\x74\xbb\xf6\x67" + "\xc1\xf3\x38\x16\xc3\x56\x7f\x0d\x4e\x87\xbc\xd0\x85\xa0\x5d\x48" + "\x48\x44\x24\x79\x3d\x0d\xd3\x7a\x70\x38\xac\xd6\x3c\xe1\x6e\x2e" + "\xea\xb9\xee\x89\xea\xe2\x1d\xe9\xd1\xa5\x0f\x75\x46\xa8\x8d\x0d" + "\xf5\x72\x37\xc8\xe0\xaa\x48\x0f\x0e\xa4\x08\xce\x31\x74\x78\xdb" + "\x92\x30\x54\x70\x0c\x62\xe0\x62\x00\x90\xdd\x08\xf7\x3c\xa3\x1b" + "\x79\x78\x6a\xb6\xdb\xa3\x77\xd9\x3a\x38\xfc\x98\x29\x08\xba\x71" + "\x60\xa5\xf6\xcb\xc0\xe7\xe5\x35\x87\x97\xaf\x87\x81\x33\xe4\x1a" + "\x3c\x4b\x21\x7d\x7d\x84\x96\x52\x30\xac\xf2\x1b\x47\x28\xc0\x6f" + "\xf8\x6d\x9c\x2d\x69\x19\x49\x2e\x37\x1b\x89\x31\xa4\xb5\xbf\x60" + "\x9b\x32\x55\x83\x8f\x78\x50\x6b\xc5\x9c\xf6\x58\x8b\x0d\x93\xc0" + "\x30\x74\x98\x62\xec\xaa\x0e\x6e\xe7\x9b\x7c\x9b\x28\x97\x9e\xaf" + "\x38\xb8\x56\x4d\x78\xbe\x76\x69\xb5\xe0\x84\x2b\x1f\x11\x8e\xf7" + "\x18\x90\x4b\xfa\x82\x06\x57\xdd\xc7\xe3\x1d\xd6\x1f\x72\x12\x50" + "\x93\x20\x4c\xf7\x4b\x08\x1c\x28\x3f\x46\x47\xd0\x12\x40\xaa\xa9" + "\x38\x27\x04\x1e\x5f\x2c\xd0\x6b\xe8\xd0\xcd\xd9\x9d\xcc\x88\x67" + "\x8b\x5c\x5f\x80\xca\x54\xd8\x85\x26\x20\x31\xe8\xb8\xd9\xd4\xe9" + "\x40\x99\x11\x24\x86\x56\x82\xbe\x75\x5e\x53\x19\xf4\xfd\x38\x06" + "\x15\x9d\x58\x4c\x92\xb2\x09\xd1\x69\x03\x6f\xd2\x58\x9f\x85\x09" + "\x64\x15\x17\x55\x60\x71\xb4\xaf\xcd\xc8\x90\x25\xc8\xc8\x62", + "\xe2\x32\xda\x3a\x5a\x0e\x45\x1b\x8e\xf8\xbb\xe6\x60\x71\x81\xeb", + }, + { + GCRY_CIPHER_CAMELLIA128, + "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "", + 0, + "\x72\x94\x7b\x5d\x3c\x14\xc0\xa6\x27\x8d\x8d\xee\xbd\xe8\x8c\x6a" + "\x21\x34\xce\x64\x8f\x01\x01\xc6\xe4\x5d\xed\x2e\xb9\xec\xac\x53" + "\xf2\x07\xed\x60\xc8\xa2\x2f\x2e\x83\x0e\xf2\xbc\x42\x51\x24\x3b" + "\x41\x4f\x26\x84\xf0\x25\x69\x3f\x38\x29\xfb\xe9\xbb\x1a\x94\xd1" + "\x94\x0c\xce\xad\x8e\x66\xeb\xda\xc9\x1c\x72\x5a\x7f\x95\x4f\x9c" + "\x02\x27\x79\x8f\xe7\x51\x51\x3d\x1e\x2c\x4e\xcd\x07\xe5\xd1\xf0" + "\x6c\x95\x82\x37\x00\x50\x5e\xff\x82\xfb\x69\x0b\x4e\x7f\x10\x12" + "\x7d\x18\x7f\xa8\x88\x59\xfb\x55\x9b\x70\x36\xfc\xde\x75\xed\x77" + "\xf9\x09\x87\x29\x30\x7c\x81\x41\x12\xc2\xbd\xcd\x9f\x86\x98\x38" + "\x96\x44\x4c\xda\x2e\xbe\x7a\xfb\xdd\x4a\x4e\xa0\x84\x94\xd5\x76" + "\xa6\xae\x02\xcb\x1b\xd4\xd8\xcb\xa5\x24\x28\xe1\x3c\x1e\xdc\x3d" + "\x25\x50\xe7\xfb\x92\xad\xd9\x80\x33\xe0\xb2\x50\x07\xd4\x43\x40" + "\x41\x63\x98\x63\xa6\x1a\xfc\x56\x84\x3f\xf7\x4f\x31\xe7\xfe\xc5" + "\x73\x52\xfd\x6d\x9b\xbb\x9b\xf8\x19\xf8\xdc\x9f\x3a\x88\xa6\x7c" + "\xf3\x6b\xbe\xde\xda\x05\x2e\x79\x54\xb9\x3e\x59\x43\x0a\x1b\x16" + "\xcf\x94\x97\x71\x03\x74\x12\x37\xaf\xd4\x0a\x4b\x30\x16\x9b\x8b" + "\x9f\xae\x78\x46\x83\xde\x34\xc5\x31\x71\x67\x5e\xdb\x8d\x93\x71" + "\x90\x03\x72\x00\x9f\x4e\x1e\x7d\xf3\x3f\xf8\x31\xe7\xf6\xb4\x6d" + "\x8d\xdc\xa0\x85\x32\x7b\x32\x40\x8c\xa9\x90\x69\xac\x03\xdb\xd4" + "\xa5\x62\x9c\xfd\x78\xde\xc8\x4a\x18\x67\xa0\xee\x5e\x1e\xad\x1a" + "\x1c\xee\x78\xbd\xea\xdc\xc8\x34\xd1\x92\x20\xa7\x0d\x12\x90\x88" + "\x91\xe4\x6c\x3c\x06\x78\x13\x00\xdc\xc7\x3e\xd7\x91\xf7\xc1\xd6" + "\x5a\x99\x95\x23\xb5\xd8\x3d\x0f\x12\xaf\x25\xd8\xcf\xe8\x27\x7f" + "\xbc\x7c\xe2\xad\x34\x66\x7f\xfb\xf5\xa8\x11\xc1\xe6\x04\x37\x41" + "\xaf\x96\xb3\xb7\xee\x05\xf5\xd7\x7c\xc6\xfe\x2e\xa9\x07\x47\x08" + "\xa4\x50\x65\xc0\x2e\xd7\x27\xd8\x70\x8c\xf1\x12\x30\x4a\x82\xf6" + "\xb7\x68\xdb\x9d\x73\xc2\x82\x3d\x44\xda\xfb\xdd\x03\xc1\xdc\xfc" + "\x3f\x7f\x2e\xe2\xd3\x73\x24\xaf\xd1\x35\xa9\x4f\x3a\xad\x9d\x5c" + "\xd7\xc6\xa3\xb1\x11\xf1\xbb\xa0\x23\xe1\x22\x88\x5b\x10\xb3\xd6" + "\x01\x78\x5f\x9e\x4d\x96\x7b\xeb\x81\x6b\xce\x2d\xf5\x6a\xd1\xa8" + "\xb7\x56\xdd\xd0\x4b\xb0\xc9\x64\x7a\x2f\x63\xcb\xd6\x61\x84\x4b" + "\x9e\x4d\x0b\x2c\x99\xbc\xa2\x94\xf5\x07\x20\xe6\xe9\xc2\xd2\xa6" + "\x1c\x37\xd5\x88\x01\x71\xe2\x16\xcd\x10\x7a\x07\x8b\xf3\xb5\x49" + "\x75\xbe\x0b\xe1\xb2\x28\x15\x88\x2b\xb4\xee\x34\xfd\x67\x30\xd8" + "\xdc\x38\x90\x66\xb6\x51\x90\xb3\xdb\xee\x4e\x66\xc3\x05\xdf\xee" + "\x32\xac\x8b\xa2\x00\xcc\xff\xa2\x52\x19\x79\x7e\x6c\xc9\x68\xb2" + "\xab\xe4\x69\x11\xea\x00\xc9\x2b\x58\x77\x8b\x6c\x28\x0e\x40\x42" + "\xcc\xa7\xb2\x58\xed\x5e\x0b\x19\x49\xe5\x5e\xb1\xb5\x24\x61\x63" + "\x7d\x5b\x6a\x7d\x3d\xc1\x6e\x09\x64\x43\x66\x31\x3c\xb0\x26\x2e" + "\xc8\x27\xf6\x5a\x5f\x22\x94\x42\x62\x2a\xf6\x5a\x7d\xc2\x4a\x0d" + "\xd2\xad\xaa\x0e\xb2\xa4\x29\x1c\xb8\x3b\xaa\xc9\x1d\x1a\x30\xf8" + "\x0b\x35\xb2\x84\x75\xc3\x08\x0c\xe5\x36\xa9\xff\xfe\xb9\xc2\xb7" + "\x51\xab\x2d\x9d\x3e\x1c\x08\x8c\x6c\x64\xe1\xd9\x97\xf4\xfc\x4d" + "\x77\x6d\x0e\xce\x73\x0b\x7f\x57\x41\xed\xdf\x96\x11\xb3\xcc\x61" + "\xe8\x12\x31\x16\x72\x4c\x10\xd4\x52\x14\x4c\x83\xaa\x3c\x29\x6c" + "\x51\x40\x9a\x4d\x9b\xd0\xe6\x7f\xad\x31\x54\x88\x90\xe1\xa8\x0e" + "\xd8\xf4\x84\x11\xdb\x02\x41\xff\xb0\x8a\x92\x95\x97\xd6\x98\x8a" + "\xa0\x43\xda\x70\xbb\x17\xd0\x5a\x81\x3e\xf7\xcf\xc9\x33\xd9\x76" + "\x2f\x53\xa2\xac\xa0\x8a\x73\xe4\x0c\x81\xbe\x26\x01\x3f\x6d\x79" + "\x8a\x37\x59\x5b\x0a\x9a\x10\x6b\x04\x30\xed\xda\x11\x73\x73\xd9" + "\xa2\x9a\xf8\x8e\x67\x82\x5a\x8d\xc0\x52\xe8\x42\x89\xcd\x9c\xb1" + "\x5c\x3d\xd4\x75\x03\x71\x03\x3f\xdc\x6b\x79\xb4\x02\xb6\xac\xc4" + "\x11\x0f\x61\xc8\xf7\x5d\xc6\xbf\x48\x02\xa3\xdc\xa8\x37\x10\x85" + "\xb2\x8d\xbd\xb0\x79\x09\xb0\x5f\x30\x6c\x40\xba\x03\xbb\x22\xcc" + "\x80\xa1\xc3\x91\x88\x25\x92\xbe\xa6\xfa\x14\x77\x56\xb3\xc0\xb5" + "\x69\x8c\x6f\xed\x21\xaf\x0c\x79\x07\x64\xa2\xea\xeb\x47\x2c\x1e" + "\x7d\x6c\x12\xae\x75\xc4\xee\x12\x46\x72\x87\x65\x73\x51\xee\xf8" + "\x08\x63\x20\xa1\x61\xca\x73\x8f\xdf\xcb\x97\xf8\xfc\xb0\x56\xea" + "\x34\x9d\xce\xb8\x91\xb8\xfc\xec\x76\xd0\x71\xb7\x92\xc9\xb2\x28" + "\xee\x0b\x5d\x7c\x4a\xf6\x73\x4d\xc2\x5b\x5b\xae\x7b\xa6\x9c\xba" + "\x29\x7d\x7d\x3c\x29\x01\x04\x2d\xd1\x6c\x8d\x8d\xe5\xb4\x6b\xf9" + "\x2a\x83\xb8\x14\x00\x1c\x91\x72\x5e\x8f\x13\x56\x6d\x9b\x6d\x27" + "\xe8\x22\x55\x4b\x2f\x8a\x31\x16\x98\x03\x51\x73\xa7\x2e\x18\x81" + "\x00\xf8\xa8\x64\x74\x1e\x9c\x18\x72\xa5\x55\x9e\x83\x8f\xde\xa5" + "\xd2\x34\x8e\x06\x25\xe8\x00\x15\xac\x4f\x26\x8d\x12\xe7\x3b\x7b" + "\xbb\xa7\x16\x54\x9b\xad\x82\x94\x0f\xdf\x3b\x3e\xfe\x42\xc0\x0f" + "\x23\x4d\x5b\x8e\xb5\x13\xf8\xb7\x40\x9b\xf6\x09\x1b\xc2\x9d\x2f" + "\xa4\x38\x6f\x19\x86\xd5\x6f\xac\x1f\xe4\x2c\x5c\x74\xc2\xdb\x7a" + "\x77\xac\xed\x83\xdb\xfe\x5f\x1c\x2a\x4f\xba\x00\xfc\x47\x8b\xe2" + "\x77\xb8\x38\x86\x7c\x21\x10\x64\xde\x37\x0f\x4c\x09\xcd\x6f\x0a" + "\x3f\x6d\xf4\xf1\x55\x6c\xe2\x29\x7f\xf8\xd6\x84\x31\xd5\x9c\x08" + "\x10\x94\x45\x7d\x62\x73\xfa\x28\x5e\x81\x90\x13\xb8\x0a\xd2\x4e" + "\xfd\x11\x99\x42\xd7\xb3\x90\x38\x26\x05\xda\xad\x11\x43\x84\x28" + "\x64\x0b\x50\xd9\x47\xa7\xad\x7c\xba\x51\xd2\x9c\xe4\xe9\xdf\x67" + "\x09\xe5\xb2\xfe\xb1\x60\xe9\x3f\x93\x6b\x4a\xe8\x71\x71\x9c\xdf" + "\xbe\x82\x59\x1c\x25\x8b\x72\xe8\x9d\x64\x4c\x21\x4b\x61\x11\x82" + "\x65\xb5\xf7\x80\x5c\xec\xee\x08\x7c\x35\x5a\x40\xb5\x64\xf6\xa2" + "\xa9\xda\x81\xff\x92\xf9\x49\x4f\x08\x24\xdc\x6a\x2f\x3f\xe6\xac" + "\x68\xf8\x5a\x10\xd4\x3b\xc7\x60\x0c\x69\x6c\x42\x99\xa3\x03\x2d" + "\x98\x64\x03\xe7\x4d\x07\x49\x91\x82\x1b\x34\x11\x9b\x16\xef\x2c" + "\x77\x10\xb4\xd7\xc5\xa7\xca\xbe\xb9\x71\xa0\x74\xb7\xfc\x06\xcd" + "\x82\x9f\xb3\xb0\xde\x49\xe2\x5a\x9c\xc6\x3b\x4b\xd7\x3e\x8b\xdf" + "\xd0\x27\xb8\x7a\xb5\x45\x05\xe5\xfa\xff\x6c\x9d\x6e\x9c\x0b\x4e" + "\xc3\x7d\xd1\x28\xd2\x30\x58\xc9\xa9\x7f\xa0\xd3\x65\xaf\x7b\x04" + "\x27\x39\xb3\x80\x6c\x68\x5c\x27\x60\xab\x98\x3b\xca\x16\xb4\x98" + "\x3c\xed\xf1\xd6\xf9\x43\x55\x51\xf8\xba\xdb\x96\xc2\xbb\xc4\x53" + "\x8e\x49\x0b\x8f\x82\x92\x75\x9c\x83\x7a\xf9\x7b\x0f\x30\x4f\x6d" + "\x8b\x6a\x05\xd9\x6e\x47\x88\x09\xfc\x56\x57\x91\x9a\xcd\xbb\xa9" + "\x39\x45\x20\x81\xd9\x23\x72\x1d\xfa\xea\x24\xb7\xeb\x2a\xcf\x19" + "\xcc\xcc\x63\xd6\xbb\x29\x5f\x9f\x71\x7c\x45\x15\x7b\x37\x12\x82" + "\x64\x41\xad\xe6\x20\xf1\x5d\xd0\x14\xff\x7b\x0c\x72\xe9\xc3\xf5" + "\x8a\xf2\xa3\x2e\x30\xdd\x32\xdc\x10\x9d\x9e\x05\xd8\x0d\xd8\x22" + "\xdd\xa6\x7f\x0d\xf5\x00\x3e\x7a\x92\xa6\x01\x3c\xc7\xdc\xf7\xae" + "\x73\x0c\xbf\xd4\x98\xfc\x30\xa5\xe8\xc1\x69\xb8\x57\xc9\x31\x4c" + "\x82\x1e\x3e\x17\x5f\x4d\x0c\x4d\x31\xbe\x21\x60\x79\x31\x52\x12" + "\x08\x09\x52\x8d\xf7\xbc\x73\x21\x95\x28\x09\x1f\x9b\xcd\x79\x42" + "\x61\x1f\x9f\x9e\x87\x53\x4c\x39\x50\x90\x74\xc4\xe1\xf7\x4f\x72" + "\xe6\x95\xf3\x38\xcb\x41\x3c\x26\x48\x00\x12\x0f\xbb\x3e\xd3\x17" + "\x7c\x03\xe1\x6e\x76\x58\xfc\x87\xa0\x99\x7f\x1e\x00\xea\x9e\x4e" + "\xef\x4c\x10\xee\xee\x79\xeb\x13\x8c\x19\x01\xd0\x2a\x74\x48\x99" + "\x66\x7e\x77\x1e\xa4\xee\x31\xae\xaf\x7b\x8f\x80\x06\x51\x5d\x7d" + "\x5d\x9f\x68\x1d\xea\xa8\x43\x99\xff\xac\x5d\x04\xb0\x30\x70\xf8" + "\x4a\xd3\xba\x6c\xd6\xb2\x01\x86\x8f\x4b\x2e\x6b\x5a\xd4\xc3\x74" + "\x1c\xb1\xe8\x4e\xbf\x7e\x18\xf3\x14\xe8\xf6\x05\xb5\xb6\x6c\xa7" + "\x94\xce\xba\xd2\x70\x3b\x49\x32\x80\xef\xaa\xdd\xa3\xfd\x49\x0d" + "\x0e\x24\x36\x69\x0a\x20\x7e\xbf\xfa\xca\x1b\xc9\xd9\xfd\x2b\x83" + "\x5d\xab\x3a\xa1\x2c\x43\xc7\xf1\xc4\x43\x37\x97\xa9\xd2\x39\x67" + "\x5d\xac\xdd\xf6\x0b\x6e\x99\x9a\x4b\x83\xaf\xba\x74\xbb\xf6\x67" + "\xc1\xf3\x38\x16\xc3\x56\x7f\x0d\x4e\x87\xbc\xd0\x85\xa0\x5d\x48" + "\x48\x44\x24\x79\x3d\x0d\xd3\x7a\x70\x38\xac\xd6\x3c\xe1\x6e\x2e" + "\xea\xb9\xee\x89\xea\xe2\x1d\xe9\xd1\xa5\x0f\x75\x46\xa8\x8d\x0d" + "\xf5\x72\x37\xc8\xe0\xaa\x48\x0f\x0e\xa4\x08\xce\x31\x74\x78\xdb" + "\x92\x30\x54\x70\x0c\x62\xe0\x62\x00\x90\xdd\x08\xf7\x3c\xa3\x1b" + "\x79\x78\x6a\xb6\xdb\xa3\x77\xd9\x3a\x38\xfc\x98\x29\x08\xba\x71" + "\x60\xa5\xf6\xcb\xc0\xe7\xe5\x35\x87\x97\xaf\x87\x81\x33\xe4\x1a" + "\x3c\x4b\x21\x7d\x7d\x84\x96\x52\x30\xac\xf2\x1b\x47\x28\xc0\x6f" + "\xf8\x6d\x9c\x2d\x69\x19\x49\x2e\x37\x1b\x89\x31\xa4\xb5\xbf\x60" + "\x9b\x32\x55\x83\x8f\x78\x50\x6b\xc5\x9c\xf6\x58\x8b\x0d\x93\xc0" + "\x30\x74\x98\x62\xec\xaa\x0e\x6e\xe7\x9b\x7c\x9b\x28\x97\x9e\xaf" + "\x38\xb8\x56\x4d\x78\xbe\x76\x69\xb5\xe0\x84\x2b\x1f\x11\x8e\xf7" + "\x18\x90\x4b\xfa\x82\x06\x57\xdd\xc7\xe3\x1d\xd6\x1f\x72\x12\x50" + "\x93\x20\x4c\xf7\x4b\x08\x1c\x28\x3f\x46\x47\xd0\x12\x40\xaa\xa9" + "\x38\x27\x04\x1e\x5f\x2c\xd0\x6b\xe8\xd0\xcd\xd9\x9d\xcc\x88\x67" + "\x8b\x5c\x5f\x80\xca\x54\xd8\x85\x26\x20\x31\xe8\xb8\xd9\xd4\xe9" + "\x40\x99\x11\x24\x86\x56\x82\xbe\x75\x5e\x53\x19\xf4\xfd\x38\x06" + "\x15\x9d\x58\x4c\x92\xb2\x09\xd1\x69\x03\x6f\xd2\x58\x9f\x85\x09" + "\xe2\x32\xda\x3a\x5a\x0e\x45\x1b\x8e\xf8\xbb\xe6\x60\x71\x81\xeb" + "\x51\x0a\x8f\x6d\x17\xd0\xea\x04\x1c\x11\xb9\x6b\x8e\xaa\x76", + 2047, + "\x66\x22\x20\xcc\x13\x5b\xe5\xac\x17\xf7\x54\xe4\xa3\xfd\x9c\xb6" + "\xbf\x4d\x20\x8f\x9a\x28\xc4\xd0\xe0\xa7\x8f\x36\xa1\xeb\x0d\xbf" + "\xc2\x79\x44\xd2\x42\xf6\xdb\x57\x34\xf3\x07\xac\x43\xdc\xa1\xc7" + "\x54\x9f\x0b\xfb\xc1\xb6\x12\x11\xb1\x67\xf1\x80\xef\x70\x0a\x9c" + "\x71\x34\xf1\x55\x1e\x9f\x2f\x0f\x93\x2a\x61\xb5\xf2\x01\x26\xfa" + "\xa2\xef\xe9\x4f\x00\xf5\xec\x08\x3d\x72\x80\x66\xfe\xde\x72\x9c" + "\xf4\x04\x24\xf3\x71\x3b\x07\xa3\x3d\xac\xcf\x59\x4d\xab\xec\x93" + "\xb2\x2d\x9a\x40\x66\xd3\xfb\x48\x66\xa5\x0f\xb7\xe6\x37\x50\x86" + "\x3d\xf1\x66\x8c\xae\x8d\xce\x1f\xca\xe4\x7e\x80\xb1\x15\x3b\x05" + "\x29\x71\xf9\x72\x68\x51\x2d\x5d\x94\xf7\x12\xd7\x24\x9e\x89\xd9" + "\x86\x8b\x4c\x1e\xf0\xdb\xec\x86\x1b\x9b\xb9\x84\x72\xce\x6a\x41" + "\x37\x1f\x86\x5c\x58\x75\x90\xeb\x2a\xac\xd5\xa2\x31\xc3\x99\xa9" + "\xb7\x62\x05\x8c\x2a\x16\x87\xa6\xb7\x9a\xe0\x11\xf9\x3d\xaf\x68" + "\x05\x96\x38\x87\x9d\x7c\x45\x22\xf4\xb6\xf1\x02\x03\xca\xbb\x76" + "\xd7\x9a\x7c\x55\x0b\x52\x25\x51\x02\x74\x62\x2c\xea\x76\x36\x8d" + "\x41\xac\x1b\x8c\xd8\x72\x71\xed\xa7\x93\xa7\x4b\x86\x4e\x0b\xf2" + "\xfa\x7d\x0e\xdb\xd9\x05\xc5\x94\x8c\x5f\x42\xc6\x16\xbc\x45\x66" + "\xa5\x81\xd9\xba\x14\x46\x38\xd0\x33\x15\x20\x75\xeb\x6a\xff\x27" + "\x9b\x91\x98\x5e\x9f\x8b\xd6\x5f\x86\xf6\x16\xba\xa7\x4d\x31\xf9" + "\x93\xd5\x85\x09\x5d\xbe\xf2\x41\x6a\xb3\x1a\xc8\x13\xce\xef\x63" + "\xc9\x31\x38\x0a\xa0\x22\xab\x4e\xb5\x30\xfe\xa0\x19\xf0\xa2\xa5" + "\x90\xc0\xef\xa4\xdd\x0f\xae\x78\x4e\x68\x95\x47\x20\xb9\x89\xbd" + "\x0c\x86\xb1\xe8\x1d\x73\x15\x60\xe0\x0c\xb9\x01\x70\x6b\xdf\xdb" + "\x6a\x40\xb7\x3d\xe9\x14\x10\xbe\x01\x9e\xe0\xc5\x57\x37\xe9\x81" + "\xc2\xe6\x0d\x4f\x82\x25\xe0\xa4\x85\x1f\x1d\xb6\x2e\x03\x22\x82" + "\x76\x02\x7c\x3c\x9a\xe1\xa6\xc4\x6b\x12\xbc\x5d\x8a\x94\xa0\x91" + "\xf8\x3b\x00\xce\x28\x07\x70\xe8\x5d\xe1\xf3\x0f\x11\xdf\x0a\xef" + "\x70\x98\x5b\x43\xe3\xbf\x0b\x0c\xf4\x95\xfd\x32\x99\xd1\x96\xee" + "\x1b\xe8\x5f\x20\xe6\x63\x84\x9b\xe0\xf2\x0f\xaa\xc0\x7b\x9c\x0e" + "\x8e\x2c\xec\x1b\x22\xa4\x2b\x84\xd9\x1c\x12\x5d\x21\x82\x6a\x6a" + "\x54\x65\x42\x36\x3c\x60\x42\x2b\xfa\x58\xac\xbd\x67\x20\xd6\x56" + "\x68\x9b\xfa\xc9\x96\x85\x8a\x67\x5c\x0c\x31\xbf\xba\xe8\xcb\x0d" + "\xd2\x5d\xd0\xec\x78\x2c\xa3\x13\xdb\x1c\x40\x41\x9f\xed\xea\xc3" + "\xc8\x8e\x5a\x52\x54\x75\xe0\xab\x42\x61\x70\x7c\x45\xdd\x02\xac" + "\xaf\x7b\x6a\x15\x11\xa0\xad\x48\xe1\x1c\x53\x24\xc7\xd3\x4d\x5c" + "\x2f\xc8\xa3\x72\xa5\x09\x45\xd1\x8e\xf8\xbc\x7a\xfd\xfd\x28\x5e" + "\x53\xdb\x1d\xe7\x08\x9c\xe8\x08\xc2\xe0\xd6\x21\x4d\x19\xcd\x44" + "\x77\xdf\xc8\x09\xb8\xbb\x2a\x88\xc5\xbe\x5a\x0b\x2c\x86\xd9\x9e" + "\xed\x85\x8d\x64\x98\x36\x29\x23\xdb\x6f\x8b\x02\x0d\x89\x86\xb0" + "\xee\xc9\x5b\xe3\x01\xab\xf3\x3c\xa4\xc1\x99\x27\xe0\xf9\xb1\xa9" + "\xc1\xb9\x9e\x8d\xc5\x06\xb8\xb6\xb8\x42\x50\x73\xef\x33\x71\x24" + "\x83\x4d\xc7\xe2\x71\xd9\x22\x9e\xad\x26\xc7\xbf\x00\xdb\x80\x34" + "\xb9\xf4\xc5\x59\xf8\x18\x66\x9a\x1c\x5f\x71\x22\x26\x1d\x9d\x84" + "\xfb\xe6\x5d\x3a\x5b\x6f\x21\x19\x17\x6a\x71\x28\xad\xd1\x67\x86" + "\x35\xec\x7b\x88\x3a\x64\xd2\xcb\x18\x2e\xa4\x06\x87\x7e\x5b\x5a" + "\x77\xe9\xb9\x68\xd2\xd3\x4b\x16\xaa\x5d\xed\xd4\xcc\x48\x9b\x55" + "\xd4\x02\x43\xa2\xc4\x3b\xe4\x67\xd3\x78\x42\x78\xbe\xa7\xb5\x07" + "\x8d\x6c\x6c\x96\xd8\x9c\x75\x91\xdb\xe7\x02\xb5\xe5\x00\xed\xf2" + "\xa4\x94\xeb\x02\xe8\xbc\x2c\xd8\x3b\xcc\x53\x17\xb2\xa6\x1c\xc0" + "\x7d\x4d\x5a\xf9\x52\xab\xb8\xba\xcf\x60\x8c\x7f\x5a\xb2\x51\x8a" + "\x7a\x87\xd2\xa2\xee\x78\x70\xe1\xfb\x28\x78\x24\xf9\x9a\x48\x2c" + "\x48\xfc\xb2\x28\xdc\xe4\x22\x94\x5a\xf3\xab\x6d\x57\x6e\x4b\x46" + "\x76\x5b\x84\xaf\x7c\xbf\x7c\x0b\x1d\x59\x65\x9e\x18\xbb\x26\xdb" + "\x52\x6c\x94\x9f\x52\x5f\xb6\x16\x93\x17\x37\x45\x38\x70\x73\x30" + "\x3c\x9c\x38\x9d\xb5\x5e\x6a\x53\x4b\xc0\xd1\xec\x40\xbe\x3f\x61" + "\x57\x12\x43\xc5\x4c\xe8\x76\xb5\xff\x39\x70\xc3\x2c\x9e\x33\xa0" + "\x45\x5d\xdd\xf4\xf1\x5c\xec\x6f\xd6\x22\x23\xa6\xa4\xf3\x55\x69" + "\x7e\x5f\xd8\x3d\xc3\xc8\x74\x83\xba\x36\xca\x3f\x94\xf9\x77\x2c" + "\x38\xe2\x87\x05\x08\x55\x7f\xa4\x43\x95\xeb\x75\x89\xee\xc2\x4e" + "\xf2\x04\xc2\xda\xd7\x05\xf1\xc6\xc0\x3c\x1c\x37\xae\x3f\x6e\x5c" + "\xd3\x85\xa9\x01\x70\x91\x55\xf0\x7f\xf2\xd5\x9c\x19\x8d\x21\xfd" + "\x01\xc1\xc6\x8a\x2a\x73\x34\x5d\x66\x24\x09\x66\x8f\xe7\x3b\x98" + "\xd5\x72\x69\xb9\xea\x8a\x16\xcf\x8b\xea\x4b\x6b\x65\x42\x42\x39" + "\xf1\xdb\xfa\x54\x69\xc9\xc0\xeb\x92\xd0\x4f\x10\xed\x69\xc4\xf7" + "\xf7\xa2\xcc\x94\xb3\x54\x56\x11\x17\xf5\xdc\xcc\x3a\xa7\x5b\x3f" + "\x5e\xfd\x2f\xc8\x5d\xe0\xa7\x35\xbc\xd2\xdd\xf7\x45\x89\xfb\xc9" + "\x28\xc2\x82\x19\x9c\x06\xda\xf7\x93\x64\xf0\x41\x41\xff\x00\x41" + "\x44\x1e\x9b\x01\x54\x9d\x37\x07\xab\x16\x91\x55\xf2\xf5\x5e\x28" + "\x5f\x40\x99\xb0\x09\x8c\xd8\xa9\xd3\xef\xff\x89\xba\xb4\xad\x09" + "\x98\x0f\x7c\xb7\x60\x6d\x60\x79\x4b\x9b\x28\x7c\x1a\x69\x7a\x23" + "\xe1\xed\xad\x0c\xf1\x61\xd2\xab\xf9\xa4\xe7\xd7\x3b\x5b\xfe\x28" + "\x7c\xa7\x92\x53\x90\xd7\x5b\xb6\x5d\x9f\x0f\xbe\xfb\xf9\x4f\xd0" + "\x4a\x23\x4a\x49\xd9\x29\xf1\x9c\xd7\xbc\x5c\x0e\xb4\x60\x2e\x95" + "\x6e\xe2\x24\x02\x8a\x80\x72\x55\xb1\xfa\xc2\x38\xdc\xa1\x4b\x6f" + "\xc7\xe1\xb3\xbb\x38\x9b\xdf\xf1\x49\xf1\x11\xbe\x40\xfd\x27\x9f" + "\x5a\x1d\x56\x3a\xc2\xa8\x76\xd7\xf4\x9a\x01\x1a\x9e\x40\x10\x79" + "\x7e\xa1\x31\xe8\xc8\x35\xcd\x9e\xa5\xa7\x29\x57\xf9\xd6\x1c\xc9" + "\x9e\x93\xb8\xfe\x0f\xd9\x8c\xdd\xcc\x77\x9f\xf8\x2c\x70\x36\x48" + "\x81\x75\xe4\x61\x01\x98\x9b\xea\x11\xf9\x47\xfb\x77\x1b\x9e\x16" + "\x12\x56\x72\x55\xfe\x64\x5f\xa4\xff\x16\x3d\x00\xbd\xa2\xe0\xd1" + "\xd4\x47\x36\x7b\x7e\x94\xd1\x22\x7f\xfb\xcb\x5c\x99\x01\x4e\xaf" + "\x82\x50\xf3\x2f\xb1\xcb\x12\x7a\x12\x4a\x5c\x62\x9b\x91\x43\xf2" + "\x73\xbe\xfd\x87\xfc\xd0\x59\x4c\xa4\xfb\x7f\x15\x55\x0d\x90\x83" + "\xd7\xf7\xe8\x20\xb4\x6a\xc7\xb9\xe7\x32\xc6\xda\xb3\x57\x15\x49" + "\x96\xf4\xbc\x03\xa6\x98\xe8\xbf\x3d\x61\x0b\x34\xe5\xad\xd6\xb8" + "\xd8\x1c\xc6\x1d\x39\x58\xb6\xef\xb2\xd0\x8b\xe5\x60\x9a\x90\x07" + "\x9e\x62\xcc\xf2\x5b\xe8\x20\xe5\x88\x57\xf0\x12\xc8\x66\x96\x27" + "\x1d\x9e\x34\x56\x2a\x62\x7e\x75\x94\x10\x93\x69\x50\x68\x6a\x48" + "\x8d\x02\xda\x4f\x9e\x82\xe2\x8c\xf1\xaf\x07\xe8\x01\x6d\x04\xce" + "\x3f\xc2\xbf\x01\x27\xe6\xd6\x73\xfe\x53\x00\xa2\x0e\x1b\xe0\x9f" + "\x4d\x3f\x69\x12\xd6\xc9\xf6\x1d\x5b\x50\xbc\x1f\x3d\x8e\xd4\x7f" + "\x57\xb9\x3f\xe4\x52\xe4\xae\xde\x54\xef\x09\xbe\xf8\xc8\x67\x0f" + "\xda\x1d\x1b\xe6\xf5\x7f\x61\x78\xac\xbb\xc8\xff\xe4\x42\xdd\xbb" + "\x44\x19\x94\xeb\xa1\x2a\xfe\x7d\xb9\x2d\xcb\xbe\x50\xff\x9a\xca" + "\x44\xf1\x75\xe3\xc6\x4d\x1d\x0c\x5f\xbd\x39\x25\xce\x43\xa6\xfb" + "\x9d\x0a\x55\x79\xf4\x6a\x4e\x36\xff\xda\x25\x63\xd4\xae\xb0\xa9" + "\x9b\x2f\x1d\x9a\x1e\xfb\x96\xd1\x85\x53\x54\xc4\x2b\x76\xf7\xe0" + "\xee\x1c\xcb\x03\xad\xd4\x49\x5c\xeb\x5e\xc3\x6c\x1e\xa9\x2f\x8a" + "\x20\xd9\x11\x8a\x94\xd7\x15\x42\xdf\x2c\x15\xa5\x69\xec\x6b\x9b" + "\x00\x71\x59\xfc\xbf\x7d\x3f\x77\xa3\xb8\x89\xc8\x66\x75\xc5\xb1" + "\xd9\x69\x79\xe4\x6a\x74\x01\xe3\x5a\x6b\x1e\xc6\xa5\xd0\x11\x29" + "\xa1\x5a\x1d\xf0\xe2\xf1\x35\x30\x65\xa6\x14\xf4\xcb\x57\xf2\xa1" + "\x0d\x92\xcb\x4c\x24\x50\x8d\xfe\x4f\xbf\x55\x9c\xa1\x54\x54\x9b" + "\x72\x45\xdc\x13\xea\x57\x8e\xab\xdf\xe6\xa4\x3f\x2f\x7b\xbe\x86" + "\x63\x04\x09\x99\xe6\x38\x19\x2b\x88\x50\xe3\x78\x64\x85\x56\x45" + "\x53\xc4\xce\xfb\xa0\xc3\xf8\x77\x87\xa7\xa8\x54\x57\xb7\x18\x2c" + "\x87\xb6\x87\xca\xe0\x45\x58\x06\xe1\x3e\xc3\x4a\x0c\xaa\xce\xca" + "\x25\xee\x53\x85\x2a\x37\x3d\x37\x29\x85\xda\x1d\x2c\x24\xb3\xd4" + "\x63\xbf\xe9\x34\xdd\x35\x01\x41\x2e\x27\x9b\x05\x44\x25\xd1\x5b" + "\xdf\x59\xcc\x26\xf9\xd4\xdf\x2e\x23\x71\xe6\xc4\x6e\x3b\xac\xe9" + "\x75\x27\x74\xe8\xd4\x0d\xc9\xb5\x8f\x58\x27\x25\xef\x9e\x66\x4b" + "\x69\x2e\xfb\x07\x39\x91\x4c\x9a\x00\xf6\x62\xd2\xfb\x15\x73\xb4" + "\xe1\x7a\x3b\xd1\x7d\x16\x74\xa5\x09\xa0\xc4\x99\x42\xea\x6d\x64" + "\xd6\x15\x18\xa6\x9a\x94\x94\x49\x8c\x5e\xe9\x5f\xc5\x40\x14\xa1" + "\xc9\xd8\xf2\x1f\x4a\x75\x3d\x14\xde\x3e\x8c\x89\xa0\xf2\xd4\xf3" + "\x3d\xb1\x89\xe4\x0a\x8a\x06\x33\x60\x73\x45\xc5\x3b\x99\x29\xbf" + "\xd2\x87\x44\x7b\x80\xf8\xe6\x31\x92\xf5\xd6\x44\xcb\xc9\x02\xd9" + "\xf9\x66\xc6\x6a\x40\x1f\x40\x1a\x31\x39\xd9\xcc\x4d\xa2\x5b\x6f" + "\x42\xc3\x4d\x53\x89\x66\xb3\x72\xb8\x3d\x6b\x21\xc7\xa4\xe1\x14" + "\x06\xd3\x3b\xf7\x7b\x1d\x5e\xd9\xb6\xb4\xc7\xeb\xc9\xea\x51\xe7" + "\x33\xaa\xdf\xf9\xe6\x5c\x66\x93\xa9\x4b\x03\x73\xdf\x6b\x3f\x3b" + "\xad\x27\xb6\xa4\x09\x30\x31\x06\x30\x17\x3a\xaf\xd2\xa6\x71\xd8" + "\x60\x8d\x16\xb5\x0d\xec\x1a\xf7\x42\x38\x32\x08\xf2\x32\xeb\x6a" + "\x12\xdc\x77\x42\x93\xe8\x29\x2a\x06\xe9\x2f\xbf\x4d\xfb\xb1\x59" + "\x31\xaf\xe7\xfa\x3e\xb1\x54\x33\xe2\xfa\xde\x99\x16\x2e\xeb\x35" + "\x52\x98\x12\x12\x7c\x69\xa1\x7f\x9a\xe1\xd8\x2d\x51\xba\xda\xb6" + "\x90\xf1\x8a\x57\xed\x86\x20\xfe\x79\x08\x57\xc5\x80\x15\x7b\xf5" + "\xfb\x92\x1d\xd1\x31\x8d\x9d\xa1\xc3\x46\x69\xfa\x71\x2c\x42", + "\xc6\x54\xf9\xf0\x22\x2c\xc3\xee\xdd\x13\x02\xb8\xe7\x5a\x2e\x7e" } }; gcry_cipher_hd_t hde, hdd; - unsigned char out[MAX_DATA_LEN]; + unsigned char out[MAX_DATA_LEN * 2]; unsigned char tag[16]; int i, keylen; gcry_error_t err = 0; -- 2.34.1 From jussi.kivilinna at iki.fi Thu Jul 21 10:09:30 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 21 Jul 2022 11:09:30 +0300 Subject: [PATCH 3/3] sm4-arm-sve-ce: use 32 parallel blocks for XTS and CTR32LE In-Reply-To: <20220721080930.3271436-1-jussi.kivilinna@iki.fi> References: <20220721080930.3271436-1-jussi.kivilinna@iki.fi> Message-ID: <20220721080930.3271436-3-jussi.kivilinna@iki.fi> * cipher/sm4.c (sm4_crypt_blk1_32) [USE_ARM_SVE_CE]: Add SVE-SM4 code path. -- Signed-off-by: Jussi Kivilinna --- cipher/sm4.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cipher/sm4.c b/cipher/sm4.c index f68197c4..b5d4691d 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -1462,6 +1462,13 @@ sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf, return _gcry_sm4_gfni_avx512_crypt_blk32 (rk, outbuf, inbuf); } #endif +#ifdef USE_ARM_SVE_CE + if (ctx->use_arm_sve_ce) + { + _gcry_sm4_armv9_sve_ce_crypt (rk, outbuf, inbuf, num_blks); + return 0; + } +#endif do { -- 2.34.1 From jussi.kivilinna at iki.fi Thu Jul 21 10:25:34 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 21 Jul 2022 11:25:34 +0300 Subject: [PATCH] sha3: Add x86-64 AVX512 accelerated implementation Message-ID: <20220721082534.3331439-1-jussi.kivilinna@iki.fi> * LICENSES: Add 'cipher/keccak-amd64-avx512.S'. * configure.ac: Add 'keccak-amd64-avx512.lo'. * cipher/Makefile.am: Add 'keccak-amd64-avx512.S'. * cipher/keccak-amd64-avx512.S: New. * cipher/keccak.c (USE_64BIT_AVX512): New. [USE_64BIT_AVX512] (_gcry_keccak_f1600_state_permute64_avx512) (_gcry_keccak_absorb_blocks_avx512, keccak_f1600_state_permute64_avx512) (keccak_absorb_lanes64_avx512, keccak_avx512_64_ops): New. (keccak_init) [USE_64BIT_AVX512]: Enable x86-64 AVX512 implementation if supported by HW features. -- Benchmark on Intel Core i3-1115G4 (tigerlake): Before (BMI2 instructions): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA3-224 | 1.77 ns/B 540.3 MiB/s 7.22 c/B 4088 SHA3-256 | 1.86 ns/B 514.0 MiB/s 7.59 c/B 4089 SHA3-384 | 2.43 ns/B 393.1 MiB/s 9.92 c/B 4089 SHA3-512 | 3.49 ns/B 273.2 MiB/s 14.27 c/B 4088 SHAKE128 | 1.52 ns/B 629.1 MiB/s 6.20 c/B 4089 SHAKE256 | 1.86 ns/B 511.6 MiB/s 7.62 c/B 4089 After (~33% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA3-224 | 1.32 ns/B 721.8 MiB/s 5.40 c/B 4089 SHA3-256 | 1.40 ns/B 681.7 MiB/s 5.72 c/B 4089 SHA3-384 | 1.83 ns/B 522.5 MiB/s 7.46 c/B 4089 SHA3-512 | 2.63 ns/B 362.1 MiB/s 10.77 c/B 4088 SHAKE128 | 1.13 ns/B 840.4 MiB/s 4.64 c/B 4089 SHAKE256 | 1.40 ns/B 682.1 MiB/s 5.72 c/B 4089 Signed-off-by: Jussi Kivilinna --- LICENSES | 1 + cipher/Makefile.am | 3 +- cipher/keccak-amd64-avx512.S | 583 +++++++++++++++++++++++++++++++++++ cipher/keccak.c | 73 +++++ configure.ac | 2 +- 5 files changed, 660 insertions(+), 2 deletions(-) create mode 100644 cipher/keccak-amd64-avx512.S diff --git a/LICENSES b/LICENSES index 67b80e64..c2fea82d 100644 --- a/LICENSES +++ b/LICENSES @@ -139,6 +139,7 @@ with any binary distributions derived from the GNU C Library. For files: - cipher/cipher-gcm-ppc.c + - cipher/keccak-amd64-avx512.S #+begin_quote Copyright (c) 2006, CRYPTOGAMS by diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 97823cb4..29690358 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -134,7 +134,8 @@ EXTRA_libcipher_la_SOURCES = \ sha512-armv7-neon.S sha512-arm.S \ sha512-ppc.c sha512-ssse3-i386.c \ sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \ - keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ + keccak.c keccak_permute_32.h keccak_permute_64.h \ + keccak-armv7-neon.S keccak-amd64-avx512.S \ stribog.c \ tiger.c \ whirlpool.c whirlpool-sse2-amd64.S \ diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S new file mode 100644 index 00000000..f44e0285 --- /dev/null +++ b/cipher/keccak-amd64-avx512.S @@ -0,0 +1,583 @@ +/* keccak-amd64-avx512.S - x86-64 AVX512 implementation of Keccak + * + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * --- + * + * Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation + * by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`. + * `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC + * preprocessed assembly and fitted with new absorb function optimized for + * x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB. + * + * Original copyright license follows: + * + * Copyright (c) 2006, CRYPTOGAMS by + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain copyright notices, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * * Neither the name of the CRYPTOGAMS nor the names of its + * copyright holder and contributors may be used to endorse or + * promote products derived from this software without specific + * prior written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this + * product may be distributed under the terms of the GNU General Public + * License (GPL), in which case the provisions of the GPL apply INSTEAD OF + * those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +#include "asm-common-amd64.h" + +.text + +/* Register macros. */ +#define A_0_0 %xmm31 +#define A_0_1 %xmm30 +#define A_0_2 %xmm29 +#define A_0_3 %xmm28 +#define A_0_4 %xmm27 +#define A_1_0 %xmm26 +#define A_1_1 %xmm25 +#define A_1_2 %xmm24 +#define A_1_3 %xmm23 +#define A_1_4 %xmm22 +#define A_2_0 %xmm21 +#define A_2_1 %xmm20 +#define A_2_2 %xmm19 +#define A_2_3 %xmm18 +#define A_2_4 %xmm17 +#define A_3_0 %xmm16 +#define A_3_1 %xmm15 +#define A_3_2 %xmm14 +#define A_3_3 %xmm13 +#define A_3_4 %xmm12 +#define A_4_0 %xmm11 +#define A_4_1 %xmm10 +#define A_4_2 %xmm9 +#define A_4_3 %xmm8 +#define A_4_4 %xmm7 + +#define C_0 %xmm6 +#define C_1 %xmm5 +#define C_2 %xmm4 +#define C_3 %xmm3 +#define C_4 %xmm2 +#define C_5 %xmm1 +#define C_6 %xmm0 + +#define D_0 C_4 +#define D_1 C_5 +#define D_2 C_6 +#define D_3 C_2 +#define D_4 C_3 + +/* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */ +#define eor3_d(dst_s1, s2, s3) \ + vpternlogq $0x96, s3, s2, dst_s1; + +#define eor3(dst, s1, s2, s3) \ + vmovdqa s1, dst; \ + eor3_d(dst, s2, s3); + +#define rax1_c(dst, s1, s2_rol1) \ + vprolq $1, s2_rol1, dst; \ + vpxor s1, dst, dst; + +#define rax1_t(dst_s1, s2_rol1, tmp) \ + vprolq $1, s2_rol1, tmp; \ + vpxor tmp, dst_s1, dst_s1; + +#define rax1_s(dst_s1, s2_rol1) \ + vprolq $1, s2_rol1, s2_rol1; \ + vpxor s2_rol1, dst_s1, dst_s1; + +#define xar(dst, s1, s2, rol) \ + vpxorq s2, s1, dst; \ + vprolq $(rol), dst, dst; + +#define xar_x(dst, s1, s2, rol) \ + vpxor s2, s1, dst; \ + vprolq $(rol), dst, dst; + +#define bcax_d(dst_s1, s2, s3) \ + vpternlogq $0xb4, s3, s2, dst_s1; + +#define bcax(dst, s1, s2, s3) \ + vmovdqa64 s1, dst; \ + bcax_d(dst, s2, s3); + +#define bcax_x(dst, s1, s2, s3) \ + vmovdqa s1, dst; \ + bcax_d(dst, s2, s3); + +#define eor(dst, s1, s2) \ + vpxorq s2, s1, dst; + +/* Misc helper macros. */ +#define clear_avx512_4regs(a, b, c, d) \ + eor(a, a, a); vmovdqa64 a, b; vmovdqa64 a, c; vmovdqa64 a, d; + +#define clear_regs() \ + vzeroall; /* xmm0-xmm15 */ \ + clear_avx512_4regs(%xmm16, %xmm17, %xmm18, %xmm19); \ + clear_avx512_4regs(%xmm20, %xmm21, %xmm22, %xmm23); \ + clear_avx512_4regs(%xmm24, %xmm25, %xmm26, %xmm27); \ + clear_avx512_4regs(%xmm28, %xmm29, %xmm30, %xmm31); + +ELF(.type KeccakF1600_ce, at function) +.align 64, 0xcc +KeccakF1600_ce: +.Loop_ce: + CFI_STARTPROC() + + ////////////////////////////////////////////////// Theta + eor3( C_0, A_4_0, A_3_0, A_2_0) + eor3( C_1, A_4_1, A_3_1, A_2_1) + eor3( C_3, A_4_3, A_3_3, A_2_3) + eor3( C_2, A_4_2, A_3_2, A_2_2) + eor3( C_4, A_4_4, A_3_4, A_2_4) + eor3_d( C_0, A_1_0, A_0_0) + eor3_d( C_1, A_1_1, A_0_1) + eor3_d( C_3, A_1_3, A_0_3) + eor3_d( C_2, A_1_2, A_0_2) + eor3_d( C_4, A_1_4, A_0_4) + + rax1_c( C_5, C_0, C_2) // D[1] + rax1_t( C_2, C_4, C_6) // D[3] + rax1_c( C_6, C_1, C_3) // D[2] + rax1_s( C_3, C_0) // D[4] + rax1_s( C_4, C_1) // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi + xar( C_0, A_0_1, D_1, 1) // C[0]=A[2][0] + + xar( A_0_1, A_1_1, D_1, 44) + xar( A_1_1, A_1_4, D_4, 20) + xar( A_1_4, A_4_2, D_2, 61) + xar( A_4_2, A_2_4, D_4, 39) + xar( A_2_4, A_4_0, D_0, 18) + + xar( C_1, A_0_2, D_2, 62) // C[1]=A[4][0] + + xar( A_0_2, A_2_2, D_2, 43) + xar( A_2_2, A_2_3, D_3, 25) + xar( A_2_3, A_3_4, D_4, 8) + xar_x( A_3_4, A_4_3, D_3, 56) + xar( A_4_3, A_3_0, D_0, 41) + + xar( A_3_0, A_0_4, D_4, 27) + + xar_x( D_4, A_4_4, D_4, 14) // D[4]=A[0][4] + xar_x( A_4_4, A_4_1, D_1, 2) + xar( A_1_3, A_1_3, D_3, 55) // A[1][3]=A[4][1] + xar( A_0_4, A_3_1, D_1, 45) // A[0][4]=A[1][3] + xar( A_3_1, A_1_0, D_0, 36) + + xar( A_1_0, A_0_3, D_3, 28) + + eor( A_0_0, A_0_0, D_0) + + xar_x( D_3, A_3_3, D_3, 21) // D[3]=A[0][3] + xar( A_0_3, A_3_2, D_2, 15) // A[0][3]=A[3][3] + xar( D_1, A_2_1, D_1, 10) // D[1]=A[3][2] + xar( D_2, A_1_2, D_2, 6) // D[2]=A[2][1] + xar( D_0, A_2_0, D_0, 3) // D[0]=A[1][2] + + ////////////////////////////////////////////////// Chi+Iota + bcax_x( A_4_0, C_1, A_4_2, A_1_3) // A[1][3]=A[4][1] + bcax( A_4_1, A_1_3, A_4_3, A_4_2) // A[1][3]=A[4][1] + bcax_d( A_4_2, A_4_4, A_4_3) + bcax_d( A_4_3, C_1, A_4_4) + bcax_d( A_4_4, A_1_3, C_1) // A[1][3]=A[4][1] + + bcax_x( A_3_2, D_1, A_3_4, A_0_3) // A[0][3]=A[3][3] + bcax( A_3_3, A_0_3, A_3_0, A_3_4) // A[0][3]=A[3][3] + bcax_d( A_3_4, A_3_1, A_3_0) + bcax_d( A_3_0, D_1, A_3_1) + bcax_d( A_3_1, A_0_3, D_1) // A[0][3]=A[3][3] + + bcax( A_2_0, C_0, A_2_2, D_2) + bcax( A_2_1, D_2, A_2_3, A_2_2) + bcax_d( A_2_2, A_2_4, A_2_3) + bcax_d( A_2_3, C_0, A_2_4) + bcax_d( A_2_4, D_2, C_0) + + bcax( A_1_2, D_0, A_1_4, A_0_4) // A[0][4]=A[1][3] + bcax( A_1_3, A_0_4, A_1_0, A_1_4) // A[0][4]=A[1][3] + bcax_d( A_1_4, A_1_1, A_1_0) + bcax_d( A_1_0, D_0, A_1_1) + bcax_d( A_1_1, A_0_4, D_0) // A[0][4]=A[1][3] + + bcax( A_0_3, D_3, A_0_0, D_4) + bcax( A_0_4, D_4, A_0_1, A_0_0) + bcax_d( A_0_0, A_0_2, A_0_1) + bcax_d( A_0_1, D_3, A_0_2) + bcax_d( A_0_2, D_4, D_3) + eor( A_0_0, A_0_0, (%r10)) + + cmpq %r10, %r11 + je .Lend_ce + + addq $8, %r10 + jmp .Loop_ce + +.align 64, 0xcc +.Lend_ce: + ret_spec_stop + CFI_ENDPROC() +ELF(.size KeccakF1600_ce,.-KeccakF1600_ce) + +.globl _gcry_keccak_f1600_state_permute64_avx512 +ELF(.type _gcry_keccak_f1600_state_permute64_avx512, at function) +.align 64, 0xcc +_gcry_keccak_f1600_state_permute64_avx512: + /* input: + * %rdi: state + * %rsi: round constants + */ + CFI_STARTPROC() + + leaq 12*8(%rdi), %rax + leaq (24-1)*8(%rsi), %r11 + + vmovdqu64 0*8(%rdi), A_0_0 + vmovdqu64 1*8(%rdi), A_0_1 + vmovdqu64 2*8(%rdi), A_0_2 + vmovdqu64 3*8(%rdi), A_0_3 + vmovdqu64 4*8(%rdi), A_0_4 + vmovdqu64 5*8(%rdi), A_1_0 + vmovdqu64 6*8(%rdi), A_1_1 + vmovdqu64 7*8(%rdi), A_1_2 + vmovdqu64 8*8(%rdi), A_1_3 + vmovdqu64 9*8(%rdi), A_1_4 + vmovdqu64 10*8(%rdi), A_2_0 + vmovdqu64 11*8(%rdi), A_2_1 + vmovdqu64 0*8(%rax), A_2_2 + vmovdqu64 1*8(%rax), A_2_3 + vmovdqu64 2*8(%rax), A_2_4 + vmovdqu64 3*8(%rax), A_3_0 + vmovdqu 4*8(%rax), A_3_1 + vmovdqu 5*8(%rax), A_3_2 + vmovdqu 6*8(%rax), A_3_3 + vmovdqu 7*8(%rax), A_3_4 + vmovdqu 8*8(%rax), A_4_0 + vmovdqu 9*8(%rax), A_4_1 + vmovdqu 10*8(%rax), A_4_2 + vmovdqu 11*8(%rax), A_4_3 + vmovq 12*8(%rax), A_4_4 + + movq %rsi, %r10 + call KeccakF1600_ce + + vpunpcklqdq A_0_1, A_0_0, A_0_0 + vpunpcklqdq A_0_3, A_0_2, A_0_2 + vpunpcklqdq A_1_0, A_0_4, A_0_4 + vpunpcklqdq A_1_2, A_1_1, A_1_1 + vpunpcklqdq A_1_4, A_1_3, A_1_3 + vpunpcklqdq A_2_1, A_2_0, A_2_0 + vpunpcklqdq A_2_3, A_2_2, A_2_2 + vpunpcklqdq A_3_0, A_2_4, A_2_4 + vpunpcklqdq A_3_2, A_3_1, A_3_1 + vpunpcklqdq A_3_4, A_3_3, A_3_3 + vpunpcklqdq A_4_1, A_4_0, A_4_0 + vpunpcklqdq A_4_3, A_4_2, A_4_2 + vmovdqu64 A_0_0, 0*8(%rdi) + vmovdqu64 A_0_2, 2*8(%rdi) + vmovdqu64 A_0_4, 4*8(%rdi) + vmovdqu64 A_1_1, 6*8(%rdi) + vmovdqu64 A_1_3, 8*8(%rdi) + vmovdqu64 A_2_0, 10*8(%rdi) + vmovdqu64 A_2_2, 0*8(%rax) + vmovdqu64 A_2_4, 2*8(%rax) + vmovdqu A_3_1, 4*8(%rax) + vmovdqu A_3_3, 6*8(%rax) + vmovdqu A_4_0, 8*8(%rax) + vmovdqu A_4_2, 10*8(%rax) + vmovq A_4_4, 12*8(%rax) + + xorl %eax, %eax + + clear_regs() + ret_spec_stop + CFI_ENDPROC() +ELF(.size _gcry_keccak_f1600_state_permute64_avx512, + .-_gcry_keccak_f1600_state_permute64_avx512) + +.globl _gcry_keccak_absorb_blocks_avx512 +ELF(.type _gcry_keccak_absorb_blocks_avx512, at function) +.align 64, 0xcc +_gcry_keccak_absorb_blocks_avx512: + /* input: + * %rdi: state + * %rsi: round constants + * %rdx: lanes + * %rcx: nlanes + * %r8 : blocklanes + * %r9 : lanes output pointer + */ + CFI_STARTPROC() + + leaq 12*8(%rdi), %rax + leaq (24-1)*8(%rsi), %r11 + + vmovdqu64 0*8(%rdi), A_0_0 + vmovdqu64 1*8(%rdi), A_0_1 + vmovdqu64 2*8(%rdi), A_0_2 + vmovdqu64 3*8(%rdi), A_0_3 + vmovdqu64 4*8(%rdi), A_0_4 + vmovdqu64 5*8(%rdi), A_1_0 + vmovdqu64 6*8(%rdi), A_1_1 + vmovdqu64 7*8(%rdi), A_1_2 + vmovdqu64 8*8(%rdi), A_1_3 + vmovdqu64 9*8(%rdi), A_1_4 + vmovdqu64 10*8(%rdi), A_2_0 + vmovdqu64 11*8(%rdi), A_2_1 + vmovdqu64 0*8(%rax), A_2_2 + vmovdqu64 1*8(%rax), A_2_3 + vmovdqu64 2*8(%rax), A_2_4 + vmovdqu64 3*8(%rax), A_3_0 + vmovdqu 4*8(%rax), A_3_1 + vmovdqu 5*8(%rax), A_3_2 + vmovdqu 6*8(%rax), A_3_3 + vmovdqu 7*8(%rax), A_3_4 + vmovdqu 8*8(%rax), A_4_0 + vmovdqu 9*8(%rax), A_4_1 + vmovdqu 10*8(%rax), A_4_2 + vmovdqu 11*8(%rax), A_4_3 + vmovq 12*8(%rax), A_4_4 + + cmpq $(104 >> 3), %r8 + jb .Loop_absorb_72_ce + je .Loop_absorb_104_ce + cmpq $(144 >> 3), %r8 + jb .Loop_absorb_136_ce + je .Loop_absorb_144_ce + jmp .Loop_absorb_168_ce + +.align 64, 0xcc +.Loop_absorb_168_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vpxorq 12*8(%rdx), A_2_2, A_2_2 + vpxorq 13*8(%rdx), A_2_3, A_2_3 + vpxorq 14*8(%rdx), A_2_4, A_2_4 + vpxorq 15*8(%rdx), A_3_0, A_3_0 + vpxor 16*8(%rdx), A_3_1, A_3_1 + vpxor 17*8(%rdx), A_3_2, A_3_2 + vpxor 18*8(%rdx), A_3_3, A_3_3 + vpxor 19*8(%rdx), A_3_4, A_3_4 + vmovq 20*8(%rdx), C_0 + leaq 21*8(%rdx), %rdx + vpxorq C_0, A_4_0, A_4_0 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_168_ce + +.align 64, 0xcc +.Loop_absorb_144_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vpxorq 12*8(%rdx), A_2_2, A_2_2 + vpxorq 13*8(%rdx), A_2_3, A_2_3 + vpxorq 14*8(%rdx), A_2_4, A_2_4 + vpxorq 15*8(%rdx), A_3_0, A_3_0 + vpxor 16*8(%rdx), A_3_1, A_3_1 + vmovq 17*8(%rdx), C_0 + leaq 18*8(%rdx), %rdx + vpxor C_0, A_3_2, A_3_2 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_144_ce + +.align 64, 0xcc +.Loop_absorb_136_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vpxorq 12*8(%rdx), A_2_2, A_2_2 + vpxorq 13*8(%rdx), A_2_3, A_2_3 + vpxorq 14*8(%rdx), A_2_4, A_2_4 + vpxorq 15*8(%rdx), A_3_0, A_3_0 + vmovq 16*8(%rdx), C_0 + leaq 17*8(%rdx), %rdx + vpxor C_0, A_3_1, A_3_1 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_136_ce + +.align 64, 0xcc +.Loop_absorb_104_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vmovq 12*8(%rdx), C_0 + leaq 13*8(%rdx), %rdx + vpxorq C_0, A_2_2, A_2_2 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_104_ce + +.align 64, 0xcc +.Loop_absorb_72_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vmovq 8*8(%rdx), C_0 + leaq 9*8(%rdx), %rdx + vpxorq C_0, A_1_3, A_1_3 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_72_ce + +.align 64, 0xcc +.Labsorbed_ce: + vpunpcklqdq A_0_1, A_0_0, A_0_0 + vpunpcklqdq A_0_3, A_0_2, A_0_2 + vpunpcklqdq A_1_0, A_0_4, A_0_4 + vpunpcklqdq A_1_2, A_1_1, A_1_1 + vpunpcklqdq A_1_4, A_1_3, A_1_3 + vpunpcklqdq A_2_1, A_2_0, A_2_0 + vpunpcklqdq A_2_3, A_2_2, A_2_2 + vpunpcklqdq A_3_0, A_2_4, A_2_4 + vpunpcklqdq A_3_2, A_3_1, A_3_1 + vpunpcklqdq A_3_4, A_3_3, A_3_3 + vpunpcklqdq A_4_1, A_4_0, A_4_0 + vpunpcklqdq A_4_3, A_4_2, A_4_2 + vmovdqu64 A_0_0, 0*8(%rdi) + vmovdqu64 A_0_2, 2*8(%rdi) + vmovdqu64 A_0_4, 4*8(%rdi) + vmovdqu64 A_1_1, 6*8(%rdi) + vmovdqu64 A_1_3, 8*8(%rdi) + vmovdqu64 A_2_0, 10*8(%rdi) + vmovdqu64 A_2_2, 0*8(%rax) + vmovdqu64 A_2_4, 2*8(%rax) + vmovdqu A_3_1, 4*8(%rax) + vmovdqu A_3_3, 6*8(%rax) + vmovdqu A_4_0, 8*8(%rax) + vmovdqu A_4_2, 10*8(%rax) + vmovq A_4_4, 12*8(%rax) + + leaq (%r8, %rcx), %rax // return value + movq %rdx, (%r9) // return buffer pointer + + clear_regs() + ret_spec_stop + CFI_ENDPROC() +ELF(.size _gcry_keccak_absorb_blocks_avx512, + .-_gcry_keccak_absorb_blocks_avx512) + +#endif /* HAVE_GCC_INLINE_ASM_AVX512 */ +#endif /* __x86_64 */ diff --git a/cipher/keccak.c b/cipher/keccak.c index f3502022..6c027eac 100644 --- a/cipher/keccak.c +++ b/cipher/keccak.c @@ -62,6 +62,16 @@ #endif +/* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */ +#undef USE_64BIT_AVX512 +#if defined(USE_64BIT) && defined(__x86_64__) && \ + defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_64BIT_AVX512 1 +#endif + + /* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly * code. */ #undef USE_64BIT_ARM_NEON @@ -428,6 +438,65 @@ static const keccak_ops_t keccak_bmi2_64_ops = #endif /* USE_64BIT_BMI2 */ +/* 64-bit Intel AVX512 implementation. */ +#ifdef USE_64BIT_AVX512 + +extern unsigned int +_gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst); + +extern unsigned int +_gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst, + const byte *lanes, size_t nlanes, + size_t blocklanes, const byte **new_lanes); + +static unsigned int +keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd) +{ + return _gcry_keccak_f1600_state_permute64_avx512 ( + hd->u.state64, _gcry_keccak_round_consts_64bit); +} + +static unsigned int +keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + while (nlanes) + { + if (pos == 0 && blocklanes > 0 && nlanes >= (unsigned int)blocklanes) + { + nlanes = _gcry_keccak_absorb_blocks_avx512 ( + hd->u.state64, _gcry_keccak_round_consts_64bit, + lanes, nlanes, blocklanes, &lanes); + } + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64 (lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + keccak_f1600_state_permute64_avx512 (hd); + pos = 0; + break; + } + } + } + + return 0; +} + +static const keccak_ops_t keccak_avx512_64_ops = +{ + .permute = keccak_f1600_state_permute64_avx512, + .absorb = keccak_absorb_lanes64_avx512, + .extract = keccak_extract64, +}; + +#endif /* USE_64BIT_AVX512 */ + + /* 64-bit ARMv7/NEON implementation. */ #ifdef USE_64BIT_ARM_NEON @@ -894,6 +963,10 @@ keccak_init (int algo, void *context, unsigned int flags) /* Select optimized implementation based in hw features. */ if (0) {} +#ifdef USE_64BIT_AVX512 + else if (features & HWF_INTEL_AVX512) + ctx->ops = &keccak_avx512_64_ops; +#endif #ifdef USE_64BIT_ARM_NEON else if (features & HWF_ARM_NEON) ctx->ops = &keccak_armv7_neon_64_ops; diff --git a/configure.ac b/configure.ac index b55510d8..3abee22d 100644 --- a/configure.ac +++ b/configure.ac @@ -3152,7 +3152,7 @@ if test "$found" = "1" ; then case "${host}" in x86_64-*-*) # Build with the assembly implementation - : + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-amd64-avx512.lo" ;; esac -- 2.34.1 From jussi.kivilinna at iki.fi Thu Jul 21 10:25:46 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 21 Jul 2022 11:25:46 +0300 Subject: [PATCH] sha512: add AArch64 crypto/SHA512 extension implementation Message-ID: <20220721082546.3331466-1-jussi.kivilinna@iki.fi> * cipher/Makefile.am: Add 'sha512-armv8-aarch64-ce.S'. * cipher/sha512-armv8-aarch64-ce.S: New. * cipher/sha512.c (ATTR_ALIGNED_64, USE_ARM64_SHA512): New. (k): Make array aligned to 64 bytes. [USE_ARM64_SHA512] (_gcry_sha512_transform_armv8_ce): New. [USE_ARM64_SHA512] (do_sha512_transform_armv8_ce): New. (sha512_init_common) [USE_ARM64_SHA512]: Use ARMv8-SHA512 accelerated implementation if HW feature available. * configure.ac: Add 'sha512-armv8-aarch64-ce.lo'. (gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4) (HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4): New. -- Benchmark on AWS Graviton3: Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA512 | 2.36 ns/B 404.2 MiB/s 6.13 c/B 2600 After (2.4x faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA512 | 0.977 ns/B 976.6 MiB/s 2.54 c/B 2600 Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 +- cipher/sha512-armv8-aarch64-ce.S | 383 +++++++++++++++++++++++++++++++ cipher/sha512.c | 40 +++- configure.ac | 54 +++++ 4 files changed, 477 insertions(+), 2 deletions(-) create mode 100644 cipher/sha512-armv8-aarch64-ce.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 97823cb4..e27bb0bc 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -131,7 +131,7 @@ EXTRA_libcipher_la_SOURCES = \ sha256-intel-shaext.c sha256-ppc.c \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \ sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \ - sha512-armv7-neon.S sha512-arm.S \ + sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \ sha512-ppc.c sha512-ssse3-i386.c \ sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \ keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ diff --git a/cipher/sha512-armv8-aarch64-ce.S b/cipher/sha512-armv8-aarch64-ce.S new file mode 100644 index 00000000..73fe7ced --- /dev/null +++ b/cipher/sha512-armv8-aarch64-ce.S @@ -0,0 +1,383 @@ +/* sha512-armv8-aarch64-ce.S - ARM/CE accelerated SHA-512 transform function + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include "asm-common-aarch64.h" + +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4) && \ + defined(USE_SHA512) + +.arch armv8.2-a+sha3+sm4 + +.text + + +/* Register macros */ + +#define Qv0 q0 +#define Qv1 q1 +#define Qv2 q2 +#define Qv3 q3 +#define Qv4 q4 + +#define vT0 v5 +#define vT1 v6 +#define QvT1 q6 +#define vT2 v7 +#define vT3 v16 + +#define vH01 v17 +#define vH23 v18 +#define vH45 v19 +#define vH67 v20 + +#define vW0 v21 +#define vW1 v22 +#define vW2 v23 +#define vW3 v24 +#define vW4 v25 +#define vW5 v26 +#define vW6 v27 +#define vW7 v28 + +#define vK0 v29 +#define vK1 v30 +#define vK2 v31 + + +/* Round macros */ + +#define _(...) /*_*/ + +#define do_add(a, b) add a.2d, a.2d, b.2d; + +#define load_k_3() ld1 {vK0.2d-vK2.2d}, [x3], #48; +#define load_k_last() ld1 {vK0.2d}, [x3]; + +#define load_msg1(...) \ + ld1 {vW0.16b-vW3.16b}, [x1], #64; + +#define load_msg2(...) \ + rev64 vW0.16b, vW0.16b; + +#define load_msg3(...) \ + rev64 vW1.16b, vW1.16b; + +#define load_msg4(...) \ + ld1 {vW4.16b-vW7.16b}, [x1], #64; + +#define load_msg5(...) \ + rev64 vW2.16b, vW2.16b; + +#define load_msg6(...) \ + rev64 vW3.16b, vW3.16b; + +#define load_msg7(...) \ + rev64 vW4.16b, vW4.16b; + +#define load_msg8(...) \ + rev64 vW5.16b, vW5.16b; + +#define load_msg9(...) \ + rev64 vW6.16b, vW6.16b; + +#define load_msg10(...) \ + rev64 vW7.16b, vW7.16b; + +#define schedule1(w0, w1, w2, w3, w4, w5, w6, w7) \ + sha512su0 w0.2d, w1.2d; \ + +#define schedule2(w0, w1, w2, w3, w4, w5, w6, w7) \ + ext vT2.16b, w4.16b, w5.16b, #8; \ + sha512su1 w0.2d, w7.2d, vT2.2d; + +#define do_round2(ab, cd, ef, gh, cd_out, \ + load_nextk_op, k, \ + sched_op1, sched_op2, w0, w1, w2, w3, w4, w5, w6, w7) \ + add vT3.2d, k.2d, w0.2d; \ + load_nextk_op(); \ + ext vT1.16b, ef.16b, gh.16b, #8; \ + ext vT3.16b, vT3.16b, vT3.16b, #8; \ + ext vT0.16b, cd.16b, ef.16b, #8; \ + add gh.2d, gh.2d, vT3.2d; \ + sched_op1(w0, w1, w2, w3, w4, w5, w6, w7); \ + sha512h Q##gh, Q##vT1, vT0.2d; \ + sched_op2(w0, w1, w2, w3, w4, w5, w6, w7); \ + add cd_out.2d, gh.2d, cd.2d; \ + sha512h2 Q##gh, Q##cd, ab.2d; \ + + +/* Other functional macros */ + +#undef CLEAR_REG +#define CLEAR_REG(reg, ...) movi reg.16b, #0; + + +/* + * unsigned int + * _gcry_sha512_transform_armv8_ce (u64 state[8], const void *input_data, + * size_t num_blks, const u64 k[80]) + */ +.align 3 +.globl _gcry_sha512_transform_armv8_ce +ELF(.type _gcry_sha512_transform_armv8_ce,%function;) +_gcry_sha512_transform_armv8_ce: + /* input: + * x0: ctx, CTX + * x1: data (128*nblks bytes) + * x2: nblks + * x3: k table + */ + CFI_STARTPROC() + + cbz x2, .Ldo_nothing + + mov x4, x3 + + ld1 {vH01.2d-vH67.2d}, [x0] /* load state */ + + load_msg1() + mov v0.16b, vH01.16b + mov v1.16b, vH23.16b + load_k_3() + load_msg2() + load_msg3() + load_msg4() + mov v2.16b, vH45.16b + mov v3.16b, vH67.16b + load_msg5() + load_msg6() + load_msg7() + load_msg8() + load_msg9() + load_msg10() + +.Loop: + sub x2, x2, #1 + + # rounds 1-16 + do_round2(v0, v1, v2, v3, v4, + _, vK0, + schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7) + do_round2(v3, v0, v4, v2, v1, + _, vK1, + schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0) + do_round2(v2, v3, v1, v4, v0, + load_k_3, vK2, + schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1) + do_round2(v4, v2, v0, v1, v3, + _, vK0, + schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2) + do_round2(v1, v4, v3, v0, v2, + _, vK1, + schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3) + do_round2(v0, v1, v2, v3, v4, + load_k_3, vK2, + schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4) + do_round2(v3, v0, v4, v2, v1, + _, vK0, + schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5) + do_round2(v2, v3, v1, v4, v0, + _, vK1, + schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6) + + # rounds 17-32 + do_round2(v4, v2, v0, v1, v3, + load_k_3, vK2, + schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7) + do_round2(v1, v4, v3, v0, v2, + _, vK0, + schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0) + do_round2(v0, v1, v2, v3, v4, + _, vK1, + schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1) + do_round2(v3, v0, v4, v2, v1, + load_k_3, vK2, + schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2) + do_round2(v2, v3, v1, v4, v0, + _, vK0, + schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3) + do_round2(v4, v2, v0, v1, v3, + _, vK1, + schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4) + do_round2(v1, v4, v3, v0, v2, + load_k_3, vK2, + schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5) + do_round2(v0, v1, v2, v3, v4, + _, vK0, + schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6) + + # rounds 33-48 + do_round2(v3, v0, v4, v2, v1, + _, vK1, + schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7) + do_round2(v2, v3, v1, v4, v0, + load_k_3, vK2, + schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0) + do_round2(v4, v2, v0, v1, v3, + _, vK0, + schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1) + do_round2(v1, v4, v3, v0, v2, + _, vK1, + schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2) + do_round2(v0, v1, v2, v3, v4, + load_k_3, vK2, + schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3) + do_round2(v3, v0, v4, v2, v1, + _, vK0, + schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4) + do_round2(v2, v3, v1, v4, v0, + _, vK1, + schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5) + do_round2(v4, v2, v0, v1, v3, + load_k_3, vK2, + schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6) + + # rounds 49-64 + do_round2(v1, v4, v3, v0, v2, + _, vK0, + schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7) + do_round2(v0, v1, v2, v3, v4, + _, vK1, + schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0) + do_round2(v3, v0, v4, v2, v1, + load_k_3, vK2, + schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1) + do_round2(v2, v3, v1, v4, v0, + _, vK0, + schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2) + do_round2(v4, v2, v0, v1, v3, + _, vK1, + schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3) + do_round2(v1, v4, v3, v0, v2, + load_k_3, vK2, + schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4) + do_round2(v0, v1, v2, v3, v4, + _, vK0, + schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5) + do_round2(v3, v0, v4, v2, v1, + _, vK1, + schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6) + + cbz x2, .Lend + + # rounds 65-80 + do_round2(v2, v3, v1, v4, v0, + load_k_3, vK2, + _, _, vW0, , , , , , , ) + do_round2(v4, v2, v0, v1, v3, + _, vK0, + _, _, vW1, , , , , , , ) + do_round2(v1, v4, v3, v0, v2, + _, vK1, + _, _, vW2, , , , , , , ) + do_round2(v0, v1, v2, v3, v4, + load_k_3, vK2, + _, _, vW3, , , , , , , ) + do_round2(v3, v0, v4, v2, v1, + _, vK0, + load_msg1, _, vW4, , , , , , , ) + do_round2(v2, v3, v1, v4, v0, + _, vK1, + load_msg2, _, vW5, , , , , , , ) + do_round2(v4, v2, v0, v1, v3, + load_k_last, vK2, + load_msg3, _, vW6, , , , , , , ) + mov x3, x4 + do_round2(v1, v4, v3, v0, v2, + load_k_3, vK0, + load_msg4, load_msg5, vW7, , , , , , , ) + + load_msg6() + load_msg7() + + add vH01.2d, vH01.2d, v0.2d + add vH23.2d, vH23.2d, v1.2d + add vH45.2d, vH45.2d, v2.2d + add vH67.2d, vH67.2d, v3.2d + load_msg8() + load_msg9() + load_msg10() + mov v0.16b, vH01.16b + mov v1.16b, vH23.16b + mov v2.16b, vH45.16b + mov v3.16b, vH67.16b + + b .Loop + +.Lend: + + # rounds 65-80 + do_round2(v2, v3, v1, v4, v0, + load_k_3, vK2, + CLEAR_REG, _, vW0, , , , , , , ) + do_round2(v4, v2, v0, v1, v3, + _, vK0, + CLEAR_REG, _, vW1, , , , , , , ) + do_round2(v1, v4, v3, v0, v2, + _, vK1, + CLEAR_REG, _, vW2, , , , , , , ) + do_round2(v0, v1, v2, v3, v4, + load_k_3, vK2, + CLEAR_REG, _, vW3, , , , , , , ) + do_round2(v3, v0, v4, v2, v1, + _, vK0, + CLEAR_REG, _, vW4, , , , , , , ) + do_round2(v2, v3, v1, v4, v0, + _, vK1, + CLEAR_REG, _, vW5, , , , , , , ) + CLEAR_REG(vK1) + do_round2(v4, v2, v0, v1, v3, + load_k_last, vK2, + CLEAR_REG, _, vW6, , , , , , , ) + CLEAR_REG(vK2) + do_round2(v1, v4, v3, v0, v2, + _, vK0, + CLEAR_REG, _, vW7, , , , , , , ) + CLEAR_REG(vK0) + + CLEAR_REG(v4) + add vH01.2d, vH01.2d, v0.2d + CLEAR_REG(v0) + add vH23.2d, vH23.2d, v1.2d + CLEAR_REG(v1) + add vH45.2d, vH45.2d, v2.2d + CLEAR_REG(v2) + add vH67.2d, vH67.2d, v3.2d + CLEAR_REG(v3) + CLEAR_REG(vT0) + CLEAR_REG(vT1) + CLEAR_REG(vT2) + CLEAR_REG(vT3) + + st1 {vH01.2d-vH67.2d}, [x0] /* store state */ + + CLEAR_REG(vH01) + CLEAR_REG(vH23) + CLEAR_REG(vH45) + CLEAR_REG(vH67) + +.Ldo_nothing: + mov x0, #0 + ret_spec_stop + CFI_ENDPROC() +ELF(.size _gcry_sha512_transform_armv8_ce,.-_gcry_sha512_transform_armv8_ce;) + +#endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 42eaf1fe..9ac412b3 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -55,6 +55,14 @@ #include "hash-common.h" +/* Helper macro to force alignment to 64 bytes. */ +#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED +# define ATTR_ALIGNED_64 __attribute__ ((aligned (64))) +#else +# define ATTR_ALIGNED_64 +#endif + + /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */ #undef USE_ARM_NEON_ASM #ifdef ENABLE_NEON_SUPPORT @@ -72,6 +80,17 @@ # define USE_ARM_ASM 1 #endif +/* USE_ARM64_SHA512 indicates whether to enable ARMv8 SHA512 extension assembly + * code. */ +#undef USE_ARM64_SHA512 +#ifdef ENABLE_ARM_CRYPTO_SUPPORT +# if defined(__AARCH64EL__) \ + && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \ + && defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4) +# define USE_ARM64_SHA512 1 +# endif +#endif + /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 @@ -158,7 +177,7 @@ typedef struct } SHA512_CONTEXT; -static const u64 k[] = +static ATTR_ALIGNED_64 const u64 k[] = { U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd), U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc), @@ -219,6 +238,21 @@ static const u64 k[] = #endif +#ifdef USE_ARM64_SHA512 +unsigned int _gcry_sha512_transform_armv8_ce (u64 state[8], + const unsigned char *data, + size_t num_blks, + const u64 k[]); + +static unsigned int +do_sha512_transform_armv8_ce(void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA512_CONTEXT *hd = ctx; + return _gcry_sha512_transform_armv8_ce (hd->state.h, data, nblks, k); +} +#endif + #ifdef USE_ARM_NEON_ASM unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd, const unsigned char *data, @@ -415,6 +449,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags) if ((features & HWF_ARM_NEON) != 0) ctx->bctx.bwrite = do_sha512_transform_armv7_neon; #endif +#ifdef USE_ARM64_SHA512 + if ((features & HWF_ARM_NEON) && (features & HWF_ARM_SHA512)) + ctx->bctx.bwrite = do_sha512_transform_armv8_ce; +#endif #ifdef USE_SSSE3 if ((features & HWF_INTEL_SSSE3) != 0) ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3; diff --git a/configure.ac b/configure.ac index b55510d8..ddba42c0 100644 --- a/configure.ac +++ b/configure.ac @@ -2054,6 +2054,56 @@ if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" = "yes" ; then fi +# +# Check whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions], + [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4], + [if test "$mpi_cpu_arch" != "aarch64" || + test "$try_asm_modules" != "yes" ; then + gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4="n/a" + else + gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=no + AC_LINK_IFELSE([AC_LANG_PROGRAM( + [[__asm__( + ".arch armv8.2-a+sha3+sm4\n\t" + ".text\n\t" + "testfn:\n\t" + + /* Test for SHA512 instructions */ + "sha512h q0, q0, v0.2d;\n\t" + "sha512h2 q0, q0, v0.2d;\n\t" + "sha512su0 v0.2d, v0.2d;\n\t" + "sha512su1 v0.2d, v0.2d, v31.2d;\n\t" + + /* Test for SHA3 instructions */ + "bcax v0.16b, v1.16b, v2.16b, v3.16b;\n\t" + "eor3 v0.16b, v1.16b, v2.16b, v3.16b;\n\t" + "rax1 v0.2d, v1.2d, v2.2d;\n\t" + "xar v0.2d, v1.2d, v2.2d, \#1;\n\t" + + /* Test for SM3 instructions */ + "sm3partw1 v0.4s, v1.4s, v2.4s;\n\t" + "sm3partw2 v0.4s, v1.4s, v2.4s;\n\t" + "sm3ss1 v0.4s, v1.4s, v2.4s, v3.4s;\n\t" + "sm3tt1a v0.4s, v1.4s, v2.s[0];\n\t" + "sm3tt1b v0.4s, v1.4s, v2.s[0];\n\t" + "sm3tt2a v0.4s, v1.4s, v2.s[0];\n\t" + "sm3tt2b v0.4s, v1.4s, v2.s[0];\n\t" + + /* Test for SM4 instructions */ + "sm4e v0.4s, v1.4s;\n\t" + "sm4ekey v0.4s, v1.4s, v2.4s;\n\t" + ); + ]], [ testfn(); ])], + [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=yes]) + fi]) +if test "$gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4,1, + [Defined if inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions]) +fi + + # # Check whether PowerPC AltiVec/VSX intrinsics # @@ -3123,6 +3173,10 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo" ;; + aarch64-*-*) + # Build with the assembly implementation + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv8-aarch64-ce.lo" + ;; powerpc64le-*-*) # Build with the crypto extension implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo" -- 2.34.1 From jussi.kivilinna at iki.fi Thu Jul 21 10:38:02 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 21 Jul 2022 11:38:02 +0300 Subject: [PATCH] blake2: add AVX512 accelerated implementations Message-ID: <20220721083802.3355448-1-jussi.kivilinna@iki.fi> * cipher/Makefile.am: Add 'blake2b-amd64-avx512.S' and 'blake2s-amd64-avx512.S'. * cipher/blake2.c (USE_AVX512): New. (ASM_FUNC_ABI): Setup attribute if USE_AVX2 or USE_AVX512 enabled in addition to USE_AVX. (BLAKE2B_CONTEXT_S, BLAKE2S_CONTEXT_S): Add 'use_avx512'. (_gcry_blake2b_transform_amd64_avx512) (_gcry_blake2s_transform_amd64_avx512): New. (blake2b_transform, blake2s_transform) [USE_AVX512]: Add AVX512 path. (blake2b_init_ctx, blake2s_init_ctx) [USE_AVX512]: Use AVX512 if HW feature available. * cipher/blake2b-amd64-avx512.S: New. * cipher/blake2s-amd64-avx512.S: New. * configure.ac: Add 'blake2b-amd64-avx512.lo' and 'blake2s-amd64-avx512.lo'. -- Benchmark on Intel Core i3-1115G4 (tigerlake): Before (AVX/AVX2 implementations): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz BLAKE2B_512 | 0.841 ns/B 1134 MiB/s 3.44 c/B 4089 BLAKE2S_256 | 1.29 ns/B 741.2 MiB/s 5.26 c/B 4089 After (blake2s ~19% faster, blake2b ~25% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz BLAKE2B_512 | 0.705 ns/B 1353 MiB/s 2.88 c/B 4088 BLAKE2S_256 | 1.02 ns/B 933.3 MiB/s 4.18 c/B 4088 Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 3 +- cipher/blake2.c | 49 +++++- cipher/blake2b-amd64-avx512.S | 312 ++++++++++++++++++++++++++++++++++ cipher/blake2s-amd64-avx512.S | 261 ++++++++++++++++++++++++++++ configure.ac | 2 + 5 files changed, 622 insertions(+), 5 deletions(-) create mode 100644 cipher/blake2b-amd64-avx512.S create mode 100644 cipher/blake2s-amd64-avx512.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 97823cb4..c7674453 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -147,7 +147,8 @@ EXTRA_libcipher_la_SOURCES = \ camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \ camellia-arm.S camellia-aarch64.S \ blake2.c \ - blake2b-amd64-avx2.S blake2s-amd64-avx.S + blake2b-amd64-avx2.S blake2b-amd64-avx512.S \ + blake2s-amd64-avx.S blake2s-amd64-avx512.S gost28147.lo: gost-sb.h gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD) diff --git a/cipher/blake2.c b/cipher/blake2.c index d7f9a7e4..45f74a56 100644 --- a/cipher/blake2.c +++ b/cipher/blake2.c @@ -46,11 +46,20 @@ # define USE_AVX2 1 #endif +/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */ +#undef USE_AVX512 +#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_AVX512 1 +#endif + /* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK -#if defined(USE_AVX2) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) +#if (defined(USE_AVX) || defined(USE_AVX2) || defined(USE_AVX512)) \ + && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) # define ASM_EXTRA_STACK (10 * 16) #else @@ -98,6 +107,9 @@ typedef struct BLAKE2B_CONTEXT_S #ifdef USE_AVX2 unsigned int use_avx2:1; #endif +#ifdef USE_AVX512 + unsigned int use_avx512:1; +#endif } BLAKE2B_CONTEXT; typedef struct @@ -132,6 +144,9 @@ typedef struct BLAKE2S_CONTEXT_S #ifdef USE_AVX unsigned int use_avx:1; #endif +#ifdef USE_AVX512 + unsigned int use_avx512:1; +#endif } BLAKE2S_CONTEXT; typedef unsigned int (*blake2_transform_t)(void *S, const void *inblk, @@ -346,6 +361,12 @@ unsigned int _gcry_blake2b_transform_amd64_avx2(BLAKE2B_STATE *S, size_t nblks) ASM_FUNC_ABI; #endif +#ifdef USE_AVX512 +unsigned int _gcry_blake2b_transform_amd64_avx512(BLAKE2B_STATE *S, + const void *inblks, + size_t nblks) ASM_FUNC_ABI; +#endif + static unsigned int blake2b_transform(void *ctx, const void *inblks, size_t nblks) { @@ -354,8 +375,12 @@ static unsigned int blake2b_transform(void *ctx, const void *inblks, if (0) {} +#ifdef USE_AVX512 + else if (c->use_avx512) + nburn = _gcry_blake2b_transform_amd64_avx512(&c->state, inblks, nblks); +#endif #ifdef USE_AVX2 - if (c->use_avx2) + else if (c->use_avx2) nburn = _gcry_blake2b_transform_amd64_avx2(&c->state, inblks, nblks); #endif else @@ -468,6 +493,9 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags, #ifdef USE_AVX2 c->use_avx2 = !!(features & HWF_INTEL_AVX2); #endif +#ifdef USE_AVX512 + c->use_avx512 = !!(features & HWF_INTEL_AVX512); +#endif c->outlen = dbits / 8; c->buflen = 0; @@ -670,6 +698,12 @@ unsigned int _gcry_blake2s_transform_amd64_avx(BLAKE2S_STATE *S, size_t nblks) ASM_FUNC_ABI; #endif +#ifdef USE_AVX512 +unsigned int _gcry_blake2s_transform_amd64_avx512(BLAKE2S_STATE *S, + const void *inblks, + size_t nblks) ASM_FUNC_ABI; +#endif + static unsigned int blake2s_transform(void *ctx, const void *inblks, size_t nblks) { @@ -677,9 +711,13 @@ static unsigned int blake2s_transform(void *ctx, const void *inblks, unsigned int nburn; if (0) - {} + { } +#ifdef USE_AVX512 + else if (c->use_avx512) + nburn = _gcry_blake2s_transform_amd64_avx512(&c->state, inblks, nblks); +#endif #ifdef USE_AVX - if (c->use_avx) + else if (c->use_avx) nburn = _gcry_blake2s_transform_amd64_avx(&c->state, inblks, nblks); #endif else @@ -792,6 +830,9 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags, #ifdef USE_AVX c->use_avx = !!(features & HWF_INTEL_AVX); #endif +#ifdef USE_AVX + c->use_avx512 = !!(features & HWF_INTEL_AVX512); +#endif c->outlen = dbits / 8; c->buflen = 0; diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S new file mode 100644 index 00000000..db53474d --- /dev/null +++ b/cipher/blake2b-amd64-avx512.S @@ -0,0 +1,312 @@ +/* blake2b-amd64-avx512.S - AVX512 implementation of BLAKE2b + * + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* The code is based on public-domain/CC0 BLAKE2 reference implementation + * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse + * Copyright 2012, Samuel Neves + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +#include "asm-common-amd64.h" + +.text + +/* register macros */ +#define RSTATE %rdi +#define RINBLKS %rsi +#define RNBLKS %rdx +#define RIV %rcx + +/* state structure */ +#define STATE_H 0 +#define STATE_T (STATE_H + 8 * 8) +#define STATE_F (STATE_T + 2 * 8) + +/* vector registers */ +#define ROW1 %ymm0 +#define ROW2 %ymm1 +#define ROW3 %ymm2 +#define ROW4 %ymm3 +#define TMP1 %ymm4 +#define TMP1x %xmm4 + +#define MA1 %ymm5 +#define MA2 %ymm6 +#define MA3 %ymm7 +#define MA4 %ymm8 +#define MA1x %xmm5 +#define MA2x %xmm6 +#define MA3x %xmm7 +#define MA4x %xmm8 + +#define MB1 %ymm9 +#define MB2 %ymm10 +#define MB3 %ymm11 +#define MB4 %ymm12 +#define MB1x %xmm9 +#define MB2x %xmm10 +#define MB3x %xmm11 +#define MB4x %xmm12 + +/********************************************************************** + blake2b/AVX2 + **********************************************************************/ + +#define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, gather_masks) \ + vmovdqa gather_masks + (4*4) * 0 rRIP, m2x; \ + vmovdqa gather_masks + (4*4) * 1 rRIP, m3x; \ + vmovdqa gather_masks + (4*4) * 2 rRIP, m4x; \ + vmovdqa gather_masks + (4*4) * 3 rRIP, TMP1x; \ + vpgatherdq (RINBLKS, m2x), m1 {%k1}; \ + vpgatherdq (RINBLKS, m3x), m2 {%k2}; \ + vpgatherdq (RINBLKS, m4x), m3 {%k3}; \ + vpgatherdq (RINBLKS, TMP1x), m4 {%k4} + +#define GEN_GMASK(s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15) \ + .long (s0)*8, (s2)*8, (s4)*8, (s6)*8, \ + (s1)*8, (s3)*8, (s5)*8, (s7)*8, \ + (s8)*8, (s10)*8, (s12)*8, (s14)*8, \ + (s9)*8, (s11)*8, (s13)*8, (s15)*8 + +#define RESET_KMASKS() \ + kmovw %k0, %k1; \ + kmovw %k0, %k2; \ + kmovw %k0, %k3; \ + kmovw %k0, %k4 + +#define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask0); \ + RESET_KMASKS() +#define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask1); \ + RESET_KMASKS() +#define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask2); \ + RESET_KMASKS() +#define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask3); \ + RESET_KMASKS() +#define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask4); \ + RESET_KMASKS() +#define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask5); \ + RESET_KMASKS() +#define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask6); \ + RESET_KMASKS() +#define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask7); \ + RESET_KMASKS() +#define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask8); \ + RESET_KMASKS() +#define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask9); \ + RESET_KMASKS() +#define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask0); \ + RESET_KMASKS() +#define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask1); + +#define LOAD_MSG(r, m1, m2, m3, m4) \ + LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x) + +#define ROR_32(in, out) vpshufd $0xb1, in, out + +#define ROR_24(in, out) vprorq $24, in, out + +#define ROR_16(in, out) vprorq $16, in, out + +#define ROR_63(in, out) vprorq $63, in, out + +#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \ + vpaddq m, r1, r1; \ + vpaddq r2, r1, r1; \ + vpxor r1, r4, r4; \ + ROR_A(r4, r4); \ + vpaddq r4, r3, r3; \ + vpxor r3, r2, r2; \ + ROR_B(r2, r2) + +#define G1(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_32, ROR_24) + +#define G2(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_16, ROR_63) + +#define MM_SHUFFLE(z,y,x,w) \ + (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +#define DIAGONALIZE(r1, r2, r3, r4) \ + vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \ + vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpermq $MM_SHUFFLE(2,1,0,3), r4, r4 + +#define UNDIAGONALIZE(r1, r2, r3, r4) \ + vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \ + vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpermq $MM_SHUFFLE(0,3,2,1), r4, r4 + +#define ROUND(r, m1, m2, m3, m4) \ + G1(ROW1, ROW2, ROW3, ROW4, m1); \ + G2(ROW1, ROW2, ROW3, ROW4, m2); \ + DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \ + G1(ROW1, ROW2, ROW3, ROW4, m3); \ + G2(ROW1, ROW2, ROW3, ROW4, m4); \ + UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4) + +ELF(.type blake2b_data, at object;) +blake2b_data: +.align 32 +.Liv: + .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b + .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 + .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f + .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 +.Lgmask0: + GEN_GMASK(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +.Lgmask1: + GEN_GMASK(14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3) +.Lgmask2: + GEN_GMASK(11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4) +.Lgmask3: + GEN_GMASK(7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8) +.Lgmask4: + GEN_GMASK(9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13) +.Lgmask5: + GEN_GMASK(2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9) +.Lgmask6: + GEN_GMASK(12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11) +.Lgmask7: + GEN_GMASK(13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10) +.Lgmask8: + GEN_GMASK(6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5) +.Lgmask9: + GEN_GMASK(10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0) + +.align 64 +.globl _gcry_blake2b_transform_amd64_avx512 +ELF(.type _gcry_blake2b_transform_amd64_avx512, at function;) + +_gcry_blake2b_transform_amd64_avx512: + /* input: + * %rdi: state + * %rsi: blks + * %rdx: num_blks + */ + CFI_STARTPROC(); + + movl $0xf, %eax; + kmovw %eax, %k0; + xorl %eax, %eax; + RESET_KMASKS(); + + addq $128, (STATE_T + 0)(RSTATE); + adcq $0, (STATE_T + 8)(RSTATE); + + vmovdqa .Liv+(0 * 8) rRIP, ROW3; + vmovdqa .Liv+(4 * 8) rRIP, ROW4; + + vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1; + vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2; + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + LOAD_MSG(0, MA1, MA2, MA3, MA4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + jmp .Loop; + +.align 64, 0xcc +.Loop: + ROUND(0, MA1, MA2, MA3, MA4); + LOAD_MSG(2, MA1, MA2, MA3, MA4); + ROUND(1, MB1, MB2, MB3, MB4); + LOAD_MSG(3, MB1, MB2, MB3, MB4); + ROUND(2, MA1, MA2, MA3, MA4); + LOAD_MSG(4, MA1, MA2, MA3, MA4); + ROUND(3, MB1, MB2, MB3, MB4); + LOAD_MSG(5, MB1, MB2, MB3, MB4); + ROUND(4, MA1, MA2, MA3, MA4); + LOAD_MSG(6, MA1, MA2, MA3, MA4); + ROUND(5, MB1, MB2, MB3, MB4); + LOAD_MSG(7, MB1, MB2, MB3, MB4); + ROUND(6, MA1, MA2, MA3, MA4); + LOAD_MSG(8, MA1, MA2, MA3, MA4); + ROUND(7, MB1, MB2, MB3, MB4); + LOAD_MSG(9, MB1, MB2, MB3, MB4); + ROUND(8, MA1, MA2, MA3, MA4); + LOAD_MSG(10, MA1, MA2, MA3, MA4); + ROUND(9, MB1, MB2, MB3, MB4); + LOAD_MSG(11, MB1, MB2, MB3, MB4); + sub $1, RNBLKS; + jz .Loop_end; + RESET_KMASKS(); + + lea 128(RINBLKS), RINBLKS; + addq $128, (STATE_T + 0)(RSTATE); + adcq $0, (STATE_T + 8)(RSTATE); + + ROUND(10, MA1, MA2, MA3, MA4); + LOAD_MSG(0, MA1, MA2, MA3, MA4); + ROUND(11, MB1, MB2, MB3, MB4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2; + + vmovdqa .Liv+(0 * 8) rRIP, ROW3; + vmovdqa .Liv+(4 * 8) rRIP, ROW4; + + vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE); + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + jmp .Loop; + +.align 64, 0xcc +.Loop_end: + ROUND(10, MA1, MA2, MA3, MA4); + ROUND(11, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2; + + vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE); + + kxorw %k0, %k0, %k0; + vzeroall; + RESET_KMASKS(); + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_blake2b_transform_amd64_avx512, + .-_gcry_blake2b_transform_amd64_avx512;) + +#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ +#endif /*__x86_64*/ diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S new file mode 100644 index 00000000..4457ca99 --- /dev/null +++ b/cipher/blake2s-amd64-avx512.S @@ -0,0 +1,261 @@ +/* blake2s-amd64-avx512.S - AVX512 implementation of BLAKE2s + * + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* The code is based on public-domain/CC0 BLAKE2 reference implementation + * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse + * Copyright 2012, Samuel Neves + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +#include "asm-common-amd64.h" + +.text + +/* register macros */ +#define RSTATE %rdi +#define RINBLKS %rsi +#define RNBLKS %rdx +#define RIV %rcx + +/* state structure */ +#define STATE_H 0 +#define STATE_T (STATE_H + 8 * 4) +#define STATE_F (STATE_T + 2 * 4) + +/* vector registers */ +#define ROW1 %xmm0 +#define ROW2 %xmm1 +#define ROW3 %xmm2 +#define ROW4 %xmm3 +#define TMP1 %xmm4 +#define TMP1x %xmm4 + +#define MA1 %xmm5 +#define MA2 %xmm6 +#define MA3 %xmm7 +#define MA4 %xmm8 + +#define MB1 %xmm9 +#define MB2 %xmm10 +#define MB3 %xmm11 +#define MB4 %xmm12 + +/********************************************************************** + blake2s/AVX + **********************************************************************/ + +/* On Intel tigerlake, vmovd+vpinsrd approach is faster than vpgatherdd. */ +#define GATHER_MSG(m1, m2, m3, m4, \ + s0, s1, s2, s3, s4, s5, s6, s7, s8, \ + s9, s10, s11, s12, s13, s14, s15) \ + vmovd (s0)*4(RINBLKS), m1; \ + vmovd (s1)*4(RINBLKS), m2; \ + vmovd (s8)*4(RINBLKS), m3; \ + vmovd (s9)*4(RINBLKS), m4; \ + vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \ + vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \ + vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \ + vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \ + vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \ + vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \ + vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \ + vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \ + vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \ + vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \ + vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \ + vpinsrd $3, (s15)*4(RINBLKS), m4, m4; + +#define LOAD_MSG_0(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +#define LOAD_MSG_1(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3) +#define LOAD_MSG_2(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4) +#define LOAD_MSG_3(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8) +#define LOAD_MSG_4(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13) +#define LOAD_MSG_5(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9) +#define LOAD_MSG_6(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11) +#define LOAD_MSG_7(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10) +#define LOAD_MSG_8(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5) +#define LOAD_MSG_9(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0) + +#define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4) + +#define ROR_16(in, out) vprord $16, in, out; + +#define ROR_8(in, out) vprord $8, in, out; + +#define ROR_12(in, out) vprord $12, in, out; + +#define ROR_7(in, out) vprord $7, in, out; + +#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \ + vpaddd m, r1, r1; \ + vpaddd r2, r1, r1; \ + vpxor r1, r4, r4; \ + ROR_A(r4, r4); \ + vpaddd r4, r3, r3; \ + vpxor r3, r2, r2; \ + ROR_B(r2, r2); + +#define G1(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_16, ROR_12); + +#define G2(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_8, ROR_7); + +#define MM_SHUFFLE(z,y,x,w) \ + (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +#define DIAGONALIZE(r1, r2, r3, r4) \ + vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \ + vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4; + +#define UNDIAGONALIZE(r1, r2, r3, r4) \ + vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \ + vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4; + +#define ROUND(r, m1, m2, m3, m4) \ + G1(ROW1, ROW2, ROW3, ROW4, m1); \ + G2(ROW1, ROW2, ROW3, ROW4, m2); \ + DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \ + G1(ROW1, ROW2, ROW3, ROW4, m3); \ + G2(ROW1, ROW2, ROW3, ROW4, m4); \ + UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4); + +ELF(.type blake2s_data, at object;) +blake2s_data: +.align 16 +.Liv: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + +.align 64 +.globl _gcry_blake2s_transform_amd64_avx512 +ELF(.type _gcry_blake2s_transform_amd64_avx512, at function;) + +_gcry_blake2s_transform_amd64_avx512: + /* input: + * %rdi: state + * %rsi: blks + * %rdx: num_blks + */ + CFI_STARTPROC(); + + addq $64, (STATE_T + 0)(RSTATE); + + vmovdqa .Liv+(0 * 4) rRIP, ROW3; + vmovdqa .Liv+(4 * 4) rRIP, ROW4; + + vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1; + vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2; + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + LOAD_MSG(0, MA1, MA2, MA3, MA4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + jmp .Loop; + +.align 64, 0xcc +.Loop: + ROUND(0, MA1, MA2, MA3, MA4); + LOAD_MSG(2, MA1, MA2, MA3, MA4); + ROUND(1, MB1, MB2, MB3, MB4); + LOAD_MSG(3, MB1, MB2, MB3, MB4); + ROUND(2, MA1, MA2, MA3, MA4); + LOAD_MSG(4, MA1, MA2, MA3, MA4); + ROUND(3, MB1, MB2, MB3, MB4); + LOAD_MSG(5, MB1, MB2, MB3, MB4); + ROUND(4, MA1, MA2, MA3, MA4); + LOAD_MSG(6, MA1, MA2, MA3, MA4); + ROUND(5, MB1, MB2, MB3, MB4); + LOAD_MSG(7, MB1, MB2, MB3, MB4); + ROUND(6, MA1, MA2, MA3, MA4); + LOAD_MSG(8, MA1, MA2, MA3, MA4); + ROUND(7, MB1, MB2, MB3, MB4); + LOAD_MSG(9, MB1, MB2, MB3, MB4); + sub $1, RNBLKS; + jz .Loop_end; + + lea 64(RINBLKS), RINBLKS; + addq $64, (STATE_T + 0)(RSTATE); + + ROUND(8, MA1, MA2, MA3, MA4); + LOAD_MSG(0, MA1, MA2, MA3, MA4); + ROUND(9, MB1, MB2, MB3, MB4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2; + + vmovdqa .Liv+(0 * 4) rRIP, ROW3; + vmovdqa .Liv+(4 * 4) rRIP, ROW4; + + vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE); + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + jmp .Loop; + +.align 64, 0xcc +.Loop_end: + ROUND(8, MA1, MA2, MA3, MA4); + ROUND(9, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2; + + vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE); + + xorl %eax, %eax; + vzeroall; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_blake2s_transform_amd64_avx512, + .-_gcry_blake2s_transform_amd64_avx512;) + +#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ +#endif /*__x86_64*/ diff --git a/configure.ac b/configure.ac index b55510d8..f30198bc 100644 --- a/configure.ac +++ b/configure.ac @@ -3190,7 +3190,9 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo" + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx512.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo" + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx512.lo" ;; esac fi -- 2.34.1 From jussi.kivilinna at iki.fi Thu Jul 21 10:09:28 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 21 Jul 2022 11:09:28 +0300 Subject: [PATCH 1/3] sm4: add amd64 GFNI/AVX512 implementation Message-ID: <20220721080930.3271436-1-jussi.kivilinna@iki.fi> * cipher/Makefile.am: Add 'sm4-gfni-avx512-amd64.S'. * cipher/sm4-gfni-avx512-amd64.S: New. * cipher/sm4-gfni.c (USE_GFNI_AVX512): New. (SM4_context): Add 'use_gfni_avx512' and 'crypt_blk1_16'. (_gcry_sm4_gfni_avx512_expand_key, _gcry_sm4_gfni_avx512_ctr_enc) (_gcry_sm4_gfni_avx512_cbc_dec, _gcry_sm4_gfni_avx512_cfb_dec) (_gcry_sm4_gfni_avx512_ocb_enc, _gcry_sm4_gfni_avx512_ocb_dec) (_gcry_sm4_gfni_avx512_ocb_auth, _gcry_sm4_gfni_avx512_ctr_enc_blk32) (_gcry_sm4_gfni_avx512_cbc_dec_blk32) (_gcry_sm4_gfni_avx512_cfb_dec_blk32) (_gcry_sm4_gfni_avx512_ocb_enc_blk32) (_gcry_sm4_gfni_avx512_ocb_dec_blk32) (_gcry_sm4_gfni_avx512_crypt_blk1_16) (_gcry_sm4_gfni_avx512_crypt_blk32, sm4_gfni_avx512_crypt_blk1_16) (sm4_crypt_blk1_32, sm4_encrypt_blk1_32, sm4_decrypt_blk1_32): New. (sm4_expand_key): Add GFNI/AVX512 code-path (sm4_setkey): Use GFNI/AVX512 if supported by CPU; Setup `ctx->crypt_blk1_16`. (sm4_encrypt, sm4_decrypt, sm4_get_crypt_blk1_16_fn, _gcry_sm4_ctr_enc) (_gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec, _gcry_sm4_ocb_crypt) (_gcry_sm4_ocb_auth) [USE_GFNI_AVX512]: Add GFNI/AVX512 code path. (_gcry_sm4_xts_crypt): Change parallel block size from 16 to 32. * configure.ac: Add 'sm4-gfni-avx512-amd64.lo'. -- Benchmark on Intel i3-1115G4 (tigerlake): Before: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC enc | 9.45 ns/B 101.0 MiB/s 38.63 c/B 4089 CBC dec | 0.647 ns/B 1475 MiB/s 2.64 c/B 4089 CFB enc | 9.43 ns/B 101.1 MiB/s 38.57 c/B 4089 CFB dec | 0.648 ns/B 1472 MiB/s 2.65 c/B 4089 CTR enc | 0.661 ns/B 1443 MiB/s 2.70 c/B 4089 CTR dec | 0.661 ns/B 1444 MiB/s 2.70 c/B 4089 XTS enc | 0.767 ns/B 1243 MiB/s 3.14 c/B 4089 XTS dec | 0.772 ns/B 1235 MiB/s 3.16 c/B 4089 OCB enc | 0.671 ns/B 1421 MiB/s 2.74 c/B 4089 OCB dec | 0.676 ns/B 1410 MiB/s 2.77 c/B 4089 OCB auth | 0.668 ns/B 1428 MiB/s 2.73 c/B 4090 After: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC enc | 7.80 ns/B 122.2 MiB/s 31.91 c/B 4090 CBC dec | 0.293 ns/B 3258 MiB/s 1.20 c/B 4095?3 CFB enc | 7.80 ns/B 122.2 MiB/s 31.90 c/B 4089 CFB dec | 0.294 ns/B 3247 MiB/s 1.20 c/B 4096?3 CTR enc | 0.306 ns/B 3120 MiB/s 1.25 c/B 4098?4 CTR dec | 0.300 ns/B 3182 MiB/s 1.23 c/B 4103?6 XTS enc | 0.431 ns/B 2211 MiB/s 1.77 c/B 4107?9 XTS dec | 0.431 ns/B 2213 MiB/s 1.77 c/B 4102?6 OCB enc | 0.324 ns/B 2946 MiB/s 1.33 c/B 4096?3 OCB dec | 0.326 ns/B 2923 MiB/s 1.34 c/B 4093?2 OCB auth | 0.536 ns/B 1779 MiB/s 2.19 c/B 4089 CBC/CFB enc: 1.20x faster CBC/CFB dec: 2.20x faster CTR: 2.18x faster XTS: 1.78x faster OCB enc/dec: 2.07x faster OCB auth: 1.24x faster Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 6 +- cipher/sm4-gfni-avx512-amd64.S | 1750 ++++++++++++++++++++++++++++++++ cipher/sm4.c | 336 +++++- configure.ac | 1 + 4 files changed, 2076 insertions(+), 17 deletions(-) create mode 100644 cipher/sm4-gfni-avx512-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 97823cb4..3d95a794 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -118,9 +118,9 @@ EXTRA_libcipher_la_SOURCES = \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S \ - sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \ - sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \ - sm4-armv9-aarch64-sve-ce.S \ + sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \ + sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \ + sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S new file mode 100644 index 00000000..1d5e9a48 --- /dev/null +++ b/cipher/sm4-gfni-avx512-amd64.S @@ -0,0 +1,1750 @@ +/* sm4-gfni-avx512-amd64.S - GFNI/AVX512 implementation of SM4 cipher + * + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#ifdef __x86_64 +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) + +#include "asm-common-amd64.h" + +/********************************************************************** + helper macros + **********************************************************************/ + +/* Transpose four 32-bit words between 128-bit vectors. */ +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x1, x0, x0; \ + \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x2; \ + \ + vpunpckhqdq t1, x0, x1; \ + vpunpcklqdq t1, x0, x0; \ + \ + vpunpckhqdq x2, t2, x3; \ + vpunpcklqdq x2, t2, x2; + +/********************************************************************** + 4-way && 8-way SM4 with GFNI and AVX512 (128-bit vectors) + **********************************************************************/ + +/* vector registers */ +#define RX0 %ymm0 +#define RX1 %ymm1 +#define RX0x %xmm0 +#define RX1x %xmm1 +#define RX0z %zmm0 +#define RX1z %zmm1 + +#define RTMP0 %ymm2 +#define RTMP1 %ymm3 +#define RTMP2 %ymm4 +#define RTMP3 %ymm5 +#define RTMP4 %ymm6 +#define RTMP0x %xmm2 +#define RTMP1x %xmm3 +#define RTMP2x %xmm4 +#define RTMP3x %xmm5 +#define RTMP4x %xmm6 +#define RTMP0z %zmm2 +#define RTMP1z %zmm3 +#define RTMP2z %zmm4 +#define RTMP3z %zmm5 +#define RTMP4z %zmm6 + +#define RNOT %ymm7 +#define RNOTx %xmm7 +#define RNOTz %zmm7 + +#define RA0 %ymm8 +#define RA1 %ymm9 +#define RA2 %ymm10 +#define RA3 %ymm11 +#define RA0x %xmm8 +#define RA1x %xmm9 +#define RA2x %xmm10 +#define RA3x %xmm11 +#define RA0z %zmm8 +#define RA1z %zmm9 +#define RA2z %zmm10 +#define RA3z %zmm11 + +#define RB0 %ymm12 +#define RB1 %ymm13 +#define RB2 %ymm14 +#define RB3 %ymm15 +#define RB0x %xmm12 +#define RB1x %xmm13 +#define RB2x %xmm14 +#define RB3x %xmm15 +#define RB0z %zmm12 +#define RB1z %zmm13 +#define RB2z %zmm14 +#define RB3z %zmm15 + +.text +.align 32 + +/* Affine transform, SM4 field to AES field */ +.Lpre_affine_s: + .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34 + .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34 + .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34 + .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34 + +/* Affine transform, AES field to SM4 field */ +.Lpost_affine_s: + .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7 + .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7 + .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7 + .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7 + +/* For CTR-mode IV byteswap */ +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* For input word byte-swap */ +.Lbswap32_mask: + .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +.Lcounter2222_lo: + .quad 2, 0 +.Lcounter4444_lo: + .quad 4, 0 +.Lcounter8888_lo: + .quad 8, 0 +.Lcounter16161616_lo: + .quad 16, 0 +.Lcounter1111_hi: + .quad 0, 1 + +.align 64 +.Lcounter0123_lo: + .quad 0, 0 + .quad 1, 0 + .quad 2, 0 + .quad 3, 0 + +.align 16 +.globl _gcry_sm4_gfni_avx512_expand_key +ELF(.type _gcry_sm4_gfni_avx512_expand_key, at function;) +_gcry_sm4_gfni_avx512_expand_key: + /* input: + * %rdi: 128-bit key + * %rsi: rkey_enc + * %rdx: rkey_dec + * %rcx: fk array + * %r8: ck array + */ + CFI_STARTPROC(); + + vmovd 0*4(%rdi), RA0x; + vmovd 1*4(%rdi), RA1x; + vmovd 2*4(%rdi), RA2x; + vmovd 3*4(%rdi), RA3x; + + vmovdqa .Lbswap32_mask rRIP, RTMP2x; + vpshufb RTMP2x, RA0x, RA0x; + vpshufb RTMP2x, RA1x, RA1x; + vpshufb RTMP2x, RA2x, RA2x; + vpshufb RTMP2x, RA3x, RA3x; + + vmovd 0*4(%rcx), RB0x; + vmovd 1*4(%rcx), RB1x; + vmovd 2*4(%rcx), RB2x; + vmovd 3*4(%rcx), RB3x; + vpxor RB0x, RA0x, RA0x; + vpxor RB1x, RA1x, RA1x; + vpxor RB2x, RA2x, RA2x; + vpxor RB3x, RA3x, RA3x; + +#define ROUND(round, s0, s1, s2, s3) \ + vpxord (4*(round))(%r8) {1to4}, s1, RX0x; \ + vpternlogd $0x96, s2, s3, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \ + vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \ + \ + /* linear part */ \ + vpxor RX0x, s0, s0; /* s0 ^ x */ \ + vprold $13, RX0x, RTMP1x; \ + vprold $23, RX0x, RTMP3x; \ + vpternlogd $0x96, RTMP1x, RTMP3x, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */ + + leaq (32*4)(%r8), %rax; + leaq (32*4)(%rdx), %rdx; +.align 16 +.Lroundloop_expand_key: + leaq (-4*4)(%rdx), %rdx; + ROUND(0, RA0x, RA1x, RA2x, RA3x); + ROUND(1, RA1x, RA2x, RA3x, RA0x); + ROUND(2, RA2x, RA3x, RA0x, RA1x); + ROUND(3, RA3x, RA0x, RA1x, RA2x); + leaq (4*4)(%r8), %r8; + vmovd RA0x, (0*4)(%rsi); + vmovd RA1x, (1*4)(%rsi); + vmovd RA2x, (2*4)(%rsi); + vmovd RA3x, (3*4)(%rsi); + vmovd RA0x, (3*4)(%rdx); + vmovd RA1x, (2*4)(%rdx); + vmovd RA2x, (1*4)(%rdx); + vmovd RA3x, (0*4)(%rdx); + leaq (4*4)(%rsi), %rsi; + cmpq %rax, %r8; + jne .Lroundloop_expand_key; + +#undef ROUND + + vzeroall; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_expand_key,.-_gcry_sm4_gfni_avx512_expand_key;) + +.align 16 +ELF(.type sm4_gfni_avx512_crypt_blk1_4, at function;) +sm4_gfni_avx512_crypt_blk1_4: + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..4 blocks) + * %rdx: src (1..4 blocks) + * %rcx: num blocks (1..4) + */ + CFI_STARTPROC(); + + vmovdqu 0*16(%rdx), RA0x; + vmovdqa RA0x, RA1x; + vmovdqa RA0x, RA2x; + vmovdqa RA0x, RA3x; + cmpq $2, %rcx; + jb .Lblk4_load_input_done; + vmovdqu 1*16(%rdx), RA1x; + je .Lblk4_load_input_done; + vmovdqu 2*16(%rdx), RA2x; + cmpq $3, %rcx; + je .Lblk4_load_input_done; + vmovdqu 3*16(%rdx), RA3x; + +.Lblk4_load_input_done: + + vmovdqa .Lbswap32_mask rRIP, RTMP2x; + vpshufb RTMP2x, RA0x, RA0x; + vpshufb RTMP2x, RA1x, RA1x; + vpshufb RTMP2x, RA2x, RA2x; + vpshufb RTMP2x, RA3x, RA3x; + + transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x); + +#define ROUND(round, s0, s1, s2, s3) \ + vpxord (4*(round))(%rdi) {1to4}, s1, RX0x; \ + vpternlogd $0x96, s2, s3, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \ + vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \ + \ + /* linear part */ \ + vprold $2, RX0x, RTMP0x; \ + vprold $10, RX0x, RTMP1x; \ + vprold $18, RX0x, RTMP2x; \ + vpternlogd $0x96, RTMP0x, RX0x, s0; /* s0 ^ x ^ rol(x,2) */ \ + vprold $24, RX0x, RX0x; \ + vpternlogd $0x96, RTMP1x, RTMP2x, RX0x; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RX0x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk4: + ROUND(0, RA0x, RA1x, RA2x, RA3x); + ROUND(1, RA1x, RA2x, RA3x, RA0x); + ROUND(2, RA2x, RA3x, RA0x, RA1x); + ROUND(3, RA3x, RA0x, RA1x, RA2x); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk4; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2x; + + transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x); + vpshufb RTMP2x, RA0x, RA0x; + vpshufb RTMP2x, RA1x, RA1x; + vpshufb RTMP2x, RA2x, RA2x; + vpshufb RTMP2x, RA3x, RA3x; + + vmovdqu RA0x, 0*16(%rsi); + cmpq $2, %rcx; + jb .Lblk4_store_output_done; + vmovdqu RA1x, 1*16(%rsi); + je .Lblk4_store_output_done; + vmovdqu RA2x, 2*16(%rsi); + cmpq $3, %rcx; + je .Lblk4_store_output_done; + vmovdqu RA3x, 3*16(%rsi); + +.Lblk4_store_output_done: + vzeroall; + xorl %eax, %eax; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size sm4_gfni_avx512_crypt_blk1_4,.-sm4_gfni_avx512_crypt_blk1_4;) + +.align 16 +ELF(.type __sm4_gfni_crypt_blk8, at function;) +__sm4_gfni_crypt_blk8: + /* input: + * %rdi: round key array, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * ciphertext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext + * blocks + */ + CFI_STARTPROC(); + + vmovdqa .Lbswap32_mask rRIP, RTMP2x; + vpshufb RTMP2x, RA0x, RA0x; + vpshufb RTMP2x, RA1x, RA1x; + vpshufb RTMP2x, RA2x, RA2x; + vpshufb RTMP2x, RA3x, RA3x; + vpshufb RTMP2x, RB0x, RB0x; + vpshufb RTMP2x, RB1x, RB1x; + vpshufb RTMP2x, RB2x, RB2x; + vpshufb RTMP2x, RB3x, RB3x; + + transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x); + transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vpbroadcastd (4*(round))(%rdi), RX1x; \ + vmovdqa .Lpre_affine_s rRIP, RTMP2x; \ + vmovdqa .Lpost_affine_s rRIP, RTMP3x; \ + vpxor s1, RX1x, RX0x; \ + vpternlogd $0x96, s2, s3, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \ + vpxor r1, RX1x, RX1x; \ + vpternlogd $0x96, r2, r3, RX1x; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + vgf2p8affineqb $0x65, RTMP2x, RX0x, RX0x; \ + vgf2p8affineinvqb $0xd3, RTMP3x, RX0x, RX0x; \ + vgf2p8affineqb $0x65, RTMP2x, RX1x, RX1x; \ + vgf2p8affineinvqb $0xd3, RTMP3x, RX1x, RX1x; \ + \ + /* linear part */ \ + vprold $2, RX0x, RTMP0x; \ + vprold $10, RX0x, RTMP1x; \ + vprold $18, RX0x, RTMP2x; \ + vpternlogd $0x96, RTMP0x, RX0x, s0; /* s0 ^ x ^ rol(x,2) */ \ + vprold $24, RX0x, RX0x; \ + vprold $2, RX1x, RTMP3x; \ + vprold $10, RX1x, RTMP4x; \ + vprold $18, RX1x, RTMP0x; \ + vpternlogd $0x96, RTMP3x, RX1x, r0; /* r0 ^ x ^ rol(x,2) */ \ + vprold $24, RX1x, RX1x; \ + vpternlogd $0x96, RTMP1x, RTMP2x, RX0x; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpternlogd $0x96, RTMP4x, RTMP0x, RX1x; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RX0x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RX1x, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk8: + ROUND(0, RA0x, RA1x, RA2x, RA3x, RB0x, RB1x, RB2x, RB3x); + ROUND(1, RA1x, RA2x, RA3x, RA0x, RB1x, RB2x, RB3x, RB0x); + ROUND(2, RA2x, RA3x, RA0x, RA1x, RB2x, RB3x, RB0x, RB1x); + ROUND(3, RA3x, RA0x, RA1x, RA2x, RB3x, RB0x, RB1x, RB2x); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk8; + +#undef ROUND + + vmovdqa .Lbswap128_mask rRIP, RTMP2x; + + transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x); + transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x); + vpshufb RTMP2x, RA0x, RA0x; + vpshufb RTMP2x, RA1x, RA1x; + vpshufb RTMP2x, RA2x, RA2x; + vpshufb RTMP2x, RA3x, RA3x; + vpshufb RTMP2x, RB0x, RB0x; + vpshufb RTMP2x, RB1x, RB1x; + vpshufb RTMP2x, RB2x, RB2x; + vpshufb RTMP2x, RB3x, RB3x; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size __sm4_gfni_crypt_blk8,.-__sm4_gfni_crypt_blk8;) + +.align 16 +ELF(.type _gcry_sm4_gfni_avx512_crypt_blk1_8, at function;) +_gcry_sm4_gfni_avx512_crypt_blk1_8: + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..8 blocks) + * %rdx: src (1..8 blocks) + * %rcx: num blocks (1..8) + */ + CFI_STARTPROC(); + + cmpq $5, %rcx; + jb sm4_gfni_avx512_crypt_blk1_4; + vmovdqu (0 * 16)(%rdx), RA0x; + vmovdqu (1 * 16)(%rdx), RA1x; + vmovdqu (2 * 16)(%rdx), RA2x; + vmovdqu (3 * 16)(%rdx), RA3x; + vmovdqu (4 * 16)(%rdx), RB0x; + vmovdqa RB0x, RB1x; + vmovdqa RB0x, RB2x; + vmovdqa RB0x, RB3x; + je .Lblk8_load_input_done; + vmovdqu (5 * 16)(%rdx), RB1x; + cmpq $7, %rcx; + jb .Lblk8_load_input_done; + vmovdqu (6 * 16)(%rdx), RB2x; + je .Lblk8_load_input_done; + vmovdqu (7 * 16)(%rdx), RB3x; + +.Lblk8_load_input_done: + call __sm4_gfni_crypt_blk8; + + cmpq $6, %rcx; + vmovdqu RA0x, (0 * 16)(%rsi); + vmovdqu RA1x, (1 * 16)(%rsi); + vmovdqu RA2x, (2 * 16)(%rsi); + vmovdqu RA3x, (3 * 16)(%rsi); + vmovdqu RB0x, (4 * 16)(%rsi); + jb .Lblk8_store_output_done; + vmovdqu RB1x, (5 * 16)(%rsi); + je .Lblk8_store_output_done; + vmovdqu RB2x, (6 * 16)(%rsi); + cmpq $7, %rcx; + je .Lblk8_store_output_done; + vmovdqu RB3x, (7 * 16)(%rsi); + +.Lblk8_store_output_done: + vzeroall; + xorl %eax, %eax; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_crypt_blk1_8,.-_gcry_sm4_gfni_avx512_crypt_blk1_8;) + +/********************************************************************** + 16-way SM4 with GFNI and AVX512 (256-bit vectors) + **********************************************************************/ + +.align 16 +ELF(.type __sm4_gfni_crypt_blk16, at function;) +__sm4_gfni_crypt_blk16: + /* input: + * %rdi: ctx, CTX + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * plaintext blocks + * output: + * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel + * ciphertext blocks + */ + CFI_STARTPROC(); + + vbroadcasti128 .Lbswap32_mask rRIP, RTMP2; + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vpbroadcastd (4*(round))(%rdi), RX1; \ + vbroadcasti128 .Lpre_affine_s rRIP, RTMP2; \ + vbroadcasti128 .Lpost_affine_s rRIP, RTMP3; \ + vpxor s1, RX1, RX0; \ + vpternlogd $0x96, s2, s3, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \ + vpxor r1, RX1, RX1; \ + vpternlogd $0x96, r2, r3, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + vgf2p8affineqb $0x65, RTMP2, RX0, RX0; \ + vgf2p8affineinvqb $0xd3, RTMP3, RX0, RX0; \ + vgf2p8affineqb $0x65, RTMP2, RX1, RX1; \ + vgf2p8affineinvqb $0xd3, RTMP3, RX1, RX1; \ + \ + /* linear part */ \ + vprold $2, RX0, RTMP0; \ + vprold $10, RX0, RTMP1; \ + vprold $18, RX0, RTMP2; \ + vpternlogd $0x96, RTMP0, RX0, s0; /* s0 ^ x ^ rol(x,2) */ \ + vprold $24, RX0, RX0; \ + vprold $2, RX1, RTMP3; \ + vprold $10, RX1, RTMP4; \ + vprold $18, RX1, RTMP0; \ + vpternlogd $0x96, RTMP3, RX1, r0; /* r0 ^ x ^ rol(x,2) */ \ + vprold $24, RX1, RX1; \ + vpternlogd $0x96, RTMP1, RTMP2, RX0; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpternlogd $0x96, RTMP4, RTMP0, RX1; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RX0, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxor RX1, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk16: + ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3); + ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0); + ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1); + ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk16; + +#undef ROUND + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP2; + + transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1); + transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1); + vpshufb RTMP2, RA0, RA0; + vpshufb RTMP2, RA1, RA1; + vpshufb RTMP2, RA2, RA2; + vpshufb RTMP2, RA3, RA3; + vpshufb RTMP2, RB0, RB0; + vpshufb RTMP2, RB1, RB1; + vpshufb RTMP2, RB2, RB2; + vpshufb RTMP2, RB3, RB3; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size __sm4_gfni_crypt_blk16,.-__sm4_gfni_crypt_blk16;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_crypt_blk1_16 +ELF(.type _gcry_sm4_gfni_avx512_crypt_blk1_16, at function;) +_gcry_sm4_gfni_avx512_crypt_blk1_16: + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..16 blocks) + * %rdx: src (1..16 blocks) + * %rcx: num blocks (1..16) + */ + CFI_STARTPROC(); + +#define LOAD_INPUT(offset, yreg) \ + cmpq $(1 + 2 * (offset)), %rcx; \ + jb .Lblk16_load_input_done; \ + ja 1f; \ + vmovdqu (offset) * 32(%rdx), yreg##x; \ + jmp .Lblk16_load_input_done; \ + 1: \ + vmovdqu (offset) * 32(%rdx), yreg; + + cmpq $8, %rcx; + jbe _gcry_sm4_gfni_avx512_crypt_blk1_8; + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RA1; + vmovdqu (2 * 32)(%rdx), RA2; + vmovdqu (3 * 32)(%rdx), RA3; + LOAD_INPUT(4, RB0); + LOAD_INPUT(5, RB1); + LOAD_INPUT(6, RB2); + LOAD_INPUT(7, RB3); +#undef LOAD_INPUT + +.Lblk16_load_input_done: + call __sm4_gfni_crypt_blk16; + +#define STORE_OUTPUT(yreg, offset) \ + cmpq $(1 + 2 * (offset)), %rcx; \ + jb .Lblk16_store_output_done; \ + ja 1f; \ + vmovdqu yreg##x, (offset) * 32(%rsi); \ + jmp .Lblk16_store_output_done; \ + 1: \ + vmovdqu yreg, (offset) * 32(%rsi); + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + STORE_OUTPUT(RB0, 4); + STORE_OUTPUT(RB1, 5); + STORE_OUTPUT(RB2, 6); + STORE_OUTPUT(RB3, 7); +#undef STORE_OUTPUT + +.Lblk16_store_output_done: + vzeroall; + xorl %eax, %eax; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_crypt_blk1_16,.-_gcry_sm4_gfni_avx512_crypt_blk1_16;) + +#define add_le128(out, in, lo_counter, hi_counter1) \ + vpaddq lo_counter, in, out; \ + vpcmpuq $1, lo_counter, out, %k1; \ + kaddb %k1, %k1, %k1; \ + vpaddq hi_counter1, out, out{%k1}; + +.align 16 +.globl _gcry_sm4_gfni_avx512_ctr_enc +ELF(.type _gcry_sm4_gfni_avx512_ctr_enc, at function;) +_gcry_sm4_gfni_avx512_ctr_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv (big endian, 128bit) + */ + CFI_STARTPROC(); + + vbroadcasti128 .Lbswap128_mask rRIP, RTMP0; + vmovdqa .Lcounter0123_lo rRIP, RTMP1; + vbroadcasti128 .Lcounter2222_lo rRIP, RTMP2; + vbroadcasti128 .Lcounter4444_lo rRIP, RTMP3; + vbroadcasti128 .Lcounter8888_lo rRIP, RTMP4; + + /* load IV and byteswap */ + movq 8(%rcx), %r11; + bswapq %r11; + vbroadcasti128 (%rcx), RB3; + vpshufb RTMP0, RB3, RB3; + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 16), %r11; + ja .Lhandle_ctr_carry_blk16; + + /* construct IVs */ + vpaddq RTMP1, RB3, RA0; /* +0:+1 */ + vpaddq RTMP2, RA0, RA1; /* +2:+3 */ + vpaddq RTMP3, RA0, RA2; /* +4:+5 */ + vpaddq RTMP3, RA1, RA3; /* +6:+7 */ + vpaddq RTMP4, RA0, RB0; /* +8... */ + vpaddq RTMP4, RA1, RB1; /* +10... */ + vpaddq RTMP4, RA2, RB2; /* +12... */ + vpaddq RTMP4, RA3, RB3; /* +14... */ + + /* Update counter */ + leaq 16(%r11), %r11; + bswapq %r11; + movq %r11, 8(%rcx); + + jmp .Lctr_carry_done_blk16; + +.Lhandle_ctr_carry_blk16: + vbroadcasti128 .Lcounter1111_hi rRIP, RNOT; + + /* construct IVs */ + add_le128(RA0, RB3, RTMP1, RNOT); /* +0:+1 */ + add_le128(RA1, RA0, RTMP2, RNOT); /* +2:+3 */ + add_le128(RA2, RA0, RTMP3, RNOT); /* +4:+5 */ + add_le128(RA3, RA1, RTMP3, RNOT); /* +6:+7 */ + add_le128(RB0, RA0, RTMP4, RNOT); /* +8... */ + add_le128(RB1, RA1, RTMP4, RNOT); /* +10... */ + add_le128(RB2, RA2, RTMP4, RNOT); /* +12... */ + add_le128(RB3, RA3, RTMP4, RNOT); /* +14... */ + + /* Update counter */ + addq $16, %r11; + movq (%rcx), %r10; + bswapq %r10; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + +.align 16 +.Lctr_carry_done_blk16: + /* Byte-swap IVs. */ + vpshufb RTMP0, RA0, RA0; + vpshufb RTMP0, RA1, RA1; + vpshufb RTMP0, RA2, RA2; + vpshufb RTMP0, RA3, RA3; + vpshufb RTMP0, RB0, RB0; + vpshufb RTMP0, RB1, RB1; + vpshufb RTMP0, RB2, RB2; + vpshufb RTMP0, RB3, RB3; + + call __sm4_gfni_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + kxorq %k1, %k1, %k1; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_ctr_enc,.-_gcry_sm4_gfni_avx512_ctr_enc;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_cbc_dec +ELF(.type _gcry_sm4_gfni_avx512_cbc_dec, at function;) +_gcry_sm4_gfni_avx512_cbc_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RA1; + vmovdqu (2 * 32)(%rdx), RA2; + vmovdqu (3 * 32)(%rdx), RA3; + vmovdqu (4 * 32)(%rdx), RB0; + vmovdqu (5 * 32)(%rdx), RB1; + vmovdqu (6 * 32)(%rdx), RB2; + vmovdqu (7 * 32)(%rdx), RB3; + + call __sm4_gfni_crypt_blk16; + + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RNOT; + vpxor RNOT, RA0, RA0; + vpxor (0 * 32 + 16)(%rdx), RA1, RA1; + vpxor (1 * 32 + 16)(%rdx), RA2, RA2; + vpxor (2 * 32 + 16)(%rdx), RA3, RA3; + vpxor (3 * 32 + 16)(%rdx), RB0, RB0; + vpxor (4 * 32 + 16)(%rdx), RB1, RB1; + vpxor (5 * 32 + 16)(%rdx), RB2, RB2; + vpxor (6 * 32 + 16)(%rdx), RB3, RB3; + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); /* store new IV */ + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_cbc_dec,.-_gcry_sm4_gfni_avx512_cbc_dec;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_cfb_dec +ELF(.type _gcry_sm4_gfni_avx512_cfb_dec, at function;) +_gcry_sm4_gfni_avx512_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + + /* Load input */ + vmovdqu (%rcx), RNOTx; + vinserti128 $1, (%rdx), RNOT, RA0; + vmovdqu (0 * 32 + 16)(%rdx), RA1; + vmovdqu (1 * 32 + 16)(%rdx), RA2; + vmovdqu (2 * 32 + 16)(%rdx), RA3; + vmovdqu (3 * 32 + 16)(%rdx), RB0; + vmovdqu (4 * 32 + 16)(%rdx), RB1; + vmovdqu (5 * 32 + 16)(%rdx), RB2; + vmovdqu (6 * 32 + 16)(%rdx), RB3; + + /* Update IV */ + vmovdqu (7 * 32 + 16)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); + + call __sm4_gfni_crypt_blk16; + + vpxor (0 * 32)(%rdx), RA0, RA0; + vpxor (1 * 32)(%rdx), RA1, RA1; + vpxor (2 * 32)(%rdx), RA2, RA2; + vpxor (3 * 32)(%rdx), RA3, RA3; + vpxor (4 * 32)(%rdx), RB0, RB0; + vpxor (5 * 32)(%rdx), RB1, RB1; + vpxor (6 * 32)(%rdx), RB2, RB2; + vpxor (7 * 32)(%rdx), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_cfb_dec,.-_gcry_sm4_gfni_avx512_cfb_dec;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_ocb_enc +ELF(.type _gcry_sm4_gfni_avx512_ocb_enc, at function;) + +_gcry_sm4_gfni_avx512_ocb_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[16]) + */ + CFI_STARTPROC(); + + subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + + vmovdqu (%rcx), RTMP0x; + vmovdqu (%r8), RTMP1x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg, inreg) \ + vmovdqu (n * 32)(%rdx), inreg; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti128 $1, RTMP0x, RNOT, RNOT; \ + vpxor inreg, RNOT, yreg; \ + vmovdqu RNOT, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, RA0, RTMP2); + OCB_INPUT(1, %r12, %r13, RA1, RTMP3); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, RA2, RTMP4); + vpternlogd $0x96, RTMP2, RTMP3, RTMP4; + OCB_INPUT(3, %r12, %r13, RA3, RX0); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, RB0, RX1); + OCB_INPUT(5, %r12, %r13, RB1, RTMP2); + vpternlogd $0x96, RX0, RX1, RTMP2; + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, RB2, RTMP3); + OCB_INPUT(7, %r12, %r13, RB3, RX0); + vpternlogd $0x96, RTMP3, RX0, RTMP1; +#undef OCB_INPUT + + vpternlogd $0x96, RTMP4, RTMP2, RTMP1; + vextracti128 $1, RTMP1, RNOTx; + vmovdqu RTMP0x, (%rcx); + vpxor RNOTx, RTMP1x, RTMP1x; + vmovdqu RTMP1x, (%r8); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + + call __sm4_gfni_crypt_blk16; + + addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); + + vpxor (0 * 32)(%rsi), RA0, RA0; + vpxor (1 * 32)(%rsi), RA1, RA1; + vpxor (2 * 32)(%rsi), RA2, RA2; + vpxor (3 * 32)(%rsi), RA3, RA3; + vpxor (4 * 32)(%rsi), RB0, RB0; + vpxor (5 * 32)(%rsi), RB1, RB1; + vpxor (6 * 32)(%rsi), RB2, RB2; + vpxor (7 * 32)(%rsi), RB3, RB3; + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_ocb_enc,.-_gcry_sm4_gfni_avx512_ocb_enc;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_ocb_dec +ELF(.type _gcry_sm4_gfni_avx512_ocb_dec, at function;) + +_gcry_sm4_gfni_avx512_ocb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[16]) + */ + CFI_STARTPROC(); + + subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + + vmovdqu (%rcx), RTMP0x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rdx), yreg; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti128 $1, RTMP0x, RNOT, RNOT; \ + vpxor yreg, RNOT, yreg; \ + vmovdqu RNOT, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, RA0); + OCB_INPUT(1, %r12, %r13, RA1); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, RA2); + OCB_INPUT(3, %r12, %r13, RA3); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, RB0); + OCB_INPUT(5, %r12, %r13, RB1); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, RB2); + OCB_INPUT(7, %r12, %r13, RB3); +#undef OCB_INPUT + + vmovdqu RTMP0x, (%rcx); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + + call __sm4_gfni_crypt_blk16; + + addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); + + vpxor (0 * 32)(%rsi), RA0, RA0; + vpxor (1 * 32)(%rsi), RA1, RA1; + vpxor (2 * 32)(%rsi), RA2, RA2; + vpxor (3 * 32)(%rsi), RA3, RA3; + vpxor (4 * 32)(%rsi), RB0, RB0; + vpxor (5 * 32)(%rsi), RB1, RB1; + vpxor (6 * 32)(%rsi), RB2, RB2; + vpxor (7 * 32)(%rsi), RB3, RB3; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + + vpternlogd $0x96, RA0, RA1, RA2; + vpternlogd $0x96, RA3, RB0, RB1; + vpternlogd $0x96, RB2, RB3, RA2; + vpxord RA2, RB1, RTMP1; + + vextracti128 $1, RTMP1, RNOTx; + vpternlogd $0x96, (%r8), RNOTx, RTMP1x; + vmovdqu RTMP1x, (%r8); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_ocb_dec,.-_gcry_sm4_gfni_avx512_ocb_dec;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_ocb_auth +ELF(.type _gcry_sm4_gfni_avx512_ocb_auth, at function;) + +_gcry_sm4_gfni_avx512_ocb_auth: + /* input: + * %rdi: ctx, CTX + * %rsi: abuf (16 blocks) + * %rdx: offset + * %rcx: checksum + * %r8 : L pointers (void *L[16]) + */ + CFI_STARTPROC(); + + subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + + vmovdqu (%rdx), RTMP0x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rsi), yreg; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti128 $1, RTMP0x, RNOT, RNOT; \ + vpxor yreg, RNOT, yreg; + + movq (0 * 8)(%r8), %r10; + movq (1 * 8)(%r8), %r11; + movq (2 * 8)(%r8), %r12; + movq (3 * 8)(%r8), %r13; + OCB_INPUT(0, %r10, %r11, RA0); + OCB_INPUT(1, %r12, %r13, RA1); + movq (4 * 8)(%r8), %r10; + movq (5 * 8)(%r8), %r11; + movq (6 * 8)(%r8), %r12; + movq (7 * 8)(%r8), %r13; + OCB_INPUT(2, %r10, %r11, RA2); + OCB_INPUT(3, %r12, %r13, RA3); + movq (8 * 8)(%r8), %r10; + movq (9 * 8)(%r8), %r11; + movq (10 * 8)(%r8), %r12; + movq (11 * 8)(%r8), %r13; + OCB_INPUT(4, %r10, %r11, RB0); + OCB_INPUT(5, %r12, %r13, RB1); + movq (12 * 8)(%r8), %r10; + movq (13 * 8)(%r8), %r11; + movq (14 * 8)(%r8), %r12; + movq (15 * 8)(%r8), %r13; + OCB_INPUT(6, %r10, %r11, RB2); + OCB_INPUT(7, %r12, %r13, RB3); +#undef OCB_INPUT + + vmovdqu RTMP0x, (%rdx); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + + call __sm4_gfni_crypt_blk16; + + addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); + + vpternlogd $0x96, RA0, RA1, RA2; + vpternlogd $0x96, RA3, RB0, RB1; + vpternlogd $0x96, RB2, RB3, RA2; + vpxor RA2, RB1, RTMP1; + + vextracti128 $1, RTMP1, RNOTx; + vpternlogd $0x96, (%rcx), RNOTx, RTMP1x; + vmovdqu RTMP1x, (%rcx); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_ocb_auth,.-_gcry_sm4_gfni_avx512_ocb_auth;) + +/********************************************************************** + 32-way SM4 with GFNI and AVX512 (512-bit vectors) + **********************************************************************/ + +.align 16 +ELF(.type __sm4_gfni_crypt_blk32, at function;) +__sm4_gfni_crypt_blk32: + /* input: + * %rdi: ctx, CTX + * RA0z, RA1z, RA2z, RA3z, RB0z, RB1z, RB2z, RB3z: 32 parallel plaintext blocks + * output: + * RA0z, RA1z, RA2z, RA3z, RB0z, RB1z, RB2z, RB3z: 32 parallel ciphertext blocks + */ + CFI_STARTPROC(); + + vbroadcasti32x4 .Lbswap32_mask rRIP, RTMP2z; + vpshufb RTMP2z, RA0z, RA0z; + vpshufb RTMP2z, RA1z, RA1z; + vpshufb RTMP2z, RA2z, RA2z; + vpshufb RTMP2z, RA3z, RA3z; + vpshufb RTMP2z, RB0z, RB0z; + vpshufb RTMP2z, RB1z, RB1z; + vpshufb RTMP2z, RB2z, RB2z; + vpshufb RTMP2z, RB3z, RB3z; + + vbroadcasti32x4 .Lpre_affine_s rRIP, %zmm16; + vbroadcasti32x4 .Lpost_affine_s rRIP, %zmm17; + + transpose_4x4(RA0z, RA1z, RA2z, RA3z, RTMP0z, RTMP1z); + transpose_4x4(RB0z, RB1z, RB2z, RB3z, RTMP0z, RTMP1z); + +#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \ + vpbroadcastd (4*(round))(%rdi), RX1z; \ + vpxord s1, RX1z, RX0z; \ + vpternlogd $0x96, s2, s3, RX0z; /* s1 ^ s2 ^ s3 ^ rk */ \ + vpxord r1, RX1z, RX1z; \ + vpternlogd $0x96, r2, r3, RX1z; /* r1 ^ r2 ^ r3 ^ rk */ \ + \ + /* sbox, non-linear part */ \ + vgf2p8affineqb $0x65, %zmm16, RX0z, RX0z; \ + vgf2p8affineinvqb $0xd3, %zmm17, RX0z, RX0z; \ + vgf2p8affineqb $0x65, %zmm16, RX1z, RX1z; \ + vgf2p8affineinvqb $0xd3, %zmm17, RX1z, RX1z; \ + \ + /* linear part */ \ + vprold $2, RX0z, RTMP0z; \ + vprold $10, RX0z, RTMP1z; \ + vprold $18, RX0z, RTMP2z; \ + vpternlogd $0x96, RTMP0z, RX0z, s0; /* s0 ^ x ^ rol(x,2) */ \ + vprold $24, RX0z, RX0z; \ + vprold $2, RX1z, RTMP3z; \ + vprold $10, RX1z, RTMP4z; \ + vprold $18, RX1z, RTMP0z; \ + vpternlogd $0x96, RTMP3z, RX1z, r0; /* r0 ^ x ^ rol(x,2) */ \ + vprold $24, RX1z, RX1z; \ + vpternlogd $0x96, RTMP1z, RTMP2z, RX0z; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpternlogd $0x96, RTMP4z, RTMP0z, RX1z; /* rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxord RX0z, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \ + vpxord RX1z, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ + + leaq (32*4)(%rdi), %rax; +.align 16 +.Lroundloop_blk32: + ROUND(0, RA0z, RA1z, RA2z, RA3z, RB0z, RB1z, RB2z, RB3z); + ROUND(1, RA1z, RA2z, RA3z, RA0z, RB1z, RB2z, RB3z, RB0z); + ROUND(2, RA2z, RA3z, RA0z, RA1z, RB2z, RB3z, RB0z, RB1z); + ROUND(3, RA3z, RA0z, RA1z, RA2z, RB3z, RB0z, RB1z, RB2z); + leaq (4*4)(%rdi), %rdi; + cmpq %rax, %rdi; + jne .Lroundloop_blk32; + +#undef ROUND + + vbroadcasti32x4 .Lbswap128_mask rRIP, RTMP2z; + + transpose_4x4(RA0z, RA1z, RA2z, RA3z, RTMP0z, RTMP1z); + transpose_4x4(RB0z, RB1z, RB2z, RB3z, RTMP0z, RTMP1z); + vpshufb RTMP2z, RA0z, RA0z; + vpshufb RTMP2z, RA1z, RA1z; + vpshufb RTMP2z, RA2z, RA2z; + vpshufb RTMP2z, RA3z, RA3z; + vpshufb RTMP2z, RB0z, RB0z; + vpshufb RTMP2z, RB1z, RB1z; + vpshufb RTMP2z, RB2z, RB2z; + vpshufb RTMP2z, RB3z, RB3z; + + vpxord %zmm16, %zmm16, %zmm16; + vpxord %zmm17, %zmm17, %zmm17; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size __sm4_gfni_crypt_blk32,.-__sm4_gfni_crypt_blk32;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_crypt_blk32 +ELF(.type _gcry_sm4_gfni_avx512_crypt_blk32, at function;) +_gcry_sm4_gfni_avx512_crypt_blk32: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + /* Load input */ + vmovdqu32 (0 * 64)(%rdx), RA0z; + vmovdqu32 (1 * 64)(%rdx), RA1z; + vmovdqu32 (2 * 64)(%rdx), RA2z; + vmovdqu32 (3 * 64)(%rdx), RA3z; + vmovdqu32 (4 * 64)(%rdx), RB0z; + vmovdqu32 (5 * 64)(%rdx), RB1z; + vmovdqu32 (6 * 64)(%rdx), RB2z; + vmovdqu32 (7 * 64)(%rdx), RB3z; + + call __sm4_gfni_crypt_blk32; + + vmovdqu32 RA0z, (0 * 64)(%rsi); + vmovdqu32 RA1z, (1 * 64)(%rsi); + vmovdqu32 RA2z, (2 * 64)(%rsi); + vmovdqu32 RA3z, (3 * 64)(%rsi); + vmovdqu32 RB0z, (4 * 64)(%rsi); + vmovdqu32 RB1z, (5 * 64)(%rsi); + vmovdqu32 RB2z, (6 * 64)(%rsi); + vmovdqu32 RB3z, (7 * 64)(%rsi); + + xorl %eax, %eax; + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_crypt_blk32,.-_gcry_sm4_gfni_avx512_crypt_blk32;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_ctr_enc_blk32 +ELF(.type _gcry_sm4_gfni_avx512_ctr_enc_blk32, at function;) +_gcry_sm4_gfni_avx512_ctr_enc_blk32: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv (big endian, 128bit) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z; + vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z; + vbroadcasti64x2 .Lcounter4444_lo rRIP, RTMP2z; + vbroadcasti64x2 .Lcounter8888_lo rRIP, RTMP3z; + vbroadcasti64x2 .Lcounter16161616_lo rRIP, RTMP4z; + + /* load IV and byteswap */ + movq 8(%rcx), %r11; + bswapq %r11; + vbroadcasti64x2 (%rcx), RB3z; + vpshufb RTMP0z, RB3z, RB3z; + + /* check need for handling 64-bit overflow and carry */ + cmpq $(0xffffffffffffffff - 32), %r11; + ja .Lhandle_ctr_carry_blk32; + + /* construct IVs */ + vpaddq RTMP1z, RB3z, RA0z; /* +0:+1:+2:+3 */ + vpaddq RTMP2z, RA0z, RA1z; /* +4:+5:+6:+7 */ + vpaddq RTMP3z, RA0z, RA2z; /* +8:+9:+10:+11 */ + vpaddq RTMP3z, RA1z, RA3z; /* +12:+13:+14:+15 */ + vpaddq RTMP4z, RA0z, RB0z; /* +16... */ + vpaddq RTMP4z, RA1z, RB1z; /* +20... */ + vpaddq RTMP4z, RA2z, RB2z; /* +24... */ + vpaddq RTMP4z, RA3z, RB3z; /* +28... */ + + /* Update counter */ + leaq 32(%r11), %r11; + bswapq %r11; + movq %r11, 8(%rcx); + + jmp .Lctr_carry_done_blk32; + +.Lhandle_ctr_carry_blk32: + vbroadcasti64x2 .Lcounter1111_hi rRIP, RNOTz; + + /* construct IVs */ + add_le128(RA0z, RB3z, RTMP1z, RNOTz); /* +0:+1:+2:+3 */ + add_le128(RA1z, RA0z, RTMP2z, RNOTz); /* +4:+5:+6:+7 */ + add_le128(RA2z, RA0z, RTMP3z, RNOTz); /* +8:+9:+10:+11 */ + add_le128(RA3z, RA1z, RTMP3z, RNOTz); /* +12:+13:+14:+15 */ + add_le128(RB0z, RA0z, RTMP4z, RNOTz); /* +16... */ + add_le128(RB1z, RA1z, RTMP4z, RNOTz); /* +20... */ + add_le128(RB2z, RA2z, RTMP4z, RNOTz); /* +24... */ + add_le128(RB3z, RA3z, RTMP4z, RNOTz); /* +28... */ + + /* Update counter */ + addq $32, %r11; + movq (%rcx), %r10; + bswapq %r10; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + +.align 16 +.Lctr_carry_done_blk32: + /* Byte-swap IVs. */ + vpshufb RTMP0z, RA0z, RA0z; + vpshufb RTMP0z, RA1z, RA1z; + vpshufb RTMP0z, RA2z, RA2z; + vpshufb RTMP0z, RA3z, RA3z; + vpshufb RTMP0z, RB0z, RB0z; + vpshufb RTMP0z, RB1z, RB1z; + vpshufb RTMP0z, RB2z, RB2z; + vpshufb RTMP0z, RB3z, RB3z; + + call __sm4_gfni_crypt_blk32; + + vpxord (0 * 64)(%rdx), RA0z, RA0z; + vpxord (1 * 64)(%rdx), RA1z, RA1z; + vpxord (2 * 64)(%rdx), RA2z, RA2z; + vpxord (3 * 64)(%rdx), RA3z, RA3z; + vpxord (4 * 64)(%rdx), RB0z, RB0z; + vpxord (5 * 64)(%rdx), RB1z, RB1z; + vpxord (6 * 64)(%rdx), RB2z, RB2z; + vpxord (7 * 64)(%rdx), RB3z, RB3z; + + vmovdqu32 RA0z, (0 * 64)(%rsi); + vmovdqu32 RA1z, (1 * 64)(%rsi); + vmovdqu32 RA2z, (2 * 64)(%rsi); + vmovdqu32 RA3z, (3 * 64)(%rsi); + vmovdqu32 RB0z, (4 * 64)(%rsi); + vmovdqu32 RB1z, (5 * 64)(%rsi); + vmovdqu32 RB2z, (6 * 64)(%rsi); + vmovdqu32 RB3z, (7 * 64)(%rsi); + + vzeroall; + kxorq %k1, %k1, %k1; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_ctr_enc_blk32,.-_gcry_sm4_gfni_avx512_ctr_enc_blk32;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_cbc_dec_blk32 +ELF(.type _gcry_sm4_gfni_avx512_cbc_dec_blk32, at function;) +_gcry_sm4_gfni_avx512_cbc_dec_blk32: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + vmovdqu32 (0 * 64)(%rdx), RA0z; + vmovdqu32 (1 * 64)(%rdx), RA1z; + vmovdqu32 (2 * 64)(%rdx), RA2z; + vmovdqu32 (3 * 64)(%rdx), RA3z; + vmovdqu32 (4 * 64)(%rdx), RB0z; + vmovdqu32 (5 * 64)(%rdx), RB1z; + vmovdqu32 (6 * 64)(%rdx), RB2z; + vmovdqu32 (7 * 64)(%rdx), RB3z; + + call __sm4_gfni_crypt_blk32; + + vmovdqu (%rcx), RNOTx; + vinserti64x2 $1, (0 * 16)(%rdx), RNOT, RNOT; + vinserti64x4 $1, (1 * 16)(%rdx), RNOTz, RNOTz; + vpxord RNOTz, RA0z, RA0z; + vpxord (0 * 64 + 48)(%rdx), RA1z, RA1z; + vpxord (1 * 64 + 48)(%rdx), RA2z, RA2z; + vpxord (2 * 64 + 48)(%rdx), RA3z, RA3z; + vpxord (3 * 64 + 48)(%rdx), RB0z, RB0z; + vpxord (4 * 64 + 48)(%rdx), RB1z, RB1z; + vpxord (5 * 64 + 48)(%rdx), RB2z, RB2z; + vpxord (6 * 64 + 48)(%rdx), RB3z, RB3z; + vmovdqu (7 * 64 + 48)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); /* store new IV */ + + vmovdqu32 RA0z, (0 * 64)(%rsi); + vmovdqu32 RA1z, (1 * 64)(%rsi); + vmovdqu32 RA2z, (2 * 64)(%rsi); + vmovdqu32 RA3z, (3 * 64)(%rsi); + vmovdqu32 RB0z, (4 * 64)(%rsi); + vmovdqu32 RB1z, (5 * 64)(%rsi); + vmovdqu32 RB2z, (6 * 64)(%rsi); + vmovdqu32 RB3z, (7 * 64)(%rsi); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_cbc_dec_blk32,.-_gcry_sm4_gfni_avx512_cbc_dec_blk32;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_cfb_dec_blk32 +ELF(.type _gcry_sm4_gfni_avx512_cfb_dec_blk32, at function;) +_gcry_sm4_gfni_avx512_cfb_dec_blk32: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: iv + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + /* Load input */ + vmovdqu (%rcx), RA0x; + vinserti64x2 $1, (%rdx), RA0, RA0; + vinserti64x4 $1, 16(%rdx), RA0z, RA0z; + vmovdqu32 (0 * 64 + 48)(%rdx), RA1z; + vmovdqu32 (1 * 64 + 48)(%rdx), RA2z; + vmovdqu32 (2 * 64 + 48)(%rdx), RA3z; + vmovdqu32 (3 * 64 + 48)(%rdx), RB0z; + vmovdqu32 (4 * 64 + 48)(%rdx), RB1z; + vmovdqu32 (5 * 64 + 48)(%rdx), RB2z; + vmovdqu32 (6 * 64 + 48)(%rdx), RB3z; + + /* Update IV */ + vmovdqu (7 * 64 + 48)(%rdx), RNOTx; + vmovdqu RNOTx, (%rcx); + + call __sm4_gfni_crypt_blk32; + + vpxord (0 * 64)(%rdx), RA0z, RA0z; + vpxord (1 * 64)(%rdx), RA1z, RA1z; + vpxord (2 * 64)(%rdx), RA2z, RA2z; + vpxord (3 * 64)(%rdx), RA3z, RA3z; + vpxord (4 * 64)(%rdx), RB0z, RB0z; + vpxord (5 * 64)(%rdx), RB1z, RB1z; + vpxord (6 * 64)(%rdx), RB2z, RB2z; + vpxord (7 * 64)(%rdx), RB3z, RB3z; + + vmovdqu32 RA0z, (0 * 64)(%rsi); + vmovdqu32 RA1z, (1 * 64)(%rsi); + vmovdqu32 RA2z, (2 * 64)(%rsi); + vmovdqu32 RA3z, (3 * 64)(%rsi); + vmovdqu32 RB0z, (4 * 64)(%rsi); + vmovdqu32 RB1z, (5 * 64)(%rsi); + vmovdqu32 RB2z, (6 * 64)(%rsi); + vmovdqu32 RB3z, (7 * 64)(%rsi); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_cfb_dec_blk32,.-_gcry_sm4_gfni_avx512_cfb_dec_blk32;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_ocb_enc_blk32 +ELF(.type _gcry_sm4_gfni_avx512_ocb_enc_blk32, at function;) +_gcry_sm4_gfni_avx512_ocb_enc_blk32: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[32]) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + subq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(5 * 8); + + movq %r12, (0 * 8)(%rsp); + movq %r13, (1 * 8)(%rsp); + movq %r14, (2 * 8)(%rsp); + movq %r15, (3 * 8)(%rsp); + movq %rbx, (4 * 8)(%rsp); + CFI_REL_OFFSET(%r12, 0 * 8); + CFI_REL_OFFSET(%r13, 1 * 8); + CFI_REL_OFFSET(%r14, 2 * 8); + CFI_REL_OFFSET(%r15, 3 * 8); + CFI_REL_OFFSET(%rbx, 4 * 8); + + vmovdqu (%rcx), RTMP0x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg, zplain) \ + vmovdqu32 (n * 64)(%rdx), zplain; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti64x2 $1, RTMP0x, RNOT, RNOT; \ + vpxor (l2reg), RTMP0x, RTMP0x; \ + vinserti64x2 $2, RTMP0x, RNOTz, RNOTz; \ + vpxor (l3reg), RTMP0x, RTMP0x; \ + vinserti64x2 $3, RTMP0x, RNOTz, RNOTz; \ + vpxord zplain, RNOTz, zreg; \ + vmovdqu32 RNOTz, (n * 64)(%rsi); + +#define OCB_LOAD_PTRS(n) \ + movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \ + movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \ + movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \ + movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \ + movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \ + movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \ + movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \ + movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx; + + OCB_LOAD_PTRS(0); + OCB_INPUT(0, %r10, %r11, %r12, %r13, RA0z, RTMP1z); + OCB_INPUT(1, %r14, %r15, %rax, %rbx, RA1z, RTMP2z); + OCB_LOAD_PTRS(2); + OCB_INPUT(2, %r10, %r11, %r12, %r13, RA2z, RTMP3z); + vpternlogd $0x96, RTMP1z, RTMP2z, RTMP3z; + OCB_INPUT(3, %r14, %r15, %rax, %rbx, RA3z, RTMP4z); + OCB_LOAD_PTRS(4); + OCB_INPUT(4, %r10, %r11, %r12, %r13, RB0z, RX0z); + OCB_INPUT(5, %r14, %r15, %rax, %rbx, RB1z, RX1z); + vpternlogd $0x96, RTMP4z, RX0z, RX1z; + OCB_LOAD_PTRS(6); + OCB_INPUT(6, %r10, %r11, %r12, %r13, RB2z, RTMP4z); + OCB_INPUT(7, %r14, %r15, %rax, %rbx, RB3z, RX0z); +#undef OCB_LOAD_PTRS +#undef OCB_INPUT + + vpternlogd $0x96, RTMP3z, RTMP4z, RX0z; + vpxord RX1z, RX0z, RNOTz; + vextracti64x4 $1, RNOTz, RTMP1; + vpxor RTMP1, RNOT, RNOT; + vextracti128 $1, RNOT, RTMP1x; + vpternlogd $0x96, (%r8), RTMP1x, RNOTx; + + movq (0 * 8)(%rsp), %r12; + movq (1 * 8)(%rsp), %r13; + movq (2 * 8)(%rsp), %r14; + movq (3 * 8)(%rsp), %r15; + movq (4 * 8)(%rsp), %rbx; + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); + CFI_RESTORE(%rbx); + + vmovdqu RTMP0x, (%rcx); + vmovdqu RNOTx, (%r8); + + call __sm4_gfni_crypt_blk32; + + addq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-5 * 8); + + vpxord (0 * 64)(%rsi), RA0z, RA0z; + vpxord (1 * 64)(%rsi), RA1z, RA1z; + vpxord (2 * 64)(%rsi), RA2z, RA2z; + vpxord (3 * 64)(%rsi), RA3z, RA3z; + vpxord (4 * 64)(%rsi), RB0z, RB0z; + vpxord (5 * 64)(%rsi), RB1z, RB1z; + vpxord (6 * 64)(%rsi), RB2z, RB2z; + vpxord (7 * 64)(%rsi), RB3z, RB3z; + + vmovdqu32 RA0z, (0 * 64)(%rsi); + vmovdqu32 RA1z, (1 * 64)(%rsi); + vmovdqu32 RA2z, (2 * 64)(%rsi); + vmovdqu32 RA3z, (3 * 64)(%rsi); + vmovdqu32 RB0z, (4 * 64)(%rsi); + vmovdqu32 RB1z, (5 * 64)(%rsi); + vmovdqu32 RB2z, (6 * 64)(%rsi); + vmovdqu32 RB3z, (7 * 64)(%rsi); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_ocb_enc_blk32,.-_gcry_sm4_gfni_avx512_ocb_enc_blk32;) + +.align 16 +.globl _gcry_sm4_gfni_avx512_ocb_dec_blk32 +ELF(.type _gcry_sm4_gfni_avx512_ocb_dec_blk32, at function;) +_gcry_sm4_gfni_avx512_ocb_dec_blk32: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[32]) + */ + CFI_STARTPROC(); + vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + + subq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(5 * 8); + + movq %r12, (0 * 8)(%rsp); + movq %r13, (1 * 8)(%rsp); + movq %r14, (2 * 8)(%rsp); + movq %r15, (3 * 8)(%rsp); + movq %rbx, (4 * 8)(%rsp); + CFI_REL_OFFSET(%r12, 0 * 8); + CFI_REL_OFFSET(%r13, 1 * 8); + CFI_REL_OFFSET(%r14, 2 * 8); + CFI_REL_OFFSET(%r15, 3 * 8); + CFI_REL_OFFSET(%rbx, 4 * 8); + + vmovdqu (%rcx), RTMP0x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg) \ + vmovdqu32 (n * 64)(%rdx), RTMP1z; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti64x2 $1, RTMP0x, RNOT, RNOT; \ + vpxor (l2reg), RTMP0x, RTMP0x; \ + vinserti64x2 $2, RTMP0x, RNOTz, RNOTz; \ + vpxor (l3reg), RTMP0x, RTMP0x; \ + vinserti64x2 $3, RTMP0x, RNOTz, RNOTz; \ + vpxord RTMP1z, RNOTz, zreg; \ + vmovdqu32 RNOTz, (n * 64)(%rsi); + +#define OCB_LOAD_PTRS(n) \ + movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \ + movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \ + movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \ + movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \ + movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \ + movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \ + movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \ + movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx; + + OCB_LOAD_PTRS(0); + OCB_INPUT(0, %r10, %r11, %r12, %r13, RA0z); + OCB_INPUT(1, %r14, %r15, %rax, %rbx, RA1z); + OCB_LOAD_PTRS(2); + OCB_INPUT(2, %r10, %r11, %r12, %r13, RA2z); + OCB_INPUT(3, %r14, %r15, %rax, %rbx, RA3z); + OCB_LOAD_PTRS(4); + OCB_INPUT(4, %r10, %r11, %r12, %r13, RB0z); + OCB_INPUT(5, %r14, %r15, %rax, %rbx, RB1z); + OCB_LOAD_PTRS(6); + OCB_INPUT(6, %r10, %r11, %r12, %r13, RB2z); + OCB_INPUT(7, %r14, %r15, %rax, %rbx, RB3z); +#undef OCB_LOAD_PTRS +#undef OCB_INPUT + + movq (0 * 8)(%rsp), %r12; + movq (1 * 8)(%rsp), %r13; + movq (2 * 8)(%rsp), %r14; + movq (3 * 8)(%rsp), %r15; + movq (4 * 8)(%rsp), %rbx; + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); + CFI_RESTORE(%rbx); + + vmovdqu RTMP0x, (%rcx); + + call __sm4_gfni_crypt_blk32; + + addq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-5 * 8); + + vpxord (0 * 64)(%rsi), RA0z, RA0z; + vpxord (1 * 64)(%rsi), RA1z, RA1z; + vpxord (2 * 64)(%rsi), RA2z, RA2z; + vpxord (3 * 64)(%rsi), RA3z, RA3z; + vpxord (4 * 64)(%rsi), RB0z, RB0z; + vpxord (5 * 64)(%rsi), RB1z, RB1z; + vpxord (6 * 64)(%rsi), RB2z, RB2z; + vpxord (7 * 64)(%rsi), RB3z, RB3z; + + vmovdqu32 RA0z, (0 * 64)(%rsi); + vmovdqu32 RA1z, (1 * 64)(%rsi); + vmovdqu32 RA2z, (2 * 64)(%rsi); + vmovdqu32 RA3z, (3 * 64)(%rsi); + vmovdqu32 RB0z, (4 * 64)(%rsi); + vmovdqu32 RB1z, (5 * 64)(%rsi); + vmovdqu32 RB2z, (6 * 64)(%rsi); + vmovdqu32 RB3z, (7 * 64)(%rsi); + + /* Checksum_i = Checksum_{i-1} xor C_i */ + vpternlogd $0x96, RA0z, RA1z, RA2z; + vpternlogd $0x96, RA3z, RB0z, RB1z; + vpternlogd $0x96, RB2z, RB3z, RA2z; + vpxord RA2z, RB1z, RTMP1z; + + vextracti64x4 $1, RTMP1z, RNOT; + vpxor RNOT, RTMP1, RTMP1; + vextracti128 $1, RTMP1, RNOTx; + vpternlogd $0x96, (%r8), RNOTx, RTMP1x; + vmovdqu RTMP1x, (%r8); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_gfni_avx512_ocb_dec_blk32,.-_gcry_sm4_gfni_avx512_ocb_dec_blk32;) + +#endif /*defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)*/ +#endif /*__x86_64*/ diff --git a/cipher/sm4.c b/cipher/sm4.c index 062a14f4..02c399a9 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -65,10 +65,20 @@ # endif #endif +/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */ +#undef USE_GFNI_AVX512 +#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) +# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_GFNI_AVX512 1 +# endif +#endif + /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI -#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_GFNI_AVX2) +#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \ + defined(USE_GFNI_AVX2) || defined(USE_GFNI_AVX512) # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define ASM_FUNC_ABI __attribute__((sysv_abi)) # else @@ -125,10 +135,15 @@ static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, static size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); +typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out, + const byte *in, + unsigned int num_blks); + typedef struct { u32 rkey_enc[32]; u32 rkey_dec[32]; + crypt_blk1_16_fn_t crypt_blk1_16; #ifdef USE_AESNI_AVX unsigned int use_aesni_avx:1; #endif @@ -138,6 +153,9 @@ typedef struct #ifdef USE_GFNI_AVX2 unsigned int use_gfni_avx2:1; #endif +#ifdef USE_GFNI_AVX512 + unsigned int use_gfni_avx512:1; +#endif #ifdef USE_AARCH64_SIMD unsigned int use_aarch64_simd:1; #endif @@ -149,10 +167,6 @@ typedef struct #endif } SM4_context; -typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out, - const byte *in, - unsigned int num_blks); - static const u32 fk[4] = { 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc @@ -217,6 +231,8 @@ static const u32 ck[] = 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 }; +static inline crypt_blk1_16_fn_t sm4_get_crypt_blk1_16_fn(SM4_context *ctx); + #ifdef USE_AESNI_AVX extern void _gcry_sm4_aesni_avx_expand_key(const byte *key, u32 *rk_enc, u32 *rk_dec, const u32 *fk, @@ -374,6 +390,86 @@ sm4_gfni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in, #endif /* USE_GFNI_AVX2 */ +#ifdef USE_GFNI_AVX512 +extern void _gcry_sm4_gfni_avx512_expand_key(const byte *key, u32 *rk_enc, + u32 *rk_dec, const u32 *fk, + const u32 *ck) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_ctr_enc(const u32 *rk_enc, byte *out, + const byte *in, + byte *ctr) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_cbc_dec(const u32 *rk_dec, byte *out, + const byte *in, + byte *iv) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_cfb_dec(const u32 *rk_enc, byte *out, + const byte *in, + byte *iv) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_ocb_enc(const u32 *rk_enc, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_ocb_dec(const u32 *rk_dec, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_ocb_auth(const u32 *rk_enc, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_ctr_enc_blk32(const u32 *rk_enc, byte *out, + const byte *in, + byte *ctr) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_cbc_dec_blk32(const u32 *rk_enc, byte *out, + const byte *in, + byte *iv) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_cfb_dec_blk32(const u32 *rk_enc, byte *out, + const byte *in, + byte *iv) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_ocb_enc_blk32(const u32 *rk_enc, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_sm4_gfni_avx512_ocb_dec_blk32(const u32 *rk_dec, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern unsigned int +_gcry_sm4_gfni_avx512_crypt_blk1_16(const u32 *rk, byte *out, const byte *in, + unsigned int num_blks) ASM_FUNC_ABI; + +extern unsigned int +_gcry_sm4_gfni_avx512_crypt_blk32(const u32 *rk, byte *out, + const byte *in) ASM_FUNC_ABI; + +static inline unsigned int +sm4_gfni_avx512_crypt_blk1_16(const void *rk, byte *out, const byte *in, + unsigned int num_blks) +{ + return _gcry_sm4_gfni_avx512_crypt_blk1_16(rk, out, in, num_blks); +} + +#endif /* USE_GFNI_AVX2 */ + #ifdef USE_AARCH64_SIMD extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out, const byte *in, @@ -561,6 +657,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key) u32 rk[4]; int i; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + _gcry_sm4_gfni_avx512_expand_key (key, ctx->rkey_enc, ctx->rkey_dec, + fk, ck); + return; + } +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) { @@ -645,6 +750,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, #ifdef USE_GFNI_AVX2 ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2); #endif +#ifdef USE_GFNI_AVX512 + ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512); +#endif #ifdef USE_AARCH64_SIMD ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON); #endif @@ -670,6 +778,8 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, } #endif + ctx->crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx); + /* Setup bulk encryption routines. */ memset (bulk_ops, 0, sizeof(*bulk_ops)); bulk_ops->cbc_dec = _gcry_sm4_cbc_dec; @@ -715,6 +825,11 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf) { SM4_context *ctx = context; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1); +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1); @@ -735,6 +850,11 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf) { SM4_context *ctx = context; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1); +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1); @@ -834,6 +954,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx) { if (0) ; +#ifdef USE_GFNI_AVX512 + else if (ctx->use_gfni_avx512) + { + return &sm4_gfni_avx512_crypt_blk1_16; + } +#endif #ifdef USE_GFNI_AVX2 else if (ctx->use_gfni_avx2) { @@ -890,6 +1016,32 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr, const byte *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + /* Process data in 32 block chunks. */ + while (nblocks >= 32) + { + _gcry_sm4_gfni_avx512_ctr_enc_blk32(ctx->rkey_enc, + outbuf, inbuf, ctr); + + nblocks -= 32; + outbuf += 32 * 16; + inbuf += 32 * 16; + } + + /* Process data in 16 block chunks. */ + if (nblocks >= 16) + { + _gcry_sm4_gfni_avx512_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr); + + nblocks -= 16; + outbuf += 16 * 16; + inbuf += 16 * 16; + } + } +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) { @@ -982,7 +1134,7 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr, /* Process remaining blocks. */ if (nblocks) { - crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx); + crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16; byte tmpbuf[16 * 16]; unsigned int tmp_used = 16; size_t nburn; @@ -1011,6 +1163,31 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv, const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + /* Process data in 32 block chunks. */ + while (nblocks >= 32) + { + _gcry_sm4_gfni_avx512_cbc_dec_blk32(ctx->rkey_dec, outbuf, inbuf, iv); + + nblocks -= 32; + outbuf += 32 * 16; + inbuf += 32 * 16; + } + + /* Process data in 16 block chunks. */ + if (nblocks >= 16) + { + _gcry_sm4_gfni_avx512_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv); + + nblocks -= 16; + outbuf += 16 * 16; + inbuf += 16 * 16; + } + } +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) { @@ -1103,7 +1280,7 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv, /* Process remaining blocks. */ if (nblocks) { - crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx); + crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16; unsigned char tmpbuf[16 * 16]; unsigned int tmp_used = 16; size_t nburn; @@ -1132,6 +1309,31 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + /* Process data in 32 block chunks. */ + while (nblocks >= 32) + { + _gcry_sm4_gfni_avx512_cfb_dec_blk32(ctx->rkey_enc, outbuf, inbuf, iv); + + nblocks -= 32; + outbuf += 32 * 16; + inbuf += 32 * 16; + } + + /* Process data in 16 block chunks. */ + if (nblocks >= 16) + { + _gcry_sm4_gfni_avx512_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv); + + nblocks -= 16; + outbuf += 16 * 16; + inbuf += 16 * 16; + } + } +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) { @@ -1224,7 +1426,7 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, /* Process remaining blocks. */ if (nblocks) { - crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx); + crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16; unsigned char tmpbuf[16 * 16]; unsigned int tmp_used = 16; size_t nburn; @@ -1241,6 +1443,52 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, _gcry_burn_stack(burn_stack_depth); } +static unsigned int +sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf, + unsigned int num_blks, const u32 *rk) +{ + unsigned int stack_burn_size = 0; + unsigned int nburn; + + gcry_assert (num_blks <= 32); + +#ifdef USE_GFNI_AVX512 + if (num_blks == 32 && ctx->use_gfni_avx512) + { + return _gcry_sm4_gfni_avx512_crypt_blk32 (rk, outbuf, inbuf); + } +#endif + + do + { + unsigned int curr_blks = num_blks > 16 ? 16 : num_blks; + nburn = ctx->crypt_blk1_16 (rk, outbuf, inbuf, curr_blks); + stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size; + outbuf += curr_blks * 16; + inbuf += curr_blks * 16; + num_blks -= curr_blks; + } + while (num_blks > 0); + + return stack_burn_size; +} + +static unsigned int +sm4_encrypt_blk1_32 (const void *context, byte *out, const byte *in, + unsigned int num_blks) +{ + const SM4_context *ctx = context; + return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_enc); +} + +static unsigned int +sm4_decrypt_blk1_32 (const void *context, byte *out, const byte *in, + unsigned int num_blks) +{ + const SM4_context *ctx = context; + return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_dec); +} + /* Bulk encryption/decryption of complete blocks in XTS mode. */ static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, @@ -1254,13 +1502,13 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, /* Process remaining blocks. */ if (nblocks) { - crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx); - u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec; - unsigned char tmpbuf[16 * 16]; + unsigned char tmpbuf[32 * 16]; unsigned int tmp_used = 16; size_t nburn; - nburn = bulk_xts_crypt_128(rk, crypt_blk1_16, outbuf, inbuf, nblocks, + nburn = bulk_xts_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32 + : sm4_decrypt_blk1_32, + outbuf, inbuf, nblocks, tweak, tmpbuf, sizeof(tmpbuf) / 16, &tmp_used); burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; @@ -1283,6 +1531,39 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, u64 blkn = c->u_mode.ocb.data_nblocks; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + u64 Ls[32]; + u64 *l; + + if (nblocks >= 32) + { + l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn); + + /* Process data in 32 block chunks. */ + while (nblocks >= 32) + { + blkn += 32; + *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 32); + + if (encrypt) + _gcry_sm4_gfni_avx512_ocb_enc_blk32 (ctx->rkey_enc, outbuf, + inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_sm4_gfni_avx512_ocb_dec_blk32 (ctx->rkey_dec, outbuf, + inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 32; + outbuf += 32 * 16; + inbuf += 32 * 16; + } + } + } +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) { @@ -1379,7 +1660,7 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, /* Process remaining blocks. */ if (nblocks) { - crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx); + crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16; u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec; unsigned char tmpbuf[16 * 16]; unsigned int tmp_used = 16; @@ -1410,6 +1691,33 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) u64 blkn = c->u_mode.ocb.aad_nblocks; int burn_stack_depth = 0; +#ifdef USE_GFNI_AVX512 + if (ctx->use_gfni_avx512) + { + u64 Ls[16]; + u64 *l; + + if (nblocks >= 16) + { + l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn); + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + blkn += 16; + *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 16); + + _gcry_sm4_gfni_avx512_ocb_auth (ctx->rkey_enc, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 16; + abuf += 16 * 16; + } + } + } +#endif + #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) { @@ -1494,7 +1802,7 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) /* Process remaining blocks. */ if (nblocks) { - crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx); + crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16; unsigned char tmpbuf[16 * 16]; unsigned int tmp_used = 16; size_t nburn; diff --git a/configure.ac b/configure.ac index b55510d8..34ec058e 100644 --- a/configure.ac +++ b/configure.ac @@ -2952,6 +2952,7 @@ if test "$found" = "1" ; then GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo" + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx512-amd64.lo" ;; aarch64-*-*) # Build with the assembly implementation -- 2.34.1 From jussi.kivilinna at iki.fi Thu Jul 21 10:10:36 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 21 Jul 2022 11:10:36 +0300 Subject: [PATCH v2 3/3] Add SM4 ARMv9 SVE CE assembly implementation In-Reply-To: <20220721063217.20034-3-tianjia.zhang@linux.alibaba.com> References: <20220721063217.20034-1-tianjia.zhang@linux.alibaba.com> <20220721063217.20034-3-tianjia.zhang@linux.alibaba.com> Message-ID: <618bc9d1-e169-fddf-61cd-6b8ac695918f@iki.fi> Hello, On 21.7.2022 9.32, Tianjia Zhang via Gcrypt-devel wrote: > * cipher/Makefile.am: Add 'sm4-armv9-aarch64-sve-ce.S'. > * cipher/sm4-armv9-aarch64-sve-ce.S: New. > * cipher/sm4.c (USE_ARM_SVE_CE): New. > (SM4_context) [USE_ARM_SVE_CE]: Add 'use_arm_sve_ce'. > (_gcry_sm4_armv9_sve_ce_crypt, _gcry_sm4_armv9_sve_ce_ctr_enc) > (_gcry_sm4_armv9_sve_ce_cbc_dec, _gcry_sm4_armv9_sve_ce_cfb_dec) > (sm4_armv9_sve_ce_crypt_blk1_16): New. > (sm4_setkey): Enable ARMv9 SVE CE if supported by HW. > (sm4_get_crypt_blk1_16_fn) [USE_ARM_SVE_CE]: Add ARMv9 SVE CE > bulk functions. > (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec) > [USE_ARM_SVE_CE]: Add ARMv9 SVE CE bulk functions. > * configure.ac: Add 'sm4-armv9-aarch64-sve-ce.lo'. > -- > > Signed-off-by: Tianjia Zhang > --- > cipher/Makefile.am | 1 + > cipher/sm4-armv9-aarch64-sve-ce.S | 967 ++++++++++++++++++++++++++++++ > cipher/sm4.c | 86 +++ > configure.ac | 1 + > 4 files changed, 1055 insertions(+) > create mode 100644 cipher/sm4-armv9-aarch64-sve-ce.S Patches applied to master. -Jussi From tianjia.zhang at linux.alibaba.com Thu Jul 21 14:08:49 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Thu, 21 Jul 2022 20:08:49 +0800 Subject: [PATCH 3/3] sm4-arm-sve-ce: use 32 parallel blocks for XTS and CTR32LE In-Reply-To: <20220721080930.3271436-3-jussi.kivilinna@iki.fi> References: <20220721080930.3271436-1-jussi.kivilinna@iki.fi> <20220721080930.3271436-3-jussi.kivilinna@iki.fi> Message-ID: Hi Jussi, On 7/21/22 4:09 PM, Jussi Kivilinna wrote: > * cipher/sm4.c (sm4_crypt_blk1_32) [USE_ARM_SVE_CE]: Add SVE-SM4 code > path. > -- > > Signed-off-by: Jussi Kivilinna > --- > cipher/sm4.c | 7 +++++++ > 1 file changed, 7 insertions(+) > > diff --git a/cipher/sm4.c b/cipher/sm4.c > index f68197c4..b5d4691d 100644 > --- a/cipher/sm4.c > +++ b/cipher/sm4.c > @@ -1462,6 +1462,13 @@ sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf, > return _gcry_sm4_gfni_avx512_crypt_blk32 (rk, outbuf, inbuf); > } > #endif > +#ifdef USE_ARM_SVE_CE > + if (ctx->use_arm_sve_ce) > + { > + _gcry_sm4_armv9_sve_ce_crypt (rk, outbuf, inbuf, num_blks); > + return 0; > + } > +#endif > > do > { Looks good to me. Reviewed-by: Tianjia Zhang Unfortunately I don't have a machine to test avx512 at the moment. Best regards, Tianjia From jussi.kivilinna at iki.fi Wed Jul 27 22:16:31 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 27 Jul 2022 23:16:31 +0300 Subject: [PATCH 1/2] rijndael-ppc: small speed-up for CBC and CFB encryption Message-ID: <20220727201632.2000015-1-jussi.kivilinna@iki.fi> * cipher/rijndael-ppc-common.h (AES_ENCRYPT_ALL): Remove * cipher/rijndael-ppc-functions.h (CFB_ENC_FUNC) (CBC_ENC_FUNC): Removed two block unrolled loop; Optimized single block loop for shorter critical-path. -- Patch gives small ~3% performance increase for CBC and CFB encryption, tested with POWER8. Signed-off-by: Jussi Kivilinna --- cipher/rijndael-ppc-common.h | 25 ------ cipher/rijndael-ppc-functions.h | 131 ++++++++++++++++++-------------- 2 files changed, 74 insertions(+), 82 deletions(-) diff --git a/cipher/rijndael-ppc-common.h b/cipher/rijndael-ppc-common.h index bbbeaac0..3fa9a0b9 100644 --- a/cipher/rijndael-ppc-common.h +++ b/cipher/rijndael-ppc-common.h @@ -158,31 +158,6 @@ typedef union rkeylast = ALIGNED_LOAD (rk, nrounds); \ } while (0) -#define AES_ENCRYPT_ALL(blk, nrounds) \ - do { \ - blk ^= rkey0; \ - blk = asm_cipher_be (blk, rkey1); \ - blk = asm_cipher_be (blk, rkey2); \ - blk = asm_cipher_be (blk, rkey3); \ - blk = asm_cipher_be (blk, rkey4); \ - blk = asm_cipher_be (blk, rkey5); \ - blk = asm_cipher_be (blk, rkey6); \ - blk = asm_cipher_be (blk, rkey7); \ - blk = asm_cipher_be (blk, rkey8); \ - blk = asm_cipher_be (blk, rkey9); \ - if (nrounds >= 12) \ - { \ - blk = asm_cipher_be (blk, rkey10); \ - blk = asm_cipher_be (blk, rkey11); \ - if (rounds > 12) \ - { \ - blk = asm_cipher_be (blk, rkey12); \ - blk = asm_cipher_be (blk, rkey13); \ - } \ - } \ - blk = asm_cipherlast_be (blk, rkeylast); \ - } while (0) - static ASM_FUNC_ATTR_INLINE block asm_aligned_ld(unsigned long offset, const void *ptr) diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h index 72f31852..f95489d9 100644 --- a/cipher/rijndael-ppc-functions.h +++ b/cipher/rijndael-ppc-functions.h @@ -76,43 +76,46 @@ void CFB_ENC_FUNC (void *context, unsigned char *iv_arg, u128_t *out = (u128_t *)outbuf_arg; int rounds = ctx->rounds; ROUND_KEY_VARIABLES_ALL; - block rkeylast_orig; - block iv; + block key0_xor_keylast; + block iv, outiv; iv = VEC_LOAD_BE (iv_arg, 0, bige_const); + outiv = iv; PRELOAD_ROUND_KEYS_ALL (rounds); - rkeylast_orig = rkeylast; - - for (; nblocks >= 2; nblocks -= 2) - { - block in2, iv1; - - rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const); - in2 = VEC_LOAD_BE (in + 1, 0, bige_const); - in += 2; - - AES_ENCRYPT_ALL (iv, rounds); - - iv1 = iv; - rkeylast = rkeylast_orig ^ in2; - - AES_ENCRYPT_ALL (iv, rounds); - - VEC_STORE_BE (out++, 0, iv1, bige_const); - VEC_STORE_BE (out++, 0, iv, bige_const); - } + key0_xor_keylast = rkey0 ^ rkeylast; + iv ^= rkey0; for (; nblocks; nblocks--) { - rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in++, 0, bige_const); - - AES_ENCRYPT_ALL (iv, rounds); + rkeylast = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const); + + iv = asm_cipher_be (iv, rkey1); + iv = asm_cipher_be (iv, rkey2); + iv = asm_cipher_be (iv, rkey3); + iv = asm_cipher_be (iv, rkey4); + iv = asm_cipher_be (iv, rkey5); + iv = asm_cipher_be (iv, rkey6); + iv = asm_cipher_be (iv, rkey7); + iv = asm_cipher_be (iv, rkey8); + iv = asm_cipher_be (iv, rkey9); + if (rounds >= 12) + { + iv = asm_cipher_be (iv, rkey10); + iv = asm_cipher_be (iv, rkey11); + if (rounds > 12) + { + iv = asm_cipher_be (iv, rkey12); + iv = asm_cipher_be (iv, rkey13); + } + } + iv = asm_cipherlast_be (iv, rkeylast); - VEC_STORE_BE (out++, 0, iv, bige_const); + outiv = rkey0 ^ iv; + VEC_STORE_BE (out++, 0, outiv, bige_const); } - VEC_STORE_BE (iv_arg, 0, iv, bige_const); + VEC_STORE_BE (iv_arg, 0, outiv, bige_const); } void CFB_DEC_FUNC (void *context, unsigned char *iv_arg, @@ -324,47 +327,61 @@ void CBC_ENC_FUNC (void *context, unsigned char *iv_arg, byte *out = (byte *)outbuf_arg; int rounds = ctx->rounds; ROUND_KEY_VARIABLES_ALL; - block lastiv, b; + block iv, key0_xor_keylast, nextiv, outiv; unsigned int outadd = -(!cbc_mac) & 16; - lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const); - - PRELOAD_ROUND_KEYS_ALL (rounds); - - for (; nblocks >= 2; nblocks -= 2) - { - block in2, lastiv1; - - b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const); - in2 = VEC_LOAD_BE (in + 1, 0, bige_const); - in += 2; - - AES_ENCRYPT_ALL (b, rounds); + if (nblocks == 0) /* CMAC may call with nblocks 0. */ + return; - lastiv1 = b; - b = lastiv1 ^ in2; + iv = VEC_LOAD_BE (iv_arg, 0, bige_const); - AES_ENCRYPT_ALL (b, rounds); + PRELOAD_ROUND_KEYS_ALL (rounds); + key0_xor_keylast = rkey0 ^ rkeylast; - lastiv = b; - VEC_STORE_BE ((u128_t *)out, 0, lastiv1, bige_const); - out += outadd; - VEC_STORE_BE ((u128_t *)out, 0, lastiv, bige_const); - out += outadd; - } + nextiv = VEC_LOAD_BE (in++, 0, bige_const); + iv ^= rkey0 ^ nextiv; - for (; nblocks; nblocks--) + do { - b = lastiv ^ VEC_LOAD_BE (in++, 0, bige_const); - - AES_ENCRYPT_ALL (b, rounds); + if (--nblocks) + { + nextiv = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const); + } - lastiv = b; - VEC_STORE_BE ((u128_t *)out, 0, b, bige_const); + iv = asm_cipher_be (iv, rkey1); + iv = asm_cipher_be (iv, rkey2); + iv = asm_cipher_be (iv, rkey3); + iv = asm_cipher_be (iv, rkey4); + iv = asm_cipher_be (iv, rkey5); + iv = asm_cipher_be (iv, rkey6); + iv = asm_cipher_be (iv, rkey7); + iv = asm_cipher_be (iv, rkey8); + iv = asm_cipher_be (iv, rkey9); + if (rounds >= 12) + { + iv = asm_cipher_be (iv, rkey10); + iv = asm_cipher_be (iv, rkey11); + if (rounds > 12) + { + iv = asm_cipher_be (iv, rkey12); + iv = asm_cipher_be (iv, rkey13); + } + } + outiv = iv; + /* Proper order for following instructions is important for best + * performance on POWER8: the output path vcipherlast needs to be + * last one. */ + __asm__ volatile ("vcipherlast %0, %0, %2\n\t" + "vcipherlast %1, %1, %3\n\t" + : "+v" (iv), "+outiv" (outiv) + : "v" (nextiv), "v" (rkeylast)); + + VEC_STORE_BE ((u128_t *)out, 0, outiv, bige_const); out += outadd; } + while (nblocks); - VEC_STORE_BE (iv_arg, 0, lastiv, bige_const); + VEC_STORE_BE (iv_arg, 0, outiv, bige_const); } void CBC_DEC_FUNC (void *context, unsigned char *iv_arg, -- 2.34.1 From jussi.kivilinna at iki.fi Wed Jul 27 22:16:32 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 27 Jul 2022 23:16:32 +0300 Subject: [PATCH 2/2] Simplify AES key schedule implementation In-Reply-To: <20220727201632.2000015-1-jussi.kivilinna@iki.fi> References: <20220727201632.2000015-1-jussi.kivilinna@iki.fi> Message-ID: <20220727201632.2000015-2-jussi.kivilinna@iki.fi> * cipher/rijndael-armv8-ce.c (_gcry_aes_armv8_ce_setkey): New key schedule with simplified structure and less stack usage. * cipher/rijndael-internal.h (RIJNDAEL_context_s): Add 'keyschedule32b'. (keyschenc32b): New. * cipher/rijndael-ppc-common.h (vec_u32): New. * cipher/rijndael-ppc.c (vec_bswap32_const): Remove. (_gcry_aes_sbox4_ppc8): Optimize for less instructions emitted. (keysched_idx): New. (_gcry_aes_ppc8_setkey): New key schedule with simplified structure. * cipher/rijndael-tables.h (rcon): Remove. * cipher/rijndael.c (sbox4): New. (do_setkey): New key schedule with simplified structure and less stack usage. -- Signed-off-by: Jussi Kivilinna --- cipher/rijndael-armv8-ce.c | 102 +++++----------------- cipher/rijndael-internal.h | 12 +-- cipher/rijndael-ppc-common.h | 1 + cipher/rijndael-ppc.c | 158 +++++++++++------------------------ cipher/rijndael-tables.h | 7 -- cipher/rijndael.c | 118 ++++++++------------------ 6 files changed, 117 insertions(+), 281 deletions(-) diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c index e53c940e..10fb58be 100644 --- a/cipher/rijndael-armv8-ce.c +++ b/cipher/rijndael-armv8-ce.c @@ -128,103 +128,47 @@ typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf, unsigned char *tweak, size_t nblocks, unsigned int nrounds); + void _gcry_aes_armv8_ce_setkey (RIJNDAEL_context *ctx, const byte *key) { - union - { - PROPERLY_ALIGNED_TYPE dummy; - byte data[MAXKC][4]; - u32 data32[MAXKC]; - } tkk[2]; unsigned int rounds = ctx->rounds; - int KC = rounds - 6; - unsigned int keylen = KC * 4; - unsigned int i, r, t; + unsigned int KC = rounds - 6; + u32 *W_u32 = ctx->keyschenc32b; + unsigned int i, j; + u32 W_prev; byte rcon = 1; - int j; -#define k tkk[0].data -#define k_u32 tkk[0].data32 -#define tk tkk[1].data -#define tk_u32 tkk[1].data32 -#define W (ctx->keyschenc) -#define W_u32 (ctx->keyschenc32) - - for (i = 0; i < keylen; i++) - { - k[i >> 2][i & 3] = key[i]; - } - for (j = KC-1; j >= 0; j--) + for (i = 0; i < KC; i += 2) { - tk_u32[j] = k_u32[j]; - } - r = 0; - t = 0; - /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < rounds + 1); ) - { - for (; (j < KC) && (t < 4); j++, t++) - { - W_u32[r][t] = le_bswap32(tk_u32[j]); - } - if (t == 4) - { - r++; - t = 0; - } + W_u32[i + 0] = buf_get_le32(key + i * 4 + 0); + W_u32[i + 1] = buf_get_le32(key + i * 4 + 4); } - while (r < rounds + 1) + for (i = KC, j = KC, W_prev = W_u32[KC - 1]; + i < 4 * (rounds + 1); + i += 2, j += 2) { - tk_u32[0] ^= _gcry_aes_sbox4_armv8_ce(rol(tk_u32[KC - 1], 24)) ^ rcon; + u32 temp0 = W_prev; + u32 temp1; - if (KC != 8) + if (j == KC) { - for (j = 1; j < KC; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } + j = 0; + temp0 = _gcry_aes_sbox4_armv8_ce(rol(temp0, 24)) ^ rcon; + rcon = ((rcon << 1) ^ (-(rcon >> 7) & 0x1b)) & 0xff; } - else + else if (KC == 8 && j == 4) { - for (j = 1; j < KC/2; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } - - tk_u32[KC/2] ^= _gcry_aes_sbox4_armv8_ce(tk_u32[KC/2 - 1]); - - for (j = KC/2 + 1; j < KC; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } + temp0 = _gcry_aes_sbox4_armv8_ce(temp0); } - /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < rounds + 1); ) - { - for (; (j < KC) && (t < 4); j++, t++) - { - W_u32[r][t] = le_bswap32(tk_u32[j]); - } - if (t == 4) - { - r++; - t = 0; - } - } + temp1 = W_u32[i - KC + 0]; - rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b); + W_u32[i + 0] = temp0 ^ temp1; + W_u32[i + 1] = W_u32[i - KC + 1] ^ temp0 ^ temp1; + W_prev = W_u32[i + 1]; } - -#undef W -#undef tk -#undef k -#undef W_u32 -#undef tk_u32 -#undef k_u32 - wipememory(&tkk, sizeof(tkk)); } /* Make a decryption key from an encryption key. */ diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h index 30604088..52c892fd 100644 --- a/cipher/rijndael-internal.h +++ b/cipher/rijndael-internal.h @@ -160,6 +160,7 @@ typedef struct RIJNDAEL_context_s PROPERLY_ALIGNED_TYPE dummy; byte keyschedule[MAXROUNDS+1][4][4]; u32 keyschedule32[MAXROUNDS+1][4]; + u32 keyschedule32b[(MAXROUNDS+1)*4]; #ifdef USE_PADLOCK /* The key as passed to the padlock engine. It is only used if the padlock engine is used (USE_PADLOCK, below). */ @@ -195,10 +196,11 @@ typedef struct RIJNDAEL_context_s } RIJNDAEL_context ATTR_ALIGNED_16; /* Macros defining alias for the keyschedules. */ -#define keyschenc u1.keyschedule -#define keyschenc32 u1.keyschedule32 -#define keyschdec u2.keyschedule -#define keyschdec32 u2.keyschedule32 -#define padlockkey u1.padlock_key +#define keyschenc u1.keyschedule +#define keyschenc32 u1.keyschedule32 +#define keyschenc32b u1.keyschedule32b +#define keyschdec u2.keyschedule +#define keyschdec32 u2.keyschedule32 +#define padlockkey u1.padlock_key #endif /* G10_RIJNDAEL_INTERNAL_H */ diff --git a/cipher/rijndael-ppc-common.h b/cipher/rijndael-ppc-common.h index 3fa9a0b9..e4a90934 100644 --- a/cipher/rijndael-ppc-common.h +++ b/cipher/rijndael-ppc-common.h @@ -30,6 +30,7 @@ typedef vector unsigned char block; +typedef vector unsigned int vec_u32; typedef union { diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c index f5c32361..6a32271d 100644 --- a/cipher/rijndael-ppc.c +++ b/cipher/rijndael-ppc.c @@ -34,10 +34,7 @@ #include "rijndael-ppc-common.h" -#ifdef WORDS_BIGENDIAN -static const block vec_bswap32_const = - { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; -#else +#ifndef WORDS_BIGENDIAN static const block vec_bswap32_const_neg = { ~3, ~2, ~1, ~0, ~7, ~6, ~5, ~4, ~11, ~10, ~9, ~8, ~15, ~14, ~13, ~12 }; #endif @@ -107,134 +104,81 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr) static ASM_FUNC_ATTR_INLINE u32 _gcry_aes_sbox4_ppc8(u32 fourbytes) { - union - { - PROPERLY_ALIGNED_TYPE dummy; - block data_vec; - u32 data32[4]; - } u; + vec_u32 vec_fourbyte = { fourbytes, fourbytes, fourbytes, fourbytes }; +#ifdef WORDS_BIGENDIAN + return ((vec_u32)vec_sbox_be((block)vec_fourbyte))[1]; +#else + return ((vec_u32)vec_sbox_be((block)vec_fourbyte))[2]; +#endif +} + - u.data32[0] = fourbytes; - u.data_vec = vec_sbox_be(u.data_vec); - return u.data32[0]; +static ASM_FUNC_ATTR_INLINE unsigned int +keysched_idx(unsigned int in) +{ +#ifdef WORDS_BIGENDIAN + return in; +#else + return (in & ~3U) | (3U - (in & 3U)); +#endif } + void _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key) { - const block bige_const = asm_load_be_const(); - union - { - PROPERLY_ALIGNED_TYPE dummy; - byte data[MAXKC][4]; - u32 data32[MAXKC]; - } tkk[2]; + u32 tk_u32[MAXKC]; unsigned int rounds = ctx->rounds; - int KC = rounds - 6; - unsigned int keylen = KC * 4; - u128_t *ekey = (u128_t *)(void *)ctx->keyschenc; - unsigned int i, r, t; + unsigned int KC = rounds - 6; + u32 *W_u32 = ctx->keyschenc32b; + unsigned int i, j; + u32 tk_prev; byte rcon = 1; - int j; -#define k tkk[0].data -#define k_u32 tkk[0].data32 -#define tk tkk[1].data -#define tk_u32 tkk[1].data32 -#define W (ctx->keyschenc) -#define W_u32 (ctx->keyschenc32) - for (i = 0; i < keylen; i++) + for (i = 0; i < KC; i += 2) { - k[i >> 2][i & 3] = key[i]; + unsigned int idx0 = keysched_idx(i + 0); + unsigned int idx1 = keysched_idx(i + 1); + tk_u32[i + 0] = buf_get_le32(key + i * 4 + 0); + tk_u32[i + 1] = buf_get_le32(key + i * 4 + 4); + W_u32[idx0] = _gcry_bswap32(tk_u32[i + 0]); + W_u32[idx1] = _gcry_bswap32(tk_u32[i + 1]); } - for (j = KC-1; j >= 0; j--) - { - tk_u32[j] = k_u32[j]; - } - r = 0; - t = 0; - /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < rounds + 1); ) + for (i = KC, j = KC, tk_prev = tk_u32[KC - 1]; + i < 4 * (rounds + 1); + i += 2, j += 2) { - for (; (j < KC) && (t < 4); j++, t++) - { - W_u32[r][t] = le_bswap32(tk_u32[j]); - } - if (t == 4) - { - r++; - t = 0; - } - } - while (r < rounds + 1) - { - tk_u32[0] ^= - le_bswap32( - _gcry_aes_sbox4_ppc8(rol(le_bswap32(tk_u32[KC - 1]), 24)) ^ rcon); + unsigned int idx0 = keysched_idx(i + 0); + unsigned int idx1 = keysched_idx(i + 1); + u32 temp0 = tk_prev; + u32 temp1; - if (KC != 8) + if (j == KC) { - for (j = 1; j < KC; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } + j = 0; + temp0 = _gcry_aes_sbox4_ppc8(rol(temp0, 24)) ^ rcon; + rcon = ((rcon << 1) ^ (-(rcon >> 7) & 0x1b)) & 0xff; } - else + else if (KC == 8 && j == 4) { - for (j = 1; j < KC/2; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } - - tk_u32[KC/2] ^= - le_bswap32(_gcry_aes_sbox4_ppc8(le_bswap32(tk_u32[KC/2 - 1]))); - - for (j = KC/2 + 1; j < KC; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } + temp0 = _gcry_aes_sbox4_ppc8(temp0); } - /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < rounds + 1); ) - { - for (; (j < KC) && (t < 4); j++, t++) - { - W_u32[r][t] = le_bswap32(tk_u32[j]); - } - if (t == 4) - { - r++; - t = 0; - } - } + temp1 = tk_u32[j + 0]; - rcon = (rcon << 1) ^ (-(rcon >> 7) & 0x1b); - } + tk_u32[j + 0] = temp0 ^ temp1; + tk_u32[j + 1] ^= temp0 ^ temp1; + tk_prev = tk_u32[j + 1]; - /* Store in big-endian order. */ - for (r = 0; r <= rounds; r++) - { -#ifndef WORDS_BIGENDIAN - VEC_STORE_BE(ekey, r, ALIGNED_LOAD (ekey, r), bige_const); -#else - block rvec = ALIGNED_LOAD (ekey, r); - ALIGNED_STORE (ekey, r, - vec_perm(rvec, rvec, vec_bswap32_const)); - (void)bige_const; -#endif + W_u32[idx0] = _gcry_bswap32(tk_u32[j + 0]); + W_u32[idx1] = _gcry_bswap32(tk_u32[j + 1]); } -#undef W -#undef tk -#undef k -#undef W_u32 -#undef tk_u32 -#undef k_u32 - wipememory(&tkk, sizeof(tkk)); + wipememory(tk_u32, sizeof(tk_u32)); } + void _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx) { diff --git a/cipher/rijndael-tables.h b/cipher/rijndael-tables.h index b54d9593..e46ce08c 100644 --- a/cipher/rijndael-tables.h +++ b/cipher/rijndael-tables.h @@ -218,10 +218,3 @@ static struct #define decT dec_tables.T #define inv_sbox dec_tables.inv_sbox - -static const u32 rcon[30] = - { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, - 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, - 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91 - }; diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 7e75ddd2..f3060ea5 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -422,6 +422,17 @@ static void prefetch_dec(void) +static inline u32 +sbox4(u32 inb4) +{ + u32 out; + out = (encT[(inb4 >> 0) & 0xffU] & 0xff00U) >> 8; + out |= (encT[(inb4 >> 8) & 0xffU] & 0xff00U) >> 0; + out |= (encT[(inb4 >> 16) & 0xffU] & 0xff0000U) << 0; + out |= (encT[(inb4 >> 24) & 0xffU] & 0xff0000U) << 8; + return out; +} + /* Perform the key setup. */ static gcry_err_code_t do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, @@ -431,8 +442,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, static const char *selftest_failed = 0; void (*hw_setkey)(RIJNDAEL_context *ctx, const byte *key) = NULL; int rounds; - int i,j, r, t, rconpointer = 0; - int KC; + unsigned int KC; unsigned int hwfeatures; /* The on-the-fly self tests are only run in non-fips mode. In fips @@ -662,101 +672,43 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, } else { - const byte *sbox = ((const byte *)encT) + 1; - union - { - PROPERLY_ALIGNED_TYPE dummy; - byte data[MAXKC][4]; - u32 data32[MAXKC]; - } tkk[2]; -#define k tkk[0].data -#define k_u32 tkk[0].data32 -#define tk tkk[1].data -#define tk_u32 tkk[1].data32 -#define W (ctx->keyschenc) -#define W_u32 (ctx->keyschenc32) + u32 W_prev; + u32 *W_u32 = ctx->keyschenc32b; + byte rcon = 1; + unsigned int i, j; prefetch_enc(); - for (i = 0; i < keylen; i++) + for (i = 0; i < KC; i += 2) { - k[i >> 2][i & 3] = key[i]; + W_u32[i + 0] = buf_get_le32(key + i * 4 + 0); + W_u32[i + 1] = buf_get_le32(key + i * 4 + 4); } - for (j = KC-1; j >= 0; j--) + for (i = KC, j = KC, W_prev = W_u32[KC - 1]; + i < 4 * (rounds + 1); + i += 2, j += 2) { - tk_u32[j] = k_u32[j]; - } - r = 0; - t = 0; - /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < rounds + 1); ) - { - for (; (j < KC) && (t < 4); j++, t++) - { - W_u32[r][t] = le_bswap32(tk_u32[j]); - } - if (t == 4) - { - r++; - t = 0; - } - } + u32 temp0 = W_prev; + u32 temp1; - while (r < rounds + 1) - { - /* While not enough round key material calculated calculate - new values. */ - tk[0][0] ^= sbox[tk[KC-1][1] * 4]; - tk[0][1] ^= sbox[tk[KC-1][2] * 4]; - tk[0][2] ^= sbox[tk[KC-1][3] * 4]; - tk[0][3] ^= sbox[tk[KC-1][0] * 4]; - tk[0][0] ^= rcon[rconpointer++]; - - if (KC != 8) + if (j == KC) { - for (j = 1; j < KC; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } + j = 0; + temp0 = sbox4(rol(temp0, 24)) ^ rcon; + rcon = ((rcon << 1) ^ (-(rcon >> 7) & 0x1b)) & 0xff; } - else + else if (KC == 8 && j == 4) { - for (j = 1; j < KC/2; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } - tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4]; - tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4]; - tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4]; - tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4]; - for (j = KC/2 + 1; j < KC; j++) - { - tk_u32[j] ^= tk_u32[j-1]; - } + temp0 = sbox4(temp0); } - /* Copy values into round key array. */ - for (j = 0; (j < KC) && (r < rounds + 1); ) - { - for (; (j < KC) && (t < 4); j++, t++) - { - W_u32[r][t] = le_bswap32(tk_u32[j]); - } - if (t == 4) - { - r++; - t = 0; - } - } + temp1 = W_u32[i - KC + 0]; + + W_u32[i + 0] = temp0 ^ temp1; + W_u32[i + 1] = W_u32[i - KC + 1] ^ temp0 ^ temp1; + W_prev = W_u32[i + 1]; } -#undef W -#undef tk -#undef k -#undef W_u32 -#undef tk_u32 -#undef k_u32 - wipememory(&tkk, sizeof(tkk)); } return 0; -- 2.34.1 From tianjia.zhang at linux.alibaba.com Thu Jul 28 10:26:54 2022 From: tianjia.zhang at linux.alibaba.com (Tianjia Zhang) Date: Thu, 28 Jul 2022 16:26:54 +0800 Subject: [PATCH] sm4: add ARMv8 CE accelerated implementation for XTS mode Message-ID: <20220728082655.47697-1-tianjia.zhang@linux.alibaba.com> * cipher/sm4-armv8-aarch64-ce.S (_gcry_sm4_armv8_ce_xts_crypt): New. * cipher/sm4.c (_gcry_sm4_armv8_ce_xts_crypt): New. (_gcry_sm4_xts_crypt) [USE_ARM_CE]: Add ARMv8 CE implementation for XTS. -- Benchmark on T-Head Yitian-710 2.75 GHz: Before: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz XTS enc | 0.373 ns/B 2560 MiB/s 1.02 c/B 2749 XTS dec | 0.372 ns/B 2562 MiB/s 1.02 c/B 2750 After (1.18x faster): SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz XTS enc | 0.314 ns/B 3038 MiB/s 0.863 c/B 2749 XTS dec | 0.314 ns/B 3037 MiB/s 0.863 c/B 2749 Signed-off-by: Tianjia Zhang --- cipher/sm4-armv8-aarch64-ce.S | 151 ++++++++++++++++++++++++++++++++++ cipher/sm4.c | 18 +++- 2 files changed, 168 insertions(+), 1 deletion(-) diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S index 5fb55947edc1..1a4ff736ad27 100644 --- a/cipher/sm4-armv8-aarch64-ce.S +++ b/cipher/sm4-armv8-aarch64-ce.S @@ -62,6 +62,7 @@ #define RTMP3 v19 #define RIV v20 +#define RMASK v21 /* Helper macros. */ @@ -69,6 +70,20 @@ ld1 {v24.16b-v27.16b}, [ptr], #64; \ ld1 {v28.16b-v31.16b}, [ptr]; +#define SM4_CRYPT_BLK(b0) \ + rev32 b0.16b, b0.16b; \ + sm4e(b0, v24); \ + sm4e(b0, v25); \ + sm4e(b0, v26); \ + sm4e(b0, v27); \ + sm4e(b0, v28); \ + sm4e(b0, v29); \ + sm4e(b0, v30); \ + sm4e(b0, v31); \ + rev64 b0.4s, b0.4s; \ + ext b0.16b, b0.16b, b0.16b, #8; \ + rev32 b0.16b, b0.16b; + #define crypt_blk4(b0, b1, b2, b3) \ rev32 b0.16b, b0.16b; \ rev32 b1.16b, b1.16b; \ @@ -577,4 +592,140 @@ _gcry_sm4_armv8_ce_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;) +.align 3 +.global _gcry_sm4_armv8_ce_xts_crypt +ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;) +_gcry_sm4_armv8_ce_xts_crypt: + /* input: + * x0: round key array, CTX + * x1: dst + * x2: src + * x3: tweak (big endian, 128 bit) + * x4: nblocks + */ + CFI_STARTPROC() + VPUSH_ABI + + load_rkey(x0) + + mov x7, #0x87 + mov x8, #0x1 + mov RMASK.d[0], x7 + mov RMASK.d[1], x8 + + ld1 {RIV.16b}, [x3] + mov v8.16b, RIV.16b + ext RIV.16b, RIV.16b, RIV.16b, #8 + +.Lxts_loop_blk: + sub x4, x4, #8 + tbnz x4, #63, .Lxts_tail8 + +#define tweak_next(vt, vin, RTMP) \ + sshr RTMP.2d, RIV.2d, #63; \ + add vt.2d, vin.2d, vin.2d; \ + and RTMP.16b, RTMP.16b, RMASK.16b; \ + add RIV.2d, RIV.2d, RIV.2d; \ + eor vt.16b, vt.16b, RTMP.16b; + + tweak_next( v9, v8, RTMP0) + tweak_next(v10, v9, RTMP1) + tweak_next(v11, v10, RTMP2) + tweak_next(v12, v11, RTMP3) + tweak_next(v13, v12, RTMP0) + tweak_next(v14, v13, RTMP1) + tweak_next(v15, v14, RTMP2) + + ld1 {v0.16b-v3.16b}, [x2], #64 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + ld1 {v4.16b-v7.16b}, [x2], #64 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + + crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7) + + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.16b-v3.16b}, [x1], #64 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + st1 {v4.16b-v7.16b}, [x1], #64 + + tweak_next(v8, v15, RTMP3) + + cbz x4, .Lxts_end + b .Lxts_loop_blk + +.Lxts_tail8: + add x4, x4, #8 + cmp x4, #4 + blt .Lxts_tail4 + + sub x4, x4, #4 + + tweak_next( v9, v8, RTMP0) + tweak_next(v10, v9, RTMP1) + tweak_next(v11, v10, RTMP2) + + ld1 {v0.16b-v3.16b}, [x2], #64 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + + crypt_blk4(v0, v1, v2, v3); + + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.16b-v3.16b}, [x1], #64 + + tweak_next(v8, v11, RTMP3) + + cbz x4, .Lxts_end + +.Lxts_tail4: + sub x4, x4, #1 + + ld1 {v0.16b}, [x2], #16 + eor v0.16b, v0.16b, v8.16b + + SM4_CRYPT_BLK(v0) + + eor v0.16b, v0.16b, v8.16b + st1 {v0.16b}, [x1], #16 + + tweak_next(v8, v8, RTMP0) + + cbnz x4, .Lxts_tail4 + +.Lxts_end: + /* store new tweak */ + st1 {v8.16b}, [x3] + + CLEAR_REG(v8) + CLEAR_REG(v9) + CLEAR_REG(v10) + CLEAR_REG(v11) + CLEAR_REG(v12) + CLEAR_REG(v13) + CLEAR_REG(v14) + CLEAR_REG(v15) + CLEAR_REG(RIV) + + VPOP_ABI + ret_spec_stop + CFI_ENDPROC() +ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;) + #endif diff --git a/cipher/sm4.c b/cipher/sm4.c index b5d4691ddbcb..4cac3b6c64b0 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -1,6 +1,6 @@ /* sm4.c - SM4 Cipher Algorithm * Copyright (C) 2020 Alibaba Group. - * Copyright (C) 2020 Tianjia Zhang + * Copyright (C) 2020-2022 Tianjia Zhang * Copyright (C) 2020-2022 Jussi Kivilinna * * This file is part of Libgcrypt. @@ -539,6 +539,11 @@ extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out, byte *iv, size_t nblocks); +extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out, + const byte *in, + byte *tweak, + size_t nblocks); + extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out, const byte *in, size_t num_blocks); @@ -1510,6 +1515,17 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_ARM_CE + if (ctx->use_arm_ce) + { + /* Process all blocks at a time. */ + _gcry_sm4_armv8_ce_xts_crypt(encrypt ? ctx->rkey_enc : ctx->rkey_dec, + outbuf, inbuf, tweak, nblocks); + + nblocks = 0; + } +#endif + /* Process remaining blocks. */ if (nblocks) { -- 2.24.3 (Apple Git-128)