From jussi.kivilinna at iki.fi Wed Mar 1 10:46:37 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 1 Mar 2023 11:46:37 +0200 Subject: [PATCH 1/2] Improve PPC target function attribute checks Message-ID: <20230301094638.1902141-1-jussi.kivilinna@iki.fi> * configure.ac (gcry_cv_gcc_attribute_ppc_target) (gcry_cv_clang_attribute_ppc_target): Add 'always_inline' function to test. -- With some CFLAG combinations, target attribute fails to work with always_inline functions. Patch adds detection for such configuration and disables target attribute use in such case (and suffer less optimal code generation). Signed-off-by: Jussi Kivilinna --- configure.ac | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index 0d5c9160..44340e49 100644 --- a/configure.ac +++ b/configure.ac @@ -2376,10 +2376,11 @@ AC_CACHE_CHECK([whether compiler supports GCC PowerPC target attributes], else gcry_cv_gcc_attribute_ppc_target=no AC_LINK_IFELSE([AC_LANG_PROGRAM( - [[void __attribute__((target("cpu=power8"))) testfn8(void) {} + [[void __attribute__((always_inline)) inline aifn(void) {} + void __attribute__((target("cpu=power8"))) testfn8(void) {aifn();} void __attribute__((target("cpu=power9"))) testfn9(void) - { testfn8(); } - ]], [ testfn9(); ])], + { testfn8(); aifn(); } + ]], [ testfn9(); aifn(); ])], [gcry_cv_gcc_attribute_ppc_target=yes]) fi]) if test "$gcry_cv_gcc_attribute_ppc_target" = "yes" ; then @@ -2398,10 +2399,11 @@ AC_CACHE_CHECK([whether compiler supports clang PowerPC target attributes], else gcry_cv_clang_attribute_ppc_target=no AC_LINK_IFELSE([AC_LANG_PROGRAM( - [[void __attribute__((target("arch=pwr8"))) testfn8(void) {} + [[void __attribute__((always_inline)) inline aifn(void) {} + void __attribute__((target("arch=pwr8"))) testfn8(void) {aifn();} void __attribute__((target("arch=pwr9"))) testfn9(void) - { testfn8(); } - ]], [ testfn9(); ])], + { testfn8(); aifn(); } + ]], [ testfn9(); aifn(); ])], [gcry_cv_clang_attribute_ppc_target=yes]) fi]) if test "$gcry_cv_clang_attribute_ppc_target" = "yes" ; then -- 2.37.2 From jussi.kivilinna at iki.fi Wed Mar 1 10:46:38 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 1 Mar 2023 11:46:38 +0200 Subject: [PATCH 2/2] Fix "'inline' is not at beginning of declaration" warnings In-Reply-To: <20230301094638.1902141-1-jussi.kivilinna@iki.fi> References: <20230301094638.1902141-1-jussi.kivilinna@iki.fi> Message-ID: <20230301094638.1902141-2-jussi.kivilinna@iki.fi> * cipher/chacha20-ppc.c (chacha20_ppc_blocks1) (chacha20_ppc_blocks4, chacha20_poly1305_ppc_blocks4): Move 'ASM_FUNC_ATTR_INLINE' right after 'static'. * cipher/sha256-ppc.c (sha256_transform_ppc): Likewise. * cipher/sha512-ppc.c (sha512_transform_ppc): Likewise. -- Patch fixes these GCC warnings in PowerPC implementations: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Jussi Kivilinna --- cipher/chacha20-ppc.c | 8 ++++---- cipher/sha256-ppc.c | 2 +- cipher/sha512-ppc.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c index 243c12ff..f135a32f 100644 --- a/cipher/chacha20-ppc.c +++ b/cipher/chacha20-ppc.c @@ -136,7 +136,7 @@ vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a) #define ADD_U64(v,a) \ (v = vec_add_ctr_u64(v, a)) -static unsigned int ASM_FUNC_ATTR_INLINE +static ASM_FUNC_ATTR_INLINE unsigned int chacha20_ppc_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) { vector4x_u32 counter_1 = { 1, 0, 0, 0 }; @@ -282,7 +282,7 @@ chacha20_ppc_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE(b1, rotate_7); ROTATE(b2, rotate_7); -static unsigned int ASM_FUNC_ATTR_INLINE +static ASM_FUNC_ATTR_INLINE unsigned int chacha20_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) { vector4x_u32 counters_0123 = { 0, 1, 2, 3 }; @@ -468,7 +468,7 @@ chacha20_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \ } while (0) -static unsigned int ASM_FUNC_ATTR_INLINE +static ASM_FUNC_ATTR_INLINE unsigned int chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src) @@ -641,7 +641,7 @@ chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src, #else -static unsigned int ASM_FUNC_ATTR_INLINE +static ASM_FUNC_ATTR_INLINE unsigned int chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src) diff --git a/cipher/sha256-ppc.c b/cipher/sha256-ppc.c index fd69380f..e5839a84 100644 --- a/cipher/sha256-ppc.c +++ b/cipher/sha256-ppc.c @@ -278,7 +278,7 @@ vec_u32_load_be(unsigned long offset, const void *ptr) wlt1; \ }) -static unsigned int ASM_FUNC_ATTR ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2 +static ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2 unsigned int sha256_transform_ppc(u32 state[8], const unsigned char *data, size_t nblks) { vector4x_u32 h0, h1, h2, h3, h4, h5, h6, h7; diff --git a/cipher/sha512-ppc.c b/cipher/sha512-ppc.c index 6e69ddb9..d213c241 100644 --- a/cipher/sha512-ppc.c +++ b/cipher/sha512-ppc.c @@ -339,7 +339,7 @@ vec_u64_load_be(unsigned long offset, const void *ptr) wlt1; \ }) -static unsigned int ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2 +static ASM_FUNC_ATTR_INLINE FUNC_ATTR_OPT_O2 unsigned int sha512_transform_ppc(u64 state[8], const unsigned char *data, size_t nblks) { vector2x_u64 h0, h1, h2, h3, h4, h5, h6, h7; -- 2.37.2 From jussi.kivilinna at iki.fi Thu Mar 2 12:58:59 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 2 Mar 2023 13:58:59 +0200 Subject: [PATCH 2/3] camellia-simd128: faster sbox filtering with uint8 right shift In-Reply-To: <20230302115900.3263821-1-jussi.kivilinna@iki.fi> References: <20230302115900.3263821-1-jussi.kivilinna@iki.fi> Message-ID: <20230302115900.3263821-2-jussi.kivilinna@iki.fi> * cipher/camellia-simd128.h (if_vpsrlb128) (if_not_vpsrlb128): New. (filter_8bit): Use 'vpsrlb128' when available on target architecture (PowerPC and AArch64). -- Benchmark on POWER9: Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 3.26 ns/B 292.8 MiB/s 7.49 c/B ECB dec | 3.29 ns/B 290.0 MiB/s 7.56 c/B After (~2% faster): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 3.16 ns/B 301.4 MiB/s 7.28 c/B ECB dec | 3.19 ns/B 298.7 MiB/s 7.34 c/B Signed-off-by: Jussi Kivilinna --- cipher/camellia-simd128.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h index 9cb7b987..6b44961f 100644 --- a/cipher/camellia-simd128.h +++ b/cipher/camellia-simd128.h @@ -91,6 +91,8 @@ asm_sbox_be(uint8x16_t b) o = (__m128i)vec_sld((uint8x16_t)a, \ (uint8x16_t)__tmp, (s) & 15);}) +#define if_vpsrlb128(...) __VA_ARGS__ +#define if_not_vpsrlb128(...) /*_*/ #define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o) #define vpsll_byte_128(s, a, o) vpsllb128(s, a, o) @@ -182,6 +184,8 @@ static const uint8x16_t shift_row = o = (__m128i)vextq_u8((uint8x16_t)__tmp, \ (uint8x16_t)a, (16 - (s)) & 15);}) +#define if_vpsrlb128(...) __VA_ARGS__ +#define if_not_vpsrlb128(...) /*_*/ #define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o) #define vpsll_byte_128(s, a, o) vpsllb128(s, a, o) @@ -253,6 +257,8 @@ static const uint8x16_t shift_row = #define vpsrldq128(s, a, o) (o = _mm_srli_si128(a, s)) #define vpslldq128(s, a, o) (o = _mm_slli_si128(a, s)) +#define if_vpsrlb128(...) /*_*/ +#define if_not_vpsrlb128(...) __VA_ARGS__ #define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o) #define vpsll_byte_128(s, a, o) vpslld128(s, a, o) @@ -309,8 +315,9 @@ static const uint8x16_t shift_row = **********************************************************************/ #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ vpand128(x, mask4bit, tmp0); \ - vpandn128(x, mask4bit, x); \ - vpsrl_byte_128(4, x, x); \ + if_vpsrlb128(vpsrlb128(4, x, x)); \ + if_not_vpsrlb128(vpandn128(x, mask4bit, x)); \ + if_not_vpsrlb128(vpsrld128(4, x, x)); \ \ vpshufb128(tmp0, lo_t, tmp0); \ vpshufb128(x, hi_t, x); \ -- 2.37.2 From jussi.kivilinna at iki.fi Thu Mar 2 12:58:58 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 2 Mar 2023 13:58:58 +0200 Subject: [PATCH 1/3] chacha20-ppc: do not generate p9 code when target attr unavailable Message-ID: <20230302115900.3263821-1-jussi.kivilinna@iki.fi> * cipher/chacha20-ppc.c (HAVE_FUNC_ATTR_TARGET): New. (_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4) (_gcry_chacha20_poly1305_ppc8_blocks4): Use inline functions only if HAVE_FUNC_ATTR_TARGET is defined. -- Signed-off-by: Jussi Kivilinna --- cipher/chacha20-ppc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c index f135a32f..994b6a01 100644 --- a/cipher/chacha20-ppc.c +++ b/cipher/chacha20-ppc.c @@ -660,12 +660,15 @@ chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src, #if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET) # define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8"))) # define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9"))) +# define HAVE_FUNC_ATTR_TARGET 1 #elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET) # define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8"))) # define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9"))) +# define HAVE_FUNC_ATTR_TARGET 1 #else # define FUNC_ATTR_TARGET_P8 # define FUNC_ATTR_TARGET_P9 +# undef HAVE_FUNC_ATTR_TARGET #endif @@ -693,7 +696,7 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, poly1305_src); } -#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET +#ifdef HAVE_FUNC_ATTR_TARGET /* Functions targetting POWER9. */ unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2 _gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src, -- 2.37.2 From jussi.kivilinna at iki.fi Thu Mar 2 12:59:00 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 2 Mar 2023 13:59:00 +0200 Subject: [PATCH 3/3] Add PowerPC vector implementation of SM4 In-Reply-To: <20230302115900.3263821-1-jussi.kivilinna@iki.fi> References: <20230302115900.3263821-1-jussi.kivilinna@iki.fi> Message-ID: <20230302115900.3263821-3-jussi.kivilinna@iki.fi> * cipher/Makefile.am: Add 'sm4-ppc.c'. * cipher/sm4-ppc.c: New. * cipher/sm4.c (USE_PPC_CRYPTO): New. (SM4_context): Add 'use_ppc8le' and 'use_ppc9le'. [USE_PPC_CRYPTO] (_gcry_sm4_ppc8le_crypt_blk1_16) (_gcry_sm4_ppc9le_crypt_blk1_16, sm4_ppc8le_crypt_blk1_16) (sm4_ppc9le_crypt_blk1_16): New. (sm4_setkey) [USE_PPC_CRYPTO]: Set use_ppc8le and use_ppc9le based on HW features. (sm4_get_crypt_blk1_16_fn) [USE_PPC_CRYPTO]: Add PowerPC implementation selection. -- Benchmark on POWER9: Before: SM4 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 14.47 ns/B 65.89 MiB/s 33.29 c/B ECB dec | 14.47 ns/B 65.89 MiB/s 33.29 c/B CBC enc | 35.09 ns/B 27.18 MiB/s 80.71 c/B CBC dec | 16.69 ns/B 57.13 MiB/s 38.39 c/B CFB enc | 35.09 ns/B 27.18 MiB/s 80.71 c/B CFB dec | 16.76 ns/B 56.90 MiB/s 38.55 c/B CTR enc | 16.88 ns/B 56.50 MiB/s 38.82 c/B CTR dec | 16.88 ns/B 56.50 MiB/s 38.82 c/B After (ECB ~4.4x faster): SM4 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 3.26 ns/B 292.3 MiB/s 7.50 c/B ECB dec | 3.26 ns/B 292.3 MiB/s 7.50 c/B CBC enc | 35.10 ns/B 27.17 MiB/s 80.72 c/B CBC dec | 3.33 ns/B 286.3 MiB/s 7.66 c/B CFB enc | 35.10 ns/B 27.17 MiB/s 80.74 c/B CFB dec | 3.36 ns/B 283.8 MiB/s 7.73 c/B CTR enc | 3.47 ns/B 275.0 MiB/s 7.98 c/B CTR dec | 3.47 ns/B 275.0 MiB/s 7.98 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 1 + cipher/sm4-ppc.c | 342 +++++++++++++++++++++++++++++++++++++++++++++ cipher/sm4.c | 48 +++++++ configure.ac | 5 + 4 files changed, 396 insertions(+) create mode 100644 cipher/sm4-ppc.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index dcaa68bb..d8f520dd 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -123,6 +123,7 @@ EXTRA_libcipher_la_SOURCES = \ sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \ sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \ sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \ + sm4-ppc.c \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ diff --git a/cipher/sm4-ppc.c b/cipher/sm4-ppc.c new file mode 100644 index 00000000..bb2c55e0 --- /dev/null +++ b/cipher/sm4-ppc.c @@ -0,0 +1,342 @@ +/* sm4-ppc.c - PowerPC implementation of SM4 cipher + * + * Copyright (C) 2023 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \ + defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ + defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \ + !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4) + +#include +#include "bufhelp.h" + +typedef vector unsigned char vector16x_u8; +typedef vector unsigned int vector4x_u32; +typedef vector unsigned long long vector2x_u64; + +#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE +# define FUNC_ATTR_OPT __attribute__((optimize("-O2"))) +#else +# define FUNC_ATTR_OPT +#endif + +#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET) +# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8"))) +# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9"))) +# define HAVE_FUNC_ATTR_TARGET 1 +#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET) +# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8"))) +# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9"))) +# define HAVE_FUNC_ATTR_TARGET 1 +#else +# define FUNC_ATTR_TARGET_P8 +# define FUNC_ATTR_TARGET_P9 +# undef HAVE_FUNC_ATTR_TARGET +#endif + +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NO_INLINE __attribute__((noinline)) +#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) + +#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION +#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE +#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE + +#ifdef __clang__ +/* clang has mismatching prototype for vec_sbox_be. */ +static ASM_FUNC_ATTR_INLINE vector16x_u8 +asm_sbox_be(vector16x_u8 b) +{ + vector16x_u8 o; + __asm__ ("vsbox %0, %1\n\t" : "=v" (o) : "v" (b)); + return o; +} +#undef vec_sbox_be +#define vec_sbox_be asm_sbox_be +#endif /* __clang__ */ + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + t2 = (vector4x_u32)vec_mergel((vector4x_u32)x0, (vector4x_u32)x1); \ + x0 = (vector4x_u32)vec_mergeh((vector4x_u32)x0, (vector4x_u32)x1); \ + \ + t1 = (vector4x_u32)vec_mergeh((vector4x_u32)x2, (vector4x_u32)x3); \ + x2 = (vector4x_u32)vec_mergel((vector4x_u32)x2, (vector4x_u32)x3); \ + \ + x1 = (vector4x_u32)vec_mergel((vector2x_u64)x0, (vector2x_u64)t1); \ + x0 = (vector4x_u32)vec_mergeh((vector2x_u64)x0, (vector2x_u64)t1); \ + \ + x3 = (vector4x_u32)vec_mergel((vector2x_u64)t2, (vector2x_u64)x2); \ + x2 = (vector4x_u32)vec_mergeh((vector2x_u64)t2, (vector2x_u64)x2); + +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) ({ \ + tmp0 = x & mask4bit; \ + x = (vector4x_u32)((vector16x_u8)x >> 4); \ + \ + tmp0 = (vector4x_u32)vec_perm((vector16x_u8)lo_t, (vector16x_u8)lo_t, \ + (vector16x_u8)tmp0); \ + x = (vector4x_u32)vec_perm((vector16x_u8)hi_t, (vector16x_u8)hi_t, \ + (vector16x_u8)x); \ + x = x ^ tmp0; \ + }) + +#define GET_RKEY(round) vec_splat(r4keys, round) + +#define ROUND4(round, s0, s1, s2, s3) ({ \ + vector4x_u32 rkey = GET_RKEY(round); \ + vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \ + filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \ + rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \ + filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \ + s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \ + vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \ + }) + +#define ROUND8(round, s0, s1, s2, s3, r0, r1, r2, r3) ({ \ + vector4x_u32 rkey = GET_RKEY(round); \ + vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \ + vector4x_u32 rx1 = rkey ^ r1 ^ r2 ^ r3; \ + filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \ + filter_8bit(rx1, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \ + rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \ + rx1 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx1); \ + filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \ + filter_8bit(rx1, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \ + s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \ + vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \ + r0 ^= rx1 ^ vec_rl(rx1, rotate2) ^ vec_rl(rx1, rotate10) ^ \ + vec_rl(rx1, rotate18) ^ vec_rl(rx1, rotate24); \ + }) + +static const vector4x_u32 mask_0f = + { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f }; +static const vector2x_u64 pre_tf_lo_s = + { 0x9096E3E575730600ULL, 0xC6C0B5B323255056ULL }; +static const vector2x_u64 pre_tf_hi_s = + { 0xE341AA08EA48A301ULL, 0xF153B81AF85AB113ULL }; +static const vector2x_u64 post_tf_lo_s = + { 0x6F53C6FA95A93C00ULL, 0xD9E5704C231F8AB6ULL }; +static const vector2x_u64 post_tf_hi_s = + { 0x9A4635E9479BE834ULL, 0x25F98A56F824578BULL }; +static const vector4x_u32 rotate2 = { 2, 2, 2, 2 }; +static const vector4x_u32 rotate10 = { 10, 10, 10, 10 }; +static const vector4x_u32 rotate18 = { 18, 18, 18, 18 }; +static const vector4x_u32 rotate24 = { 24, 24, 24, 24 }; + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk16(u32 *rk, byte *out, const byte *in) +{ + vector4x_u32 ra0, ra1, ra2, ra3; + vector4x_u32 rb0, rb1, rb2, rb3; + vector4x_u32 rc0, rc1, rc2, rc3; + vector4x_u32 rd0, rd1, rd2, rd3; + vector4x_u32 tmp0, tmp1; + u32 *rk_end; + + ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16)); + rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16)); + rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16)); + rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16)); + in += 8 * 16; + rc0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + rc1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + rc2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + rc3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + rd0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16)); + rd1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16)); + rd2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16)); + rd3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16)); + + transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1); + transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1); + transpose_4x4(rc0, rc1, rc2, rc3, tmp0, tmp1); + transpose_4x4(rd0, rd1, rd2, rd3, tmp0, tmp1); + + for (rk_end = rk + 32; rk < rk_end; rk += 4) + { + vector4x_u32 r4keys = vec_xl(0, rk); + ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3); + ROUND8(0, rc0, rc1, rc2, rc3, rd0, rd1, rd2, rd3); + ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0); + ROUND8(1, rc1, rc2, rc3, rc0, rd1, rd2, rd3, rd0); + ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1); + ROUND8(2, rc2, rc3, rc0, rc1, rd2, rd3, rd0, rd1); + ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2); + ROUND8(3, rc3, rc0, rc1, rc2, rd3, rd0, rd1, rd2); + } + + transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1); + transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1); + transpose_4x4(rc3, rc2, rc1, rc0, tmp0, tmp1); + transpose_4x4(rd3, rd2, rd1, rd0, tmp0, tmp1); + + vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16); + vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16); + vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16); + vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16); + vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16); + vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16); + vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16); + vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16); + out += 8 * 16; + vec_xst((vector16x_u8)vec_revb(rc3), 0, out + 0 * 16); + vec_xst((vector16x_u8)vec_revb(rc2), 0, out + 1 * 16); + vec_xst((vector16x_u8)vec_revb(rc1), 0, out + 2 * 16); + vec_xst((vector16x_u8)vec_revb(rc0), 0, out + 3 * 16); + vec_xst((vector16x_u8)vec_revb(rd3), 0, out + 4 * 16); + vec_xst((vector16x_u8)vec_revb(rd2), 0, out + 5 * 16); + vec_xst((vector16x_u8)vec_revb(rd1), 0, out + 6 * 16); + vec_xst((vector16x_u8)vec_revb(rd0), 0, out + 7 * 16); +} + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk8(u32 *rk, byte *out, const byte *in) +{ + vector4x_u32 ra0, ra1, ra2, ra3; + vector4x_u32 rb0, rb1, rb2, rb3; + vector4x_u32 tmp0, tmp1; + u32 *rk_end; + + ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16)); + rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16)); + rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16)); + rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16)); + + transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1); + transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1); + + for (rk_end = rk + 32; rk < rk_end; rk += 4) + { + vector4x_u32 r4keys = vec_xl(0, rk); + ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3); + ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0); + ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1); + ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2); + } + + transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1); + transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1); + + vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16); + vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16); + vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16); + vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16); + vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16); + vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16); + vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16); + vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16); +} + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk1_4(u32 *rk, byte *out, const byte *in, size_t nblks) +{ + vector4x_u32 ra0, ra1, ra2, ra3; + vector4x_u32 tmp0, tmp1; + u32 *rk_end; + + ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + ra1 = ra0; + ra2 = ra0; + ra3 = ra0; + if (LIKELY(nblks > 1)) + ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + if (LIKELY(nblks > 2)) + ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + if (LIKELY(nblks > 3)) + ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + + transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1); + + for (rk_end = rk + 32; rk < rk_end; rk += 4) + { + vector4x_u32 r4keys = vec_xl(0, rk); + ROUND4(0, ra0, ra1, ra2, ra3); + ROUND4(1, ra1, ra2, ra3, ra0); + ROUND4(2, ra2, ra3, ra0, ra1); + ROUND4(3, ra3, ra0, ra1, ra2); + } + + transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1); + + vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16); + if (LIKELY(nblks > 1)) + vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16); + if (LIKELY(nblks > 2)) + vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16); + if (LIKELY(nblks > 3)) + vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16); +} + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks) +{ + if (nblks >= 16) + { + sm4_ppc_crypt_blk16(rk, out, in); + return; + } + + while (nblks >= 8) + { + sm4_ppc_crypt_blk8(rk, out, in); + in += 8 * 16; + out += 8 * 16; + nblks -= 8; + } + + while (nblks) + { + size_t currblks = nblks > 4 ? 4 : nblks; + sm4_ppc_crypt_blk1_4(rk, out, in, currblks); + in += currblks * 16; + out += currblks * 16; + nblks -= currblks; + } +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P8 void +_gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t nblks) +{ + sm4_ppc_crypt_blk1_16(rk, out, in, nblks); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P9 void +_gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t nblks) +{ +#ifdef HAVE_FUNC_ATTR_TARGET + /* Inline for POWER9 target optimization. */ + sm4_ppc_crypt_blk1_16(rk, out, in, nblks); +#else + /* Target selecting not working, just call the other noinline function. */ + _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, nblks); +#endif +} + +#endif /* ENABLE_PPC_CRYPTO_SUPPORT */ diff --git a/cipher/sm4.c b/cipher/sm4.c index b0402b64..06b843f8 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -115,6 +115,14 @@ # endif #endif +#undef USE_PPC_CRYPTO +#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \ + defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ + defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \ + !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4) +# define USE_PPC_CRYPTO 1 +#endif + static const char *sm4_selftest (void); static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr, @@ -169,6 +177,10 @@ typedef struct #ifdef USE_ARM_SVE_CE unsigned int use_arm_sve_ce:1; #endif +#ifdef USE_PPC_CRYPTO + unsigned int use_ppc8le:1; + unsigned int use_ppc9le:1; +#endif } SM4_context; static const u32 fk[4] = @@ -598,6 +610,28 @@ sm4_armv9_sve_ce_crypt_blk1_16(void *rk, byte *out, const byte *in, extern unsigned int _gcry_sm4_armv9_sve_get_vl(void); #endif /* USE_ARM_SVE_CE */ +#ifdef USE_PPC_CRYPTO +extern void _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t num_blks); + +extern void _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t num_blks); + +static inline unsigned int +sm4_ppc8le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) +{ + _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, num_blks); + return 0; +} + +static inline unsigned int +sm4_ppc9le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) +{ + _gcry_sm4_ppc9le_crypt_blk1_16(rk, out, in, num_blks); + return 0; +} +#endif /* USE_PPC_CRYPTO */ + static inline void prefetch_sbox_table(void) { const volatile byte *vtab = (void *)&sbox_table; @@ -775,6 +809,10 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVE2) && (hwf & HWF_ARM_SVESM4) && _gcry_sm4_armv9_sve_get_vl() > 16; #endif +#ifdef USE_PPC_CRYPTO + ctx->use_ppc8le = (hwf & HWF_PPC_VCRYPTO) != 0; + ctx->use_ppc9le = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00); +#endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) @@ -1008,6 +1046,16 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx) { return &sm4_aarch64_crypt_blk1_16; } +#endif +#ifdef USE_PPC_CRYPTO + else if (ctx->use_ppc9le) + { + return &sm4_ppc9le_crypt_blk1_16; + } + else if (ctx->use_ppc8le) + { + return &sm4_ppc8le_crypt_blk1_16; + } #endif else { diff --git a/configure.ac b/configure.ac index 44340e49..60fb1f75 100644 --- a/configure.ac +++ b/configure.ac @@ -3191,6 +3191,11 @@ if test "$found" = "1" ; then GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aarch64.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv8-aarch64-ce.lo" GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-armv9-aarch64-sve-ce.lo" + ;; + powerpc64le-*-*) + # Build with the ppc64le vector implementation + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-ppc.lo" + ;; esac fi -- 2.37.2 From jussi.kivilinna at iki.fi Mon Mar 6 20:36:29 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 6 Mar 2023 21:36:29 +0200 Subject: [PATCH] rijndael-ppc: use vector registers for key schedule calculations Message-ID: <20230306193629.829897-1-jussi.kivilinna@iki.fi> * cipher/rijndael-ppc.c (_gcry_aes_sbox4_ppc8): Remove. (bcast_u32_to_vec, u32_from_vec): New. (_gcry_aes_ppc8_setkey): Use vectors for round key calculation variables. -- Signed-off-by: Jussi Kivilinna --- cipher/rijndael-ppc.c | 68 +++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c index 7530209d..055b00c0 100644 --- a/cipher/rijndael-ppc.c +++ b/cipher/rijndael-ppc.c @@ -116,25 +116,32 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr) } -static ASM_FUNC_ATTR_INLINE u32 -_gcry_aes_sbox4_ppc8(u32 fourbytes) +static ASM_FUNC_ATTR_INLINE unsigned int +keysched_idx(unsigned int in) { - vec_u32 vec_fourbyte = { fourbytes, fourbytes, fourbytes, fourbytes }; #ifdef WORDS_BIGENDIAN - return ((vec_u32)asm_sbox_be((block)vec_fourbyte))[1]; + return in; #else - return ((vec_u32)asm_sbox_be((block)vec_fourbyte))[2]; + return (in & ~3U) | (3U - (in & 3U)); #endif } -static ASM_FUNC_ATTR_INLINE unsigned int -keysched_idx(unsigned int in) +static ASM_FUNC_ATTR_INLINE vec_u32 +bcast_u32_to_vec(u32 x) +{ + vec_u32 v = { x, x, x, x }; + return v; +} + + +static ASM_FUNC_ATTR_INLINE u32 +u32_from_vec(vec_u32 x) { #ifdef WORDS_BIGENDIAN - return in; + return x[1]; #else - return (in & ~3U) | (3U - (in & 3U)); + return x[2]; #endif } @@ -142,55 +149,58 @@ keysched_idx(unsigned int in) void PPC_OPT_ATTR _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key) { - u32 tk_u32[MAXKC]; + static const vec_u32 rotate24 = { 24, 24, 24, 24 }; + static const vec_u32 rcon_const = { 0x1b, 0x1b, 0x1b, 0x1b }; + vec_u32 tk_vu32[MAXKC]; unsigned int rounds = ctx->rounds; unsigned int KC = rounds - 6; u32 *W_u32 = ctx->keyschenc32b; unsigned int i, j; - u32 tk_prev; - byte rcon = 1; + vec_u32 tk_prev; + vec_u32 rcon = { 1, 1, 1, 1 }; for (i = 0; i < KC; i += 2) { unsigned int idx0 = keysched_idx(i + 0); unsigned int idx1 = keysched_idx(i + 1); - tk_u32[i + 0] = buf_get_le32(key + i * 4 + 0); - tk_u32[i + 1] = buf_get_le32(key + i * 4 + 4); - W_u32[idx0] = _gcry_bswap32(tk_u32[i + 0]); - W_u32[idx1] = _gcry_bswap32(tk_u32[i + 1]); + tk_vu32[i + 0] = bcast_u32_to_vec(buf_get_le32(key + i * 4 + 0)); + tk_vu32[i + 1] = bcast_u32_to_vec(buf_get_le32(key + i * 4 + 4)); + W_u32[idx0] = u32_from_vec(vec_revb(tk_vu32[i + 0])); + W_u32[idx1] = u32_from_vec(vec_revb(tk_vu32[i + 1])); } - for (i = KC, j = KC, tk_prev = tk_u32[KC - 1]; + for (i = KC, j = KC, tk_prev = tk_vu32[KC - 1]; i < 4 * (rounds + 1); i += 2, j += 2) { unsigned int idx0 = keysched_idx(i + 0); unsigned int idx1 = keysched_idx(i + 1); - u32 temp0 = tk_prev; - u32 temp1; + vec_u32 temp0 = tk_prev; + vec_u32 temp1; if (j == KC) { j = 0; - temp0 = _gcry_aes_sbox4_ppc8(rol(temp0, 24)) ^ rcon; - rcon = ((rcon << 1) ^ (-(rcon >> 7) & 0x1b)) & 0xff; + temp0 = (vec_u32)(asm_sbox_be((block)vec_rl(temp0, rotate24))) ^ rcon; + rcon = (vec_u32)(((block)rcon << 1) + ^ (-((block)rcon >> 7) & (block)rcon_const)); } else if (KC == 8 && j == 4) { - temp0 = _gcry_aes_sbox4_ppc8(temp0); + temp0 = (vec_u32)asm_sbox_be((block)temp0); } - temp1 = tk_u32[j + 0]; + temp1 = tk_vu32[j + 0]; - tk_u32[j + 0] = temp0 ^ temp1; - tk_u32[j + 1] ^= temp0 ^ temp1; - tk_prev = tk_u32[j + 1]; + tk_vu32[j + 0] = temp0 ^ temp1; + tk_vu32[j + 1] ^= temp0 ^ temp1; + tk_prev = tk_vu32[j + 1]; - W_u32[idx0] = _gcry_bswap32(tk_u32[j + 0]); - W_u32[idx1] = _gcry_bswap32(tk_u32[j + 1]); + W_u32[idx0] = u32_from_vec(vec_revb(tk_vu32[j + 0])); + W_u32[idx1] = u32_from_vec(vec_revb(tk_vu32[j + 1])); } - wipememory(tk_u32, sizeof(tk_u32)); + wipememory(tk_vu32, sizeof(tk_vu32)); } -- 2.37.2 From jussi.kivilinna at iki.fi Mon Mar 13 18:54:31 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 13 Mar 2023 19:54:31 +0200 Subject: [PATCH] camellia-gfni: use GFNI for uint8 right shift in FLS Message-ID: <20230313175431.186874-1-jussi.kivilinna@iki.fi> * cipher/camellia-gfni-avx512-amd64.S (clear_regs): Don't clear %k1. (rol32_1_64): Use vgf2p8affineqb for uint8 right shift by 7. (fls64): Adjust for rol32_1_64 changes. (.Lbyte_ones): Remove. (.Lright_shift_by_7): New. (_gcry_camellia_gfni_avx512_ctr_enc): Clear %k1 after use. -- Benchmark on Intel Core i3-1115G4: Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.194 ns/B 4920 MiB/s 0.794 c/B 4096?4 ECB dec | 0.194 ns/B 4916 MiB/s 0.793 c/B 4089 After (~1.7% faster) CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.190 ns/B 5008 MiB/s 0.780 c/B 4096?3 ECB dec | 0.191 ns/B 5002 MiB/s 0.781 c/B 4096?3 Signed-off-by: Jussi Kivilinna --- cipher/camellia-gfni-avx512-amd64.S | 37 +++++++++++++++-------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S index b676379f..643eed3e 100644 --- a/cipher/camellia-gfni-avx512-amd64.S +++ b/cipher/camellia-gfni-avx512-amd64.S @@ -105,7 +105,6 @@ clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31) #define clear_regs() \ - kxorq %k1, %k1, %k1; \ vzeroall; \ clear_zmm16_zmm31() @@ -307,22 +306,18 @@ * v0..3: (IN << 1) * t0, t1, t2, zero: (IN >> 7) */ -#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \ - vpcmpltb zero, v0, %k1; \ +#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, t3, right_shift_by_7) \ + vgf2p8affineqb $0, right_shift_by_7, v0, t0; \ vpaddb v0, v0, v0; \ - vpaddb one, zero, t0{%k1}{z}; \ \ - vpcmpltb zero, v1, %k1; \ + vgf2p8affineqb $0, right_shift_by_7, v1, t1; \ vpaddb v1, v1, v1; \ - vpaddb one, zero, t1{%k1}{z}; \ \ - vpcmpltb zero, v2, %k1; \ + vgf2p8affineqb $0, right_shift_by_7, v2, t2; \ vpaddb v2, v2, v2; \ - vpaddb one, zero, t2{%k1}{z}; \ \ - vpcmpltb zero, v3, %k1; \ - vpaddb v3, v3, v3; \ - vpaddb one, zero, zero{%k1}{z}; + vgf2p8affineqb $0, right_shift_by_7, v3, t3; \ + vpaddb v3, v3, v3; /* * IN: @@ -338,8 +333,7 @@ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ - vpbroadcastq .Lbyte_ones rRIP, tmp; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ + vpbroadcastq .Lright_shift_by_7 rRIP, tmp; \ vpbroadcastb 0+kll, t3; \ vpbroadcastb 1+kll, t2; \ vpbroadcastb 2+kll, t1; \ @@ -360,7 +354,6 @@ vmovdqu64 l6, l##_6; \ vpternlogq $0x96, tt3, t3, l7; \ vmovdqu64 l7, l##_7; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ \ /* \ * t2 = krr; \ @@ -399,7 +392,6 @@ vpternlogq $0x96, tt1, t1, r##_5; \ vpternlogq $0x96, tt0, t2, r##_6; \ vpternlogq $0x96, tt3, t3, r##_7; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ \ /* \ * t0 = klr; \ @@ -596,9 +588,6 @@ ELF(.type _gcry_camellia_gfni_avx512__constants, at object;) .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lbyte_ones: - .quad 0x0101010101010101 - /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3 * and s4. * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. @@ -663,6 +652,17 @@ ELF(.type _gcry_camellia_gfni_avx512__constants, at object;) BV8(0, 0, 0, 1, 1, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) +/* Bit-matrix for right shifting uint8_t values in vector by 7. */ +.Lright_shift_by_7: + .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0)) + /* CTR byte addition constants */ .align 64 .Lbige_addb_0_1: @@ -904,6 +904,7 @@ _gcry_camellia_gfni_avx512_ctr_enc: add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */ add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */ add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */ + kxorq %k1, %k1, %k1; .align 4 .Lload_ctr_done: -- 2.37.2 From jussi.kivilinna at iki.fi Tue Mar 14 18:27:25 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 14 Mar 2023 19:27:25 +0200 Subject: [PATCH] camellia-simd128: use 8-bit right shift for rotate function Message-ID: <20230314172725.125163-1-jussi.kivilinna@iki.fi> * cipher/camellia-simd128.h (rol32_1_16): Use vpsrlb128 for uint8 right shift by 7 if available. -- Signed-off-by: Jussi Kivilinna --- cipher/camellia-simd128.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h index 6b44961f..ed26afb7 100644 --- a/cipher/camellia-simd128.h +++ b/cipher/camellia-simd128.h @@ -593,23 +593,27 @@ static const uint8x16_t shift_row = * v0..3: (IN <<< 1) */ #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ - vpcmpgtb128(v0, zero, t0); \ + if_vpsrlb128(vpsrlb128(7, v0, t0)); \ + if_not_vpsrlb128(vpcmpgtb128(v0, zero, t0)); \ vpaddb128(v0, v0, v0); \ - vpabsb128(t0, t0); \ + if_not_vpsrlb128(vpabsb128(t0, t0)); \ \ - vpcmpgtb128(v1, zero, t1); \ + if_vpsrlb128(vpsrlb128(7, v1, t1)); \ + if_not_vpsrlb128(vpcmpgtb128(v1, zero, t1)); \ vpaddb128(v1, v1, v1); \ - vpabsb128(t1, t1); \ + if_not_vpsrlb128(vpabsb128(t1, t1)); \ \ - vpcmpgtb128(v2, zero, t2); \ + if_vpsrlb128(vpsrlb128(7, v2, t2)); \ + if_not_vpsrlb128(vpcmpgtb128(v2, zero, t2)); \ vpaddb128(v2, v2, v2); \ - vpabsb128(t2, t2); \ + if_not_vpsrlb128(vpabsb128(t2, t2)); \ \ vpor128(t0, v1, v1); \ \ - vpcmpgtb128(v3, zero, t0); \ + if_vpsrlb128(vpsrlb128(7, v3, t0)); \ + if_not_vpsrlb128(vpcmpgtb128(v3, zero, t0)); \ vpaddb128(v3, v3, v3); \ - vpabsb128(t0, t0); \ + if_not_vpsrlb128(vpabsb128(t0, t0)); \ \ vpor128(t1, v2, v2); \ vpor128(t2, v3, v3); \ -- 2.37.2 From jussi.kivilinna at iki.fi Tue Mar 14 18:28:55 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 14 Mar 2023 19:28:55 +0200 Subject: [PATCH v2] camellia-gfni: use GFNI for uint8 right shift in FLS Message-ID: <20230314172855.125273-1-jussi.kivilinna@iki.fi> * cipher/camellia-aesni-avx2-amd64.h (IF_GFNI, IF_NOT_GFNI): New. [CAMELLIA_GFNI_BUILD] (rol32_1_32): Add GFNI variant which uses vgf2p8affineqb for uint8 right shift by 7. (fls32): Load 'right shift by 7' bit-matrix on GFNI build. [CAMELLIA_GFNI_BUILD] (.Lright_shift_by_7): New. * cipher/camellia-gfni-avx512-amd64.S (clear_regs): Don't clear %k1. (rol32_1_64): Use vgf2p8affineqb for uint8 right shift by 7. (fls64): Adjust for rol32_1_64 changes. (.Lbyte_ones): Remove. (.Lright_shift_by_7): New. (_gcry_camellia_gfni_avx512_ctr_enc): Clear %k1 after use. -- Benchmark on Intel Core i3-1115G4: Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.194 ns/B 4920 MiB/s 0.794 c/B 4096?4 ECB dec | 0.194 ns/B 4916 MiB/s 0.793 c/B 4089 After (~1.7% faster) CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.190 ns/B 5008 MiB/s 0.780 c/B 4096?3 ECB dec | 0.191 ns/B 5002 MiB/s 0.781 c/B 4096?3 [v2]: Do same optimization for GFNI build of "cipher/camellia-aesni-avx2-amd64.h". Signed-off-by: Jussi Kivilinna --- cipher/camellia-aesni-avx2-amd64.h | 43 ++++++++++++++++++++++++++++- cipher/camellia-gfni-avx512-amd64.S | 37 +++++++++++++------------ 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index 003c4496..dff8b386 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -73,6 +73,14 @@ # define IF_VAES(...) #endif +#ifdef CAMELLIA_GFNI_BUILD +# define IF_GFNI(...) __VA_ARGS__ +# define IF_NOT_GFNI(...) +#else +# define IF_GFNI(...) +# define IF_NOT_GFNI(...) __VA_ARGS__ +#endif + /********************************************************************** GFNI helper macros and constants **********************************************************************/ @@ -459,6 +467,26 @@ * OUT: * v0..3: (IN <<< 1) */ +#ifdef CAMELLIA_GFNI_BUILD +#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, right_shift_by_7) \ + vgf2p8affineqb $0, right_shift_by_7, v0, t0; \ + vpaddb v0, v0, v0; \ + \ + vgf2p8affineqb $0, right_shift_by_7, v1, t1; \ + vpaddb v1, v1, v1; \ + \ + vgf2p8affineqb $0, right_shift_by_7, v2, t2; \ + vpaddb v2, v2, v2; \ + \ + vpor t0, v1, v1; \ + \ + vgf2p8affineqb $0, right_shift_by_7, v3, t0; \ + vpaddb v3, v3, v3; \ + \ + vpor t1, v2, v2; \ + vpor t2, v3, v3; \ + vpor t0, v0, v0; +#else #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ vpcmpgtb v0, zero, t0; \ vpaddb v0, v0, v0; \ @@ -481,6 +509,7 @@ vpor t1, v2, v2; \ vpor t2, v3, v3; \ vpor t0, v0, v0; +#endif /* * IN: @@ -496,7 +525,8 @@ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ - vpxor tt0, tt0, tt0; \ + IF_NOT_GFNI(vpxor tt0, tt0, tt0); \ + IF_GFNI(vpbroadcastq .Lright_shift_by_7 rRIP, tt0); \ vpbroadcastb 0+kll, t3; \ vpbroadcastb 1+kll, t2; \ vpbroadcastb 2+kll, t1; \ @@ -867,6 +897,17 @@ ELF(.type FUNC_NAME(_constants), at object;) BV8(0, 0, 0, 1, 1, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) +/* Bit-matrix for right shifting uint8_t values in vector by 7. */ +.Lright_shift_by_7: + .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0)) + #else /* CAMELLIA_GFNI_BUILD */ /* diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S index b676379f..643eed3e 100644 --- a/cipher/camellia-gfni-avx512-amd64.S +++ b/cipher/camellia-gfni-avx512-amd64.S @@ -105,7 +105,6 @@ clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31) #define clear_regs() \ - kxorq %k1, %k1, %k1; \ vzeroall; \ clear_zmm16_zmm31() @@ -307,22 +306,18 @@ * v0..3: (IN << 1) * t0, t1, t2, zero: (IN >> 7) */ -#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \ - vpcmpltb zero, v0, %k1; \ +#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, t3, right_shift_by_7) \ + vgf2p8affineqb $0, right_shift_by_7, v0, t0; \ vpaddb v0, v0, v0; \ - vpaddb one, zero, t0{%k1}{z}; \ \ - vpcmpltb zero, v1, %k1; \ + vgf2p8affineqb $0, right_shift_by_7, v1, t1; \ vpaddb v1, v1, v1; \ - vpaddb one, zero, t1{%k1}{z}; \ \ - vpcmpltb zero, v2, %k1; \ + vgf2p8affineqb $0, right_shift_by_7, v2, t2; \ vpaddb v2, v2, v2; \ - vpaddb one, zero, t2{%k1}{z}; \ \ - vpcmpltb zero, v3, %k1; \ - vpaddb v3, v3, v3; \ - vpaddb one, zero, zero{%k1}{z}; + vgf2p8affineqb $0, right_shift_by_7, v3, t3; \ + vpaddb v3, v3, v3; /* * IN: @@ -338,8 +333,7 @@ * t0 &= ll; \ * lr ^= rol32(t0, 1); \ */ \ - vpbroadcastq .Lbyte_ones rRIP, tmp; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ + vpbroadcastq .Lright_shift_by_7 rRIP, tmp; \ vpbroadcastb 0+kll, t3; \ vpbroadcastb 1+kll, t2; \ vpbroadcastb 2+kll, t1; \ @@ -360,7 +354,6 @@ vmovdqu64 l6, l##_6; \ vpternlogq $0x96, tt3, t3, l7; \ vmovdqu64 l7, l##_7; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ \ /* \ * t2 = krr; \ @@ -399,7 +392,6 @@ vpternlogq $0x96, tt1, t1, r##_5; \ vpternlogq $0x96, tt0, t2, r##_6; \ vpternlogq $0x96, tt3, t3, r##_7; \ - vpxor tt3##_y, tt3##_y, tt3##_y; \ \ /* \ * t0 = klr; \ @@ -596,9 +588,6 @@ ELF(.type _gcry_camellia_gfni_avx512__constants, at object;) .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -.Lbyte_ones: - .quad 0x0101010101010101 - /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3 * and s4. * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. @@ -663,6 +652,17 @@ ELF(.type _gcry_camellia_gfni_avx512__constants, at object;) BV8(0, 0, 0, 1, 1, 1, 0, 0), BV8(0, 0, 0, 0, 0, 0, 0, 1)) +/* Bit-matrix for right shifting uint8_t values in vector by 7. */ +.Lright_shift_by_7: + .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 0)) + /* CTR byte addition constants */ .align 64 .Lbige_addb_0_1: @@ -904,6 +904,7 @@ _gcry_camellia_gfni_avx512_ctr_enc: add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */ add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */ add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */ + kxorq %k1, %k1, %k1; .align 4 .Lload_ctr_done: -- 2.37.2