[PATCH] Add SM3 ARMv8/AArch64/CE assembly implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Apr 4 21:56:40 CEST 2022
Hello,
Applied to master. Thanks.
-Jussi
On 1.4.2022 12.17, Tianjia Zhang via Gcrypt-devel wrote:
> * cipher/Makefile.am: Add 'sm3-armv8-aarch64-ce.S'.
> * cipher/sm3-armv8-aarch64-ce.S: New.
> * cipher/sm3.c (USE_ARM_CE): New.
> [USE_ARM_CE] (_gcry_sm3_transform_armv8_ce)
> (do_sm3_transform_armv8_ce): New.
> (sm3_init) [USE_ARM_CE]: New.
> * configure.ac: Add 'sm3-armv8-aarch64-ce.lo'.
> --
>
> Benchmark on T-Head Yitian-710 2.75 GHz:
>
> Before:
> | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> SM3 | 2.84 ns/B 335.3 MiB/s 7.82 c/B 2749
>
> After (~55% faster):
> | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
> SM3 | 1.84 ns/B 518.1 MiB/s 5.06 c/B 2749
>
> Signed-off-by: Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> ---
> cipher/Makefile.am | 2 +-
> cipher/sm3-armv8-aarch64-ce.S | 218 ++++++++++++++++++++++++++++++++++
> cipher/sm3.c | 28 +++++
> configure.ac | 1 +
> 4 files changed, 248 insertions(+), 1 deletion(-)
> create mode 100644 cipher/sm3-armv8-aarch64-ce.S
>
> diff --git a/cipher/Makefile.am b/cipher/Makefile.am
> index 1ac1923b7ce5..30be9f982883 100644
> --- a/cipher/Makefile.am
> +++ b/cipher/Makefile.am
> @@ -130,7 +130,7 @@ EXTRA_libcipher_la_SOURCES = \
> sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
> sha512-armv7-neon.S sha512-arm.S \
> sha512-ppc.c sha512-ssse3-i386.c \
> - sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S \
> + sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
> keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
> stribog.c \
> tiger.c \
> diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S
> new file mode 100644
> index 000000000000..0900b84fe2bf
> --- /dev/null
> +++ b/cipher/sm3-armv8-aarch64-ce.S
> @@ -0,0 +1,218 @@
> +/* sm3-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM3 cipher
> + *
> + * Copyright (C) 2022 Alibaba Group.
> + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> + *
> + * This file is part of Libgcrypt.
> + *
> + * Libgcrypt is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU Lesser General Public License as
> + * published by the Free Software Foundation; either version 2.1 of
> + * the License, or (at your option) any later version.
> + *
> + * Libgcrypt is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "asm-common-aarch64.h"
> +
> +#if defined(__AARCH64EL__) && \
> + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
> + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
> + defined(USE_SM3)
> +
> +.cpu generic+simd+crypto
> +
> +/* Must be consistent with register macros */
> +#define vecnum_v0 0
> +#define vecnum_v1 1
> +#define vecnum_v2 2
> +#define vecnum_v3 3
> +#define vecnum_v4 4
> +#define vecnum_CTX1 16
> +#define vecnum_CTX2 17
> +#define vecnum_SS1 18
> +#define vecnum_WT 19
> +#define vecnum_K0 20
> +#define vecnum_K1 21
> +#define vecnum_K2 22
> +#define vecnum_K3 23
> +#define vecnum_RTMP0 24
> +#define vecnum_RTMP1 25
> +
> +#define sm3partw1(vd, vn, vm) \
> + .inst (0xce60c000 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3partw2(vd, vn, vm) \
> + .inst (0xce60c400 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3ss1(vd, vn, vm, va) \
> + .inst (0xce400000 | (vecnum_##vm << 16) | (vecnum_##va << 10) \
> + | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt1a(vd, vn, vm, imm2) \
> + .inst (0xce408000 | (vecnum_##vm << 16) | imm2 << 12 \
> + | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt1b(vd, vn, vm, imm2) \
> + .inst (0xce408400 | (vecnum_##vm << 16) | imm2 << 12 \
> + | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt2a(vd, vn, vm, imm2) \
> + .inst (0xce408800 | (vecnum_##vm << 16) | imm2 << 12 \
> + | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt2b(vd, vn, vm, imm2) \
> + .inst (0xce408c00 | (vecnum_##vm << 16) | imm2 << 12 \
> + | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +/* Constants */
> +
> +.text
> +.align 4
> +ELF(.type _gcry_sm3_armv8_ce_consts, at object)
> +_gcry_sm3_armv8_ce_consts:
> +.Lsm3_Ktable:
> + .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
> + .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
> + .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
> + .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
> + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
> + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
> + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
> + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
> + .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
> + .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
> + .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
> + .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
> + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
> + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
> + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
> + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
> +ELF(.size _gcry_sm3_armv8_ce_consts,.-_gcry_sm3_armv8_ce_consts)
> +
> +/* Register macros */
> +
> +/* Must be consistent with vecnum_ macros */
> +#define CTX1 v16
> +#define CTX2 v17
> +#define SS1 v18
> +#define WT v19
> +
> +#define K0 v20
> +#define K1 v21
> +#define K2 v22
> +#define K3 v23
> +
> +#define RTMP0 v24
> +#define RTMP1 v25
> +
> +/* Helper macros. */
> +
> +#define _(...) /*_*/
> +
> +#define SCHED_W_1(s0, s1, s2, s3, s4) ext s4.16b, s1.16b, s2.16b, #12
> +#define SCHED_W_2(s0, s1, s2, s3, s4) ext RTMP0.16b, s0.16b, s1.16b, #12
> +#define SCHED_W_3(s0, s1, s2, s3, s4) ext RTMP1.16b, s2.16b, s3.16b, #8
> +#define SCHED_W_4(s0, s1, s2, s3, s4) sm3partw1(s4, s0, s3)
> +#define SCHED_W_5(s0, s1, s2, s3, s4) sm3partw2(s4, RTMP1, RTMP0)
> +
> +#define SCHED_W(n, s0, s1, s2, s3, s4) SCHED_W_##n(s0, s1, s2, s3, s4)
> +
> +#define R(ab, s0, s1, s2, s3, s4, IOP) \
> + ld4 {K0.s, K1.s, K2.s, K3.s}[3], [x3], #16; \
> + eor WT.16b, s0.16b, s1.16b; \
> + \
> + sm3ss1(SS1, CTX1, CTX2, K0); \
> + IOP(1, s0, s1, s2, s3, s4); \
> + sm3tt1##ab(CTX1, SS1, WT, 0); \
> + sm3tt2##ab(CTX2, SS1, s0, 0); \
> + \
> + IOP(2, s0, s1, s2, s3, s4); \
> + sm3ss1(SS1, CTX1, CTX2, K1); \
> + IOP(3, s0, s1, s2, s3, s4); \
> + sm3tt1##ab(CTX1, SS1, WT, 1); \
> + sm3tt2##ab(CTX2, SS1, s0, 1); \
> + \
> + sm3ss1(SS1, CTX1, CTX2, K2); \
> + IOP(4, s0, s1, s2, s3, s4); \
> + sm3tt1##ab(CTX1, SS1, WT, 2); \
> + sm3tt2##ab(CTX2, SS1, s0, 2); \
> + \
> + sm3ss1(SS1, CTX1, CTX2, K3); \
> + IOP(5, s0, s1, s2, s3, s4); \
> + sm3tt1##ab(CTX1, SS1, WT, 3); \
> + sm3tt2##ab(CTX2, SS1, s0, 3);
> +
> +#define R1(s0, s1, s2, s3, s4, IOP) R(a, s0, s1, s2, s3, s4, IOP)
> +#define R2(s0, s1, s2, s3, s4, IOP) R(b, s0, s1, s2, s3, s4, IOP)
> +
> +.align 3
> +.global _gcry_sm3_transform_armv8_ce
> +ELF(.type _gcry_sm3_transform_armv8_ce,%function;)
> +_gcry_sm3_transform_armv8_ce:
> + /* input:
> + * x0: CTX
> + * x1: data
> + * x2: nblocks
> + */
> + CFI_STARTPROC();
> +
> + ld1 {CTX1.4s, CTX2.4s}, [x0];
> + rev64 CTX1.4s, CTX1.4s;
> + rev64 CTX2.4s, CTX2.4s;
> + ext CTX1.16b, CTX1.16b, CTX1.16b, #8;
> + ext CTX2.16b, CTX2.16b, CTX2.16b, #8;
> +
> +.Lloop:
> + GET_DATA_POINTER(x3, .Lsm3_Ktable);
> + ld1 {v0.16b-v3.16b}, [x1], #64;
> + sub x2, x2, #1;
> +
> + mov v6.16b, CTX1.16b;
> + mov v7.16b, CTX2.16b;
> +
> + rev32 v0.16b, v0.16b;
> + rev32 v1.16b, v1.16b;
> + rev32 v2.16b, v2.16b;
> + rev32 v3.16b, v3.16b;
> +
> + R1(v0, v1, v2, v3, v4, SCHED_W);
> + R1(v1, v2, v3, v4, v0, SCHED_W);
> + R1(v2, v3, v4, v0, v1, SCHED_W);
> + R1(v3, v4, v0, v1, v2, SCHED_W);
> + R2(v4, v0, v1, v2, v3, SCHED_W);
> + R2(v0, v1, v2, v3, v4, SCHED_W);
> + R2(v1, v2, v3, v4, v0, SCHED_W);
> + R2(v2, v3, v4, v0, v1, SCHED_W);
> + R2(v3, v4, v0, v1, v2, SCHED_W);
> + R2(v4, v0, v1, v2, v3, SCHED_W);
> + R2(v0, v1, v2, v3, v4, SCHED_W);
> + R2(v1, v2, v3, v4, v0, SCHED_W);
> + R2(v2, v3, v4, v0, v1, SCHED_W);
> + R2(v3, v4, v0, v1, v2, _);
> + R2(v4, v0, v1, v2, v3, _);
> + R2(v0, v1, v2, v3, v4, _);
> +
> + eor CTX1.16b, CTX1.16b, v6.16b;
> + eor CTX2.16b, CTX2.16b, v7.16b;
> +
> + cbnz x2, .Lloop;
> +
> + /* save state */
> + rev64 CTX1.4s, CTX1.4s;
> + rev64 CTX2.4s, CTX2.4s;
> + ext CTX1.16b, CTX1.16b, CTX1.16b, #8;
> + ext CTX2.16b, CTX2.16b, CTX2.16b, #8;
> + st1 {CTX1.4s, CTX2.4s}, [x0];
> +
> + ret_spec_stop;
> + CFI_ENDPROC();
> +ELF(.size _gcry_sm3_transform_armv8_ce, .-_gcry_sm3_transform_armv8_ce;)
> +
> +#endif
> diff --git a/cipher/sm3.c b/cipher/sm3.c
> index 0ab5f5067edb..bfe9f4c25225 100644
> --- a/cipher/sm3.c
> +++ b/cipher/sm3.c
> @@ -67,6 +67,16 @@
> # endif
> #endif
>
> +/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension code. */
> +#undef USE_ARM_CE
> +#ifdef ENABLE_ARM_CRYPTO_SUPPORT
> +# if defined(__AARCH64EL__) && \
> + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
> + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
> +# define USE_ARM_CE 1
> +# endif
> +#endif
> +
>
> typedef struct {
> gcry_md_block_ctx_t bctx;
> @@ -117,6 +127,20 @@ do_sm3_transform_aarch64(void *context, const unsigned char *data, size_t nblks)
> }
> #endif /* USE_AARCH64_SIMD */
>
> +#ifdef USE_ARM_CE
> +void _gcry_sm3_transform_armv8_ce(void *state, const void *input_data,
> + size_t num_blks);
> +
> +static unsigned int
> +do_sm3_transform_armv8_ce(void *context, const unsigned char *data,
> + size_t nblks)
> +{
> + SM3_CONTEXT *hd = context;
> + _gcry_sm3_transform_armv8_ce (hd->h, data, nblks);
> + return 0;
> +}
> +#endif /* USE_ARM_CE */
> +
>
> static unsigned int
> transform (void *c, const unsigned char *data, size_t nblks);
> @@ -153,6 +177,10 @@ sm3_init (void *context, unsigned int flags)
> if (features & HWF_ARM_NEON)
> hd->bctx.bwrite = do_sm3_transform_aarch64;
> #endif
> +#ifdef USE_ARM_CE
> + if (features & HWF_ARM_SM3)
> + hd->bctx.bwrite = do_sm3_transform_armv8_ce;
> +#endif
>
> (void)features;
> }
> diff --git a/configure.ac b/configure.ac
> index e214082b2603..fc49bb86fc2b 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -3049,6 +3049,7 @@ if test "$found" = "1" ; then
> aarch64-*-*)
> # Build with the assembly implementation
> GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
> + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
> ;;
> esac
> fi
More information about the Gcrypt-devel
mailing list