[PATCH] Add SM3 ARMv8/AArch64/CE assembly implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Mon Apr 4 21:56:40 CEST 2022


Hello,

Applied to master. Thanks.

-Jussi

On 1.4.2022 12.17, Tianjia Zhang via Gcrypt-devel wrote:
> * cipher/Makefile.am: Add 'sm3-armv8-aarch64-ce.S'.
> * cipher/sm3-armv8-aarch64-ce.S: New.
> * cipher/sm3.c (USE_ARM_CE): New.
> [USE_ARM_CE] (_gcry_sm3_transform_armv8_ce)
> (do_sm3_transform_armv8_ce): New.
> (sm3_init) [USE_ARM_CE]: New.
> * configure.ac: Add 'sm3-armv8-aarch64-ce.lo'.
> --
> 
> Benchmark on T-Head Yitian-710 2.75 GHz:
> 
> Before:
>                  |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
>   SM3            |      2.84 ns/B     335.3 MiB/s      7.82 c/B      2749
> 
> After (~55% faster):
>                  |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
>   SM3            |      1.84 ns/B     518.1 MiB/s      5.06 c/B      2749
> 
> Signed-off-by: Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> ---
>   cipher/Makefile.am            |   2 +-
>   cipher/sm3-armv8-aarch64-ce.S | 218 ++++++++++++++++++++++++++++++++++
>   cipher/sm3.c                  |  28 +++++
>   configure.ac                  |   1 +
>   4 files changed, 248 insertions(+), 1 deletion(-)
>   create mode 100644 cipher/sm3-armv8-aarch64-ce.S
> 
> diff --git a/cipher/Makefile.am b/cipher/Makefile.am
> index 1ac1923b7ce5..30be9f982883 100644
> --- a/cipher/Makefile.am
> +++ b/cipher/Makefile.am
> @@ -130,7 +130,7 @@ EXTRA_libcipher_la_SOURCES = \
>   	sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
>   	sha512-armv7-neon.S sha512-arm.S \
>   	sha512-ppc.c sha512-ssse3-i386.c \
> -	sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S \
> +	sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
>   	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
>   	stribog.c \
>   	tiger.c \
> diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S
> new file mode 100644
> index 000000000000..0900b84fe2bf
> --- /dev/null
> +++ b/cipher/sm3-armv8-aarch64-ce.S
> @@ -0,0 +1,218 @@
> +/* sm3-armv8-aarch64-ce.S  -  ARMv8/AArch64/CE accelerated SM3 cipher
> + *
> + * Copyright (C) 2022 Alibaba Group.
> + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> + *
> + * This file is part of Libgcrypt.
> + *
> + * Libgcrypt is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU Lesser General Public License as
> + * published by the Free Software Foundation; either version 2.1 of
> + * the License, or (at your option) any later version.
> + *
> + * Libgcrypt is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "asm-common-aarch64.h"
> +
> +#if defined(__AARCH64EL__) && \
> +    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
> +    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
> +    defined(USE_SM3)
> +
> +.cpu generic+simd+crypto
> +
> +/* Must be consistent with register macros */
> +#define vecnum_v0       0
> +#define vecnum_v1       1
> +#define vecnum_v2       2
> +#define vecnum_v3       3
> +#define vecnum_v4       4
> +#define vecnum_CTX1     16
> +#define vecnum_CTX2     17
> +#define vecnum_SS1      18
> +#define vecnum_WT       19
> +#define vecnum_K0       20
> +#define vecnum_K1       21
> +#define vecnum_K2       22
> +#define vecnum_K3       23
> +#define vecnum_RTMP0    24
> +#define vecnum_RTMP1    25
> +
> +#define sm3partw1(vd, vn, vm) \
> +    .inst (0xce60c000 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3partw2(vd, vn, vm) \
> +    .inst (0xce60c400 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3ss1(vd, vn, vm, va) \
> +    .inst (0xce400000 | (vecnum_##vm << 16) | (vecnum_##va << 10) \
> +            | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt1a(vd, vn, vm, imm2) \
> +    .inst (0xce408000 | (vecnum_##vm << 16) | imm2 << 12 \
> +            | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt1b(vd, vn, vm, imm2) \
> +    .inst (0xce408400 | (vecnum_##vm << 16) | imm2 << 12 \
> +            | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt2a(vd, vn, vm, imm2) \
> +    .inst (0xce408800 | (vecnum_##vm << 16) | imm2 << 12 \
> +            | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +#define sm3tt2b(vd, vn, vm, imm2) \
> +    .inst (0xce408c00 | (vecnum_##vm << 16) | imm2 << 12 \
> +            | (vecnum_##vn << 5) | vecnum_##vd)
> +
> +/* Constants */
> +
> +.text
> +.align 4
> +ELF(.type _gcry_sm3_armv8_ce_consts, at object)
> +_gcry_sm3_armv8_ce_consts:
> +.Lsm3_Ktable:
> +    .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
> +    .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
> +    .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
> +    .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
> +    .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
> +    .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
> +    .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
> +    .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
> +    .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
> +    .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
> +    .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
> +    .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
> +    .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
> +    .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
> +    .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
> +    .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
> +ELF(.size _gcry_sm3_armv8_ce_consts,.-_gcry_sm3_armv8_ce_consts)
> +
> +/* Register macros */
> +
> +/* Must be consistent with vecnum_ macros */
> +#define CTX1    v16
> +#define CTX2    v17
> +#define SS1     v18
> +#define WT      v19
> +
> +#define K0      v20
> +#define K1      v21
> +#define K2      v22
> +#define K3      v23
> +
> +#define RTMP0   v24
> +#define RTMP1   v25
> +
> +/* Helper macros. */
> +
> +#define _(...) /*_*/
> +
> +#define SCHED_W_1(s0, s1, s2, s3, s4) ext       s4.16b, s1.16b, s2.16b, #12
> +#define SCHED_W_2(s0, s1, s2, s3, s4) ext       RTMP0.16b, s0.16b, s1.16b, #12
> +#define SCHED_W_3(s0, s1, s2, s3, s4) ext       RTMP1.16b, s2.16b, s3.16b, #8
> +#define SCHED_W_4(s0, s1, s2, s3, s4) sm3partw1(s4, s0, s3)
> +#define SCHED_W_5(s0, s1, s2, s3, s4) sm3partw2(s4, RTMP1, RTMP0)
> +
> +#define SCHED_W(n, s0, s1, s2, s3, s4) SCHED_W_##n(s0, s1, s2, s3, s4)
> +
> +#define R(ab, s0, s1, s2, s3, s4, IOP)                  \
> +        ld4     {K0.s, K1.s, K2.s, K3.s}[3], [x3], #16; \
> +        eor     WT.16b, s0.16b, s1.16b;                 \
> +                                                        \
> +        sm3ss1(SS1, CTX1, CTX2, K0);                    \
> +      IOP(1, s0, s1, s2, s3, s4);                       \
> +        sm3tt1##ab(CTX1, SS1, WT, 0);                   \
> +        sm3tt2##ab(CTX2, SS1, s0, 0);                   \
> +                                                        \
> +      IOP(2, s0, s1, s2, s3, s4);                       \
> +        sm3ss1(SS1, CTX1, CTX2, K1);                    \
> +      IOP(3, s0, s1, s2, s3, s4);                       \
> +        sm3tt1##ab(CTX1, SS1, WT, 1);                   \
> +        sm3tt2##ab(CTX2, SS1, s0, 1);                   \
> +                                                        \
> +        sm3ss1(SS1, CTX1, CTX2, K2);                    \
> +      IOP(4, s0, s1, s2, s3, s4);                       \
> +        sm3tt1##ab(CTX1, SS1, WT, 2);                   \
> +        sm3tt2##ab(CTX2, SS1, s0, 2);                   \
> +                                                        \
> +        sm3ss1(SS1, CTX1, CTX2, K3);                    \
> +      IOP(5, s0, s1, s2, s3, s4);                       \
> +        sm3tt1##ab(CTX1, SS1, WT, 3);                   \
> +        sm3tt2##ab(CTX2, SS1, s0, 3);
> +
> +#define R1(s0, s1, s2, s3, s4, IOP)  R(a, s0, s1, s2, s3, s4, IOP)
> +#define R2(s0, s1, s2, s3, s4, IOP)  R(b, s0, s1, s2, s3, s4, IOP)
> +
> +.align 3
> +.global _gcry_sm3_transform_armv8_ce
> +ELF(.type _gcry_sm3_transform_armv8_ce,%function;)
> +_gcry_sm3_transform_armv8_ce:
> +    /* input:
> +     *   x0: CTX
> +     *   x1: data
> +     *   x2: nblocks
> +     */
> +    CFI_STARTPROC();
> +
> +    ld1         {CTX1.4s, CTX2.4s}, [x0];
> +    rev64       CTX1.4s, CTX1.4s;
> +    rev64       CTX2.4s, CTX2.4s;
> +    ext         CTX1.16b, CTX1.16b, CTX1.16b, #8;
> +    ext         CTX2.16b, CTX2.16b, CTX2.16b, #8;
> +
> +.Lloop:
> +    GET_DATA_POINTER(x3, .Lsm3_Ktable);
> +    ld1         {v0.16b-v3.16b}, [x1], #64;
> +    sub         x2, x2, #1;
> +
> +    mov         v6.16b, CTX1.16b;
> +    mov         v7.16b, CTX2.16b;
> +
> +    rev32       v0.16b, v0.16b;
> +    rev32       v1.16b, v1.16b;
> +    rev32       v2.16b, v2.16b;
> +    rev32       v3.16b, v3.16b;
> +
> +    R1(v0, v1, v2, v3, v4, SCHED_W);
> +    R1(v1, v2, v3, v4, v0, SCHED_W);
> +    R1(v2, v3, v4, v0, v1, SCHED_W);
> +    R1(v3, v4, v0, v1, v2, SCHED_W);
> +    R2(v4, v0, v1, v2, v3, SCHED_W);
> +    R2(v0, v1, v2, v3, v4, SCHED_W);
> +    R2(v1, v2, v3, v4, v0, SCHED_W);
> +    R2(v2, v3, v4, v0, v1, SCHED_W);
> +    R2(v3, v4, v0, v1, v2, SCHED_W);
> +    R2(v4, v0, v1, v2, v3, SCHED_W);
> +    R2(v0, v1, v2, v3, v4, SCHED_W);
> +    R2(v1, v2, v3, v4, v0, SCHED_W);
> +    R2(v2, v3, v4, v0, v1, SCHED_W);
> +    R2(v3, v4, v0, v1, v2, _);
> +    R2(v4, v0, v1, v2, v3, _);
> +    R2(v0, v1, v2, v3, v4, _);
> +
> +    eor         CTX1.16b, CTX1.16b, v6.16b;
> +    eor         CTX2.16b, CTX2.16b, v7.16b;
> +
> +    cbnz        x2, .Lloop;
> +
> +    /* save state */
> +    rev64       CTX1.4s, CTX1.4s;
> +    rev64       CTX2.4s, CTX2.4s;
> +    ext         CTX1.16b, CTX1.16b, CTX1.16b, #8;
> +    ext         CTX2.16b, CTX2.16b, CTX2.16b, #8;
> +    st1         {CTX1.4s, CTX2.4s}, [x0];
> +
> +    ret_spec_stop;
> +    CFI_ENDPROC();
> +ELF(.size _gcry_sm3_transform_armv8_ce, .-_gcry_sm3_transform_armv8_ce;)
> +
> +#endif
> diff --git a/cipher/sm3.c b/cipher/sm3.c
> index 0ab5f5067edb..bfe9f4c25225 100644
> --- a/cipher/sm3.c
> +++ b/cipher/sm3.c
> @@ -67,6 +67,16 @@
>   # endif
>   #endif
>   
> +/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension code. */
> +#undef USE_ARM_CE
> +#ifdef ENABLE_ARM_CRYPTO_SUPPORT
> +# if defined(__AARCH64EL__) && \
> +     defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
> +     defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
> +#   define USE_ARM_CE 1
> +# endif
> +#endif
> +
>   
>   typedef struct {
>     gcry_md_block_ctx_t bctx;
> @@ -117,6 +127,20 @@ do_sm3_transform_aarch64(void *context, const unsigned char *data, size_t nblks)
>   }
>   #endif /* USE_AARCH64_SIMD */
>   
> +#ifdef USE_ARM_CE
> +void _gcry_sm3_transform_armv8_ce(void *state, const void *input_data,
> +                                    size_t num_blks);
> +
> +static unsigned int
> +do_sm3_transform_armv8_ce(void *context, const unsigned char *data,
> +                            size_t nblks)
> +{
> +  SM3_CONTEXT *hd = context;
> +  _gcry_sm3_transform_armv8_ce (hd->h, data, nblks);
> +  return 0;
> +}
> +#endif /* USE_ARM_CE */
> +
>   
>   static unsigned int
>   transform (void *c, const unsigned char *data, size_t nblks);
> @@ -153,6 +177,10 @@ sm3_init (void *context, unsigned int flags)
>     if (features & HWF_ARM_NEON)
>       hd->bctx.bwrite = do_sm3_transform_aarch64;
>   #endif
> +#ifdef USE_ARM_CE
> +  if (features & HWF_ARM_SM3)
> +    hd->bctx.bwrite = do_sm3_transform_armv8_ce;
> +#endif
>   
>     (void)features;
>   }
> diff --git a/configure.ac b/configure.ac
> index e214082b2603..fc49bb86fc2b 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -3049,6 +3049,7 @@ if test "$found" = "1" ; then
>        aarch64-*-*)
>           # Build with the assembly implementation
>           GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
> +        GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
>        ;;
>      esac
>   fi




More information about the Gcrypt-devel mailing list