[PATCH] sm4: add ARMv8 CE accelerated implementation for XTS mode

Mon Aug 1 16:25:34 CEST 2022

Hello,

Patch applied to master, thanks.

-Jussi

On 28.7.2022 11.26, Tianjia Zhang via Gcrypt-devel wrote:
> * cipher/sm4-armv8-aarch64-ce.S (_gcry_sm4_armv8_ce_xts_crypt): New.
> * cipher/sm4.c (_gcry_sm4_armv8_ce_xts_crypt): New.
> (_gcry_sm4_xts_crypt) [USE_ARM_CE]: Add ARMv8 CE implementation for XTS.
> --
> 
> Benchmark on T-Head Yitian-710 2.75 GHz:
> 
> Before:
>   SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
>          XTS enc |     0.373 ns/B      2560 MiB/s      1.02 c/B      2749
>          XTS dec |     0.372 ns/B      2562 MiB/s      1.02 c/B      2750
> 
> After (1.18x faster):
>   SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
>          XTS enc |     0.314 ns/B      3038 MiB/s     0.863 c/B      2749
>          XTS dec |     0.314 ns/B      3037 MiB/s     0.863 c/B      2749
> 
> Signed-off-by: Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> ---
>   cipher/sm4-armv8-aarch64-ce.S | 151 ++++++++++++++++++++++++++++++++++
>   cipher/sm4.c                  |  18 +++-
>   2 files changed, 168 insertions(+), 1 deletion(-)
> 
> diff --git a/cipher/sm4-armv8-aarch64-ce.S b/cipher/sm4-armv8-aarch64-ce.S
> index 5fb55947edc1..1a4ff736ad27 100644
> --- a/cipher/sm4-armv8-aarch64-ce.S
> +++ b/cipher/sm4-armv8-aarch64-ce.S
> @@ -62,6 +62,7 @@
>   #define RTMP3   v19
>   
>   #define RIV     v20
> +#define RMASK   v21
>   
>   /* Helper macros. */
>   
> @@ -69,6 +70,20 @@
>           ld1 {v24.16b-v27.16b}, [ptr], #64; \
>           ld1 {v28.16b-v31.16b}, [ptr];
>   
> +#define SM4_CRYPT_BLK(b0)                       \
> +        rev32       b0.16b, b0.16b;             \
> +        sm4e(b0, v24);                          \
> +        sm4e(b0, v25);                          \
> +        sm4e(b0, v26);                          \
> +        sm4e(b0, v27);                          \
> +        sm4e(b0, v28);                          \
> +        sm4e(b0, v29);                          \
> +        sm4e(b0, v30);                          \
> +        sm4e(b0, v31);                          \
> +        rev64       b0.4s, b0.4s;               \
> +        ext         b0.16b, b0.16b, b0.16b, #8; \
> +        rev32       b0.16b, b0.16b;
> +
>   #define crypt_blk4(b0, b1, b2, b3)         \
>           rev32 b0.16b, b0.16b;              \
>           rev32 b1.16b, b1.16b;              \
> @@ -577,4 +592,140 @@ _gcry_sm4_armv8_ce_ctr_enc:
>       CFI_ENDPROC();
>   ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
>   
> +.align 3
> +.global _gcry_sm4_armv8_ce_xts_crypt
> +ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
> +_gcry_sm4_armv8_ce_xts_crypt:
> +    /* input:
> +     *   x0: round key array, CTX
> +     *   x1: dst
> +     *   x2: src
> +     *   x3: tweak (big endian, 128 bit)
> +     *   x4: nblocks
> +     */
> +    CFI_STARTPROC()
> +    VPUSH_ABI
> +
> +    load_rkey(x0)
> +
> +    mov         x7, #0x87
> +    mov         x8, #0x1
> +    mov         RMASK.d[0], x7
> +    mov         RMASK.d[1], x8
> +
> +    ld1         {RIV.16b}, [x3]
> +    mov         v8.16b, RIV.16b
> +    ext         RIV.16b, RIV.16b, RIV.16b, #8
> +
> +.Lxts_loop_blk:
> +    sub         x4, x4, #8
> +    tbnz        x4, #63, .Lxts_tail8
> +
> +#define tweak_next(vt, vin, RTMP)                       \
> +        sshr        RTMP.2d, RIV.2d, #63;               \
> +        add         vt.2d, vin.2d, vin.2d;              \
> +        and         RTMP.16b, RTMP.16b, RMASK.16b;      \
> +        add         RIV.2d, RIV.2d, RIV.2d;             \
> +        eor         vt.16b, vt.16b, RTMP.16b;
> +
> +    tweak_next( v9,  v8, RTMP0)
> +    tweak_next(v10,  v9, RTMP1)
> +    tweak_next(v11, v10, RTMP2)
> +    tweak_next(v12, v11, RTMP3)
> +    tweak_next(v13, v12, RTMP0)
> +    tweak_next(v14, v13, RTMP1)
> +    tweak_next(v15, v14, RTMP2)
> +
> +    ld1         {v0.16b-v3.16b}, [x2], #64
> +    eor         v0.16b, v0.16b,  v8.16b
> +    eor         v1.16b, v1.16b,  v9.16b
> +    eor         v2.16b, v2.16b, v10.16b
> +    eor         v3.16b, v3.16b, v11.16b
> +    ld1         {v4.16b-v7.16b}, [x2], #64
> +    eor         v4.16b, v4.16b, v12.16b
> +    eor         v5.16b, v5.16b, v13.16b
> +    eor         v6.16b, v6.16b, v14.16b
> +    eor         v7.16b, v7.16b, v15.16b
> +
> +    crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
> +
> +    eor         v0.16b, v0.16b,  v8.16b
> +    eor         v1.16b, v1.16b,  v9.16b
> +    eor         v2.16b, v2.16b, v10.16b
> +    eor         v3.16b, v3.16b, v11.16b
> +    st1         {v0.16b-v3.16b}, [x1], #64
> +    eor         v4.16b, v4.16b, v12.16b
> +    eor         v5.16b, v5.16b, v13.16b
> +    eor         v6.16b, v6.16b, v14.16b
> +    eor         v7.16b, v7.16b, v15.16b
> +    st1         {v4.16b-v7.16b}, [x1], #64
> +
> +    tweak_next(v8, v15, RTMP3)
> +
> +    cbz         x4, .Lxts_end
> +    b           .Lxts_loop_blk
> +
> +.Lxts_tail8:
> +    add         x4, x4, #8
> +    cmp         x4, #4
> +    blt         .Lxts_tail4
> +
> +    sub         x4, x4, #4
> +
> +    tweak_next( v9,  v8, RTMP0)
> +    tweak_next(v10,  v9, RTMP1)
> +    tweak_next(v11, v10, RTMP2)
> +
> +    ld1         {v0.16b-v3.16b}, [x2], #64
> +    eor         v0.16b, v0.16b,  v8.16b
> +    eor         v1.16b, v1.16b,  v9.16b
> +    eor         v2.16b, v2.16b, v10.16b
> +    eor         v3.16b, v3.16b, v11.16b
> +
> +    crypt_blk4(v0, v1, v2, v3);
> +
> +    eor         v0.16b, v0.16b,  v8.16b
> +    eor         v1.16b, v1.16b,  v9.16b
> +    eor         v2.16b, v2.16b, v10.16b
> +    eor         v3.16b, v3.16b, v11.16b
> +    st1         {v0.16b-v3.16b}, [x1], #64
> +
> +    tweak_next(v8, v11, RTMP3)
> +
> +    cbz         x4, .Lxts_end
> +
> +.Lxts_tail4:
> +    sub         x4, x4, #1
> +
> +    ld1         {v0.16b}, [x2], #16
> +    eor         v0.16b, v0.16b, v8.16b
> +
> +    SM4_CRYPT_BLK(v0)
> +
> +    eor         v0.16b, v0.16b, v8.16b
> +    st1         {v0.16b}, [x1], #16
> +
> +    tweak_next(v8, v8, RTMP0)
> +
> +    cbnz        x4, .Lxts_tail4
> +
> +.Lxts_end:
> +    /* store new tweak */
> +    st1         {v8.16b}, [x3]
> +
> +    CLEAR_REG(v8)
> +    CLEAR_REG(v9)
> +    CLEAR_REG(v10)
> +    CLEAR_REG(v11)
> +    CLEAR_REG(v12)
> +    CLEAR_REG(v13)
> +    CLEAR_REG(v14)
> +    CLEAR_REG(v15)
> +    CLEAR_REG(RIV)
> +
> +    VPOP_ABI
> +    ret_spec_stop
> +    CFI_ENDPROC()
> +ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
> +
>   #endif
> diff --git a/cipher/sm4.c b/cipher/sm4.c
> index b5d4691ddbcb..4cac3b6c64b0 100644
> --- a/cipher/sm4.c
> +++ b/cipher/sm4.c
> @@ -1,6 +1,6 @@
>   /* sm4.c  -  SM4 Cipher Algorithm
>    * Copyright (C) 2020 Alibaba Group.
> - * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
> + * Copyright (C) 2020-2022 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
>    * Copyright (C) 2020-2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
>    *
>    * This file is part of Libgcrypt.
> @@ -539,6 +539,11 @@ extern void _gcry_sm4_armv8_ce_cfb_dec(const u32 *rk_enc, byte *out,
>   				       byte *iv,
>   				       size_t nblocks);
>   
> +extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out,
> +					 const byte *in,
> +					 byte *tweak,
> +					 size_t nblocks);
> +
>   extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
>   					    const byte *in,
>   					    size_t num_blocks);
> @@ -1510,6 +1515,17 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
>     const unsigned char *inbuf = inbuf_arg;
>     int burn_stack_depth = 0;
>   
> +#ifdef USE_ARM_CE
> +  if (ctx->use_arm_ce)
> +    {
> +      /* Process all blocks at a time. */
> +      _gcry_sm4_armv8_ce_xts_crypt(encrypt ? ctx->rkey_enc : ctx->rkey_dec,
> +                                   outbuf, inbuf, tweak, nblocks);
> +
> +      nblocks = 0;
> +    }
> +#endif
> +
>     /* Process remaining blocks. */
>     if (nblocks)
>       {