[PATCH] Add SM3 ARMv8/AArch64/CE assembly implementation
Tianjia Zhang
tianjia.zhang at linux.alibaba.com
Fri Apr 1 11:17:36 CEST 2022
* cipher/Makefile.am: Add 'sm3-armv8-aarch64-ce.S'.
* cipher/sm3-armv8-aarch64-ce.S: New.
* cipher/sm3.c (USE_ARM_CE): New.
[USE_ARM_CE] (_gcry_sm3_transform_armv8_ce)
(do_sm3_transform_armv8_ce): New.
(sm3_init) [USE_ARM_CE]: New.
* configure.ac: Add 'sm3-armv8-aarch64-ce.lo'.
--
Benchmark on T-Head Yitian-710 2.75 GHz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SM3 | 2.84 ns/B 335.3 MiB/s 7.82 c/B 2749
After (~55% faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SM3 | 1.84 ns/B 518.1 MiB/s 5.06 c/B 2749
Signed-off-by: Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
---
cipher/Makefile.am | 2 +-
cipher/sm3-armv8-aarch64-ce.S | 218 ++++++++++++++++++++++++++++++++++
cipher/sm3.c | 28 +++++
configure.ac | 1 +
4 files changed, 248 insertions(+), 1 deletion(-)
create mode 100644 cipher/sm3-armv8-aarch64-ce.S
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 1ac1923b7ce5..30be9f982883 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -130,7 +130,7 @@ EXTRA_libcipher_la_SOURCES = \
sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
sha512-armv7-neon.S sha512-arm.S \
sha512-ppc.c sha512-ssse3-i386.c \
- sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S \
+ sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
stribog.c \
tiger.c \
diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S
new file mode 100644
index 000000000000..0900b84fe2bf
--- /dev/null
+++ b/cipher/sm3-armv8-aarch64-ce.S
@@ -0,0 +1,218 @@
+/* sm3-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM3 cipher
+ *
+ * Copyright (C) 2022 Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+ defined(USE_SM3)
+
+.cpu generic+simd+crypto
+
+/* Must be consistent with register macros */
+#define vecnum_v0 0
+#define vecnum_v1 1
+#define vecnum_v2 2
+#define vecnum_v3 3
+#define vecnum_v4 4
+#define vecnum_CTX1 16
+#define vecnum_CTX2 17
+#define vecnum_SS1 18
+#define vecnum_WT 19
+#define vecnum_K0 20
+#define vecnum_K1 21
+#define vecnum_K2 22
+#define vecnum_K3 23
+#define vecnum_RTMP0 24
+#define vecnum_RTMP1 25
+
+#define sm3partw1(vd, vn, vm) \
+ .inst (0xce60c000 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3partw2(vd, vn, vm) \
+ .inst (0xce60c400 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3ss1(vd, vn, vm, va) \
+ .inst (0xce400000 | (vecnum_##vm << 16) | (vecnum_##va << 10) \
+ | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt1a(vd, vn, vm, imm2) \
+ .inst (0xce408000 | (vecnum_##vm << 16) | imm2 << 12 \
+ | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt1b(vd, vn, vm, imm2) \
+ .inst (0xce408400 | (vecnum_##vm << 16) | imm2 << 12 \
+ | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt2a(vd, vn, vm, imm2) \
+ .inst (0xce408800 | (vecnum_##vm << 16) | imm2 << 12 \
+ | (vecnum_##vn << 5) | vecnum_##vd)
+
+#define sm3tt2b(vd, vn, vm, imm2) \
+ .inst (0xce408c00 | (vecnum_##vm << 16) | imm2 << 12 \
+ | (vecnum_##vn << 5) | vecnum_##vd)
+
+/* Constants */
+
+.text
+.align 4
+ELF(.type _gcry_sm3_armv8_ce_consts, at object)
+_gcry_sm3_armv8_ce_consts:
+.Lsm3_Ktable:
+ .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
+ .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
+ .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
+ .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
+ .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
+ .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
+ .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
+ .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+ .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
+ .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
+ .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
+ .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
+ .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
+ .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
+ .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
+ .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+ELF(.size _gcry_sm3_armv8_ce_consts,.-_gcry_sm3_armv8_ce_consts)
+
+/* Register macros */
+
+/* Must be consistent with vecnum_ macros */
+#define CTX1 v16
+#define CTX2 v17
+#define SS1 v18
+#define WT v19
+
+#define K0 v20
+#define K1 v21
+#define K2 v22
+#define K3 v23
+
+#define RTMP0 v24
+#define RTMP1 v25
+
+/* Helper macros. */
+
+#define _(...) /*_*/
+
+#define SCHED_W_1(s0, s1, s2, s3, s4) ext s4.16b, s1.16b, s2.16b, #12
+#define SCHED_W_2(s0, s1, s2, s3, s4) ext RTMP0.16b, s0.16b, s1.16b, #12
+#define SCHED_W_3(s0, s1, s2, s3, s4) ext RTMP1.16b, s2.16b, s3.16b, #8
+#define SCHED_W_4(s0, s1, s2, s3, s4) sm3partw1(s4, s0, s3)
+#define SCHED_W_5(s0, s1, s2, s3, s4) sm3partw2(s4, RTMP1, RTMP0)
+
+#define SCHED_W(n, s0, s1, s2, s3, s4) SCHED_W_##n(s0, s1, s2, s3, s4)
+
+#define R(ab, s0, s1, s2, s3, s4, IOP) \
+ ld4 {K0.s, K1.s, K2.s, K3.s}[3], [x3], #16; \
+ eor WT.16b, s0.16b, s1.16b; \
+ \
+ sm3ss1(SS1, CTX1, CTX2, K0); \
+ IOP(1, s0, s1, s2, s3, s4); \
+ sm3tt1##ab(CTX1, SS1, WT, 0); \
+ sm3tt2##ab(CTX2, SS1, s0, 0); \
+ \
+ IOP(2, s0, s1, s2, s3, s4); \
+ sm3ss1(SS1, CTX1, CTX2, K1); \
+ IOP(3, s0, s1, s2, s3, s4); \
+ sm3tt1##ab(CTX1, SS1, WT, 1); \
+ sm3tt2##ab(CTX2, SS1, s0, 1); \
+ \
+ sm3ss1(SS1, CTX1, CTX2, K2); \
+ IOP(4, s0, s1, s2, s3, s4); \
+ sm3tt1##ab(CTX1, SS1, WT, 2); \
+ sm3tt2##ab(CTX2, SS1, s0, 2); \
+ \
+ sm3ss1(SS1, CTX1, CTX2, K3); \
+ IOP(5, s0, s1, s2, s3, s4); \
+ sm3tt1##ab(CTX1, SS1, WT, 3); \
+ sm3tt2##ab(CTX2, SS1, s0, 3);
+
+#define R1(s0, s1, s2, s3, s4, IOP) R(a, s0, s1, s2, s3, s4, IOP)
+#define R2(s0, s1, s2, s3, s4, IOP) R(b, s0, s1, s2, s3, s4, IOP)
+
+.align 3
+.global _gcry_sm3_transform_armv8_ce
+ELF(.type _gcry_sm3_transform_armv8_ce,%function;)
+_gcry_sm3_transform_armv8_ce:
+ /* input:
+ * x0: CTX
+ * x1: data
+ * x2: nblocks
+ */
+ CFI_STARTPROC();
+
+ ld1 {CTX1.4s, CTX2.4s}, [x0];
+ rev64 CTX1.4s, CTX1.4s;
+ rev64 CTX2.4s, CTX2.4s;
+ ext CTX1.16b, CTX1.16b, CTX1.16b, #8;
+ ext CTX2.16b, CTX2.16b, CTX2.16b, #8;
+
+.Lloop:
+ GET_DATA_POINTER(x3, .Lsm3_Ktable);
+ ld1 {v0.16b-v3.16b}, [x1], #64;
+ sub x2, x2, #1;
+
+ mov v6.16b, CTX1.16b;
+ mov v7.16b, CTX2.16b;
+
+ rev32 v0.16b, v0.16b;
+ rev32 v1.16b, v1.16b;
+ rev32 v2.16b, v2.16b;
+ rev32 v3.16b, v3.16b;
+
+ R1(v0, v1, v2, v3, v4, SCHED_W);
+ R1(v1, v2, v3, v4, v0, SCHED_W);
+ R1(v2, v3, v4, v0, v1, SCHED_W);
+ R1(v3, v4, v0, v1, v2, SCHED_W);
+ R2(v4, v0, v1, v2, v3, SCHED_W);
+ R2(v0, v1, v2, v3, v4, SCHED_W);
+ R2(v1, v2, v3, v4, v0, SCHED_W);
+ R2(v2, v3, v4, v0, v1, SCHED_W);
+ R2(v3, v4, v0, v1, v2, SCHED_W);
+ R2(v4, v0, v1, v2, v3, SCHED_W);
+ R2(v0, v1, v2, v3, v4, SCHED_W);
+ R2(v1, v2, v3, v4, v0, SCHED_W);
+ R2(v2, v3, v4, v0, v1, SCHED_W);
+ R2(v3, v4, v0, v1, v2, _);
+ R2(v4, v0, v1, v2, v3, _);
+ R2(v0, v1, v2, v3, v4, _);
+
+ eor CTX1.16b, CTX1.16b, v6.16b;
+ eor CTX2.16b, CTX2.16b, v7.16b;
+
+ cbnz x2, .Lloop;
+
+ /* save state */
+ rev64 CTX1.4s, CTX1.4s;
+ rev64 CTX2.4s, CTX2.4s;
+ ext CTX1.16b, CTX1.16b, CTX1.16b, #8;
+ ext CTX2.16b, CTX2.16b, CTX2.16b, #8;
+ st1 {CTX1.4s, CTX2.4s}, [x0];
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm3_transform_armv8_ce, .-_gcry_sm3_transform_armv8_ce;)
+
+#endif
diff --git a/cipher/sm3.c b/cipher/sm3.c
index 0ab5f5067edb..bfe9f4c25225 100644
--- a/cipher/sm3.c
+++ b/cipher/sm3.c
@@ -67,6 +67,16 @@
# endif
#endif
+/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension code. */
+#undef USE_ARM_CE
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(__AARCH64EL__) && \
+ defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+# define USE_ARM_CE 1
+# endif
+#endif
+
typedef struct {
gcry_md_block_ctx_t bctx;
@@ -117,6 +127,20 @@ do_sm3_transform_aarch64(void *context, const unsigned char *data, size_t nblks)
}
#endif /* USE_AARCH64_SIMD */
+#ifdef USE_ARM_CE
+void _gcry_sm3_transform_armv8_ce(void *state, const void *input_data,
+ size_t num_blks);
+
+static unsigned int
+do_sm3_transform_armv8_ce(void *context, const unsigned char *data,
+ size_t nblks)
+{
+ SM3_CONTEXT *hd = context;
+ _gcry_sm3_transform_armv8_ce (hd->h, data, nblks);
+ return 0;
+}
+#endif /* USE_ARM_CE */
+
static unsigned int
transform (void *c, const unsigned char *data, size_t nblks);
@@ -153,6 +177,10 @@ sm3_init (void *context, unsigned int flags)
if (features & HWF_ARM_NEON)
hd->bctx.bwrite = do_sm3_transform_aarch64;
#endif
+#ifdef USE_ARM_CE
+ if (features & HWF_ARM_SM3)
+ hd->bctx.bwrite = do_sm3_transform_armv8_ce;
+#endif
(void)features;
}
diff --git a/configure.ac b/configure.ac
index e214082b2603..fc49bb86fc2b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3049,6 +3049,7 @@ if test "$found" = "1" ; then
aarch64-*-*)
# Build with the assembly implementation
GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo"
+ GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo"
;;
esac
fi
--
2.24.3 (Apple Git-128)
More information about the Gcrypt-devel
mailing list