[PATCH] sha512: add AArch64 crypto/SHA512 extension implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu Jul 21 10:25:46 CEST 2022


* cipher/Makefile.am: Add 'sha512-armv8-aarch64-ce.S'.
* cipher/sha512-armv8-aarch64-ce.S: New.
* cipher/sha512.c (ATTR_ALIGNED_64, USE_ARM64_SHA512): New.
(k): Make array aligned to 64 bytes.
[USE_ARM64_SHA512] (_gcry_sha512_transform_armv8_ce): New.
[USE_ARM64_SHA512] (do_sha512_transform_armv8_ce): New.
(sha512_init_common) [USE_ARM64_SHA512]: Use ARMv8-SHA512 accelerated
implementation if HW feature available.
* configure.ac: Add 'sha512-armv8-aarch64-ce.lo'.
(gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4)
(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4): New.
--

Benchmark on AWS Graviton3:

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SHA512         |      2.36 ns/B     404.2 MiB/s      6.13 c/B      2600

After (2.4x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SHA512         |     0.977 ns/B     976.6 MiB/s      2.54 c/B      2600

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am               |   2 +-
 cipher/sha512-armv8-aarch64-ce.S | 383 +++++++++++++++++++++++++++++++
 cipher/sha512.c                  |  40 +++-
 configure.ac                     |  54 +++++
 4 files changed, 477 insertions(+), 2 deletions(-)
 create mode 100644 cipher/sha512-armv8-aarch64-ce.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 97823cb4..e27bb0bc 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -131,7 +131,7 @@ EXTRA_libcipher_la_SOURCES = \
 	sha256-intel-shaext.c sha256-ppc.c \
 	sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
 	sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
-	sha512-armv7-neon.S sha512-arm.S \
+	sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
 	sha512-ppc.c sha512-ssse3-i386.c \
 	sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
 	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
diff --git a/cipher/sha512-armv8-aarch64-ce.S b/cipher/sha512-armv8-aarch64-ce.S
new file mode 100644
index 00000000..73fe7ced
--- /dev/null
+++ b/cipher/sha512-armv8-aarch64-ce.S
@@ -0,0 +1,383 @@
+/* sha512-armv8-aarch64-ce.S - ARM/CE accelerated SHA-512 transform function
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4) && \
+    defined(USE_SHA512)
+
+.arch armv8.2-a+sha3+sm4
+
+.text
+
+
+/* Register macros */
+
+#define Qv0 q0
+#define Qv1 q1
+#define Qv2 q2
+#define Qv3 q3
+#define Qv4 q4
+
+#define vT0 v5
+#define vT1 v6
+#define QvT1 q6
+#define vT2 v7
+#define vT3 v16
+
+#define vH01 v17
+#define vH23 v18
+#define vH45 v19
+#define vH67 v20
+
+#define vW0 v21
+#define vW1 v22
+#define vW2 v23
+#define vW3 v24
+#define vW4 v25
+#define vW5 v26
+#define vW6 v27
+#define vW7 v28
+
+#define vK0 v29
+#define vK1 v30
+#define vK2 v31
+
+
+/* Round macros */
+
+#define _(...) /*_*/
+
+#define do_add(a, b) add a.2d, a.2d, b.2d;
+
+#define load_k_3() ld1 {vK0.2d-vK2.2d}, [x3], #48;
+#define load_k_last() ld1 {vK0.2d}, [x3];
+
+#define load_msg1(...) \
+        ld1 {vW0.16b-vW3.16b}, [x1], #64;
+
+#define load_msg2(...) \
+        rev64 vW0.16b, vW0.16b;
+
+#define load_msg3(...) \
+        rev64 vW1.16b, vW1.16b;
+
+#define load_msg4(...) \
+        ld1 {vW4.16b-vW7.16b}, [x1], #64;
+
+#define load_msg5(...) \
+        rev64 vW2.16b, vW2.16b;
+
+#define load_msg6(...) \
+        rev64 vW3.16b, vW3.16b;
+
+#define load_msg7(...) \
+        rev64 vW4.16b, vW4.16b;
+
+#define load_msg8(...) \
+        rev64 vW5.16b, vW5.16b;
+
+#define load_msg9(...) \
+        rev64 vW6.16b, vW6.16b;
+
+#define load_msg10(...) \
+        rev64 vW7.16b, vW7.16b;
+
+#define schedule1(w0, w1, w2, w3, w4, w5, w6, w7) \
+        sha512su0 w0.2d, w1.2d; \
+
+#define schedule2(w0, w1, w2, w3, w4, w5, w6, w7) \
+        ext vT2.16b, w4.16b, w5.16b, #8; \
+        sha512su1 w0.2d, w7.2d, vT2.2d;
+
+#define do_round2(ab, cd, ef, gh, cd_out, \
+                  load_nextk_op, k, \
+                  sched_op1, sched_op2, w0, w1, w2, w3, w4, w5, w6, w7) \
+        add vT3.2d, k.2d, w0.2d; \
+            load_nextk_op(); \
+        ext vT1.16b, ef.16b, gh.16b, #8; \
+        ext vT3.16b, vT3.16b, vT3.16b, #8; \
+        ext vT0.16b, cd.16b, ef.16b, #8; \
+        add gh.2d, gh.2d, vT3.2d; \
+            sched_op1(w0, w1, w2, w3, w4, w5, w6, w7); \
+        sha512h Q##gh, Q##vT1, vT0.2d; \
+            sched_op2(w0, w1, w2, w3, w4, w5, w6, w7); \
+        add cd_out.2d, gh.2d, cd.2d; \
+        sha512h2 Q##gh, Q##cd, ab.2d; \
+
+
+/* Other functional macros */
+
+#undef CLEAR_REG
+#define CLEAR_REG(reg, ...) movi reg.16b, #0;
+
+
+/*
+ * unsigned int
+ * _gcry_sha512_transform_armv8_ce (u64 state[8], const void *input_data,
+ *                                  size_t num_blks, const u64 k[80])
+ */
+.align 3
+.globl _gcry_sha512_transform_armv8_ce
+ELF(.type  _gcry_sha512_transform_armv8_ce,%function;)
+_gcry_sha512_transform_armv8_ce:
+  /* input:
+   *	x0: ctx, CTX
+   *	x1: data (128*nblks bytes)
+   *	x2: nblks
+   *	x3: k table
+   */
+  CFI_STARTPROC()
+
+  cbz x2, .Ldo_nothing
+
+  mov x4, x3
+
+  ld1 {vH01.2d-vH67.2d}, [x0]  /* load state */
+
+  load_msg1()
+  mov v0.16b, vH01.16b
+  mov v1.16b, vH23.16b
+  load_k_3()
+  load_msg2()
+  load_msg3()
+  load_msg4()
+  mov v2.16b, vH45.16b
+  mov v3.16b, vH67.16b
+  load_msg5()
+  load_msg6()
+  load_msg7()
+  load_msg8()
+  load_msg9()
+  load_msg10()
+
+.Loop:
+  sub x2, x2, #1
+
+  # rounds 1-16
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK0,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK1,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  # rounds 17-32
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_3, vK2,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK0,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK1,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v3, v0, v4, v2, v1,
+            load_k_3, vK2,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK0,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK1,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v1, v4, v3, v0, v2,
+            load_k_3, vK2,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK0,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  # rounds 33-48
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK1,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_3, vK2,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  # rounds 49-64
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK0,
+            schedule1, schedule2, vW0, vW1, vW2, vW3, vW4, vW5, vW6, vW7)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK1,
+            schedule1, schedule2, vW1, vW2, vW3, vW4, vW5, vW6, vW7, vW0)
+  do_round2(v3, v0, v4, v2, v1,
+            load_k_3, vK2,
+            schedule1, schedule2, vW2, vW3, vW4, vW5, vW6, vW7, vW0, vW1)
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK0,
+            schedule1, schedule2, vW3, vW4, vW5, vW6, vW7, vW0, vW1, vW2)
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK1,
+            schedule1, schedule2, vW4, vW5, vW6, vW7, vW0, vW1, vW2, vW3)
+  do_round2(v1, v4, v3, v0, v2,
+            load_k_3, vK2,
+            schedule1, schedule2, vW5, vW6, vW7, vW0, vW1, vW2, vW3, vW4)
+  do_round2(v0, v1, v2, v3, v4,
+            _,        vK0,
+            schedule1, schedule2, vW6, vW7, vW0, vW1, vW2, vW3, vW4, vW5)
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK1,
+            schedule1, schedule2, vW7, vW0, vW1, vW2, vW3, vW4, vW5, vW6)
+
+  cbz x2, .Lend
+
+  # rounds 65-80
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            _, _, vW0, , , , , , , )
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            _, _, vW1, , , , , , , )
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            _, _, vW2, , , , , , , )
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            _, _, vW3, , , , , , , )
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            load_msg1, _, vW4, , , , , , , )
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            load_msg2, _, vW5, , , , , , , )
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_last, vK2,
+            load_msg3, _, vW6, , , , , , , )
+  mov x3, x4
+  do_round2(v1, v4, v3, v0, v2,
+            load_k_3,    vK0,
+            load_msg4, load_msg5, vW7, , , , , , , )
+
+  load_msg6()
+  load_msg7()
+
+  add vH01.2d, vH01.2d, v0.2d
+  add vH23.2d, vH23.2d, v1.2d
+  add vH45.2d, vH45.2d, v2.2d
+  add vH67.2d, vH67.2d, v3.2d
+  load_msg8()
+  load_msg9()
+  load_msg10()
+  mov v0.16b, vH01.16b
+  mov v1.16b, vH23.16b
+  mov v2.16b, vH45.16b
+  mov v3.16b, vH67.16b
+
+  b .Loop
+
+.Lend:
+
+  # rounds 65-80
+  do_round2(v2, v3, v1, v4, v0,
+            load_k_3, vK2,
+            CLEAR_REG, _, vW0, , , , , , , )
+  do_round2(v4, v2, v0, v1, v3,
+            _,        vK0,
+            CLEAR_REG, _, vW1, , , , , , , )
+  do_round2(v1, v4, v3, v0, v2,
+            _,        vK1,
+            CLEAR_REG, _, vW2, , , , , , , )
+  do_round2(v0, v1, v2, v3, v4,
+            load_k_3, vK2,
+            CLEAR_REG, _, vW3, , , , , , , )
+  do_round2(v3, v0, v4, v2, v1,
+            _,        vK0,
+            CLEAR_REG, _, vW4, , , , , , , )
+  do_round2(v2, v3, v1, v4, v0,
+            _,        vK1,
+            CLEAR_REG, _, vW5, , , , , , , )
+  CLEAR_REG(vK1)
+  do_round2(v4, v2, v0, v1, v3,
+            load_k_last, vK2,
+            CLEAR_REG, _, vW6, , , , , , , )
+  CLEAR_REG(vK2)
+  do_round2(v1, v4, v3, v0, v2,
+            _,           vK0,
+            CLEAR_REG, _, vW7, , , , , , , )
+  CLEAR_REG(vK0)
+
+  CLEAR_REG(v4)
+  add vH01.2d, vH01.2d, v0.2d
+  CLEAR_REG(v0)
+  add vH23.2d, vH23.2d, v1.2d
+  CLEAR_REG(v1)
+  add vH45.2d, vH45.2d, v2.2d
+  CLEAR_REG(v2)
+  add vH67.2d, vH67.2d, v3.2d
+  CLEAR_REG(v3)
+  CLEAR_REG(vT0)
+  CLEAR_REG(vT1)
+  CLEAR_REG(vT2)
+  CLEAR_REG(vT3)
+
+  st1 {vH01.2d-vH67.2d}, [x0] /* store state */
+
+  CLEAR_REG(vH01)
+  CLEAR_REG(vH23)
+  CLEAR_REG(vH45)
+  CLEAR_REG(vH67)
+
+.Ldo_nothing:
+  mov x0, #0
+  ret_spec_stop
+  CFI_ENDPROC()
+ELF(.size _gcry_sha512_transform_armv8_ce,.-_gcry_sha512_transform_armv8_ce;)
+
+#endif
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 42eaf1fe..9ac412b3 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -55,6 +55,14 @@
 #include "hash-common.h"
 
 
+/* Helper macro to force alignment to 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+
 /* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
 #undef USE_ARM_NEON_ASM
 #ifdef ENABLE_NEON_SUPPORT
@@ -72,6 +80,17 @@
 # define USE_ARM_ASM 1
 #endif
 
+/* USE_ARM64_SHA512 indicates whether to enable ARMv8 SHA512 extension assembly
+ * code. */
+#undef USE_ARM64_SHA512
+#ifdef ENABLE_ARM_CRYPTO_SUPPORT
+# if defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4)
+#  define USE_ARM64_SHA512 1
+# endif
+#endif
+
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
@@ -158,7 +177,7 @@ typedef struct
 } SHA512_CONTEXT;
 
 
-static const u64 k[] =
+static ATTR_ALIGNED_64 const u64 k[] =
   {
     U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
     U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
@@ -219,6 +238,21 @@ static const u64 k[] =
 #endif
 
 
+#ifdef USE_ARM64_SHA512
+unsigned int _gcry_sha512_transform_armv8_ce (u64 state[8],
+                                              const unsigned char *data,
+                                              size_t num_blks,
+                                              const u64 k[]);
+
+static unsigned int
+do_sha512_transform_armv8_ce(void *ctx, const unsigned char *data,
+                             size_t nblks)
+{
+  SHA512_CONTEXT *hd = ctx;
+  return _gcry_sha512_transform_armv8_ce (hd->state.h, data, nblks, k);
+}
+#endif
+
 #ifdef USE_ARM_NEON_ASM
 unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
                                                 const unsigned char *data,
@@ -415,6 +449,10 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
   if ((features & HWF_ARM_NEON) != 0)
     ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
 #endif
+#ifdef USE_ARM64_SHA512
+  if ((features & HWF_ARM_NEON) && (features & HWF_ARM_SHA512))
+    ctx->bctx.bwrite = do_sha512_transform_armv8_ce;
+#endif
 #ifdef USE_SSSE3
   if ((features & HWF_INTEL_SSSE3) != 0)
     ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
diff --git a/configure.ac b/configure.ac
index b55510d8..ddba42c0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2054,6 +2054,56 @@ if test "$gcry_cv_gcc_inline_asm_aarch64_sve2" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions],
+       [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4],
+       [if test "$mpi_cpu_arch" != "aarch64" ||
+           test "$try_asm_modules" != "yes" ; then
+          gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4="n/a"
+        else
+          gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=no
+          AC_LINK_IFELSE([AC_LANG_PROGRAM(
+          [[__asm__(
+                ".arch armv8.2-a+sha3+sm4\n\t"
+                ".text\n\t"
+                "testfn:\n\t"
+
+                /* Test for SHA512 instructions */
+                "sha512h q0, q0, v0.2d;\n\t"
+                "sha512h2 q0, q0, v0.2d;\n\t"
+                "sha512su0 v0.2d, v0.2d;\n\t"
+                "sha512su1 v0.2d, v0.2d, v31.2d;\n\t"
+
+                /* Test for SHA3 instructions */
+                "bcax v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+                "eor3 v0.16b, v1.16b, v2.16b, v3.16b;\n\t"
+                "rax1 v0.2d, v1.2d, v2.2d;\n\t"
+                "xar v0.2d, v1.2d, v2.2d, \#1;\n\t"
+
+                /* Test for SM3 instructions */
+                "sm3partw1 v0.4s, v1.4s, v2.4s;\n\t"
+                "sm3partw2 v0.4s, v1.4s, v2.4s;\n\t"
+                "sm3ss1 v0.4s, v1.4s, v2.4s, v3.4s;\n\t"
+                "sm3tt1a v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt1b v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt2a v0.4s, v1.4s, v2.s[0];\n\t"
+                "sm3tt2b v0.4s, v1.4s, v2.s[0];\n\t"
+
+                /* Test for SM4 instructions */
+                "sm4e v0.4s, v1.4s;\n\t"
+                "sm4ekey v0.4s, v1.4s, v2.4s;\n\t"
+                );
+            ]], [ testfn(); ])],
+          [gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_aarch64_sha3_sha512_sm3_sm4" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_AARCH64_SHA3_SHA512_SM3_SM4,1,
+     [Defined if inline assembler supports AArch64 SHA3/SHA512/SM3/SM4 instructions])
+fi
+
+
 #
 # Check whether PowerPC AltiVec/VSX intrinsics
 #
@@ -3123,6 +3173,10 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-arm.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-armv8-aarch64-ce.lo"
+      ;;
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sha512-ppc.lo"
-- 
2.34.1




More information about the Gcrypt-devel mailing list