[PATCH 1/2] Add Intel SHA Extensions accelerated SHA1 implementation

Sat Feb 17 11:03:55 CET 2018

* cipher/Makefile.am: Add 'sha1-intel-shaext.c'.
* cipher/sha1-intel-shaext.c: New.
* cipher/sha1.c (USE_SHAEXT, _gcry_sha1_transform_intel_shaext): New.
(sha1_init) [USE_SHAEXT]: Use shaext implementation is supported.
(transform) [USE_SHAEXT]: Use shaext if enabled.
(transform): Only add ASM_EXTRA_STACK if returned burn length is not
zero.
* cipher/sha1.h (SHA1_CONTEXT): Add 'use_shaext'.
* configure.ac: Add 'sha1-intel-shaext.lo'.
(shaextsupport, gcry_cv_gcc_inline_asm_shaext): New.
* src/g10lib.h: Add HWF_INTEL_SHAEXT and reorder HWF flags.
* src/hwf-x86.c (detect_x86_gnuc): Detect SHA Extensions.
* src/hwfeatures.c (hwflist): Add 'intel-shaext'.
--

Benchmark on Intel Celeron J3455 (1500 Mhz, no turbo):

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA1           |      4.50 ns/B     211.7 MiB/s      6.76 c/B

After (4.0x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA1           |      1.11 ns/B     858.1 MiB/s      1.67 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 625a0ef69..110a48b2c 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -92,6 +92,7 @@ seed.c \
 serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S serpent-armv7-neon.S \
 sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
   sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \
+  sha1-intel-shaext.c \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
   sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
diff --git a/cipher/sha1-intel-shaext.c b/cipher/sha1-intel-shaext.c
new file mode 100644
index 000000000..5a2349e1e
--- /dev/null
+++ b/cipher/sha1-intel-shaext.c
@@ -0,0 +1,281 @@
+/* sha1-intel-shaext.S - SHAEXT accelerated SHA-1 transform function
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && defined(USE_SHA1) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+
+/* Two macros to be called prior and after the use of SHA-EXT
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define shaext_prepare_variable char win64tmp[2*16]
+# define shaext_prepare_variable_size sizeof(win64tmp)
+# define shaext_prepare()                                               \
+   do { asm volatile ("movdqu %%xmm6, (%0)\n"                           \
+                      "movdqu %%xmm7, (%1)\n"                           \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16])         \
+                      : "memory");                                      \
+   } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("movdqu (%0), %%xmm6\n"                           \
+                      "movdqu (%1), %%xmm7\n"                           \
+                      "pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "movdqa %%xmm0, (%2)\n\t"                         \
+                      "movdqa %%xmm0, (%3)\n\t"                         \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16]),        \
+                        "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#else
+# define shaext_prepare_variable
+# define shaext_prepare_variable_size 0
+# define shaext_prepare() do { } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "pxor %%xmm6, %%xmm6\n"                           \
+                      "pxor %%xmm7, %%xmm7\n"                           \
+                      "movdqa %%xmm0, (%0)\n\t"                         \
+                      "movdqa %%xmm0, (%1)\n\t"                         \
+                      :                                                 \
+                      : "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#endif
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ */
+unsigned int
+_gcry_sha1_transform_intel_shaext(void *state, const unsigned char *data,
+                                  size_t nblks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  char save_buf[2 * 16 + 15];
+  char *abcd_save;
+  char *e_save;
+  shaext_prepare_variable;
+
+  if (nblks == 0)
+    return 0;
+
+  shaext_prepare ();
+
+  asm volatile ("" : "=r" (abcd_save) : "0" (save_buf) : "memory");
+  abcd_save = abcd_save + (-(uintptr_t)abcd_save & 15);
+  e_save = abcd_save + 16;
+
+  /* byteswap mask => XMM7 */
+  asm volatile ("movdqa %[mask], %%xmm7\n\t" /* Preload mask */
+                :
+                : [mask] "m" (*be_mask)
+                : "memory");
+
+  /* Load state.. ABCD => XMM4, E => XMM5 */
+  asm volatile ("movd 16(%[state]), %%xmm5\n\t"
+                "movdqu (%[state]), %%xmm4\n\t"
+                "pslldq $12, %%xmm5\n\t"
+                "pshufd $0x1b, %%xmm4, %%xmm4\n\t"
+                "movdqa %%xmm5, (%[e_save])\n\t"
+                "movdqa %%xmm4, (%[abcd_save])\n\t"
+                :
+                : [state] "r" (state), [abcd_save] "r" (abcd_save),
+                  [e_save] "r" (e_save)
+                : "memory" );
+
+  /* DATA => XMM[0..4] */
+  asm volatile ("movdqu 0(%[data]), %%xmm0\n\t"
+                "movdqu 16(%[data]), %%xmm1\n\t"
+                "movdqu 32(%[data]), %%xmm2\n\t"
+                "movdqu 48(%[data]), %%xmm3\n\t"
+                "pshufb %%xmm7, %%xmm0\n\t"
+                "pshufb %%xmm7, %%xmm1\n\t"
+                "pshufb %%xmm7, %%xmm2\n\t"
+                "pshufb %%xmm7, %%xmm3\n\t"
+                :
+                : [data] "r" (data)
+                : "memory" );
+  data += 64;
+
+  while (1)
+    {
+      /* Round 0..3 */
+      asm volatile ("paddd %%xmm0, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t" /* ABCD => E1 */
+                    "sha1rnds4 $0, %%xmm5, %%xmm4\n\t"
+                    ::: "memory" );
+
+      /* Round 4..7 */
+      asm volatile ("sha1nexte %%xmm1, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1rnds4 $0, %%xmm6, %%xmm4\n\t"
+                    "sha1msg1 %%xmm1, %%xmm0\n\t"
+                    ::: "memory" );
+
+      /* Round 8..11 */
+      asm volatile ("sha1nexte %%xmm2, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t"
+                    "sha1rnds4 $0, %%xmm5, %%xmm4\n\t"
+                    "sha1msg1 %%xmm2, %%xmm1\n\t"
+                    "pxor %%xmm2, %%xmm0\n\t"
+                    ::: "memory" );
+
+#define ROUND(imm, E0, E1, MSG0, MSG1, MSG2, MSG3) \
+      asm volatile ("sha1nexte %%"MSG0", %%"E0"\n\t" \
+                    "movdqa %%xmm4, %%"E1"\n\t" \
+                    "sha1msg2 %%"MSG0", %%"MSG1"\n\t" \
+                    "sha1rnds4 $"imm", %%"E0", %%xmm4\n\t" \
+                    "sha1msg1 %%"MSG0", %%"MSG3"\n\t" \
+                    "pxor %%"MSG0", %%"MSG2"\n\t" \
+                    ::: "memory" )
+
+      /* Rounds 12..15 to 64..67 */
+      ROUND("0", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("0", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("1", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("1", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("1", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("1", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("1", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("2", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("2", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("2", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("2", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("2", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("3", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("3", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+
+      if (--nblks == 0)
+        break;
+
+      /* Round 68..71 */
+      asm volatile ("movdqu 0(%[data]), %%xmm0\n\t"
+                    "sha1nexte %%xmm1, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1msg2 %%xmm1, %%xmm2\n\t"
+                    "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                    "pxor %%xmm1, %%xmm3\n\t"
+                    "pshufb %%xmm7, %%xmm0\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Round 72..75 */
+      asm volatile ("movdqu 16(%[data]), %%xmm1\n\t"
+                    "sha1nexte %%xmm2, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t"
+                    "sha1msg2 %%xmm2, %%xmm3\n\t"
+                    "sha1rnds4 $3, %%xmm5, %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm1\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Round 76..79 */
+      asm volatile ("movdqu 32(%[data]), %%xmm2\n\t"
+                    "sha1nexte %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm2\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Merge states, store current. */
+      asm volatile ("movdqu 48(%[data]), %%xmm3\n\t"
+                    "sha1nexte (%[e_save]), %%xmm5\n\t"
+                    "paddd (%[abcd_save]), %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm3\n\t"
+                    "movdqa %%xmm5, (%[e_save])\n\t"
+                    "movdqa %%xmm4, (%[abcd_save])\n\t"
+                    :
+                    : [abcd_save] "r" (abcd_save), [e_save] "r" (e_save),
+                      [data] "r" (data)
+                    : "memory" );
+
+      data += 64;
+    }
+
+  /* Round 68..71 */
+  asm volatile ("sha1nexte %%xmm1, %%xmm6\n\t"
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "sha1msg2 %%xmm1, %%xmm2\n\t"
+                "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                "pxor %%xmm1, %%xmm3\n\t"
+                ::: "memory" );
+
+  /* Round 72..75 */
+  asm volatile ("sha1nexte %%xmm2, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm6\n\t"
+                "sha1msg2 %%xmm2, %%xmm3\n\t"
+                "sha1rnds4 $3, %%xmm5, %%xmm4\n\t"
+                ::: "memory" );
+
+  /* Round 76..79 */
+  asm volatile ("sha1nexte %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                ::: "memory" );
+
+  /* Merge states. */
+  asm volatile ("sha1nexte (%[e_save]), %%xmm5\n\t"
+                "paddd (%[abcd_save]), %%xmm4\n\t"
+                :
+                : [abcd_save] "r" (abcd_save), [e_save] "r" (e_save)
+                : "memory" );
+
+  /* Save state */
+  asm volatile ("pshufd $0x1b, %%xmm4, %%xmm4\n\t"
+                "psrldq $12, %%xmm5\n\t"
+                "movdqu %%xmm4, (%[state])\n\t"
+                "movd %%xmm5, 16(%[state])\n\t"
+                :
+                : [state] "r" (state)
+                : "memory" );
+
+  shaext_cleanup (abcd_save, e_save);
+  return 0;
+}
+
+#endif /* HAVE_GCC_INLINE_ASM_SHA_EXT */
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 78b172f24..09868aa3f 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -68,6 +68,14 @@
 # define USE_BMI2 1
 #endif
 
+/* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
+#undef USE_SHAEXT
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+# define USE_SHAEXT 1
+#endif
+
 /* USE_NEON indicates whether to enable ARM NEON assembly code. */
 #undef USE_NEON
 #ifdef ENABLE_NEON_SUPPORT
@@ -138,6 +146,10 @@ sha1_init (void *context, unsigned int flags)
 #ifdef USE_BMI2
   hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2);
 #endif
+#ifdef USE_SHAEXT
+  hd->use_shaext = (features & HWF_INTEL_SHAEXT)
+                   && (features & HWF_INTEL_SSE4_1);
+#endif
 #ifdef USE_NEON
   hd->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
@@ -311,7 +323,8 @@ transform_blk (void *ctx, const unsigned char *data)
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2)
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \
+    defined(USE_SHAEXT)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 #  define ASM_EXTRA_STACK (10 * 16)
@@ -340,6 +353,13 @@ _gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
                                      size_t nblks) ASM_FUNC_ABI;
 #endif
 
+#ifdef USE_SHAEXT
+/* Does not need ASM_FUNC_ABI */
+unsigned int
+_gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data,
+                                   size_t nblks);
+#endif
+
 
 static unsigned int
 transform (void *ctx, const unsigned char *data, size_t nblks)
@@ -347,29 +367,53 @@ transform (void *ctx, const unsigned char *data, size_t nblks)
   SHA1_CONTEXT *hd = ctx;
   unsigned int burn;
 
+#ifdef USE_SHAEXT
+  if (hd->use_shaext)
+    {
+      burn = _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
+#endif
 #ifdef USE_BMI2
   if (hd->use_bmi2)
-    return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_AVX
   if (hd->use_avx)
-    return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_SSSE3
   if (hd->use_ssse3)
-    return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_ARM_CE
   if (hd->use_arm_ce)
-    return _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
+    {
+      burn = _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_NEON
   if (hd->use_neon)
-    return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks)
-           + 4 * sizeof(void*);
+    {
+      burn = _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) : 0;
+      return burn;
+    }
 #endif
 
   do
diff --git a/cipher/sha1.h b/cipher/sha1.h
index d448fcac8..93ce79b5c 100644
--- a/cipher/sha1.h
+++ b/cipher/sha1.h
@@ -29,6 +29,7 @@ typedef struct
   unsigned int use_ssse3:1;
   unsigned int use_avx:1;
   unsigned int use_bmi2:1;
+  unsigned int use_shaext:1;
   unsigned int use_neon:1;
   unsigned int use_arm_ce:1;
 } SHA1_CONTEXT;
diff --git a/configure.ac b/configure.ac
index 305b19f7e..4ae7667b3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -588,6 +588,14 @@ AC_ARG_ENABLE(aesni-support,
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
+# Implementation of the --disable-shaext-support switch.
+AC_MSG_CHECKING([whether SHAEXT support is requested])
+AC_ARG_ENABLE(shaext-support,
+              AC_HELP_STRING([--disable-shaext-support],
+                 [Disable support for the Intel SHAEXT instructions]),
+              shaextsupport=$enableval,shaextsupport=yes)
+AC_MSG_RESULT($shaextsupport)
+
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
@@ -1175,6 +1183,7 @@ AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
+   shaextsupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
@@ -1329,6 +1338,34 @@ if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
+
+#
+# Check whether GCC inline assembler supports SHA Extensions instructions.
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions],
+       [gcry_cv_gcc_inline_asm_shaext],
+       [if test "$mpi_cpu_arch" != "x86" ; then
+          gcry_cv_gcc_inline_asm_shaext="n/a"
+        else
+          gcry_cv_gcc_inline_asm_shaext=no
+          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[void a(void) {
+              __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
+            }]])],
+          [gcry_cv_gcc_inline_asm_shaext=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1,
+     [Defined if inline assembler supports SHA Extensions instructions])
+fi
+
+
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
@@ -1921,6 +1958,11 @@ if test x"$aesnisupport" = xyes ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
+if test x"$shaextsupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then
+    shaextsupport="no (unsupported by compiler)"
+  fi
+fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
@@ -1960,6 +2002,10 @@ if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
+if test x"$shaextsupport" = xyes ; then
+  AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1,
+            [Enable support for Intel SHAEXT instructions.])
+fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
@@ -2449,6 +2495,13 @@ case "${host}" in
   ;;
 esac
 
+case "$mpi_cpu_arch" in
+  x86)
+    # Build with the SHAEXT implementation
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-intel-shaext.lo"
+  ;;
+esac
+
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
@@ -2634,6 +2687,7 @@ GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
+GCRY_MSG_SHOW([Try using Intel SHAEXT:   ],[$shaextsupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
diff --git a/src/g10lib.h b/src/g10lib.h
index 961b51541..d41fa0cf7 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -224,14 +224,14 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_INTEL_AVX           (1 << 12)
 #define HWF_INTEL_AVX2          (1 << 13)
 #define HWF_INTEL_FAST_VPGATHER (1 << 14)
-
-#define HWF_ARM_NEON            (1 << 15)
-#define HWF_ARM_AES             (1 << 16)
-#define HWF_ARM_SHA1            (1 << 17)
-#define HWF_ARM_SHA2            (1 << 18)
-#define HWF_ARM_PMULL           (1 << 19)
-
-#define HWF_INTEL_RDTSC         (1 << 20)
+#define HWF_INTEL_RDTSC         (1 << 15)
+#define HWF_INTEL_SHAEXT        (1 << 16)
+
+#define HWF_ARM_NEON            (1 << 17)
+#define HWF_ARM_AES             (1 << 18)
+#define HWF_ARM_SHA1            (1 << 19)
+#define HWF_ARM_SHA2            (1 << 20)
+#define HWF_ARM_PMULL           (1 << 21)
 
 
 
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 0d3a1f40e..b644eda1f 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -357,6 +357,10 @@ detect_x86_gnuc (void)
       if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
         result |= HWF_INTEL_FAST_VPGATHER;
 #endif /*ENABLE_AVX_SUPPORT*/
+
+      /* Test bit 29 for SHA Extensions. */
+      if (features & (1 << 29))
+          result |= HWF_INTEL_SHAEXT;
     }
 
   return result;
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 1cad546d2..e08166945 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -58,6 +58,7 @@ static struct
     { HWF_INTEL_AVX2,          "intel-avx2" },
     { HWF_INTEL_FAST_VPGATHER, "intel-fast-vpgather" },
     { HWF_INTEL_RDTSC,         "intel-rdtsc" },
+    { HWF_INTEL_SHAEXT,        "intel-shaext" },
     { HWF_ARM_NEON,            "arm-neon" },
     { HWF_ARM_AES,             "arm-aes" },
     { HWF_ARM_SHA1,            "arm-sha1" },