[git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-45-g0b3ec35

by Jussi Kivilinna cvs at cvs.gnupg.org
Sun Feb 18 16:12:58 CET 2018


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  0b3ec359e2279c3b46b171372b1b7733bba20cd7 (commit)
       via  d02958bd300d2c80bc92b1e072103e95e256b297 (commit)
       via  da58a62ac1b7a8d97b0895dcb41d15af531e45e5 (commit)
       via  af7fc732f9a7af7a70276f1e8364d2132db314f1 (commit)
      from  ffdc6f3623a0bcb41324d562340b2cd1c288e387 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 0b3ec359e2279c3b46b171372b1b7733bba20cd7
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu Feb 15 22:13:28 2018 +0200

    Add Intel SHA Extensions accelerated SHA256 implementation
    
    * cipher/Makefile.am: Add 'sha256-intel-shaext.c'.
    * cipher/sha256-intel-shaext.c: New.
    * cipher/sha256.c (USE_SHAEXT)
    (_gcry_sha256_transform_intel_shaext): New.
    (SHA256_CONTEXT): Add 'use_shaext'.
    (sha256_init, sha224_init) [USE_SHAEXT]: Use shaext if supported.
    (transform) [USE_SHAEXT]: Use shaext if enabled.
    (transform): Only add ASM_EXTRA_STACK if returned burn length is not
    zero.
    * configure.ac: Add 'sha256-intel-shaext.lo'.
    --
    
    Benchmark on Intel Celeron J3455 (1500 Mhz, no turbo):
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHA256         |     10.07 ns/B     94.72 MiB/s     15.10 c/B
    
    After (3.7x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHA256         |      2.70 ns/B     353.8 MiB/s      4.04 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 110a48b..599e3c1 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -94,7 +94,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
   sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \
   sha1-intel-shaext.c \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
-  sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
+  sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S sha256-intel-shaext.c \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
   sha512-armv7-neon.S sha512-arm.S \
 sm3.c \
diff --git a/cipher/sha256-intel-shaext.c b/cipher/sha256-intel-shaext.c
new file mode 100644
index 0000000..0c107bb
--- /dev/null
+++ b/cipher/sha256-intel-shaext.c
@@ -0,0 +1,352 @@
+/* sha256-intel-shaext.S - SHAEXT accelerated SHA-256 transform function
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && defined(USE_SHA256) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+
+/* Two macros to be called prior and after the use of SHA-EXT
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define shaext_prepare_variable char win64tmp[2*16]
+# define shaext_prepare_variable_size sizeof(win64tmp)
+# define shaext_prepare()                                               \
+   do { asm volatile ("movdqu %%xmm6, (%0)\n"                           \
+                      "movdqu %%xmm7, (%1)\n"                           \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16])         \
+                      : "memory");                                      \
+   } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("movdqu (%0), %%xmm6\n"                           \
+                      "movdqu (%1), %%xmm7\n"                           \
+                      "pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "movdqa %%xmm0, (%2)\n\t"                         \
+                      "movdqa %%xmm0, (%3)\n\t"                         \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16]),        \
+                        "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#else
+# define shaext_prepare_variable
+# define shaext_prepare_variable_size 0
+# define shaext_prepare() do { } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "pxor %%xmm6, %%xmm6\n"                           \
+                      "pxor %%xmm7, %%xmm7\n"                           \
+                      "movdqa %%xmm0, (%0)\n\t"                         \
+                      "movdqa %%xmm0, (%1)\n\t"                         \
+                      :                                                 \
+                      : "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#endif
+
+typedef struct u128_s
+{
+  u32 a, b, c, d;
+} u128_t;
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ */
+unsigned int
+_gcry_sha256_transform_intel_shaext(u32 state[8], const unsigned char *data,
+                                    size_t nblks)
+{
+  static const unsigned char bshuf_mask[16] __attribute__ ((aligned (16))) =
+    { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  static const u128_t K[16] __attribute__ ((aligned (16))) =
+  {
+    { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 },
+    { 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 },
+    { 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 },
+    { 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 },
+    { 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc },
+    { 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da },
+    { 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 },
+    { 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 },
+    { 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 },
+    { 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 },
+    { 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 },
+    { 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 },
+    { 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 },
+    { 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 },
+    { 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 },
+    { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }
+  };
+  char save_buf[2 * 16 + 15];
+  char *abef_save;
+  char *cdgh_save;
+  shaext_prepare_variable;
+
+  if (nblks == 0)
+    return 0;
+
+  shaext_prepare ();
+
+  asm volatile ("" : "=r" (abef_save) : "0" (save_buf) : "memory");
+  abef_save = abef_save + (-(uintptr_t)abef_save & 15);
+  cdgh_save = abef_save + 16;
+
+  /* byteswap mask => XMM7 */
+  asm volatile ("movdqa %[mask], %%xmm7\n\t" /* Preload mask */
+                :
+                : [mask] "m" (*bshuf_mask)
+                : "memory");
+
+  /* Load state.. ABEF_SAVE => STATE0 XMM1, CDGH_STATE => STATE1 XMM2 */
+  asm volatile ("movups 16(%[state]), %%xmm1\n\t" /* HGFE (xmm=EFGH) */
+                "movups  0(%[state]), %%xmm0\n\t" /* DCBA (xmm=ABCD) */
+                "movaps %%xmm1, %%xmm2\n\t"
+                "shufps $0x11, %%xmm0, %%xmm1\n\t" /* ABEF (xmm=FEBA) */
+                "shufps $0xbb, %%xmm0, %%xmm2\n\t" /* CDGH (xmm=HGDC) */
+                :
+                : [state] "r" (state)
+                : "memory" );
+
+  /* Load message */
+  asm volatile ("movdqu 0*16(%[data]), %%xmm3\n\t"
+                "movdqu 1*16(%[data]), %%xmm4\n\t"
+                "movdqu 2*16(%[data]), %%xmm5\n\t"
+                "movdqu 3*16(%[data]), %%xmm6\n\t"
+                "pshufb %%xmm7, %%xmm3\n\t"
+                "pshufb %%xmm7, %%xmm4\n\t"
+                "pshufb %%xmm7, %%xmm5\n\t"
+                "pshufb %%xmm7, %%xmm6\n\t"
+                :
+                : [data] "r" (data)
+                : "memory" );
+  data += 64;
+
+  do
+    {
+      /* Save state */
+      asm volatile ("movdqa %%xmm1, (%[abef_save])\n\t"
+                    "movdqa %%xmm2, (%[cdgh_save])\n\t"
+                    :
+                    : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+                    : "memory" );
+
+      /* Round 0..3 */
+      asm volatile ("movdqa %%xmm3, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[0].a)
+                    : "memory" );
+
+      /* Round 4..7 */
+      asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    "sha256msg1 %%xmm4, %%xmm3\n\t"
+                    :
+                    : [constants] "m" (K[1].a)
+                    : "memory" );
+
+      /* Round 8..11 */
+      asm volatile ("movdqa %%xmm5, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    "sha256msg1 %%xmm5, %%xmm4\n\t"
+                    :
+                    : [constants] "m" (K[2].a)
+                    : "memory" );
+
+#define ROUND(k, MSG0, MSG1, MSG2, MSG3) \
+      asm volatile ("movdqa %%"MSG0", %%xmm0\n\t" \
+                      "paddd %[constants], %%xmm0\n\t" \
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t" \
+                    "movdqa %%"MSG0", %%xmm7\n\t" \
+                    "palignr $4, %%"MSG3", %%xmm7\n\t" \
+                    "paddd %%xmm7, %%"MSG1"\n\t" \
+                    "sha256msg2 %%"MSG0", %%"MSG1"\n\t" \
+                      "psrldq $8, %%xmm0\n\t" \
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t" \
+                    "sha256msg1 %%"MSG0", %%"MSG3"\n\t" \
+                    : \
+                    : [constants] "m" (K[k].a) \
+                    : "memory" )
+
+      /* Rounds 12..15 to 48..51 */
+      ROUND(3, "xmm6", "xmm3", "xmm4", "xmm5");
+      ROUND(4, "xmm3", "xmm4", "xmm5", "xmm6");
+      ROUND(5, "xmm4", "xmm5", "xmm6", "xmm3");
+      ROUND(6, "xmm5", "xmm6", "xmm3", "xmm4");
+      ROUND(7, "xmm6", "xmm3", "xmm4", "xmm5");
+      ROUND(8, "xmm3", "xmm4", "xmm5", "xmm6");
+      ROUND(9, "xmm4", "xmm5", "xmm6", "xmm3");
+      ROUND(10, "xmm5", "xmm6", "xmm3", "xmm4");
+      ROUND(11, "xmm6", "xmm3", "xmm4", "xmm5");
+      ROUND(12, "xmm3", "xmm4", "xmm5", "xmm6");
+
+      if (--nblks == 0)
+        break;
+
+      /* Round 52..55 */
+      asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                    "movdqa %%xmm4, %%xmm7\n\t"
+                    "palignr $4, %%xmm3, %%xmm7\n\t"
+                    "movdqu 0*16(%[data]), %%xmm3\n\t"
+                    "paddd %%xmm7, %%xmm5\n\t"
+                    "sha256msg2 %%xmm4, %%xmm5\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[13].a), [data] "r" (data)
+                    : "memory" );
+
+      /* Round 56..59 */
+      asm volatile ("movdqa %%xmm5, %%xmm0\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                    "movdqa %%xmm5, %%xmm7\n\t"
+                    "palignr $4, %%xmm4, %%xmm7\n\t"
+                    "movdqu 1*16(%[data]), %%xmm4\n\t"
+                    "paddd %%xmm7, %%xmm6\n\t"
+                    "movdqa %[mask], %%xmm7\n\t" /* Reload mask */
+                    "sha256msg2 %%xmm5, %%xmm6\n\t"
+                    "movdqu 2*16(%[data]), %%xmm5\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[14].a), [mask] "m" (*bshuf_mask),
+                      [data] "r" (data)
+                    : "memory" );
+
+      /* Round 60..63 */
+      asm volatile ("movdqa %%xmm6, %%xmm0\n\t"
+                    "pshufb %%xmm7, %%xmm3\n\t"
+                    "movdqu 3*16(%[data]), %%xmm6\n\t"
+                      "paddd %[constants], %%xmm0\n\t"
+                    "pshufb %%xmm7, %%xmm4\n\t"
+                      "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                      "psrldq $8, %%xmm0\n\t"
+                    "pshufb %%xmm7, %%xmm5\n\t"
+                      "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                    :
+                    : [constants] "m" (K[15].a), [data] "r" (data)
+                    : "memory" );
+      data += 64;
+
+      /* Merge states */
+      asm volatile ("paddd (%[abef_save]), %%xmm1\n\t"
+                    "paddd (%[cdgh_save]), %%xmm2\n\t"
+                    "pshufb %%xmm7, %%xmm6\n\t"
+                    :
+                    : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+                    : "memory" );
+    }
+  while (1);
+
+  /* Round 52..55 */
+  asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
+                  "paddd %[constants], %%xmm0\n\t"
+                  "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                "movdqa %%xmm4, %%xmm7\n\t"
+                "palignr $4, %%xmm3, %%xmm7\n\t"
+                "paddd %%xmm7, %%xmm5\n\t"
+                "sha256msg2 %%xmm4, %%xmm5\n\t"
+                  "psrldq $8, %%xmm0\n\t"
+                  "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                :
+                : [constants] "m" (K[13].a)
+                : "memory" );
+
+  /* Round 56..59 */
+  asm volatile ("movdqa %%xmm5, %%xmm0\n\t"
+                  "paddd %[constants], %%xmm0\n\t"
+                  "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                "movdqa %%xmm5, %%xmm7\n\t"
+                "palignr $4, %%xmm4, %%xmm7\n\t"
+                "paddd %%xmm7, %%xmm6\n\t"
+                "movdqa %[mask], %%xmm7\n\t" /* Reload mask */
+                "sha256msg2 %%xmm5, %%xmm6\n\t"
+                  "psrldq $8, %%xmm0\n\t"
+                  "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                :
+                : [constants] "m" (K[14].a), [mask] "m" (*bshuf_mask)
+                : "memory" );
+
+  /* Round 60..63 */
+  asm volatile ("movdqa %%xmm6, %%xmm0\n\t"
+                  "paddd %[constants], %%xmm0\n\t"
+                  "sha256rnds2 %%xmm1, %%xmm2\n\t"
+                  "psrldq $8, %%xmm0\n\t"
+                  "sha256rnds2 %%xmm2, %%xmm1\n\t"
+                :
+                : [constants] "m" (K[15].a)
+                : "memory" );
+
+  /* Merge states */
+  asm volatile ("paddd (%[abef_save]), %%xmm1\n\t"
+                "paddd (%[cdgh_save]), %%xmm2\n\t"
+                :
+                : [abef_save] "r" (abef_save), [cdgh_save] "r" (cdgh_save)
+                : "memory" );
+
+  /* Save state (XMM1=FEBA, XMM2=HGDC) */
+  asm volatile ("movaps %%xmm1, %%xmm0\n\t"
+                "shufps $0x11, %%xmm2, %%xmm1\n\t" /* xmm=ABCD */
+                "shufps $0xbb, %%xmm2, %%xmm0\n\t" /* xmm=EFGH */
+                "movups %%xmm1, 16(%[state])\n\t"
+                "movups %%xmm0,  0(%[state])\n\t"
+                :
+                : [state] "r" (state)
+                : "memory" );
+
+  shaext_cleanup (abef_save, cdgh_save);
+  return 0;
+}
+
+#endif /* HAVE_GCC_INLINE_ASM_SHA_EXT */
diff --git a/cipher/sha256.c b/cipher/sha256.c
index d174321..cb6a860 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -75,6 +75,14 @@
 # define USE_AVX2 1
 #endif
 
+/* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
+#undef USE_SHAEXT
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+# define USE_SHAEXT 1
+#endif
+
 /* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
  * code. */
 #undef USE_ARM_CE
@@ -103,6 +111,9 @@ typedef struct {
 #ifdef USE_AVX2
   unsigned int use_avx2:1;
 #endif
+#ifdef USE_SHAEXT
+  unsigned int use_shaext:1;
+#endif
 #ifdef USE_ARM_CE
   unsigned int use_arm_ce:1;
 #endif
@@ -147,6 +158,10 @@ sha256_init (void *context, unsigned int flags)
 #ifdef USE_AVX2
   hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
 #endif
+#ifdef USE_SHAEXT
+  hd->use_shaext = (features & HWF_INTEL_SHAEXT)
+                   && (features & HWF_INTEL_SSE4_1);
+#endif
 #ifdef USE_ARM_CE
   hd->use_arm_ce = (features & HWF_ARM_SHA2) != 0;
 #endif
@@ -188,6 +203,10 @@ sha224_init (void *context, unsigned int flags)
 #ifdef USE_AVX2
   hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
 #endif
+#ifdef USE_SHAEXT
+  hd->use_shaext = (features & HWF_INTEL_SHAEXT)
+                   && (features & HWF_INTEL_SSE4_1);
+#endif
 #ifdef USE_ARM_CE
   hd->use_arm_ce = (features & HWF_ARM_SHA2) != 0;
 #endif
@@ -350,7 +369,8 @@ transform_blk (void *ctx, const unsigned char *data)
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2)
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \
+    defined(USE_SHAEXT)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 #  define ASM_EXTRA_STACK (10 * 16)
@@ -379,6 +399,14 @@ unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data,
                                                size_t num_blks) ASM_FUNC_ABI;
 #endif
 
+#ifdef USE_SHAEXT
+/* Does not need ASM_FUNC_ABI */
+unsigned int
+_gcry_sha256_transform_intel_shaext(u32 state[8],
+                                    const unsigned char *input_data,
+                                    size_t num_blks);
+#endif
+
 #ifdef USE_ARM_CE
 unsigned int _gcry_sha256_transform_armv8_ce(u32 state[8],
                                              const void *input_data,
@@ -391,27 +419,49 @@ transform (void *ctx, const unsigned char *data, size_t nblks)
   SHA256_CONTEXT *hd = ctx;
   unsigned int burn;
 
+#ifdef USE_SHAEXT
+  if (hd->use_shaext)
+    {
+      burn = _gcry_sha256_transform_intel_shaext (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
+#endif
+
 #ifdef USE_AVX2
   if (hd->use_avx2)
-    return _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 
 #ifdef USE_AVX
   if (hd->use_avx)
-    return _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 
 #ifdef USE_SSSE3
   if (hd->use_ssse3)
-    return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 
 #ifdef USE_ARM_CE
   if (hd->use_arm_ce)
-    return _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks);
+    {
+      burn = _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) : 0;
+      return burn;
+    }
 #endif
 
   do
diff --git a/configure.ac b/configure.ac
index 4ae7667..b5d7211 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2395,6 +2395,13 @@ if test "$found" = "1" ; then
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-armv8-aarch64-ce.lo"
       ;;
    esac
+
+   case "$mpi_cpu_arch" in
+     x86)
+       # Build with the SHAEXT implementation
+       GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-intel-shaext.lo"
+     ;;
+   esac
 fi
 
 LIST_MEMBER(sha512, $enabled_digests)

commit d02958bd300d2c80bc92b1e072103e95e256b297
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Tue Feb 13 20:22:41 2018 +0200

    Add Intel SHA Extensions accelerated SHA1 implementation
    
    * cipher/Makefile.am: Add 'sha1-intel-shaext.c'.
    * cipher/sha1-intel-shaext.c: New.
    * cipher/sha1.c (USE_SHAEXT, _gcry_sha1_transform_intel_shaext): New.
    (sha1_init) [USE_SHAEXT]: Use shaext implementation is supported.
    (transform) [USE_SHAEXT]: Use shaext if enabled.
    (transform): Only add ASM_EXTRA_STACK if returned burn length is not
    zero.
    * cipher/sha1.h (SHA1_CONTEXT): Add 'use_shaext'.
    * configure.ac: Add 'sha1-intel-shaext.lo'.
    (shaextsupport, gcry_cv_gcc_inline_asm_shaext): New.
    * src/g10lib.h: Add HWF_INTEL_SHAEXT and reorder HWF flags.
    * src/hwf-x86.c (detect_x86_gnuc): Detect SHA Extensions.
    * src/hwfeatures.c (hwflist): Add 'intel-shaext'.
    --
    
    Benchmark on Intel Celeron J3455 (1500 Mhz, no turbo):
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHA1           |      4.50 ns/B     211.7 MiB/s      6.76 c/B
    
    After (4.0x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHA1           |      1.11 ns/B     858.1 MiB/s      1.67 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 625a0ef..110a48b 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -92,6 +92,7 @@ seed.c \
 serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S serpent-armv7-neon.S \
 sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
   sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \
+  sha1-intel-shaext.c \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
   sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
diff --git a/cipher/sha1-intel-shaext.c b/cipher/sha1-intel-shaext.c
new file mode 100644
index 0000000..5a2349e
--- /dev/null
+++ b/cipher/sha1-intel-shaext.c
@@ -0,0 +1,281 @@
+/* sha1-intel-shaext.S - SHAEXT accelerated SHA-1 transform function
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && defined(USE_SHA1) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+#  pragma GCC target("no-sse")
+#endif
+
+/* Two macros to be called prior and after the use of SHA-EXT
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE regsiters are cleared and won't reveal any information about
+   the key or the data.  */
+#ifdef __WIN64__
+/* XMM6-XMM15 are callee-saved registers on WIN64. */
+# define shaext_prepare_variable char win64tmp[2*16]
+# define shaext_prepare_variable_size sizeof(win64tmp)
+# define shaext_prepare()                                               \
+   do { asm volatile ("movdqu %%xmm6, (%0)\n"                           \
+                      "movdqu %%xmm7, (%1)\n"                           \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16])         \
+                      : "memory");                                      \
+   } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("movdqu (%0), %%xmm6\n"                           \
+                      "movdqu (%1), %%xmm7\n"                           \
+                      "pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "movdqa %%xmm0, (%2)\n\t"                         \
+                      "movdqa %%xmm0, (%3)\n\t"                         \
+                      :                                                 \
+                      : "r" (&win64tmp[0]), "r" (&win64tmp[16]),        \
+                        "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#else
+# define shaext_prepare_variable
+# define shaext_prepare_variable_size 0
+# define shaext_prepare() do { } while (0)
+# define shaext_cleanup(tmp0,tmp1)                                      \
+   do { asm volatile ("pxor %%xmm0, %%xmm0\n"                           \
+                      "pxor %%xmm1, %%xmm1\n"                           \
+                      "pxor %%xmm2, %%xmm2\n"                           \
+                      "pxor %%xmm3, %%xmm3\n"                           \
+                      "pxor %%xmm4, %%xmm4\n"                           \
+                      "pxor %%xmm5, %%xmm5\n"                           \
+                      "pxor %%xmm6, %%xmm6\n"                           \
+                      "pxor %%xmm7, %%xmm7\n"                           \
+                      "movdqa %%xmm0, (%0)\n\t"                         \
+                      "movdqa %%xmm0, (%1)\n\t"                         \
+                      :                                                 \
+                      : "r" (tmp0), "r" (tmp1)                          \
+                      : "memory");                                      \
+   } while (0)
+#endif
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ */
+unsigned int
+_gcry_sha1_transform_intel_shaext(void *state, const unsigned char *data,
+                                  size_t nblks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  char save_buf[2 * 16 + 15];
+  char *abcd_save;
+  char *e_save;
+  shaext_prepare_variable;
+
+  if (nblks == 0)
+    return 0;
+
+  shaext_prepare ();
+
+  asm volatile ("" : "=r" (abcd_save) : "0" (save_buf) : "memory");
+  abcd_save = abcd_save + (-(uintptr_t)abcd_save & 15);
+  e_save = abcd_save + 16;
+
+  /* byteswap mask => XMM7 */
+  asm volatile ("movdqa %[mask], %%xmm7\n\t" /* Preload mask */
+                :
+                : [mask] "m" (*be_mask)
+                : "memory");
+
+  /* Load state.. ABCD => XMM4, E => XMM5 */
+  asm volatile ("movd 16(%[state]), %%xmm5\n\t"
+                "movdqu (%[state]), %%xmm4\n\t"
+                "pslldq $12, %%xmm5\n\t"
+                "pshufd $0x1b, %%xmm4, %%xmm4\n\t"
+                "movdqa %%xmm5, (%[e_save])\n\t"
+                "movdqa %%xmm4, (%[abcd_save])\n\t"
+                :
+                : [state] "r" (state), [abcd_save] "r" (abcd_save),
+                  [e_save] "r" (e_save)
+                : "memory" );
+
+  /* DATA => XMM[0..4] */
+  asm volatile ("movdqu 0(%[data]), %%xmm0\n\t"
+                "movdqu 16(%[data]), %%xmm1\n\t"
+                "movdqu 32(%[data]), %%xmm2\n\t"
+                "movdqu 48(%[data]), %%xmm3\n\t"
+                "pshufb %%xmm7, %%xmm0\n\t"
+                "pshufb %%xmm7, %%xmm1\n\t"
+                "pshufb %%xmm7, %%xmm2\n\t"
+                "pshufb %%xmm7, %%xmm3\n\t"
+                :
+                : [data] "r" (data)
+                : "memory" );
+  data += 64;
+
+  while (1)
+    {
+      /* Round 0..3 */
+      asm volatile ("paddd %%xmm0, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t" /* ABCD => E1 */
+                    "sha1rnds4 $0, %%xmm5, %%xmm4\n\t"
+                    ::: "memory" );
+
+      /* Round 4..7 */
+      asm volatile ("sha1nexte %%xmm1, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1rnds4 $0, %%xmm6, %%xmm4\n\t"
+                    "sha1msg1 %%xmm1, %%xmm0\n\t"
+                    ::: "memory" );
+
+      /* Round 8..11 */
+      asm volatile ("sha1nexte %%xmm2, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t"
+                    "sha1rnds4 $0, %%xmm5, %%xmm4\n\t"
+                    "sha1msg1 %%xmm2, %%xmm1\n\t"
+                    "pxor %%xmm2, %%xmm0\n\t"
+                    ::: "memory" );
+
+#define ROUND(imm, E0, E1, MSG0, MSG1, MSG2, MSG3) \
+      asm volatile ("sha1nexte %%"MSG0", %%"E0"\n\t" \
+                    "movdqa %%xmm4, %%"E1"\n\t" \
+                    "sha1msg2 %%"MSG0", %%"MSG1"\n\t" \
+                    "sha1rnds4 $"imm", %%"E0", %%xmm4\n\t" \
+                    "sha1msg1 %%"MSG0", %%"MSG3"\n\t" \
+                    "pxor %%"MSG0", %%"MSG2"\n\t" \
+                    ::: "memory" )
+
+      /* Rounds 12..15 to 64..67 */
+      ROUND("0", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("0", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("1", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("1", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("1", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("1", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("1", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("2", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("2", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("2", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+      ROUND("2", "xmm6", "xmm5", "xmm1", "xmm2", "xmm3", "xmm0");
+      ROUND("2", "xmm5", "xmm6", "xmm2", "xmm3", "xmm0", "xmm1");
+      ROUND("3", "xmm6", "xmm5", "xmm3", "xmm0", "xmm1", "xmm2");
+      ROUND("3", "xmm5", "xmm6", "xmm0", "xmm1", "xmm2", "xmm3");
+
+      if (--nblks == 0)
+        break;
+
+      /* Round 68..71 */
+      asm volatile ("movdqu 0(%[data]), %%xmm0\n\t"
+                    "sha1nexte %%xmm1, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1msg2 %%xmm1, %%xmm2\n\t"
+                    "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                    "pxor %%xmm1, %%xmm3\n\t"
+                    "pshufb %%xmm7, %%xmm0\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Round 72..75 */
+      asm volatile ("movdqu 16(%[data]), %%xmm1\n\t"
+                    "sha1nexte %%xmm2, %%xmm5\n\t"
+                    "movdqa %%xmm4, %%xmm6\n\t"
+                    "sha1msg2 %%xmm2, %%xmm3\n\t"
+                    "sha1rnds4 $3, %%xmm5, %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm1\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Round 76..79 */
+      asm volatile ("movdqu 32(%[data]), %%xmm2\n\t"
+                    "sha1nexte %%xmm3, %%xmm6\n\t"
+                    "movdqa %%xmm4, %%xmm5\n\t"
+                    "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm2\n\t"
+                    :
+                    : [data] "r" (data)
+                    : "memory" );
+
+      /* Merge states, store current. */
+      asm volatile ("movdqu 48(%[data]), %%xmm3\n\t"
+                    "sha1nexte (%[e_save]), %%xmm5\n\t"
+                    "paddd (%[abcd_save]), %%xmm4\n\t"
+                    "pshufb %%xmm7, %%xmm3\n\t"
+                    "movdqa %%xmm5, (%[e_save])\n\t"
+                    "movdqa %%xmm4, (%[abcd_save])\n\t"
+                    :
+                    : [abcd_save] "r" (abcd_save), [e_save] "r" (e_save),
+                      [data] "r" (data)
+                    : "memory" );
+
+      data += 64;
+    }
+
+  /* Round 68..71 */
+  asm volatile ("sha1nexte %%xmm1, %%xmm6\n\t"
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "sha1msg2 %%xmm1, %%xmm2\n\t"
+                "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                "pxor %%xmm1, %%xmm3\n\t"
+                ::: "memory" );
+
+  /* Round 72..75 */
+  asm volatile ("sha1nexte %%xmm2, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm6\n\t"
+                "sha1msg2 %%xmm2, %%xmm3\n\t"
+                "sha1rnds4 $3, %%xmm5, %%xmm4\n\t"
+                ::: "memory" );
+
+  /* Round 76..79 */
+  asm volatile ("sha1nexte %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "sha1rnds4 $3, %%xmm6, %%xmm4\n\t"
+                ::: "memory" );
+
+  /* Merge states. */
+  asm volatile ("sha1nexte (%[e_save]), %%xmm5\n\t"
+                "paddd (%[abcd_save]), %%xmm4\n\t"
+                :
+                : [abcd_save] "r" (abcd_save), [e_save] "r" (e_save)
+                : "memory" );
+
+  /* Save state */
+  asm volatile ("pshufd $0x1b, %%xmm4, %%xmm4\n\t"
+                "psrldq $12, %%xmm5\n\t"
+                "movdqu %%xmm4, (%[state])\n\t"
+                "movd %%xmm5, 16(%[state])\n\t"
+                :
+                : [state] "r" (state)
+                : "memory" );
+
+  shaext_cleanup (abcd_save, e_save);
+  return 0;
+}
+
+#endif /* HAVE_GCC_INLINE_ASM_SHA_EXT */
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 78b172f..09868aa 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -68,6 +68,14 @@
 # define USE_BMI2 1
 #endif
 
+/* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
+#undef USE_SHAEXT
+#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
+    defined(HAVE_GCC_INLINE_ASM_SSE41) && \
+    defined(ENABLE_SHAEXT_SUPPORT)
+# define USE_SHAEXT 1
+#endif
+
 /* USE_NEON indicates whether to enable ARM NEON assembly code. */
 #undef USE_NEON
 #ifdef ENABLE_NEON_SUPPORT
@@ -138,6 +146,10 @@ sha1_init (void *context, unsigned int flags)
 #ifdef USE_BMI2
   hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2);
 #endif
+#ifdef USE_SHAEXT
+  hd->use_shaext = (features & HWF_INTEL_SHAEXT)
+                   && (features & HWF_INTEL_SSE4_1);
+#endif
 #ifdef USE_NEON
   hd->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
@@ -311,7 +323,8 @@ transform_blk (void *ctx, const unsigned char *data)
  * stack to store XMM6-XMM15 needed on Win64. */
 #undef ASM_FUNC_ABI
 #undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2)
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_BMI2) || \
+    defined(USE_SHAEXT)
 # ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 #  define ASM_FUNC_ABI __attribute__((sysv_abi))
 #  define ASM_EXTRA_STACK (10 * 16)
@@ -340,6 +353,13 @@ _gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data,
                                      size_t nblks) ASM_FUNC_ABI;
 #endif
 
+#ifdef USE_SHAEXT
+/* Does not need ASM_FUNC_ABI */
+unsigned int
+_gcry_sha1_transform_intel_shaext (void *state, const unsigned char *data,
+                                   size_t nblks);
+#endif
+
 
 static unsigned int
 transform (void *ctx, const unsigned char *data, size_t nblks)
@@ -347,29 +367,53 @@ transform (void *ctx, const unsigned char *data, size_t nblks)
   SHA1_CONTEXT *hd = ctx;
   unsigned int burn;
 
+#ifdef USE_SHAEXT
+  if (hd->use_shaext)
+    {
+      burn = _gcry_sha1_transform_intel_shaext (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
+#endif
 #ifdef USE_BMI2
   if (hd->use_bmi2)
-    return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_AVX
   if (hd->use_avx)
-    return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_SSSE3
   if (hd->use_ssse3)
-    return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks)
-           + 4 * sizeof(void*) + ASM_EXTRA_STACK;
+    {
+      burn = _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) + ASM_EXTRA_STACK : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_ARM_CE
   if (hd->use_arm_ce)
-    return _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
+    {
+      burn = _gcry_sha1_transform_armv8_ce (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) : 0;
+      return burn;
+    }
 #endif
 #ifdef USE_NEON
   if (hd->use_neon)
-    return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks)
-           + 4 * sizeof(void*);
+    {
+      burn = _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks);
+      burn += burn ? 4 * sizeof(void*) : 0;
+      return burn;
+    }
 #endif
 
   do
diff --git a/cipher/sha1.h b/cipher/sha1.h
index d448fca..93ce79b 100644
--- a/cipher/sha1.h
+++ b/cipher/sha1.h
@@ -29,6 +29,7 @@ typedef struct
   unsigned int use_ssse3:1;
   unsigned int use_avx:1;
   unsigned int use_bmi2:1;
+  unsigned int use_shaext:1;
   unsigned int use_neon:1;
   unsigned int use_arm_ce:1;
 } SHA1_CONTEXT;
diff --git a/configure.ac b/configure.ac
index 305b19f..4ae7667 100644
--- a/configure.ac
+++ b/configure.ac
@@ -588,6 +588,14 @@ AC_ARG_ENABLE(aesni-support,
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
+# Implementation of the --disable-shaext-support switch.
+AC_MSG_CHECKING([whether SHAEXT support is requested])
+AC_ARG_ENABLE(shaext-support,
+              AC_HELP_STRING([--disable-shaext-support],
+                 [Disable support for the Intel SHAEXT instructions]),
+              shaextsupport=$enableval,shaextsupport=yes)
+AC_MSG_RESULT($shaextsupport)
+
 # Implementation of the --disable-pclmul-support switch.
 AC_MSG_CHECKING([whether PCLMUL support is requested])
 AC_ARG_ENABLE(pclmul-support,
@@ -1175,6 +1183,7 @@ AM_CONDITIONAL(MPI_MOD_C_UDIV_QRNND, test "$mpi_mod_c_udiv_qrnnd" = yes)
 # Reset non applicable feature flags.
 if test "$mpi_cpu_arch" != "x86" ; then
    aesnisupport="n/a"
+   shaextsupport="n/a"
    pclmulsupport="n/a"
    sse41support="n/a"
    avxsupport="n/a"
@@ -1329,6 +1338,34 @@ if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
      [Defined if inline assembler supports PCLMUL instructions])
 fi
 
+
+#
+# Check whether GCC inline assembler supports SHA Extensions instructions.
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports SHA Extensions instructions],
+       [gcry_cv_gcc_inline_asm_shaext],
+       [if test "$mpi_cpu_arch" != "x86" ; then
+          gcry_cv_gcc_inline_asm_shaext="n/a"
+        else
+          gcry_cv_gcc_inline_asm_shaext=no
+          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[void a(void) {
+              __asm__("sha1rnds4 \$0, %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha1nexte %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha1msg1 %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha1msg2 %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha256rnds2 %%xmm0, %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha256msg1 %%xmm1, %%xmm3\n\t":::"cc");
+              __asm__("sha256msg2 %%xmm1, %%xmm3\n\t":::"cc");
+            }]])],
+          [gcry_cv_gcc_inline_asm_shaext=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_shaext" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_SHAEXT,1,
+     [Defined if inline assembler supports SHA Extensions instructions])
+fi
+
+
 #
 # Check whether GCC inline assembler supports SSE4.1 instructions.
 #
@@ -1921,6 +1958,11 @@ if test x"$aesnisupport" = xyes ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
+if test x"$shaextsupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_shaext" != "yes" ; then
+    shaextsupport="no (unsupported by compiler)"
+  fi
+fi
 if test x"$pclmulsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
     pclmulsupport="no (unsupported by compiler)"
@@ -1960,6 +2002,10 @@ if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
+if test x"$shaextsupport" = xyes ; then
+  AC_DEFINE(ENABLE_SHAEXT_SUPPORT, 1,
+            [Enable support for Intel SHAEXT instructions.])
+fi
 if test x"$pclmulsupport" = xyes ; then
   AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
             [Enable support for Intel PCLMUL instructions.])
@@ -2449,6 +2495,13 @@ case "${host}" in
   ;;
 esac
 
+case "$mpi_cpu_arch" in
+  x86)
+    # Build with the SHAEXT implementation
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-intel-shaext.lo"
+  ;;
+esac
+
 LIST_MEMBER(sm3, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
@@ -2634,6 +2687,7 @@ GCRY_MSG_SHOW([Try using jitter entropy: ],[$jentsupport])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
+GCRY_MSG_SHOW([Try using Intel SHAEXT:   ],[$shaextsupport])
 GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using Intel SSE4.1:   ],[$sse41support])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
diff --git a/src/g10lib.h b/src/g10lib.h
index 961b515..d41fa0c 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -224,14 +224,14 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 #define HWF_INTEL_AVX           (1 << 12)
 #define HWF_INTEL_AVX2          (1 << 13)
 #define HWF_INTEL_FAST_VPGATHER (1 << 14)
-
-#define HWF_ARM_NEON            (1 << 15)
-#define HWF_ARM_AES             (1 << 16)
-#define HWF_ARM_SHA1            (1 << 17)
-#define HWF_ARM_SHA2            (1 << 18)
-#define HWF_ARM_PMULL           (1 << 19)
-
-#define HWF_INTEL_RDTSC         (1 << 20)
+#define HWF_INTEL_RDTSC         (1 << 15)
+#define HWF_INTEL_SHAEXT        (1 << 16)
+
+#define HWF_ARM_NEON            (1 << 17)
+#define HWF_ARM_AES             (1 << 18)
+#define HWF_ARM_SHA1            (1 << 19)
+#define HWF_ARM_SHA2            (1 << 20)
+#define HWF_ARM_PMULL           (1 << 21)
 
 
 
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 0d3a1f4..b644eda 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -357,6 +357,10 @@ detect_x86_gnuc (void)
       if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
         result |= HWF_INTEL_FAST_VPGATHER;
 #endif /*ENABLE_AVX_SUPPORT*/
+
+      /* Test bit 29 for SHA Extensions. */
+      if (features & (1 << 29))
+          result |= HWF_INTEL_SHAEXT;
     }
 
   return result;
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 1cad546..e081669 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -58,6 +58,7 @@ static struct
     { HWF_INTEL_AVX2,          "intel-avx2" },
     { HWF_INTEL_FAST_VPGATHER, "intel-fast-vpgather" },
     { HWF_INTEL_RDTSC,         "intel-rdtsc" },
+    { HWF_INTEL_SHAEXT,        "intel-shaext" },
     { HWF_ARM_NEON,            "arm-neon" },
     { HWF_ARM_AES,             "arm-aes" },
     { HWF_ARM_SHA1,            "arm-sha1" },

commit da58a62ac1b7a8d97b0895dcb41d15af531e45e5
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu Feb 8 19:45:10 2018 +0200

    AVX implementation of BLAKE2s
    
    * cipher/Makefile.am: Add 'blake2s-amd64-avx.S'.
    * cipher/blake2.c (USE_AVX, _gry_blake2s_transform_amd64_avx): New.
    (BLAKE2S_CONTEXT) [USE_AVX]: Add 'use_avx'.
    (blake2s_transform): Rename to ...
    (blake2s_transform_generic): ... this.
    (blake2s_transform): New.
    (blake2s_final): Pass 'ctx' pointer to transform function instead of
    'S'.
    (blake2s_init_ctx): Check HW features and enable AVX implementation
    if supported.
    * cipher/blake2s-amd64-avx.S: New.
    * configure.ac: Add 'blake2s-amd64-avx.lo'.
    --
    
    Benchmark on Intel Core i7-4790K (4.0 Ghz, no turbo):
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     BLAKE2S_256    |      1.77 ns/B     538.2 MiB/s      7.09 c/B
    
    After (~1.3x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     BLAKE2S_256    |      1.34 ns/B     711.4 MiB/s      5.36 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index b0ee158..625a0ef 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -107,7 +107,7 @@ rfc2268.c \
 camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
   camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
 blake2.c \
-  blake2b-amd64-avx2.S
+  blake2b-amd64-avx2.S blake2s-amd64-avx.S
 
 gost28147.lo: gost-sb.h
 gost-sb.h: gost-s-box
diff --git a/cipher/blake2.c b/cipher/blake2.c
index f830c79..0f7494f 100644
--- a/cipher/blake2.c
+++ b/cipher/blake2.c
@@ -30,6 +30,14 @@
 #include "cipher.h"
 #include "hash-common.h"
 
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX 1
+#endif
+
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
@@ -121,6 +129,9 @@ typedef struct BLAKE2S_CONTEXT_S
   byte buf[BLAKE2S_BLOCKBYTES];
   size_t buflen;
   size_t outlen;
+#ifdef USE_AVX
+  unsigned int use_avx:1;
+#endif
 } BLAKE2S_CONTEXT;
 
 typedef unsigned int (*blake2_transform_t)(void *S, const void *inblk,
@@ -479,8 +490,9 @@ static inline void blake2s_increment_counter(BLAKE2S_STATE *S, const int inc)
   S->t[1] += (S->t[0] < (u32)inc) - (inc < 0);
 }
 
-static unsigned int blake2s_transform(void *vS, const void *inblks,
-				      size_t nblks)
+static unsigned int blake2s_transform_generic(BLAKE2S_STATE *S,
+                                              const void *inblks,
+                                              size_t nblks)
 {
   static const byte blake2s_sigma[10][16] =
   {
@@ -495,7 +507,6 @@ static unsigned int blake2s_transform(void *vS, const void *inblks,
     {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
     { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
   };
-  BLAKE2S_STATE *S = vS;
   unsigned int burn = 0;
   const byte* in = inblks;
   u32 m[16];
@@ -594,6 +605,33 @@ static unsigned int blake2s_transform(void *vS, const void *inblks,
   return burn;
 }
 
+#ifdef USE_AVX
+unsigned int _gcry_blake2s_transform_amd64_avx(BLAKE2S_STATE *S,
+                                               const void *inblks,
+                                               size_t nblks) ASM_FUNC_ABI;
+#endif
+
+static unsigned int blake2s_transform(void *ctx, const void *inblks,
+                                      size_t nblks)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  unsigned int nburn;
+
+  if (0)
+    {}
+#ifdef USE_AVX
+  if (c->use_avx)
+    nburn = _gcry_blake2s_transform_amd64_avx(&c->state, inblks, nblks);
+#endif
+  else
+    nburn = blake2s_transform_generic(&c->state, inblks, nblks);
+
+  if (nburn)
+    nburn += ASM_EXTRA_STACK;
+
+  return nburn;
+}
+
 static void blake2s_final(void *ctx)
 {
   BLAKE2S_CONTEXT *c = ctx;
@@ -609,7 +647,7 @@ static void blake2s_final(void *ctx)
     memset (c->buf + c->buflen, 0, BLAKE2S_BLOCKBYTES - c->buflen); /* Padding */
   blake2s_set_lastblock (S);
   blake2s_increment_counter (S, (int)c->buflen - BLAKE2S_BLOCKBYTES);
-  burn = blake2s_transform (S, c->buf, 1);
+  burn = blake2s_transform (ctx, c->buf, 1);
 
   /* Output full hash to buffer */
   for (i = 0; i < 8; ++i)
@@ -685,11 +723,17 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
 					unsigned int dbits)
 {
   BLAKE2S_CONTEXT *c = ctx;
+  unsigned int features = _gcry_get_hw_features ();
 
+  (void)features;
   (void)flags;
 
   memset (c, 0, sizeof (*c));
 
+#ifdef USE_AVX
+  c->use_avx = !!(features & HWF_INTEL_AVX);
+#endif
+
   c->outlen = dbits / 8;
   c->buflen = 0;
   return blake2s_init(c, key, keylen);
diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S
new file mode 100644
index 0000000..f7312db
--- /dev/null
+++ b/cipher/blake2s-amd64-avx.S
@@ -0,0 +1,276 @@
+/* blake2s-amd64-avx.S  -  AVX implementation of BLAKE2s
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves at dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 4)
+#define STATE_F (STATE_T + 2 * 4)
+
+/* vector registers */
+#define ROW1  %xmm0
+#define ROW2  %xmm1
+#define ROW3  %xmm2
+#define ROW4  %xmm3
+#define TMP1  %xmm4
+#define TMP1x %xmm4
+#define R16   %xmm5
+#define R8    %xmm6
+
+#define MA1   %xmm8
+#define MA2   %xmm9
+#define MA3   %xmm10
+#define MA4   %xmm11
+
+#define MB1   %xmm12
+#define MB2   %xmm13
+#define MB3   %xmm14
+#define MB4   %xmm15
+
+/**********************************************************************
+  blake2s/AVX
+ **********************************************************************/
+
+#define GATHER_MSG(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
+#define LOAD_MSG_1(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
+#define LOAD_MSG_2(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
+#define LOAD_MSG_5(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
+#define LOAD_MSG_8(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4)
+
+#define ROR_16(in, out) vpshufb R16, in, out;
+
+#define ROR_8(in, out)  vpshufb R8, in, out;
+
+#define ROR_12(in, out) \
+        vpsrld $12, in, TMP1; \
+        vpslld $(32 - 12), in, out; \
+        vpxor TMP1, out, out;
+
+#define ROR_7(in, out) \
+        vpsrld $7, in, TMP1; \
+        vpslld $(32 - 7), in, out; \
+        vpxor TMP1, out, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddd m, r1, r1; \
+        vpaddd r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddd r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_12);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_8, ROR_7);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+blake2s_data:
+.align 16
+.Liv:
+        .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+        .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+.Lshuf_ror16:
+        .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_ror8:
+        .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
+
+.align 64
+.globl _gcry_blake2s_transform_amd64_avx
+ELF(.type _gcry_blake2s_transform_amd64_avx, at function;)
+
+_gcry_blake2s_transform_amd64_avx:
+        /* input:
+         *	%rdi: state
+         *	%rsi: blks
+         *	%rdx: num_blks
+         */
+
+        vzeroupper;
+
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        vmovdqa .Lshuf_ror16 (RIP), R16;
+        vmovdqa .Lshuf_ror8 (RIP), R8;
+
+        vmovdqa .Liv+(0 * 4) (RIP), ROW3;
+        vmovdqa .Liv+(4 * 4) (RIP), ROW4;
+
+        vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 64(RINBLKS), RINBLKS;
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+
+        vmovdqa .Liv+(0 * 4) (RIP), ROW3;
+        vmovdqa .Liv+(4 * 4) (RIP), ROW4;
+
+        vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.Loop_end:
+        ROUND(8, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+        vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        xor %eax, %eax;
+        vzeroall;
+        ret;
+ELF(.size _gcry_blake2s_transform_amd64_avx,
+    .-_gcry_blake2s_transform_amd64_avx;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/configure.ac b/configure.ac
index 300c520..305b19f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2421,6 +2421,7 @@ if test "$found" = "1" ; then
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2b-amd64-avx2.lo"
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2s-amd64-avx.lo"
       ;;
    esac
 fi

commit af7fc732f9a7af7a70276f1e8364d2132db314f1
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Jan 14 16:48:17 2018 +0200

    AVX2 implementation of BLAKE2b
    
    * cipher/Makefile.am: Add 'blake2b-amd64-avx2.S'.
    * cipher/blake2.c (USE_AVX2, ASM_FUNC_ABI, ASM_EXTRA_STACK)
    (_gry_blake2b_transform_amd64_avx2): New.
    (BLAKE2B_CONTEXT) [USE_AVX2]: Add 'use_avx2'.
    (blake2b_transform): Rename to ...
    (blake2b_transform_generic): ... this.
    (blake2b_transform): New.
    (blake2b_final): Pass 'ctx' pointer to transform function instead of
    'S'.
    (blake2b_init_ctx): Check HW features and enable AVX2 implementation
    if supported.
    * cipher/blake2b-amd64-avx2.S: New.
    * configure.ac: Add 'blake2b-amd64-avx2.lo'.
    --
    
    Benchmark on Intel Core i7-4790K (4.0 Ghz, no turbo):
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     BLAKE2B_512    |      1.07 ns/B     887.8 MiB/s      4.30 c/B
    
    After (~1.4x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     BLAKE2B_512    |     0.771 ns/B    1236.8 MiB/s      3.08 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 6e6c5ac..b0ee158 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -106,7 +106,8 @@ twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
 rfc2268.c \
 camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
   camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
-blake2.c
+blake2.c \
+  blake2b-amd64-avx2.S
 
 gost28147.lo: gost-sb.h
 gost-sb.h: gost-s-box
diff --git a/cipher/blake2.c b/cipher/blake2.c
index 0e4cf9b..f830c79 100644
--- a/cipher/blake2.c
+++ b/cipher/blake2.c
@@ -30,6 +30,26 @@
 #include "cipher.h"
 #include "hash-common.h"
 
+/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX2 1
+#endif
+
+/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AVX2) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
 #define BLAKE2B_BLOCKBYTES 128
 #define BLAKE2B_OUTBYTES 64
 #define BLAKE2B_KEYBYTES 64
@@ -67,6 +87,9 @@ typedef struct BLAKE2B_CONTEXT_S
   byte buf[BLAKE2B_BLOCKBYTES];
   size_t buflen;
   size_t outlen;
+#ifdef USE_AVX2
+  unsigned int use_avx2:1;
+#endif
 } BLAKE2B_CONTEXT;
 
 typedef struct
@@ -188,8 +211,9 @@ static inline u64 rotr64(u64 x, u64 n)
   return ((x >> (n & 63)) | (x << ((64 - n) & 63)));
 }
 
-static unsigned int blake2b_transform(void *vS, const void *inblks,
-				      size_t nblks)
+static unsigned int blake2b_transform_generic(BLAKE2B_STATE *S,
+                                              const void *inblks,
+                                              size_t nblks)
 {
   static const byte blake2b_sigma[12][16] =
   {
@@ -206,7 +230,6 @@ static unsigned int blake2b_transform(void *vS, const void *inblks,
     {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
     { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
   };
-  BLAKE2B_STATE *S = vS;
   const byte* in = inblks;
   u64 m[16];
   u64 v[16];
@@ -306,6 +329,33 @@ static unsigned int blake2b_transform(void *vS, const void *inblks,
   return sizeof(void *) * 4 + sizeof(u64) * 16 * 2;
 }
 
+#ifdef USE_AVX2
+unsigned int _gcry_blake2b_transform_amd64_avx2(BLAKE2B_STATE *S,
+                                                const void *inblks,
+                                                size_t nblks) ASM_FUNC_ABI;
+#endif
+
+static unsigned int blake2b_transform(void *ctx, const void *inblks,
+                                      size_t nblks)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  unsigned int nburn;
+
+  if (0)
+    {}
+#ifdef USE_AVX2
+  if (c->use_avx2)
+    nburn = _gcry_blake2b_transform_amd64_avx2(&c->state, inblks, nblks);
+#endif
+  else
+    nburn = blake2b_transform_generic(&c->state, inblks, nblks);
+
+  if (nburn)
+    nburn += ASM_EXTRA_STACK;
+
+  return nburn;
+}
+
 static void blake2b_final(void *ctx)
 {
   BLAKE2B_CONTEXT *c = ctx;
@@ -321,7 +371,7 @@ static void blake2b_final(void *ctx)
     memset (c->buf + c->buflen, 0, BLAKE2B_BLOCKBYTES - c->buflen); /* Padding */
   blake2b_set_lastblock (S);
   blake2b_increment_counter (S, (int)c->buflen - BLAKE2B_BLOCKBYTES);
-  burn = blake2b_transform (S, c->buf, 1);
+  burn = blake2b_transform (ctx, c->buf, 1);
 
   /* Output full hash to buffer */
   for (i = 0; i < 8; ++i)
@@ -397,11 +447,17 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
 					unsigned int dbits)
 {
   BLAKE2B_CONTEXT *c = ctx;
+  unsigned int features = _gcry_get_hw_features ();
 
+  (void)features;
   (void)flags;
 
   memset (c, 0, sizeof (*c));
 
+#ifdef USE_AVX2
+  c->use_avx2 = !!(features & HWF_INTEL_AVX2);
+#endif
+
   c->outlen = dbits / 8;
   c->buflen = 0;
   return blake2b_init(c, key, keylen);
diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S
new file mode 100644
index 0000000..6bcc565
--- /dev/null
+++ b/cipher/blake2b-amd64-avx2.S
@@ -0,0 +1,298 @@
+/* blake2b-amd64-avx2.S  -  AVX2 implementation of BLAKE2b
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves at dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 8)
+#define STATE_F (STATE_T + 2 * 8)
+
+/* vector registers */
+#define ROW1  %ymm0
+#define ROW2  %ymm1
+#define ROW3  %ymm2
+#define ROW4  %ymm3
+#define TMP1  %ymm4
+#define TMP1x %xmm4
+#define R16   %ymm5
+#define R24   %ymm6
+
+#define MA1   %ymm8
+#define MA2   %ymm9
+#define MA3   %ymm10
+#define MA4   %ymm11
+#define MA1x  %xmm8
+#define MA2x  %xmm9
+#define MA3x  %xmm10
+#define MA4x  %xmm11
+
+#define MB1   %ymm12
+#define MB2   %ymm13
+#define MB3   %ymm14
+#define MB4   %ymm15
+#define MB1x  %xmm12
+#define MB2x  %xmm13
+#define MB3x  %xmm14
+#define MB4x  %xmm15
+
+/**********************************************************************
+  blake2b/AVX2
+ **********************************************************************/
+
+#define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+        vmovq (s4)*8(RINBLKS), TMP1x; \
+        vpinsrq $1, (s2)*8(RINBLKS), m1x, m1x; \
+        vpinsrq $1, (s6)*8(RINBLKS), TMP1x, TMP1x; \
+        vinserti128 $1, TMP1x, m1, m1; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+          vmovq (s5)*8(RINBLKS), TMP1x; \
+          vpinsrq $1, (s3)*8(RINBLKS), m2x, m2x; \
+          vpinsrq $1, (s7)*8(RINBLKS), TMP1x, TMP1x; \
+          vinserti128 $1, TMP1x, m2, m2; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+            vmovq (s12)*8(RINBLKS), TMP1x; \
+            vpinsrq $1, (s10)*8(RINBLKS), m3x, m3x; \
+            vpinsrq $1, (s14)*8(RINBLKS), TMP1x, TMP1x; \
+            vinserti128 $1, TMP1x, m3, m3; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+              vmovq (s13)*8(RINBLKS), TMP1x; \
+              vpinsrq $1, (s11)*8(RINBLKS), m4x, m4x; \
+              vpinsrq $1, (s15)*8(RINBLKS), TMP1x, TMP1x; \
+              vinserti128 $1, TMP1x, m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15)
+#define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3)
+#define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13)
+#define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10)
+#define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0)
+#define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+#define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) \
+        LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x)
+
+#define ROR_32(in, out) vpshufd $0xb1, in, out;
+
+#define ROR_24(in, out) vpshufb R24, in, out;
+
+#define ROR_16(in, out) vpshufb R16, in, out;
+
+#define ROR_63(in, out) \
+        vpsrlq $63, in, TMP1; \
+        vpaddq in, in, out; \
+        vpxor  TMP1, out, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddq m, r1, r1; \
+        vpaddq r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddq r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_32, ROR_24);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_63);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+blake2b_data:
+.align 32
+.Liv:
+        .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+        .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+        .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+        .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+.Lshuf_ror16:
+        .byte 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9
+.Lshuf_ror24:
+        .byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10
+
+.align 64
+.globl _gcry_blake2b_transform_amd64_avx2
+ELF(.type _gcry_blake2b_transform_amd64_avx2, at function;)
+
+_gcry_blake2b_transform_amd64_avx2:
+        /* input:
+         *	%rdi: state
+         *	%rsi: blks
+         *	%rdx: num_blks
+         */
+
+        vzeroupper;
+
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        vbroadcasti128 .Lshuf_ror16 (RIP), R16;
+        vbroadcasti128 .Lshuf_ror24 (RIP), R24;
+
+        vmovdqa .Liv+(0 * 8) (RIP), ROW3;
+        vmovdqa .Liv+(4 * 8) (RIP), ROW4;
+
+        vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(10, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(11, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 128(RINBLKS), RINBLKS;
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        ROUND(10, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+
+        vmovdqa .Liv+(0 * 8) (RIP), ROW3;
+        vmovdqa .Liv+(4 * 8) (RIP), ROW4;
+
+        vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.Loop_end:
+        ROUND(10, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+        vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        xor %eax, %eax;
+        vzeroall;
+        ret;
+ELF(.size _gcry_blake2b_transform_amd64_avx2,
+    .-_gcry_blake2b_transform_amd64_avx2;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/configure.ac b/configure.ac
index aaf3c82..300c520 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2416,6 +2416,13 @@ LIST_MEMBER(blake2, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2.lo"
    AC_DEFINE(USE_BLAKE2, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS blake2b-amd64-avx2.lo"
+      ;;
+   esac
 fi
 
 # SHA-1 needs to be included always for example because it is used by

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am           |   6 +-
 cipher/blake2.c              | 116 +++++++++++++-
 cipher/blake2b-amd64-avx2.S  | 298 ++++++++++++++++++++++++++++++++++++
 cipher/blake2s-amd64-avx.S   | 276 +++++++++++++++++++++++++++++++++
 cipher/sha1-intel-shaext.c   | 281 ++++++++++++++++++++++++++++++++++
 cipher/sha1.c                |  64 ++++++--
 cipher/sha1.h                |   1 +
 cipher/sha256-intel-shaext.c | 352 +++++++++++++++++++++++++++++++++++++++++++
 cipher/sha256.c              |  66 +++++++-
 configure.ac                 |  69 +++++++++
 src/g10lib.h                 |  16 +-
 src/hwf-x86.c                |   4 +
 src/hwfeatures.c             |   1 +
 13 files changed, 1514 insertions(+), 36 deletions(-)
 create mode 100644 cipher/blake2b-amd64-avx2.S
 create mode 100644 cipher/blake2s-amd64-avx.S
 create mode 100644 cipher/sha1-intel-shaext.c
 create mode 100644 cipher/sha256-intel-shaext.c


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org




More information about the Gnupg-commits mailing list