[PATCH 04/13] Add Intel PCLMUL acceleration for GCM

Wed Nov 20 18:00:17 CET 2013

* cipher/cipher-gcm.c (fillM): Rename...
(do_fillM): ...to this.
(ghash): Remove.
(fillM): New macro.
(GHASH): Use 'do_ghash' instead of 'ghash'.
[GCM_USE_INTEL_PCLMUL] (do_ghash_pclmul): New.
(ghash): New.
(setupM): New.
(_gcry_cipher_gcm_encrypt, _gcry_cipher_gcm_decrypt)
(_gcry_cipher_gcm_authenticate, _gcry_cipher_gcm_setiv)
(_gcry_cipher_gcm_tag): Use 'ghash' instead of 'GHASH' and
'c->u_mode.gcm.u_tag.tag' instead of 'c->u_tag.tag'.
* cipher/cipher-internal.h (GCM_USE_INTEL_PCLMUL): New.
(gcry_cipher_handle): Move 'u_tag' and 'gcm_table' under
'u_mode.gcm'.
* configure.ac (pclmulsupport, gcry_cv_gcc_inline_asm_pclmul): New.
* src/g10lib.h (HWF_INTEL_PCLMUL): New.
* src/global.c: Add "intel-pclmul".
* src/hwf-x86.c (detect_x86_gnuc): Add check for Intel PCLMUL.
--

Speed-up GCM for Intel CPUs.

Intel Haswell (x86-64):
Old:
AES     GCM enc |      5.17 ns/B     184.4 MiB/s     16.55 c/B
        GCM dec |      4.38 ns/B     218.0 MiB/s     14.00 c/B
       GCM auth |      3.17 ns/B     300.4 MiB/s     10.16 c/B
New:
AES     GCM enc |      3.01 ns/B     317.2 MiB/s      9.62 c/B
        GCM dec |      1.96 ns/B     486.9 MiB/s      6.27 c/B
       GCM auth |     0.848 ns/B    1124.8 MiB/s      2.71 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm.c      |  205 +++++++++++++++++++++++++++++++++++++++-------
 cipher/cipher-internal.h |   55 +++++++++---
 configure.ac             |   35 ++++++++
 src/g10lib.h             |    1 
 src/global.c             |    1 
 src/hwf-x86.c            |    5 +
 6 files changed, 256 insertions(+), 46 deletions(-)

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index d7fc0d8..b93f0fa 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -1,5 +1,6 @@
 /* cipher-gcm.c  - Generic Galois Counter Mode implementation
  * Copyright (C) 2013 Dmitry Eremin-Solenikov
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -81,7 +82,7 @@ bshift (u64 * b0, u64 * b1)
 }
 
 static void
-fillM (unsigned char *h, u64 * M)
+do_fillM (unsigned char *h, u64 *M)
 {
   int i, j;
 
@@ -179,7 +180,7 @@ bshift (u32 * M, int i)
 }
 
 static void
-fillM (unsigned char *h, u32 * M)
+do_fillM (unsigned char *h, u32 *M)
 {
   int i, j;
 
@@ -269,15 +270,10 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 * gcmM)
   buf_put_be32 (result + 8, tmp[2]);
   buf_put_be32 (result + 12, tmp[3]);
 }
-#endif
-
-static void
-ghash (unsigned char *result, const unsigned char *buf, const void *gcmM)
-{
-  do_ghash (result, buf, gcmM);
-}
+#endif /* !HAVE_U64_TYPEDEF || SIZEOF_UNSIGNED_LONG != 8 */
 
-#define GHASH(c, result, buf) ghash (result, buf, c->gcm_table);
+#define fillM(c, h) do_fillM (h, c->u_mode.gcm.gcm_table)
+#define GHASH(c, result, buf) do_ghash (result, buf, c->u_mode.gcm.gcm_table)
 
 #else
 
@@ -296,7 +292,7 @@ bshift (unsigned long *b)
 }
 
 static void
-ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
+do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
 {
   unsigned long V[4];
   int i, j;
@@ -339,10 +335,161 @@ ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
 #endif
 }
 
-#define fillM(h, M) do { } while (0)
+#define fillM(c, h) do { } while (0)
+#define GHASH(c, result, buf) do_ghash (c->u_iv.iv, result, buf)
+
+#endif /* !GCM_USE_TABLES */
+
+
+#ifdef GCM_USE_INTEL_PCLMUL
+/*
+ Intel PCLMUL ghash based on white paper:
+  "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
+   GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
+ */
+static void
+do_ghash_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+  asm volatile ("movdqu (%[result]), %%xmm1\n\t"
+                "movdqu %[buf], %%xmm2\n\t"
+                "movdqa %[hsub], %%xmm0\n\t"
+                "pxor %%xmm2, %%xmm1\n\t" /* big endian */
+
+                /* be => le */
+                "pshufb %[be_mask], %%xmm1\n\t"
+
+                /* gfmul, xmm0 has operator a and xmm1 has operator b. */
+                "pshufd $78, %%xmm0, %%xmm2\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
+
+                "movdqa %%xmm0, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
+                "movdqa %%xmm0, %%xmm6\n\t"
+                "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+
+                /* shift the result by one bit position to the left cope for
+                   the fact that bits are reversed */
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "movdqa %%xmm6, %%xmm0\n\t"
+                "pslld $1, %%xmm3\n\t"
+                "pslld $1, %%xmm6\n\t"
+                "psrld $31, %%xmm7\n\t"
+                "psrld $31, %%xmm0\n\t"
+                "movdqa %%xmm7, %%xmm1\n\t"
+                "pslldq $4, %%xmm0\n\t"
+                "pslldq $4, %%xmm7\n\t"
+                "psrldq $12, %%xmm1\n\t"
+                "por %%xmm7, %%xmm3\n\t"
+                "por %%xmm0, %%xmm6\n\t"
+                "por %%xmm1, %%xmm6\n\t"
+
+                /* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "movdqa %%xmm3, %%xmm0\n\t"
+                "pslld $31, %%xmm7\n\t"  /* packed right shifting << 31 */
+                "movdqa %%xmm3, %%xmm1\n\t"
+                "pslld $30, %%xmm0\n\t"  /* packed right shifting shift << 30 */
+                "pslld $25, %%xmm1\n\t"  /* packed right shifting shift << 25 */
+                "pxor %%xmm0, %%xmm7\n\t" /* xor the shifted versions */
+                "pxor %%xmm1, %%xmm7\n\t"
+                "movdqa %%xmm7, %%xmm0\n\t"
+                "pslldq $12, %%xmm7\n\t"
+                "psrldq $4, %%xmm0\n\t"
+                "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "movdqa %%xmm3, %%xmm2\n\t"
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
+                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm2\n\t"
+                "pxor %%xmm0, %%xmm2\n\t"
+                "pxor %%xmm2, %%xmm3\n\t"
+                "pxor %%xmm3, %%xmm6\n\t" /* the result is in xmm6 */
+
+                /* le => be */
+                "pshufb %[be_mask], %%xmm6\n\t"
+
+                "movdqu %%xmm6, (%[result])\n\t" /* store the result */
+                :
+                : [result] "r" (result), [buf] "m" (*buf),
+                  [hsub] "m" (*c->u_iv.iv), [be_mask] "m" (*be_mask)
+                : "memory" );
+}
+
+#endif /*GCM_USE_INTEL_PCLMUL*/
+
+
+static void
+ghash (gcry_cipher_hd_t c, unsigned char *result, const unsigned char *buf)
+{
+  if (0)
+    ;
+#ifdef GCM_USE_INTEL_PCLMUL
+  else if (c->u_mode.gcm.use_intel_pclmul)
+    {
+      /* TODO: Loop structure, use bit-reflection and add faster bulk
+               processing (parallel four blocks). */
+      do_ghash_pclmul (c, result, buf);
+
+      /* Clear used registers. */
+      asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                    "pxor %%xmm1, %%xmm1\n\t"
+                    "pxor %%xmm2, %%xmm2\n\t"
+                    "pxor %%xmm3, %%xmm3\n\t"
+                    "pxor %%xmm4, %%xmm4\n\t"
+                    "pxor %%xmm5, %%xmm5\n\t"
+                    "pxor %%xmm6, %%xmm6\n\t"
+                    "pxor %%xmm7, %%xmm7\n\t"
+                    ::: "cc" );
+    }
+#endif
+  else
+    GHASH (c, result, buf);
+}
+
+static void
+setupM (gcry_cipher_hd_t c, byte *h)
+{
+  if (0)
+    ;
+#ifdef GCM_USE_INTEL_PCLMUL
+  else if (_gcry_get_hw_features () & HWF_INTEL_PCLMUL)
+    {
+      u64 tmp[2];
+
+      c->u_mode.gcm.use_intel_pclmul = 1;
 
-#define GHASH(c, result, buf) ghash (c->u_iv.iv, result, buf);
+      /* Swap endianness of hsub. */
+      tmp[0] = buf_get_be64(c->u_iv.iv + 8);
+      tmp[1] = buf_get_be64(c->u_iv.iv + 0);
+      buf_cpy (c->u_iv.iv, tmp, 16);
+    }
 #endif
+  else
+    fillM (c, h);
+}
 
 
 gcry_err_code_t
@@ -389,12 +536,12 @@ _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
         {
           buf_xor_2dst (outbuf, tmp, inbuf, n);
           memset (tmp + n, 0, blocksize - n);
-          GHASH (c, c->u_tag.tag, tmp);
+          ghash (c, c->u_mode.gcm.u_tag.tag, tmp);
         }
       else
         {
           buf_xor (outbuf, tmp, inbuf, n);
-          GHASH (c, c->u_tag.tag, outbuf);
+          ghash (c, c->u_mode.gcm.u_tag.tag, outbuf);
         }
 
       inbuflen -= n;
@@ -442,11 +589,11 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
         {
           memcpy (tmp, inbuf, n);
           memset (tmp + n, 0, blocksize - n);
-          GHASH (c, c->u_tag.tag, tmp);
+          ghash (c, c->u_mode.gcm.u_tag.tag, tmp);
         }
       else
         {
-          GHASH (c, c->u_tag.tag, inbuf);
+          ghash (c, c->u_mode.gcm.u_tag.tag, inbuf);
         }
 
       i = blocksize - 1;
@@ -490,7 +637,7 @@ _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
 
   while (aadbuflen >= blocksize)
     {
-      GHASH (c, c->u_tag.tag, aadbuf);
+      ghash (c, c->u_mode.gcm.u_tag.tag, aadbuf);
 
       aadbuflen -= blocksize;
       aadbuf += blocksize;
@@ -501,7 +648,7 @@ _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
       memcpy (tmp, aadbuf, aadbuflen);
       memset (tmp + aadbuflen, 0, blocksize - aadbuflen);
 
-      GHASH (c, c->u_tag.tag, tmp);
+      ghash (c, c->u_mode.gcm.u_tag.tag, tmp);
     }
 
   return 0;
@@ -512,10 +659,10 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c,
                         const byte * iv, unsigned int ivlen)
 {
   memset (c->length, 0, 16);
-  memset (c->u_tag.tag, 0, 16);
-  c->spec->encrypt (&c->context.c, c->u_iv.iv, c->u_tag.tag);
+  memset (c->u_mode.gcm.u_tag.tag, 0, 16);
+  c->spec->encrypt (&c->context.c, c->u_iv.iv, c->u_mode.gcm.u_tag.tag);
 
-  fillM (c->u_iv.iv, c->gcm_table);
+  setupM (c, c->u_iv.iv);
 
   if (ivlen != 16 - 4)
     {
@@ -523,12 +670,12 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c,
       unsigned n;
       memset (c->u_ctr.ctr, 0, 16);
       for (n = ivlen; n >= 16; n -= 16, iv += 16)
-        GHASH (c, c->u_ctr.ctr, iv);
+        ghash (c, c->u_ctr.ctr, iv);
       if (n != 0)
         {
           memcpy (tmp, iv, n);
           memset (tmp + n, 0, 16 - n);
-          GHASH (c, c->u_ctr.ctr, tmp);
+          ghash (c, c->u_ctr.ctr, tmp);
         }
       memset (tmp, 0, 16);
       n = 16;
@@ -537,7 +684,7 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c,
       n--;
       for (; n > 0; n--, ivlen >>= 8)
         tmp[n - 1] = ivlen & 0xff;
-      GHASH (c, c->u_ctr.ctr, tmp);
+      ghash (c, c->u_ctr.ctr, tmp);
     }
   else
     {
@@ -560,19 +707,19 @@ _gcry_cipher_gcm_tag (gcry_cipher_hd_t c,
 
   if (!c->marks.tag)
     {
-      GHASH (c, c->u_tag.tag, c->length);
-      buf_xor (c->u_tag.tag, c->lastiv, c->u_tag.tag, 16);
+      ghash (c, c->u_mode.gcm.u_tag.tag, c->length);
+      buf_xor (c->u_mode.gcm.u_tag.tag, c->lastiv, c->u_mode.gcm.u_tag.tag, 16);
       c->marks.tag = 1;
     }
 
   if (!check)
     {
-      memcpy (outbuf, c->u_tag.tag, outbuflen);
+      memcpy (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen);
       return GPG_ERR_NO_ERROR;
     }
   else
     {
-      return buf_eq_const(outbuf, c->u_tag.tag, outbuflen) ?
+      return buf_eq_const(outbuf, c->u_mode.gcm.u_tag.tag, outbuflen) ?
                GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
     }
 
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index b4d0ff9..a6e6271 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -39,6 +39,18 @@
 #define GCM_USE_TABLES 1
 
 
+/* GCM_USE_INTEL_PCLMUL inidicates whether to compile GCM with Intel PCLMUL
+   code.  */
+#undef GCM_USE_INTEL_PCLMUL
+#if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES)
+# if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+#  if __GNUC__ >= 4
+#   define GCM_USE_INTEL_PCLMUL 1
+#  endif
+# endif
+#endif /* GCM_USE_INTEL_PCLMUL */
+
+
 /* A VIA processor with the Padlock engine as well as the Intel AES_NI
    instructions require an alignment of most data on a 16 byte
    boundary.  Because we trick out the compiler while allocating the
@@ -118,26 +130,10 @@ struct gcry_cipher_handle
     unsigned char ctr[MAX_BLOCKSIZE];
   } u_ctr;
 
-  /* The interim tag for GCM mode.  */
-  union {
-    cipher_context_alignment_t iv_align;
-    unsigned char tag[MAX_BLOCKSIZE];
-  } u_tag;
-
   /* Space to save an IV or CTR for chaining operations.  */
   unsigned char lastiv[MAX_BLOCKSIZE];
   int unused;  /* Number of unused bytes in LASTIV. */
   unsigned char length[MAX_BLOCKSIZE]; /* bit counters for GCM */
-#ifdef GCM_USE_TABLES
- #if defined(HAVE_U64_TYPEDEF) && \
-     (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
-  #define GCM_TABLES_USE_U64 1
-  u64 gcm_table[2 * 16]; /* pre-calculated table for GCM */
- #else
-  #undef GCM_TABLES_USE_U64
-  u32 gcm_table[4 * 16]; /* pre-calculated table for GCM */
- #endif
-#endif
 
   union {
     /* Mode specific storage for CCM mode. */
@@ -156,6 +152,7 @@ struct gcry_cipher_handle
       unsigned int lengths:1; /* Set to 1 if CCM length parameters has been
                                  processed.  */
     } ccm;
+
     /* Mode specific storage for CMAC mode. */
     struct {
       unsigned int tag:1; /* Set to 1 if tag has been finalized.  */
@@ -163,8 +160,32 @@ struct gcry_cipher_handle
       /* Subkeys for tag creation, not cleared by gcry_cipher_reset. */
       unsigned char subkeys[2][MAX_BLOCKSIZE];
     } cmac;
-  } u_mode;
 
+    /* Mode specific storage for GCM mode. */
+    struct {
+      /* The interim tag for GCM mode.  */
+      union {
+        cipher_context_alignment_t iv_align;
+        unsigned char tag[MAX_BLOCKSIZE];
+      } u_tag;
+
+      /* Pre-calculated table for GCM. */
+#ifdef GCM_USE_TABLES
+ #if defined(HAVE_U64_TYPEDEF) && (SIZEOF_UNSIGNED_LONG == 8 \
+                                   || defined(__x86_64__))
+      #define GCM_TABLES_USE_U64 1
+      u64 gcm_table[2 * 16];
+ #else
+      #undef GCM_TABLES_USE_U64
+      u32 gcm_table[4 * 16];
+ #endif
+#endif
+
+#ifdef GCM_USE_INTEL_PCLMUL
+      unsigned int use_intel_pclmul:1;
+#endif
+    } gcm;
+  } u_mode;
 
   /* What follows are two contexts of the cipher in use.  The first
      one needs to be aligned well enough for the cipher operation
diff --git a/configure.ac b/configure.ac
index c4f8776..6d40343 100644
--- a/configure.ac
+++ b/configure.ac
@@ -567,6 +567,14 @@ AC_ARG_ENABLE(aesni-support,
 	      aesnisupport=$enableval,aesnisupport=yes)
 AC_MSG_RESULT($aesnisupport)
 
+# Implementation of the --disable-pclmul-support switch.
+AC_MSG_CHECKING([whether PCLMUL support is requested])
+AC_ARG_ENABLE(pclmul-support,
+              AC_HELP_STRING([--disable-pclmul-support],
+                 [Disable support for the Intel PCLMUL instructions]),
+	      pclmulsupport=$enableval,pclmulsupport=yes)
+AC_MSG_RESULT($pclmulsupport)
+
 # Implementation of the --disable-drng-support switch.
 AC_MSG_CHECKING([whether DRNG support is requested])
 AC_ARG_ENABLE(drng-support,
@@ -991,6 +999,23 @@ fi
 
 
 #
+# Check whether GCC inline assembler supports PCLMUL instructions.
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports PCLMUL instructions],
+       [gcry_cv_gcc_inline_asm_pclmul],
+       [gcry_cv_gcc_inline_asm_pclmul=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[void a(void) {
+              __asm__("pclmulqdq \$0, %%xmm1, %%xmm3\n\t":::"cc");
+            }]])],
+          [gcry_cv_gcc_inline_asm_pclmul=yes])])
+if test "$gcry_cv_gcc_inline_asm_pclmul" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_PCLMUL,1,
+     [Defined if inline assembler supports PCLMUL instructions])
+fi
+
+
+#
 # Check whether GCC inline assembler supports AVX instructions
 #
 AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
@@ -1369,6 +1394,11 @@ if test x"$aesnisupport" = xyes ; then
     aesnisupport="no (unsupported by compiler)"
   fi
 fi
+if test x"$pclmulsupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_pclmul" != "yes" ; then
+    pclmulsupport="no (unsupported by compiler)"
+  fi
+fi
 if test x"$avxsupport" = xyes ; then
   if test "$gcry_cv_gcc_inline_asm_avx" != "yes" ; then
     avxsupport="no (unsupported by compiler)"
@@ -1389,6 +1419,10 @@ if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
             [Enable support for Intel AES-NI instructions.])
 fi
+if test x"$pclmulsupport" = xyes ; then
+  AC_DEFINE(ENABLE_PCLMUL_SUPPORT, 1,
+            [Enable support for Intel PCLMUL instructions.])
+fi
 if test x"$avxsupport" = xyes ; then
   AC_DEFINE(ENABLE_AVX_SUPPORT,1,
             [Enable support for Intel AVX instructions.])
@@ -1826,6 +1860,7 @@ GCRY_MSG_SHOW([Random number generator:  ],[$random])
 GCRY_MSG_SHOW([Using linux capabilities: ],[$use_capabilities])
 GCRY_MSG_SHOW([Try using Padlock crypto: ],[$padlocksupport])
 GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
+GCRY_MSG_SHOW([Try using Intel PCLMUL:   ],[$pclmulsupport])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
diff --git a/src/g10lib.h b/src/g10lib.h
index ae4502c..0612cbc 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -167,6 +167,7 @@ int _gcry_log_verbosity( int level );
 #define HWF_PADLOCK_SHA  4
 #define HWF_PADLOCK_MMUL 8
 
+#define HWF_INTEL_PCLMUL 128
 #define HWF_INTEL_AESNI  256
 #define HWF_INTEL_RDRAND 512
 #define HWF_INTEL_AVX    1024
diff --git a/src/global.c b/src/global.c
index 44667cf..841f188 100644
--- a/src/global.c
+++ b/src/global.c
@@ -66,6 +66,7 @@ static struct
     { HWF_PADLOCK_AES, "padlock-aes" },
     { HWF_PADLOCK_SHA, "padlock-sha" },
     { HWF_PADLOCK_MMUL,"padlock-mmul"},
+    { HWF_INTEL_PCLMUL,"intel-pclmul" },
     { HWF_INTEL_AESNI, "intel-aesni" },
     { HWF_INTEL_RDRAND,"intel-rdrand" },
     { HWF_INTEL_AVX,   "intel-avx" },
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 2ceb04c..784fe2a 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -201,6 +201,11 @@ detect_x86_gnuc (void)
   /* Get CPU info and Intel feature flags (ECX).  */
   get_cpuid(1, NULL, NULL, &features, NULL);
 
+#ifdef ENABLE_PCLMUL_SUPPORT
+  /* Test bit 1 for PCLMUL.  */
+  if (features & 0x00000002)
+     result |= HWF_INTEL_PCLMUL;
+#endif
 #ifdef ENABLE_AESNI_SUPPORT
   /* Test bit 25 for AES-NI.  */
   if (features & 0x02000000)