[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-134-g4f46374

Fri Dec 12 23:50:34 CET 2014

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  4f46374502eb988d701b904f83819e2cf7b1755c (commit)
       via  4a0795af021305f9240f23626a3796157db46bd7 (commit)
      from  cbf4c8cb6bbda15eea61885279f2a6f1d4bcedfd (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 4f46374502eb988d701b904f83819e2cf7b1755c
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Dec 6 15:09:13 2014 +0200

    rijndael: further optimizations for AES-NI accelerated CBC and CFB bulk modes
    
    * cipher/rijndael-aesni.c (do_aesni_enc, do_aesni_dec): Pass
    input/output through SSE register XMM0.
    (do_aesni_cfb): Remove.
    (_gcry_aes_aesni_encrypt, _gcry_aes_aesni_decrypt): Add loading/storing
    input/output to/from XMM0.
    (_gcry_aes_aesni_cfb_enc, _gcry_aes_aesni_cbc_enc)
    (_gcry_aes_aesni_cfb_dec): Update to use renewed 'do_aesni_enc' and
    move IV loading/storing outside loop.
    (_gcry_aes_aesni_cbc_dec): Update to use renewed 'do_aesni_dec'.
    --
    
    CBC encryption speed is improved ~16% on Intel Haswell and CFB encryption ~8%.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index e6c1051..3c367ce 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -340,33 +340,14 @@ _gcry_aes_aesni_prepare_decryption (RIJNDAEL_context *ctx)
 }
 
 
-/* Encrypt one block using the Intel AES-NI instructions.  A and B may
-   be the same.
-
-   Our problem here is that gcc does not allow the "x" constraint for
-   SSE registers in asm unless you compile with -msse.  The common
-   wisdom is to use a separate file for SSE instructions and build it
-   separately.  This would require a lot of extra build system stuff,
-   similar to what we do in mpi/ for the asm stuff.  What we do
-   instead is to use standard registers and a bit more of plain asm
-   which copies the data and key stuff to the SSE registers and later
-   back.  If we decide to implement some block modes with parallelized
-   AES instructions, it might indeed be better to use plain asm ala
-   mpi/.  */
+/* Encrypt one block using the Intel AES-NI instructions.  Block is input
+ * and output through SSE register xmm0. */
 static inline void
-do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
-              const unsigned char *a)
+do_aesni_enc (const RIJNDAEL_context *ctx)
 {
 #define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
 #define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-  /* Note: For now we relax the alignment requirement for A and B: It
-     does not make much difference because in many case we would need
-     to memcpy them to an extra buffer; using the movdqu is much faster
-     that memcpy and movdqa.  For CFB we know that the IV is properly
-     aligned but that is a special case.  We should better implement
-     CFB direct in asm.  */
-  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
-                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
+  asm volatile ("movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
                 "movdqa 0x10(%[key]), %%xmm1\n\t"
                 aesenc_xmm1_xmm0
@@ -402,10 +383,9 @@ do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
 
                 ".Lenclast%=:\n\t"
                 aesenclast_xmm1_xmm0
-                "movdqu %%xmm0, %[dst]\n"
-                : [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschenc),
+                "\n"
+                :
+                : [key] "r" (ctx->keyschenc),
                   [rounds] "r" (ctx->rounds)
                 : "cc", "memory");
 #undef aesenc_xmm1_xmm0
@@ -413,14 +393,14 @@ do_aesni_enc (const RIJNDAEL_context *ctx, unsigned char *b,
 }
 
 
+/* Decrypt one block using the Intel AES-NI instructions.  Block is input
+ * and output through SSE register xmm0. */
 static inline void
-do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
-              const unsigned char *a)
+do_aesni_dec (const RIJNDAEL_context *ctx)
 {
 #define aesdec_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xde, 0xc1\n\t"
 #define aesdeclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc1\n\t"
-  asm volatile ("movdqu %[src], %%xmm0\n\t"     /* xmm0 := *a     */
-                "movdqa (%[key]), %%xmm1\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm1\n\t"
                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
                 "movdqa 0x10(%[key]), %%xmm1\n\t"
                 aesdec_xmm1_xmm0
@@ -456,10 +436,9 @@ do_aesni_dec (const RIJNDAEL_context *ctx, unsigned char *b,
 
                 ".Ldeclast%=:\n\t"
                 aesdeclast_xmm1_xmm0
-                "movdqu %%xmm0, %[dst]\n"
-                : [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschdec),
+                "\n"
+                :
+                : [key] "r" (ctx->keyschdec),
                   [rounds] "r" (ctx->rounds)
                 : "cc", "memory");
 #undef aesdec_xmm1_xmm0
@@ -685,74 +664,6 @@ do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
 }
 
 
-/* Perform a CFB encryption or decryption round using the
-   initialization vector IV and the input block A.  Write the result
-   to the output block B and update IV.  IV needs to be 16 byte
-   aligned.  */
-static inline void
-do_aesni_cfb (const RIJNDAEL_context *ctx, int decrypt_flag,
-              unsigned char *iv, unsigned char *b, const unsigned char *a)
-{
-#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
-#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
-  asm volatile ("movdqa %[iv], %%xmm0\n\t"      /* xmm0 := IV     */
-                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0] */
-                "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0] */
-                "movdqa 0x10(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x20(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x30(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x40(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x50(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x60(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x70(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x80(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0x90(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xa0(%[key]), %%xmm1\n\t"
-                "cmpl $10, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xb0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xc0(%[key]), %%xmm1\n\t"
-                "cmpl $12, %[rounds]\n\t"
-                "jz .Lenclast%=\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xd0(%[key]), %%xmm1\n\t"
-                aesenc_xmm1_xmm0
-                "movdqa 0xe0(%[key]), %%xmm1\n"
-
-                ".Lenclast%=:\n\t"
-                aesenclast_xmm1_xmm0
-                "movdqu %[src], %%xmm1\n\t"      /* Save input.  */
-                "pxor %%xmm1, %%xmm0\n\t"        /* xmm0 = input ^ IV  */
-
-                "cmpl $1, %[decrypt]\n\t"
-                "jz .Ldecrypt_%=\n\t"
-                "movdqa %%xmm0, %[iv]\n\t"       /* [encrypt] Store IV.  */
-                "jmp .Lleave_%=\n"
-                ".Ldecrypt_%=:\n\t"
-                "movdqa %%xmm1, %[iv]\n"         /* [decrypt] Store IV.  */
-                ".Lleave_%=:\n\t"
-                "movdqu %%xmm0, %[dst]\n"        /* Store output.   */
-                : [iv] "+m" (*iv), [dst] "=m" (*b)
-                : [src] "m" (*a),
-                  [key] "r" (ctx->keyschenc),
-                  [rounds] "g" (ctx->rounds),
-                  [decrypt] "m" (decrypt_flag)
-                : "cc", "memory");
-#undef aesenc_xmm1_xmm0
-#undef aesenclast_xmm1_xmm0
-}
-
 /* Perform a CTR encryption round using the counter CTR and the input
    block A.  Write the result to the output block B and update CTR.
    CTR needs to be a 16 byte aligned little-endian value.  */
@@ -1026,7 +937,15 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
 {
   aesni_prepare ();
-  do_aesni_enc (ctx, dst, src);
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_aesni_enc (ctx);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
   aesni_cleanup ();
   return 0;
 }
@@ -1038,12 +957,32 @@ _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                          size_t nblocks)
 {
   aesni_prepare ();
+
+  asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
   for ( ;nblocks; nblocks-- )
     {
-      do_aesni_cfb (ctx, 0, iv, outbuf, inbuf);
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
+                    "pxor %%xmm1, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
       outbuf += BLOCKSIZE;
       inbuf  += BLOCKSIZE;
     }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
   aesni_cleanup ();
 }
 
@@ -1053,45 +992,41 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                          const unsigned char *inbuf, unsigned char *iv,
                          size_t nblocks, int cbc_mac)
 {
-  unsigned char *last_iv;
-
   aesni_prepare ();
 
-  last_iv = iv;
+  asm volatile ("movdqu %[iv], %%xmm5\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
 
   for ( ;nblocks; nblocks-- )
     {
-      /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
-          SSE registers.  */
-      asm volatile ("movdqu %[iv], %%xmm0\n\t"
-                    "movdqu %[inbuf], %%xmm1\n\t"
-                    "pxor %%xmm0, %%xmm1\n\t"
-                    "movdqu %%xmm1, %[outbuf]\n\t"
+      asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm5, %%xmm0\n\t"
                     : /* No output */
-                    : [iv] "m" (*last_iv),
-                      [inbuf] "m" (*inbuf),
-                      [outbuf] "m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
                     : "memory" );
 
-      do_aesni_enc (ctx, outbuf, outbuf);
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqa %%xmm0, %%xmm5\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
 
-      last_iv = outbuf;
       inbuf += BLOCKSIZE;
       if (!cbc_mac)
         outbuf += BLOCKSIZE;
     }
 
-  if (last_iv != iv)
-    {
-      asm volatile ("movdqu %[last], %%xmm0\n\t"
-                    "movdqu %%xmm0, %[iv]\n\t"
-                    : /* No output */
-                    : [last] "m" (*last_iv),
-                      [iv] "m" (*iv)
-                    : "memory" );
-    }
+  asm volatile ("movdqu %%xmm5, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
 
   aesni_cleanup ();
+  aesni_cleanup_2_6 ();
 }
 
 
@@ -1134,7 +1069,15 @@ _gcry_aes_aesni_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                          const unsigned char *src)
 {
   aesni_prepare ();
-  do_aesni_dec (ctx, dst, src);
+  asm volatile ("movdqu %[src], %%xmm0\n\t"
+                :
+                : [src] "m" (*src)
+                : "memory" );
+  do_aesni_dec (ctx);
+  asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+                : [dst] "=m" (*dst)
+                :
+                : "memory" );
   aesni_cleanup ();
   return 0;
 }
@@ -1147,19 +1090,23 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
 {
   aesni_prepare ();
 
+  asm volatile ("movdqu %[iv], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*iv)
+                : "memory" );
+
   /* CFB decryption can be parallelized */
   for ( ;nblocks >= 4; nblocks -= 4)
     {
       asm volatile
-        ("movdqu (%[iv]),        %%xmm1\n\t" /* load input blocks */
+        ("movdqu %%xmm6,         %%xmm1\n\t" /* load input blocks */
          "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
          "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
          "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
 
-         "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
-         "movdqu %%xmm0,         (%[iv])\n\t"
+         "movdqu 3*16(%[inbuf]), %%xmm6\n\t" /* update IV */
          : /* No output */
-         : [inbuf] "r" (inbuf), [iv] "r" (iv)
+         : [inbuf] "r" (inbuf)
          : "memory");
 
       do_aesni_enc_vec4 (ctx);
@@ -1190,12 +1137,29 @@ _gcry_aes_aesni_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
       inbuf  += 4*BLOCKSIZE;
     }
 
+  asm volatile ("movdqu %%xmm6, %%xmm0\n\t" ::: "cc");
+
   for ( ;nblocks; nblocks-- )
     {
-      do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
+      do_aesni_enc (ctx);
+
+      asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm6, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    : [inbuf] "m" (*inbuf)
+                    : "memory" );
+
       outbuf += BLOCKSIZE;
       inbuf  += BLOCKSIZE;
     }
+
+  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+                : [iv] "=m" (*iv)
+                :
+                : "memory" );
+
   aesni_cleanup ();
   aesni_cleanup_2_6 ();
 }
@@ -1256,21 +1220,21 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
   for ( ;nblocks; nblocks-- )
     {
       asm volatile
-        ("movdqu %[inbuf], %%xmm2\n\t"	/* use xmm2 as savebuf */
+        ("movdqu %[inbuf], %%xmm0\n\t"
+         "movdqa %%xmm0, %%xmm2\n\t"    /* use xmm2 as savebuf */
          : /* No output */
          : [inbuf] "m" (*inbuf)
          : "memory");
 
       /* uses only xmm0 and xmm1 */
-      do_aesni_dec (ctx, outbuf, inbuf);
+      do_aesni_dec (ctx);
 
       asm volatile
-        ("movdqu %[outbuf], %%xmm0\n\t"
-         "pxor %%xmm5, %%xmm0\n\t"	/* xor IV with output */
+        ("pxor %%xmm5, %%xmm0\n\t"	/* xor IV with output */
          "movdqu %%xmm0, %[outbuf]\n\t"
          "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
-         : /* No output */
-         : [outbuf] "m" (*outbuf)
+         : [outbuf] "=m" (*outbuf)
+         :
          : "memory");
 
       outbuf += BLOCKSIZE;

commit 4a0795af021305f9240f23626a3796157db46bd7
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Dec 6 10:38:36 2014 +0200

    GCM: move Intel PCLMUL accelerated implementation to separate file
    
    * cipher/Makefile.am: Add 'cipher-gcm-intel-pclmul.c'.
    * cipher/cipher-gcm-intel-pclmul.c: New.
    * cipher/cipher-gcm.c [GCM_USE_INTEL_PCLMUL]
    (_gcry_ghash_setup_intel_pclmul, _gcry_ghash_intel_pclmul): New
    prototypes.
    [GCM_USE_INTEL_PCLMUL] (gfmul_pclmul, gfmul_pclmul_aggr4): Move
    to 'cipher-gcm-intel-pclmul.c'.
    (ghash): Rename to...
    (ghash_internal): ...this and move GCM_USE_INTEL_PCLMUL part to new
    function in 'cipher-gcm-intel-pclmul.c'.
    (setupM): Move GCM_USE_INTEL_PCLMUL part to new function in
    'cipher-gcm-intel-pclmul.c'; Add selection of ghash function based
    on available HW acceleration.
    (do_ghash_buf): Change use of 'ghash' to 'c->u_mode.gcm.ghash_fn'.
    * cipher/internal.h (ghash_fn_t): New.
    (gcry_cipher_handle): Remove 'use_intel_pclmul'; Add 'ghash_fn'.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index d7e7773..98142ed 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -40,8 +40,8 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES)
 libcipher_la_SOURCES = \
 cipher.c cipher-internal.h \
 cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
-cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-poly1305.c \
-cipher-selftest.c cipher-selftest.h \
+cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \
+cipher-poly1305.c cipher-selftest.c cipher-selftest.h \
 pubkey.c pubkey-internal.h pubkey-util.c \
 md.c \
 mac.c mac-internal.h \
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
new file mode 100644
index 0000000..02e7701
--- /dev/null
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -0,0 +1,395 @@
+/* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
+ *                               implementation
+ * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+#ifdef GCM_USE_INTEL_PCLMUL
+
+/*
+ Intel PCLMUL ghash based on white paper:
+  "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
+   GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
+ */
+static inline void gfmul_pclmul(void)
+{
+  /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
+                "pshufd $78, %%xmm0, %%xmm2\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
+
+                "movdqa %%xmm0, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
+                "movdqa %%xmm0, %%xmm6\n\t"
+                "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+
+                /* shift the result by one bit position to the left cope for
+                   the fact that bits are reversed */
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "movdqa %%xmm6, %%xmm5\n\t"
+                "pslld $1, %%xmm3\n\t"
+                "pslld $1, %%xmm6\n\t"
+                "psrld $31, %%xmm4\n\t"
+                "psrld $31, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm1\n\t"
+                "pslldq $4, %%xmm5\n\t"
+                "pslldq $4, %%xmm4\n\t"
+                "psrldq $12, %%xmm1\n\t"
+                "por %%xmm4, %%xmm3\n\t"
+                "por %%xmm5, %%xmm6\n\t"
+                "por %%xmm6, %%xmm1\n\t"
+
+                /* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
+                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
+                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm6\n\t"
+                "movdqa %%xmm6, %%xmm7\n\t"
+                "pslldq $12, %%xmm6\n\t"
+                "psrldq $4, %%xmm7\n\t"
+                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "movdqa %%xmm3, %%xmm2\n\t"
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
+                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm2\n\t"
+                "pxor %%xmm7, %%xmm2\n\t"
+                "pxor %%xmm2, %%xmm3\n\t"
+                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
+                ::: "cc" );
+}
+
+
+#ifdef __x86_64__
+static inline void gfmul_pclmul_aggr4(void)
+{
+  /* Input:
+      H¹: XMM0          X_i            : XMM6
+      H²: XMM8          X_(i-1)        : XMM3
+      H³: XMM9          X_(i-2)        : XMM2
+      H⁴: XMM10         X_(i-3)⊕Y_(i-4): XMM1
+     Output:
+      Y_i: XMM1
+     Inputs XMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* perform clmul and merge results... */
+                "pshufd $78, %%xmm10, %%xmm11\n\t"
+                "pshufd $78, %%xmm1, %%xmm12\n\t"
+                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
+
+                "pshufd $78, %%xmm9, %%xmm13\n\t"
+                "pshufd $78, %%xmm2, %%xmm14\n\t"
+                "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
+
+                "pshufd $78, %%xmm8, %%xmm5\n\t"
+                "pshufd $78, %%xmm3, %%xmm15\n\t"
+                "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
+                "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
+
+                "movdqa %%xmm10, %%xmm4\n\t"
+                "movdqa %%xmm9, %%xmm7\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
+                "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
+                "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
+                "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm0, %%xmm10\n\t"
+                "pshufd $78, %%xmm6, %%xmm11\n\t"
+                "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
+                "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
+
+                "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
+                "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
+
+                "movdqa %%xmm8, %%xmm13\n\t"
+                "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
+                "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
+                "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
+                "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+                "movdqa %%xmm0, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
+                "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
+                "movdqa %%xmm11, %%xmm4\n\t"
+                "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
+                "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
+                "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+
+                /* aggregated reduction... */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+
+                /* shift the result by one bit position to the left cope for
+                   the fact that bits are reversed */
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "movdqa %%xmm6, %%xmm5\n\t"
+                "pslld $1, %%xmm3\n\t"
+                "pslld $1, %%xmm6\n\t"
+                "psrld $31, %%xmm4\n\t"
+                "psrld $31, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm1\n\t"
+                "pslldq $4, %%xmm5\n\t"
+                "pslldq $4, %%xmm4\n\t"
+                "psrldq $12, %%xmm1\n\t"
+                "por %%xmm4, %%xmm3\n\t"
+                "por %%xmm5, %%xmm6\n\t"
+                "por %%xmm6, %%xmm1\n\t"
+
+                /* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
+                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
+                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm6\n\t"
+                "movdqa %%xmm6, %%xmm7\n\t"
+                "pslldq $12, %%xmm6\n\t"
+                "psrldq $4, %%xmm7\n\t"
+                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "movdqa %%xmm3, %%xmm2\n\t"
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
+                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm2\n\t"
+                "pxor %%xmm7, %%xmm2\n\t"
+                "pxor %%xmm2, %%xmm3\n\t"
+                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
+                :::"cc");
+}
+#endif
+
+
+void
+_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, byte *h)
+{
+  u64 tmp[2];
+
+  /* Swap endianness of hsub. */
+  tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8);
+  tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0);
+  buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN);
+
+#ifdef __x86_64__
+  asm volatile ("movdqu %[h_1], %%xmm0\n\t"
+                "movdqa %%xmm0, %%xmm1\n\t"
+                :
+                : [h_1] "m" (*tmp));
+
+  gfmul_pclmul (); /* H•H => H² */
+
+  asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
+                "movdqa %%xmm1, %%xmm8\n\t"
+                :
+                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H•H² => H³ */
+
+  asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
+                "movdqu %%xmm1, 1*16(%[h_234])\n\t"
+                "movdqa %%xmm8, %%xmm1\n\t"
+                :
+                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H²•H² => H⁴ */
+
+  asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
+                :
+                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm8\n\t"
+                ::: "cc" );
+#endif
+
+  wipememory (tmp, sizeof(tmp));
+}
+
+
+unsigned int
+_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                          size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+
+  if (nblocks == 0)
+    return 0;
+
+  /* Preload hash and H1. */
+  asm volatile ("movdqu %[hash], %%xmm1\n\t"
+                "movdqa %[hsub], %%xmm0\n\t"
+                "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                :
+                : [hash] "m" (*result), [be_mask] "m" (*be_mask),
+                  [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
+
+#ifdef __x86_64__
+  if (nblocks >= 4)
+    {
+      do
+        {
+          asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
+                        "movdqu 0*16(%[buf]), %%xmm5\n\t"
+                        "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                        "movdqu 2*16(%[buf]), %%xmm3\n\t"
+                        "movdqu 3*16(%[buf]), %%xmm6\n\t"
+                        "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
+
+                        /* Load H2, H3, H4. */
+                        "movdqu 2*16(%[h_234]), %%xmm10\n\t"
+                        "movdqu 1*16(%[h_234]), %%xmm9\n\t"
+                        "movdqu 0*16(%[h_234]), %%xmm8\n\t"
+
+                        "pxor %%xmm5, %%xmm1\n\t"
+                        "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
+                        "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
+                        "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
+                        :
+                        : [buf] "r" (buf), [be_mask] "m" (*be_mask),
+                          [h_234] "r" (c->u_mode.gcm.gcm_table));
+
+          gfmul_pclmul_aggr4 ();
+
+          buf += 4 * blocksize;
+          nblocks -= 4;
+        }
+      while (nblocks >= 4);
+
+      /* Clear used x86-64/XMM registers. */
+      asm volatile( "pxor %%xmm8, %%xmm8\n\t"
+                    "pxor %%xmm9, %%xmm9\n\t"
+                    "pxor %%xmm10, %%xmm10\n\t"
+                    "pxor %%xmm11, %%xmm11\n\t"
+                    "pxor %%xmm12, %%xmm12\n\t"
+                    "pxor %%xmm13, %%xmm13\n\t"
+                    "pxor %%xmm14, %%xmm14\n\t"
+                    "pxor %%xmm15, %%xmm15\n\t"
+                    ::: "cc" );
+    }
+#endif
+
+  while (nblocks--)
+    {
+      asm volatile ("movdqu %[buf], %%xmm2\n\t"
+                    "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+                    "pxor %%xmm2, %%xmm1\n\t"
+                    :
+                    : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
+
+      gfmul_pclmul ();
+
+      buf += blocksize;
+    }
+
+  /* Store hash. */
+  asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                "movdqu %%xmm1, %[hash]\n\t"
+                : [hash] "=m" (*result)
+                : [be_mask] "m" (*be_mask));
+
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                ::: "cc" );
+
+  return 0;
+}
+
+#endif /* GCM_USE_INTEL_PCLMUL */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 0534761..f89b81e 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -29,6 +29,15 @@
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
+
+#ifdef GCM_USE_INTEL_PCLMUL
+extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, byte *h);
+
+extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
+                                              const byte *buf, size_t nblocks);
+#endif
+
+
 #ifdef GCM_USE_TABLES
 static const u16 gcmR[256] = {
   0x0000, 0x01c2, 0x0384, 0x0246, 0x0708, 0x06ca, 0x048c, 0x054e,
@@ -348,325 +357,18 @@ do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
 #endif /* !GCM_USE_TABLES */
 
 
-#ifdef GCM_USE_INTEL_PCLMUL
-/*
- Intel PCLMUL ghash based on white paper:
-  "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
-   GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
- */
-static inline void gfmul_pclmul(void)
-{
-  /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
-     Input must be converted to little-endian.
-   */
-  asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
-                "pshufd $78, %%xmm0, %%xmm2\n\t"
-                "pshufd $78, %%xmm1, %%xmm4\n\t"
-                "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
-                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
-
-                "movdqa %%xmm0, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
-                "movdqa %%xmm0, %%xmm6\n\t"
-                "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
-
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                ::: "cc" );
-}
-
-#ifdef __x86_64__
-static inline void gfmul_pclmul_aggr4(void)
-{
-  /* Input:
-      H¹: XMM0		X_i            : XMM6
-      H²: XMM8		X_(i-1)        : XMM3
-      H³: XMM9		X_(i-2)        : XMM2
-      H⁴: XMM10		X_(i-3)⊕Y_(i-4): XMM1
-     Output:
-      Y_i: XMM1
-     Inputs XMM0 stays unmodified.
-     Input must be converted to little-endian.
-   */
-  asm volatile (/* perform clmul and merge results... */
-                "pshufd $78, %%xmm10, %%xmm11\n\t"
-                "pshufd $78, %%xmm1, %%xmm12\n\t"
-                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
-                "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
-
-                "pshufd $78, %%xmm9, %%xmm13\n\t"
-                "pshufd $78, %%xmm2, %%xmm14\n\t"
-                "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
-
-                "pshufd $78, %%xmm8, %%xmm5\n\t"
-                "pshufd $78, %%xmm3, %%xmm15\n\t"
-                "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
-                "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
-
-                "movdqa %%xmm10, %%xmm4\n\t"
-                "movdqa %%xmm9, %%xmm7\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
-                "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
-                "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
-                "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm0, %%xmm10\n\t"
-                "pshufd $78, %%xmm6, %%xmm11\n\t"
-                "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
-                "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
-
-                "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
-                "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
-
-                "movdqa %%xmm8, %%xmm13\n\t"
-                "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
-                "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
-                "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
-
-                "movdqa %%xmm0, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
-                "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
-                "movdqa %%xmm11, %%xmm4\n\t"
-                "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
-                "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
-                "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
-
-                /* aggregated reduction... */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
-
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                :::"cc");
-}
-#endif
-
-#endif /*GCM_USE_INTEL_PCLMUL*/
-
-
 static unsigned int
-ghash (gcry_cipher_hd_t c, byte *result, const byte *buf,
-       size_t nblocks)
+ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                size_t nblocks)
 {
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
-  unsigned int burn;
-
-  if (nblocks == 0)
-    return 0;
-
-  if (0)
-    ;
-#ifdef GCM_USE_INTEL_PCLMUL
-  else if (c->u_mode.gcm.use_intel_pclmul)
-    {
-      static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
-        { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-
-      /* Preload hash and H1. */
-      asm volatile ("movdqu %[hash], %%xmm1\n\t"
-                    "movdqa %[hsub], %%xmm0\n\t"
-                    "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
-                    :
-                    : [hash] "m" (*result), [be_mask] "m" (*be_mask),
-                      [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
-
-#ifdef __x86_64__
-      if (nblocks >= 4)
-        {
-          do
-            {
-              asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
-                            "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                            "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                            "movdqu 2*16(%[buf]), %%xmm3\n\t"
-                            "movdqu 3*16(%[buf]), %%xmm6\n\t"
-                            "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
-
-                            /* Load H2, H3, H4. */
-                            "movdqu 2*16(%[h_234]), %%xmm10\n\t"
-                            "movdqu 1*16(%[h_234]), %%xmm9\n\t"
-                            "movdqu 0*16(%[h_234]), %%xmm8\n\t"
-
-                            "pxor %%xmm5, %%xmm1\n\t"
-                            "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
-                            "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
-                            "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
-                            :
-                            : [buf] "r" (buf), [be_mask] "m" (*be_mask),
-                              [h_234] "r" (c->u_mode.gcm.gcm_table));
-
-              gfmul_pclmul_aggr4 ();
-
-              buf += 4 * blocksize;
-              nblocks -= 4;
-            }
-          while (nblocks >= 4);
-
-          /* Clear used x86-64/XMM registers. */
-          asm volatile( "pxor %%xmm8, %%xmm8\n\t"
-                        "pxor %%xmm9, %%xmm9\n\t"
-                        "pxor %%xmm10, %%xmm10\n\t"
-                        "pxor %%xmm11, %%xmm11\n\t"
-                        "pxor %%xmm12, %%xmm12\n\t"
-                        "pxor %%xmm13, %%xmm13\n\t"
-                        "pxor %%xmm14, %%xmm14\n\t"
-                        "pxor %%xmm15, %%xmm15\n\t"
-                        ::: "cc" );
-        }
-#endif
-
-      while (nblocks--)
-        {
-          asm volatile ("movdqu %[buf], %%xmm2\n\t"
-                        "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
-                        "pxor %%xmm2, %%xmm1\n\t"
-                        :
-                        : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
-
-          gfmul_pclmul ();
-
-          buf += blocksize;
-        }
+  unsigned int burn = 0;
 
-      /* Store hash. */
-      asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
-                    "movdqu %%xmm1, %[hash]\n\t"
-                    : [hash] "=m" (*result)
-                    : [be_mask] "m" (*be_mask));
-
-      /* Clear used registers. */
-      asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                    "pxor %%xmm1, %%xmm1\n\t"
-                    "pxor %%xmm2, %%xmm2\n\t"
-                    "pxor %%xmm3, %%xmm3\n\t"
-                    "pxor %%xmm4, %%xmm4\n\t"
-                    "pxor %%xmm5, %%xmm5\n\t"
-                    "pxor %%xmm6, %%xmm6\n\t"
-                    "pxor %%xmm7, %%xmm7\n\t"
-                    ::: "cc" );
-      burn = 0;
-    }
-#endif
-  else
+  while (nblocks)
     {
-      while (nblocks)
-        {
-          burn = GHASH (c, result, buf);
-          buf += blocksize;
-          nblocks--;
-        }
+      burn = GHASH (c, result, buf);
+      buf += blocksize;
+      nblocks--;
     }
 
   return burn + (burn ? 5*sizeof(void*) : 0);
@@ -681,63 +383,15 @@ setupM (gcry_cipher_hd_t c, byte *h)
 #ifdef GCM_USE_INTEL_PCLMUL
   else if (_gcry_get_hw_features () & HWF_INTEL_PCLMUL)
     {
-      u64 tmp[2];
-
-      c->u_mode.gcm.use_intel_pclmul = 1;
-
-      /* Swap endianness of hsub. */
-      tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8);
-      tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0);
-      buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN);
-
-#ifdef __x86_64__
-      asm volatile ("movdqu %[h_1], %%xmm0\n\t"
-                    "movdqa %%xmm0, %%xmm1\n\t"
-                    :
-                    : [h_1] "m" (*tmp));
-
-      gfmul_pclmul (); /* H•H => H² */
-
-      asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
-                    "movdqa %%xmm1, %%xmm8\n\t"
-                    :
-                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
-                    : "memory");
-
-      gfmul_pclmul (); /* H•H² => H³ */
-
-      asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
-                    "movdqu %%xmm1, 1*16(%[h_234])\n\t"
-                    "movdqa %%xmm8, %%xmm1\n\t"
-                    :
-                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
-                    : "memory");
-
-      gfmul_pclmul (); /* H²•H² => H⁴ */
-
-      asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
-                    :
-                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
-                    : "memory");
-
-      /* Clear used registers. */
-      asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                    "pxor %%xmm1, %%xmm1\n\t"
-                    "pxor %%xmm2, %%xmm2\n\t"
-                    "pxor %%xmm3, %%xmm3\n\t"
-                    "pxor %%xmm4, %%xmm4\n\t"
-                    "pxor %%xmm5, %%xmm5\n\t"
-                    "pxor %%xmm6, %%xmm6\n\t"
-                    "pxor %%xmm7, %%xmm7\n\t"
-                    "pxor %%xmm8, %%xmm8\n\t"
-                    ::: "cc" );
-#endif
-
-      wipememory (tmp, sizeof(tmp));
+      c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+      _gcry_ghash_setup_intel_pclmul(c, h);
     }
 #endif
   else
-    fillM (c, h);
+    {
+      c->u_mode.gcm.ghash_fn = ghash_internal;
+      fillM (c, h);
+    }
 }
 
 
@@ -810,6 +464,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 {
   unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   unsigned int unused = c->u_mode.gcm.mac_unused;
+  ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
   size_t nblocks, n;
   unsigned int burn = 0;
 
@@ -843,7 +498,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
           gcry_assert (unused == blocksize);
 
           /* Process one block from macbuf.  */
-          burn = ghash (c, hash, c->u_mode.gcm.macbuf, 1);
+          burn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
           unused = 0;
         }
 
@@ -851,7 +506,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 
       if (nblocks)
         {
-          burn = ghash (c, hash, buf, nblocks);
+          burn = ghash_fn (c, hash, buf, nblocks);
           buf += blocksize * nblocks;
           buflen -= blocksize * nblocks;
         }
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index f6bda66..fef0ecb 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -42,7 +42,7 @@
 #define GCM_USE_TABLES 1
 
 
-/* GCM_USE_INTEL_PCLMUL inidicates whether to compile GCM with Intel PCLMUL
+/* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL
    code.  */
 #undef GCM_USE_INTEL_PCLMUL
 #if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES)
@@ -54,6 +54,10 @@
 #endif /* GCM_USE_INTEL_PCLMUL */
 
 
+typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
+                                    const byte *buf, size_t nblocks);
+
+
 /* A VIA processor with the Padlock engine as well as the Intel AES_NI
    instructions require an alignment of most data on a 16 byte
    boundary.  Because we trick out the compiler while allocating the
@@ -188,6 +192,7 @@ struct gcry_cipher_handle
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
+
       /* byte counters for GCM */
       u32 aadlen[2];
       u32 datalen[2];
@@ -209,10 +214,8 @@ struct gcry_cipher_handle
         unsigned char key[MAX_BLOCKSIZE];
       } u_ghash_key;
 
-#ifdef GCM_USE_INTEL_PCLMUL
-      /* Use Intel PCLMUL instructions for accelerated GHASH. */
-      unsigned int use_intel_pclmul:1;
-#endif
+      /* GHASH implementation in use. */
+      ghash_fn_t ghash_fn;
 
       /* Pre-calculated table for GCM. */
 #ifdef GCM_USE_TABLES

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am               |    4 +-
 cipher/cipher-gcm-intel-pclmul.c |  395 ++++++++++++++++++++++++++++++++++++++
 cipher/cipher-gcm.c              |  395 +++-----------------------------------
 cipher/cipher-internal.h         |   13 +-
 cipher/rijndael-aesni.c          |  244 ++++++++++-------------
 5 files changed, 534 insertions(+), 517 deletions(-)
 create mode 100644 cipher/cipher-gcm-intel-pclmul.c


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org