[PATCH] Add intel-pclmul accelerated POLYVAL for GCM-SIV

Jussi Kivilinna jussi.kivilinna at iki.fi
Tue Nov 2 20:44:15 CET 2021


* cipher/cipher-gcm-intel-pclmul.c (gfmul_pclmul_aggr4)
(gfmul_pclmul_aggr8): Move assembly to new GFMUL_AGGRx_ASM* macros.
(GFMUL_AGGR4_ASM_1, GFMUL_AGGR4_ASM_2, gfmul_pclmul_aggr4_le)
(GFMUL_AGGR8_ASM, gfmul_pclmul_aggr8_le)
(_gcry_polyval_intel_pclmul): New.
* cipher/cipher-gcm-siv.c (do_polyval_buf): Use polyval function
if available.
* cipher/cipher-gcm.c (_gcry_polyval_intel_pclmul): New.
(setupM): Setup 'c->u_mode.gcm.polyval_fn' with accelerated polyval
function if available.
* cipher/cipher-internal.h (gcry_cipher_handle): Add member
'u_mode.gcm.polyval_fn'.
--

Benchmark on AMD Ryzen 7 5800X:

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
    GCM-SIV enc |     0.150 ns/B      6337 MiB/s     0.730 c/B      4849
    GCM-SIV dec |     0.163 ns/B      5862 MiB/s     0.789 c/B      4850
   GCM-SIV auth |     0.119 ns/B      8022 MiB/s     0.577 c/B      4850

After (enc/dec ~26% faster, auth ~43% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
    GCM-SIV enc |     0.117 ns/B      8138 MiB/s     0.568 c/B      4850
    GCM-SIV dec |     0.128 ns/B      7429 MiB/s     0.623 c/B      4850
   GCM-SIV auth |     0.083 ns/B     11507 MiB/s     0.402 c/B      4851

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-intel-pclmul.c | 642 ++++++++++++++++++++-----------
 cipher/cipher-gcm-siv.c          |  35 +-
 cipher/cipher-gcm.c              |   7 +
 cipher/cipher-internal.h         |   3 +
 4 files changed, 459 insertions(+), 228 deletions(-)

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 334c89cd..daf807d0 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -114,6 +114,91 @@ static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void)
   reduction();
 }
 
+#define GFMUL_AGGR4_ASM_1(be_to_le)                                            \
+    /* perform clmul and merge results... */                                   \
+    "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */                        \
+    "movdqu 0*16(%[buf]), %%xmm5\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */                   \
+    "pxor %%xmm5, %%xmm1\n\t"                                                  \
+                                                                               \
+    "pshufd $78, %%xmm2, %%xmm5\n\t"                                           \
+    "pshufd $78, %%xmm1, %%xmm4\n\t"                                           \
+    "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */                         \
+    "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */                         \
+    "movdqa %%xmm2, %%xmm3\n\t"                                                \
+    "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 4:a0*b0 */              \
+    "pclmulqdq $17, %%xmm2, %%xmm1\n\t"  /* xmm1 holds 4:a1*b1 */              \
+    "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 4:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */                        \
+    "movdqu 1*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */                   \
+                                                                               \
+    "pshufd $78, %%xmm5, %%xmm0\n\t"                                           \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */                         \
+    "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */                         \
+    "movdqa %%xmm5, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
+    "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
+    "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */      \
+                                                                               \
+    "movdqu 2*16(%[buf]), %%xmm5\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */                   \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */                       \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */                       \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */             \
+                                                                               \
+    "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */                        \
+                                                                               \
+    "pshufd $78, %%xmm2, %%xmm0\n\t"                                           \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */                         \
+    "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */                         \
+    "movdqa %%xmm2, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 2:a0*b0 */               \
+    "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */               \
+    "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */      \
+                                                                               \
+    "movdqu 3*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */                   \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */                     \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */                     \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+#define GFMUL_AGGR4_ASM_2()                                                    \
+    "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */                                  \
+                                                                               \
+    "pshufd $78, %%xmm5, %%xmm0\n\t"                                           \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */                         \
+    "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */                         \
+    "movdqa %%xmm5, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */               \
+    "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */               \
+    "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */      \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */                   \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */                   \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */         \
+                                                                               \
+    /* aggregated reduction... */                                              \
+    "movdqa %%xmm3, %%xmm5\n\t"                                                \
+    "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */                     \
+    "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */     \
+    "movdqa %%xmm4, %%xmm5\n\t"                                                \
+    "psrldq $8, %%xmm4\n\t"                                                    \
+    "pslldq $8, %%xmm5\n\t"                                                    \
+    "pxor %%xmm5, %%xmm3\n\t"                                                  \
+    "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the           \
+                                  carry-less multiplication of xmm0            \
+                                  by xmm1 */
+
+#define be_to_le(...) __VA_ARGS__
+#define le_to_le(...) /*_*/
+
 static ASM_FUNC_ATTR_INLINE void
 gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table,
 		   const unsigned char *be_mask)
@@ -123,90 +208,36 @@ gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table,
      Output:
       Hash: XMM1
    */
-  asm volatile (/* perform clmul and merge results... */
-                "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */
-                "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
-                "pxor %%xmm5, %%xmm1\n\t"
-
-                "pshufd $78, %%xmm2, %%xmm5\n\t"
-                "pshufd $78, %%xmm1, %%xmm4\n\t"
-                "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
-                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */
-                "movdqa %%xmm2, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 4:a0*b0 */
-                "pclmulqdq $17, %%xmm2, %%xmm1\n\t"  /* xmm1 holds 4:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 4:(a0+a1)*(b0+b1) */
-
-                "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */
-                "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm5, %%xmm0\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */
-                "movdqa %%xmm5, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
-
-                "movdqu 2*16(%[buf]), %%xmm5\n\t"
-                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */
-
-                "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */
-
-                "pshufd $78, %%xmm2, %%xmm0\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */
-                "movdqa %%xmm2, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 2:a0*b0 */
-                "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */
-                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */
-
-                "movdqu 3*16(%[buf]), %%xmm2\n\t"
-                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+  asm volatile (GFMUL_AGGR4_ASM_1(be_to_le)
                 :
                 : [buf] "r" (buf),
                   [h_table] "r" (h_table),
                   [be_mask] "m" (*be_mask)
                 : "memory" );
 
-  asm volatile ("pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */
-
-                "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */
+  asm volatile (GFMUL_AGGR4_ASM_2()
+                :
+                : [h_1] "m" (*(const unsigned char *)h_1)
+                : "memory" );
 
-                "pshufd $78, %%xmm5, %%xmm0\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */
-                "movdqa %%xmm5, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */
-                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */
-                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */
+  reduction();
+}
 
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+static ASM_FUNC_ATTR_INLINE void
+gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
+{
+  /* Input:
+      Hash: XMM1
+     Output:
+      Hash: XMM1
+   */
+  asm volatile (GFMUL_AGGR4_ASM_1(le_to_le)
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table)
+                : "memory" );
 
-                /* aggregated reduction... */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
+  asm volatile (GFMUL_AGGR4_ASM_2()
                 :
                 : [h_1] "m" (*(const unsigned char *)h_1)
                 : "memory" );
@@ -215,6 +246,154 @@ gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table,
 }
 
 #ifdef __x86_64__
+
+#define GFMUL_AGGR8_ASM(be_to_le)                                              \
+    /* Load H6, H7, H8. */                                                     \
+    "movdqu 6*16(%[h_table]), %%xmm10\n\t"                                     \
+    "movdqu 5*16(%[h_table]), %%xmm9\n\t"                                      \
+    "movdqu 4*16(%[h_table]), %%xmm8\n\t"                                      \
+                                                                               \
+    /* perform clmul and merge results... */                                   \
+    "movdqu 0*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 1*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+    "pxor %%xmm5, %%xmm1\n\t"                                                  \
+                                                                               \
+    "pshufd $78, %%xmm10, %%xmm5\n\t"                                          \
+    "pshufd $78, %%xmm1, %%xmm4\n\t"                                           \
+    "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */                        \
+    "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 8:b0+b1 */                        \
+    "movdqa %%xmm10, %%xmm3\n\t"                                               \
+    "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 8:a0*b0 */              \
+    "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */              \
+    "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 8:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "pshufd $78, %%xmm9, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 7:b0+b1 */                        \
+    "movdqa %%xmm9, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 7:a0*b0 */               \
+    "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */                       \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */                       \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */             \
+                                                                               \
+    "movdqu 2*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 3*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+                                                                               \
+    "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */                       \
+    "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 6:b0+b1 */                        \
+    "movdqa %%xmm8, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 6:a0*b0 */               \
+    "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    /* Load H3, H4, H5. */                                                     \
+    "movdqu 3*16(%[h_table]), %%xmm10\n\t"                                     \
+    "movdqu 2*16(%[h_table]), %%xmm9\n\t"                                      \
+    "movdqu 1*16(%[h_table]), %%xmm8\n\t"                                      \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */                     \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */                     \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */           \
+                                                                               \
+    "pshufd $78, %%xmm10, %%xmm11\n\t"                                         \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */                      \
+    "pxor %%xmm2, %%xmm7\n\t"   /* xmm7 holds 5:b0+b1 */                       \
+    "movdqa %%xmm10, %%xmm6\n\t"                                               \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"   /* xmm6 holds 5:a0*b0 */              \
+    "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */              \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 5:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */                   \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */                   \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */         \
+                                                                               \
+    "movdqu 4*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 5*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+                                                                               \
+    "pshufd $78, %%xmm9, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */                       \
+    "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 4:b0+b1 */                        \
+    "movdqa %%xmm9, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 4:a0*b0 */               \
+    "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */                 \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */                 \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */       \
+                                                                               \
+    "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */                        \
+    "movdqa %%xmm8, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
+    "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */                        \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */               \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */               \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "movdqu 6*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 7*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+                                                                               \
+    "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */                      \
+    "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */                       \
+    "movdqa %%xmm8, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */              \
+    "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */              \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */             \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */             \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */   \
+                                                                               \
+    "pshufd $78, %%xmm0, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */                        \
+    "movdqa %%xmm0, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
+    "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */         \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */         \
+    "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */\
+                                                                               \
+    /* aggregated reduction... */                                              \
+    "movdqa %%xmm3, %%xmm5\n\t"                                                \
+    "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */                     \
+    "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */     \
+    "movdqa %%xmm4, %%xmm5\n\t"                                                \
+    "psrldq $8, %%xmm4\n\t"                                                    \
+    "pslldq $8, %%xmm5\n\t"                                                    \
+    "pxor %%xmm5, %%xmm3\n\t"                                                  \
+    "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the           \
+                                  carry-less multiplication of xmm0            \
+                                  by xmm1 */
+
 static ASM_FUNC_ATTR_INLINE void
 gfmul_pclmul_aggr8(const void *buf, const void *h_table)
 {
@@ -226,151 +405,26 @@ gfmul_pclmul_aggr8(const void *buf, const void *h_table)
       Hash: XMM1
      Inputs XMM0 and XMM15 stays unmodified.
    */
-  asm volatile (/* Load H6, H7, H8. */
-                "movdqu 6*16(%[h_table]), %%xmm10\n\t"
-                "movdqu 5*16(%[h_table]), %%xmm9\n\t"
-                "movdqu 4*16(%[h_table]), %%xmm8\n\t"
-
-                /* perform clmul and merge results... */
-                "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-                "pxor %%xmm5, %%xmm1\n\t"
-
-                "pshufd $78, %%xmm10, %%xmm5\n\t"
-                "pshufd $78, %%xmm1, %%xmm4\n\t"
-                "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */
-                "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 8:b0+b1 */
-                "movdqa %%xmm10, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 8:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm9, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 7:b0+b1 */
-                "movdqa %%xmm9, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 7:a0*b0 */
-                "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */
-
-                "movdqu 2*16(%[buf]), %%xmm5\n\t"
-                "movdqu 3*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm8, %%xmm11\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 6:b0+b1 */
-                "movdqa %%xmm8, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 6:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */
-
-                /* Load H3, H4, H5. */
-                "movdqu 3*16(%[h_table]), %%xmm10\n\t"
-                "movdqu 2*16(%[h_table]), %%xmm9\n\t"
-                "movdqu 1*16(%[h_table]), %%xmm8\n\t"
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm10, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"   /* xmm7 holds 5:b0+b1 */
-                "movdqa %%xmm10, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"   /* xmm6 holds 5:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 5:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "movdqu 4*16(%[buf]), %%xmm5\n\t"
-                "movdqu 5*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm9, %%xmm11\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 4:b0+b1 */
-                "movdqa %%xmm9, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 4:a0*b0 */
-                "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm8, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
-                "movdqa %%xmm8, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
-
-                "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "movdqu 6*16(%[buf]), %%xmm5\n\t"
-                "movdqu 7*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm8, %%xmm11\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */
-                "movdqa %%xmm8, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm0, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
-                "movdqa %%xmm0, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                /* aggregated reduction... */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
+  asm volatile (GFMUL_AGGR8_ASM(be_to_le)
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table)
+                : "memory" );
+
+  reduction();
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_pclmul_aggr8_le(const void *buf, const void *h_table)
+{
+  /* Input:
+      H¹: XMM0
+      Hash: XMM1
+     Output:
+      Hash: XMM1
+     Inputs XMM0 and XMM15 stays unmodified.
+   */
+  asm volatile (GFMUL_AGGR8_ASM(le_to_le)
                 :
                 : [buf] "r" (buf),
                   [h_table] "r" (h_table)
@@ -705,6 +759,154 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
   return 0;
 }
 
+unsigned int ASM_FUNC_ATTR
+_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                            size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+#if defined(__x86_64__) && defined(__WIN64__)
+  char win64tmp[10 * 16];
+#endif
+
+  if (nblocks == 0)
+    return 0;
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* XMM6-XMM15 need to be restored after use. */
+  asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t"
+                "movdqu %%xmm7,  1*16(%0)\n\t"
+                "movdqu %%xmm8,  2*16(%0)\n\t"
+                "movdqu %%xmm9,  3*16(%0)\n\t"
+                "movdqu %%xmm10, 4*16(%0)\n\t"
+                "movdqu %%xmm11, 5*16(%0)\n\t"
+                "movdqu %%xmm12, 6*16(%0)\n\t"
+                "movdqu %%xmm13, 7*16(%0)\n\t"
+                "movdqu %%xmm14, 8*16(%0)\n\t"
+                "movdqu %%xmm15, 9*16(%0)\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#endif
+
+  /* Preload hash. */
+  asm volatile ("pxor %%xmm7, %%xmm7\n\t"
+                "movdqu %[hash], %%xmm1\n\t"
+                "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                :
+                : [hash] "m" (*result),
+                  [be_mask] "m" (*be_mask)
+                : "memory" );
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      /* Preload H1. */
+      asm volatile ("pxor %%xmm15, %%xmm15\n\t"
+                    "movdqa %[h_1], %%xmm0\n\t"
+                    :
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                    : "memory" );
+
+      while (nblocks >= 8)
+        {
+          gfmul_pclmul_aggr8_le (buf, c->u_mode.gcm.gcm_table);
+
+          buf += 8 * blocksize;
+          nblocks -= 8;
+        }
+#ifndef __WIN64__
+      /* Clear used x86-64/XMM registers. */
+      asm volatile( "pxor %%xmm8, %%xmm8\n\t"
+                    "pxor %%xmm9, %%xmm9\n\t"
+                    "pxor %%xmm10, %%xmm10\n\t"
+                    "pxor %%xmm11, %%xmm11\n\t"
+                    "pxor %%xmm12, %%xmm12\n\t"
+                    "pxor %%xmm13, %%xmm13\n\t"
+                    "pxor %%xmm14, %%xmm14\n\t"
+                    "pxor %%xmm15, %%xmm15\n\t"
+                    ::: "memory" );
+#endif
+    }
+#endif
+
+  while (nblocks >= 4)
+    {
+      gfmul_pclmul_aggr4_le (buf, c->u_mode.gcm.u_ghash_key.key,
+                             c->u_mode.gcm.gcm_table);
+
+      buf += 4 * blocksize;
+      nblocks -= 4;
+    }
+
+  if (nblocks)
+    {
+      /* Preload H1. */
+      asm volatile ("movdqa %[h_1], %%xmm0\n\t"
+                    :
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                    : "memory" );
+
+      while (nblocks)
+        {
+          asm volatile ("movdqu %[buf], %%xmm2\n\t"
+                        "pxor %%xmm2, %%xmm1\n\t"
+                        :
+                        : [buf] "m" (*buf)
+                        : "memory" );
+
+          gfmul_pclmul ();
+
+          buf += blocksize;
+          nblocks--;
+        }
+    }
+
+  /* Store hash. */
+  asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                "movdqu %%xmm1, %[hash]\n\t"
+                : [hash] "=m" (*result)
+                : [be_mask] "m" (*be_mask)
+                : "memory" );
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* Clear/restore used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "movdqu 0*16(%0), %%xmm6\n\t"
+                "movdqu 1*16(%0), %%xmm7\n\t"
+                "movdqu 2*16(%0), %%xmm8\n\t"
+                "movdqu 3*16(%0), %%xmm9\n\t"
+                "movdqu 4*16(%0), %%xmm10\n\t"
+                "movdqu 5*16(%0), %%xmm11\n\t"
+                "movdqu 6*16(%0), %%xmm12\n\t"
+                "movdqu 7*16(%0), %%xmm13\n\t"
+                "movdqu 8*16(%0), %%xmm14\n\t"
+                "movdqu 9*16(%0), %%xmm15\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#else
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                ::: "memory" );
+#endif
+
+  return 0;
+}
+
 #if __clang__
 #  pragma clang attribute pop
 #endif
diff --git a/cipher/cipher-gcm-siv.c b/cipher/cipher-gcm-siv.c
index 813cf579..9ebc0036 100644
--- a/cipher/cipher-gcm-siv.c
+++ b/cipher/cipher-gcm-siv.c
@@ -96,6 +96,7 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
   unsigned int blocksize = GCRY_SIV_BLOCK_LEN;
   unsigned int unused = c->u_mode.gcm.mac_unused;
   ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
+  ghash_fn_t polyval_fn = c->u_mode.gcm.polyval_fn;
   byte tmp_blocks[16][GCRY_SIV_BLOCK_LEN];
   size_t nblocks, n;
   unsigned int burn = 0, nburn;
@@ -137,9 +138,17 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
           gcry_assert (unused == blocksize);
 
           /* Process one block from macbuf.  */
-          cipher_block_bswap (c->u_mode.gcm.macbuf, c->u_mode.gcm.macbuf,
-			      blocksize);
-          nburn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
+          if (polyval_fn)
+            {
+              nburn = polyval_fn (c, hash, c->u_mode.gcm.macbuf, 1);
+            }
+          else
+            {
+              cipher_block_bswap (c->u_mode.gcm.macbuf, c->u_mode.gcm.macbuf,
+                                  blocksize);
+              nburn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
+            }
+
           burn = nburn > burn ? nburn : burn;
           unused = 0;
         }
@@ -148,12 +157,22 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 
       while (nblocks)
         {
-	  for (n = 0; n < (nblocks > 16 ? 16 : nblocks); n++)
-	    cipher_block_bswap (tmp_blocks[n], buf + n * blocksize, blocksize);
-
-	  num_blks_used = n > num_blks_used ? n : num_blks_used;
+          if (polyval_fn)
+            {
+              n = nblocks;
+              nburn = polyval_fn (c, hash, buf, n);
+            }
+          else
+            {
+              for (n = 0; n < (nblocks > 16 ? 16 : nblocks); n++)
+                cipher_block_bswap (tmp_blocks[n], buf + n * blocksize,
+                                    blocksize);
+
+              num_blks_used = n > num_blks_used ? n : num_blks_used;
+
+              nburn = ghash_fn (c, hash, tmp_blocks[0], n);
+            }
 
-          nburn = ghash_fn (c, hash, tmp_blocks[0], n);
           burn = nburn > burn ? nburn : burn;
           buf += n * blocksize;
           buflen -= n * blocksize;
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 6b14cce7..d3ed9cf6 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -43,6 +43,11 @@ extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
 
 extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
                                               const byte *buf, size_t nblocks);
+
+extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c,
+                                                byte *result,
+                                                const byte *buf,
+                                                size_t nblocks);
 #endif
 
 #ifdef GCM_USE_ARM_PMULL
@@ -570,6 +575,7 @@ setupM (gcry_cipher_hd_t c)
 #endif
 
   c->u_mode.gcm.ghash_fn = NULL;
+  c->u_mode.gcm.polyval_fn = NULL;
 
   if (0)
     { }
@@ -577,6 +583,7 @@ setupM (gcry_cipher_hd_t c)
   else if (features & HWF_INTEL_PCLMUL)
     {
       c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+      c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
       _gcry_ghash_setup_intel_pclmul (c);
     }
 #endif
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 0bc85b1a..edb29628 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -350,6 +350,9 @@ struct gcry_cipher_handle
       /* GHASH implementation in use. */
       ghash_fn_t ghash_fn;
 
+      /* POLYVAL implementation in use (GCM-SIV). */
+      ghash_fn_t polyval_fn;
+
       /* Key length used for GCM-SIV key generating key. */
       unsigned int siv_keylen;
     } gcm;
-- 
2.32.0




More information about the Gcrypt-devel mailing list