From jussi.kivilinna at iki.fi  Tue Nov  2 20:44:15 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue,  2 Nov 2021 21:44:15 +0200
Subject: [PATCH] Add intel-pclmul accelerated POLYVAL for GCM-SIV
Message-ID: <20211102194415.993948-1-jussi.kivilinna@iki.fi>

* cipher/cipher-gcm-intel-pclmul.c (gfmul_pclmul_aggr4)
(gfmul_pclmul_aggr8): Move assembly to new GFMUL_AGGRx_ASM* macros.
(GFMUL_AGGR4_ASM_1, GFMUL_AGGR4_ASM_2, gfmul_pclmul_aggr4_le)
(GFMUL_AGGR8_ASM, gfmul_pclmul_aggr8_le)
(_gcry_polyval_intel_pclmul): New.
* cipher/cipher-gcm-siv.c (do_polyval_buf): Use polyval function
if available.
* cipher/cipher-gcm.c (_gcry_polyval_intel_pclmul): New.
(setupM): Setup 'c->u_mode.gcm.polyval_fn' with accelerated polyval
function if available.
* cipher/cipher-internal.h (gcry_cipher_handle): Add member
'u_mode.gcm.polyval_fn'.
--

Benchmark on AMD Ryzen 7 5800X:

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
    GCM-SIV enc |     0.150 ns/B      6337 MiB/s     0.730 c/B      4849
    GCM-SIV dec |     0.163 ns/B      5862 MiB/s     0.789 c/B      4850
   GCM-SIV auth |     0.119 ns/B      8022 MiB/s     0.577 c/B      4850

After (enc/dec ~26% faster, auth ~43% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
    GCM-SIV enc |     0.117 ns/B      8138 MiB/s     0.568 c/B      4850
    GCM-SIV dec |     0.128 ns/B      7429 MiB/s     0.623 c/B      4850
   GCM-SIV auth |     0.083 ns/B     11507 MiB/s     0.402 c/B      4851

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-intel-pclmul.c | 642 ++++++++++++++++++++-----------
 cipher/cipher-gcm-siv.c          |  35 +-
 cipher/cipher-gcm.c              |   7 +
 cipher/cipher-internal.h         |   3 +
 4 files changed, 459 insertions(+), 228 deletions(-)

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 334c89cd..daf807d0 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -114,6 +114,91 @@ static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void)
   reduction();
 }
 
+#define GFMUL_AGGR4_ASM_1(be_to_le)                                            \
+    /* perform clmul and merge results... */                                   \
+    "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */                        \
+    "movdqu 0*16(%[buf]), %%xmm5\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */                   \
+    "pxor %%xmm5, %%xmm1\n\t"                                                  \
+                                                                               \
+    "pshufd $78, %%xmm2, %%xmm5\n\t"                                           \
+    "pshufd $78, %%xmm1, %%xmm4\n\t"                                           \
+    "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */                         \
+    "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */                         \
+    "movdqa %%xmm2, %%xmm3\n\t"                                                \
+    "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 4:a0*b0 */              \
+    "pclmulqdq $17, %%xmm2, %%xmm1\n\t"  /* xmm1 holds 4:a1*b1 */              \
+    "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 4:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */                        \
+    "movdqu 1*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */                   \
+                                                                               \
+    "pshufd $78, %%xmm5, %%xmm0\n\t"                                           \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */                         \
+    "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */                         \
+    "movdqa %%xmm5, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
+    "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
+    "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */      \
+                                                                               \
+    "movdqu 2*16(%[buf]), %%xmm5\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */                   \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */                       \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */                       \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */             \
+                                                                               \
+    "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */                        \
+                                                                               \
+    "pshufd $78, %%xmm2, %%xmm0\n\t"                                           \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */                         \
+    "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */                         \
+    "movdqa %%xmm2, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 2:a0*b0 */               \
+    "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */               \
+    "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */      \
+                                                                               \
+    "movdqu 3*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */                   \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */                     \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */                     \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+#define GFMUL_AGGR4_ASM_2()                                                    \
+    "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */                                  \
+                                                                               \
+    "pshufd $78, %%xmm5, %%xmm0\n\t"                                           \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */                         \
+    "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */                         \
+    "movdqa %%xmm5, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */               \
+    "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */               \
+    "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */      \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */                   \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */                   \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */         \
+                                                                               \
+    /* aggregated reduction... */                                              \
+    "movdqa %%xmm3, %%xmm5\n\t"                                                \
+    "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */                     \
+    "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */     \
+    "movdqa %%xmm4, %%xmm5\n\t"                                                \
+    "psrldq $8, %%xmm4\n\t"                                                    \
+    "pslldq $8, %%xmm5\n\t"                                                    \
+    "pxor %%xmm5, %%xmm3\n\t"                                                  \
+    "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the           \
+                                  carry-less multiplication of xmm0            \
+                                  by xmm1 */
+
+#define be_to_le(...) __VA_ARGS__
+#define le_to_le(...) /*_*/
+
 static ASM_FUNC_ATTR_INLINE void
 gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table,
 		   const unsigned char *be_mask)
@@ -123,90 +208,36 @@ gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table,
      Output:
       Hash: XMM1
    */
-  asm volatile (/* perform clmul and merge results... */
-                "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */
-                "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
-                "pxor %%xmm5, %%xmm1\n\t"
-
-                "pshufd $78, %%xmm2, %%xmm5\n\t"
-                "pshufd $78, %%xmm1, %%xmm4\n\t"
-                "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
-                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */
-                "movdqa %%xmm2, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 4:a0*b0 */
-                "pclmulqdq $17, %%xmm2, %%xmm1\n\t"  /* xmm1 holds 4:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 4:(a0+a1)*(b0+b1) */
-
-                "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */
-                "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm5, %%xmm0\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */
-                "movdqa %%xmm5, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
-
-                "movdqu 2*16(%[buf]), %%xmm5\n\t"
-                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */
-
-                "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */
-
-                "pshufd $78, %%xmm2, %%xmm0\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */
-                "movdqa %%xmm2, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 2:a0*b0 */
-                "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */
-                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */
-
-                "movdqu 3*16(%[buf]), %%xmm2\n\t"
-                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+  asm volatile (GFMUL_AGGR4_ASM_1(be_to_le)
                 :
                 : [buf] "r" (buf),
                   [h_table] "r" (h_table),
                   [be_mask] "m" (*be_mask)
                 : "memory" );
 
-  asm volatile ("pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */
-
-                "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */
+  asm volatile (GFMUL_AGGR4_ASM_2()
+                :
+                : [h_1] "m" (*(const unsigned char *)h_1)
+                : "memory" );
 
-                "pshufd $78, %%xmm5, %%xmm0\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */
-                "movdqa %%xmm5, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */
-                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */
-                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */
+  reduction();
+}
 
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+static ASM_FUNC_ATTR_INLINE void
+gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
+{
+  /* Input:
+      Hash: XMM1
+     Output:
+      Hash: XMM1
+   */
+  asm volatile (GFMUL_AGGR4_ASM_1(le_to_le)
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table)
+                : "memory" );
 
-                /* aggregated reduction... */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
+  asm volatile (GFMUL_AGGR4_ASM_2()
                 :
                 : [h_1] "m" (*(const unsigned char *)h_1)
                 : "memory" );
@@ -215,6 +246,154 @@ gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table,
 }
 
 #ifdef __x86_64__
+
+#define GFMUL_AGGR8_ASM(be_to_le)                                              \
+    /* Load H6, H7, H8. */                                                     \
+    "movdqu 6*16(%[h_table]), %%xmm10\n\t"                                     \
+    "movdqu 5*16(%[h_table]), %%xmm9\n\t"                                      \
+    "movdqu 4*16(%[h_table]), %%xmm8\n\t"                                      \
+                                                                               \
+    /* perform clmul and merge results... */                                   \
+    "movdqu 0*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 1*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+    "pxor %%xmm5, %%xmm1\n\t"                                                  \
+                                                                               \
+    "pshufd $78, %%xmm10, %%xmm5\n\t"                                          \
+    "pshufd $78, %%xmm1, %%xmm4\n\t"                                           \
+    "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */                        \
+    "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 8:b0+b1 */                        \
+    "movdqa %%xmm10, %%xmm3\n\t"                                               \
+    "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 8:a0*b0 */              \
+    "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */              \
+    "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 8:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "pshufd $78, %%xmm9, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 7:b0+b1 */                        \
+    "movdqa %%xmm9, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 7:a0*b0 */               \
+    "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */                       \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */                       \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */             \
+                                                                               \
+    "movdqu 2*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 3*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+                                                                               \
+    "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */                       \
+    "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 6:b0+b1 */                        \
+    "movdqa %%xmm8, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 6:a0*b0 */               \
+    "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    /* Load H3, H4, H5. */                                                     \
+    "movdqu 3*16(%[h_table]), %%xmm10\n\t"                                     \
+    "movdqu 2*16(%[h_table]), %%xmm9\n\t"                                      \
+    "movdqu 1*16(%[h_table]), %%xmm8\n\t"                                      \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */                     \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */                     \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */           \
+                                                                               \
+    "pshufd $78, %%xmm10, %%xmm11\n\t"                                         \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */                      \
+    "pxor %%xmm2, %%xmm7\n\t"   /* xmm7 holds 5:b0+b1 */                       \
+    "movdqa %%xmm10, %%xmm6\n\t"                                               \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"   /* xmm6 holds 5:a0*b0 */              \
+    "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */              \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 5:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */                   \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */                   \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */         \
+                                                                               \
+    "movdqu 4*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 5*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+                                                                               \
+    "pshufd $78, %%xmm9, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */                       \
+    "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 4:b0+b1 */                        \
+    "movdqa %%xmm9, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 4:a0*b0 */               \
+    "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */                 \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */                 \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */       \
+                                                                               \
+    "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */                        \
+    "movdqa %%xmm8, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
+    "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */                        \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */               \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */               \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "movdqu 6*16(%[buf]), %%xmm5\n\t"                                          \
+    "movdqu 7*16(%[buf]), %%xmm2\n\t"                                          \
+    be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */                      \
+    be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */                      \
+                                                                               \
+    "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
+    "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */                      \
+    "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */                       \
+    "movdqa %%xmm8, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */              \
+    "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */              \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */    \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */             \
+    "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */             \
+    "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */   \
+                                                                               \
+    "pshufd $78, %%xmm0, %%xmm11\n\t"                                          \
+    "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
+    "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */                        \
+    "movdqa %%xmm0, %%xmm6\n\t"                                                \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
+    "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */     \
+                                                                               \
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */         \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */         \
+    "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */\
+                                                                               \
+    /* aggregated reduction... */                                              \
+    "movdqa %%xmm3, %%xmm5\n\t"                                                \
+    "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */                     \
+    "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */     \
+    "movdqa %%xmm4, %%xmm5\n\t"                                                \
+    "psrldq $8, %%xmm4\n\t"                                                    \
+    "pslldq $8, %%xmm5\n\t"                                                    \
+    "pxor %%xmm5, %%xmm3\n\t"                                                  \
+    "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the           \
+                                  carry-less multiplication of xmm0            \
+                                  by xmm1 */
+
 static ASM_FUNC_ATTR_INLINE void
 gfmul_pclmul_aggr8(const void *buf, const void *h_table)
 {
@@ -226,151 +405,26 @@ gfmul_pclmul_aggr8(const void *buf, const void *h_table)
       Hash: XMM1
      Inputs XMM0 and XMM15 stays unmodified.
    */
-  asm volatile (/* Load H6, H7, H8. */
-                "movdqu 6*16(%[h_table]), %%xmm10\n\t"
-                "movdqu 5*16(%[h_table]), %%xmm9\n\t"
-                "movdqu 4*16(%[h_table]), %%xmm8\n\t"
-
-                /* perform clmul and merge results... */
-                "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-                "pxor %%xmm5, %%xmm1\n\t"
-
-                "pshufd $78, %%xmm10, %%xmm5\n\t"
-                "pshufd $78, %%xmm1, %%xmm4\n\t"
-                "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */
-                "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 8:b0+b1 */
-                "movdqa %%xmm10, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 8:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm9, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 7:b0+b1 */
-                "movdqa %%xmm9, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 7:a0*b0 */
-                "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */
-
-                "movdqu 2*16(%[buf]), %%xmm5\n\t"
-                "movdqu 3*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm8, %%xmm11\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 6:b0+b1 */
-                "movdqa %%xmm8, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 6:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */
-
-                /* Load H3, H4, H5. */
-                "movdqu 3*16(%[h_table]), %%xmm10\n\t"
-                "movdqu 2*16(%[h_table]), %%xmm9\n\t"
-                "movdqu 1*16(%[h_table]), %%xmm8\n\t"
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm10, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"   /* xmm7 holds 5:b0+b1 */
-                "movdqa %%xmm10, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"   /* xmm6 holds 5:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 5:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "movdqu 4*16(%[buf]), %%xmm5\n\t"
-                "movdqu 5*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm9, %%xmm11\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 4:b0+b1 */
-                "movdqa %%xmm9, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 4:a0*b0 */
-                "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm8, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
-                "movdqa %%xmm8, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
-
-                "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "movdqu 6*16(%[buf]), %%xmm5\n\t"
-                "movdqu 7*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
-
-                "pshufd $78, %%xmm8, %%xmm11\n\t"
-                "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */
-                "movdqa %%xmm8, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */
-                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm0, %%xmm11\n\t"
-                "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
-                "movdqa %%xmm0, %%xmm6\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */
-                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
-
-                /* aggregated reduction... */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
+  asm volatile (GFMUL_AGGR8_ASM(be_to_le)
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table)
+                : "memory" );
+
+  reduction();
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_pclmul_aggr8_le(const void *buf, const void *h_table)
+{
+  /* Input:
+      H?: XMM0
+      Hash: XMM1
+     Output:
+      Hash: XMM1
+     Inputs XMM0 and XMM15 stays unmodified.
+   */
+  asm volatile (GFMUL_AGGR8_ASM(le_to_le)
                 :
                 : [buf] "r" (buf),
                   [h_table] "r" (h_table)
@@ -705,6 +759,154 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
   return 0;
 }
 
+unsigned int ASM_FUNC_ATTR
+_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                            size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+#if defined(__x86_64__) && defined(__WIN64__)
+  char win64tmp[10 * 16];
+#endif
+
+  if (nblocks == 0)
+    return 0;
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* XMM6-XMM15 need to be restored after use. */
+  asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t"
+                "movdqu %%xmm7,  1*16(%0)\n\t"
+                "movdqu %%xmm8,  2*16(%0)\n\t"
+                "movdqu %%xmm9,  3*16(%0)\n\t"
+                "movdqu %%xmm10, 4*16(%0)\n\t"
+                "movdqu %%xmm11, 5*16(%0)\n\t"
+                "movdqu %%xmm12, 6*16(%0)\n\t"
+                "movdqu %%xmm13, 7*16(%0)\n\t"
+                "movdqu %%xmm14, 8*16(%0)\n\t"
+                "movdqu %%xmm15, 9*16(%0)\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#endif
+
+  /* Preload hash. */
+  asm volatile ("pxor %%xmm7, %%xmm7\n\t"
+                "movdqu %[hash], %%xmm1\n\t"
+                "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                :
+                : [hash] "m" (*result),
+                  [be_mask] "m" (*be_mask)
+                : "memory" );
+
+#ifdef __x86_64__
+  if (nblocks >= 8)
+    {
+      /* Preload H1. */
+      asm volatile ("pxor %%xmm15, %%xmm15\n\t"
+                    "movdqa %[h_1], %%xmm0\n\t"
+                    :
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                    : "memory" );
+
+      while (nblocks >= 8)
+        {
+          gfmul_pclmul_aggr8_le (buf, c->u_mode.gcm.gcm_table);
+
+          buf += 8 * blocksize;
+          nblocks -= 8;
+        }
+#ifndef __WIN64__
+      /* Clear used x86-64/XMM registers. */
+      asm volatile( "pxor %%xmm8, %%xmm8\n\t"
+                    "pxor %%xmm9, %%xmm9\n\t"
+                    "pxor %%xmm10, %%xmm10\n\t"
+                    "pxor %%xmm11, %%xmm11\n\t"
+                    "pxor %%xmm12, %%xmm12\n\t"
+                    "pxor %%xmm13, %%xmm13\n\t"
+                    "pxor %%xmm14, %%xmm14\n\t"
+                    "pxor %%xmm15, %%xmm15\n\t"
+                    ::: "memory" );
+#endif
+    }
+#endif
+
+  while (nblocks >= 4)
+    {
+      gfmul_pclmul_aggr4_le (buf, c->u_mode.gcm.u_ghash_key.key,
+                             c->u_mode.gcm.gcm_table);
+
+      buf += 4 * blocksize;
+      nblocks -= 4;
+    }
+
+  if (nblocks)
+    {
+      /* Preload H1. */
+      asm volatile ("movdqa %[h_1], %%xmm0\n\t"
+                    :
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                    : "memory" );
+
+      while (nblocks)
+        {
+          asm volatile ("movdqu %[buf], %%xmm2\n\t"
+                        "pxor %%xmm2, %%xmm1\n\t"
+                        :
+                        : [buf] "m" (*buf)
+                        : "memory" );
+
+          gfmul_pclmul ();
+
+          buf += blocksize;
+          nblocks--;
+        }
+    }
+
+  /* Store hash. */
+  asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                "movdqu %%xmm1, %[hash]\n\t"
+                : [hash] "=m" (*result)
+                : [be_mask] "m" (*be_mask)
+                : "memory" );
+
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* Clear/restore used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "movdqu 0*16(%0), %%xmm6\n\t"
+                "movdqu 1*16(%0), %%xmm7\n\t"
+                "movdqu 2*16(%0), %%xmm8\n\t"
+                "movdqu 3*16(%0), %%xmm9\n\t"
+                "movdqu 4*16(%0), %%xmm10\n\t"
+                "movdqu 5*16(%0), %%xmm11\n\t"
+                "movdqu 6*16(%0), %%xmm12\n\t"
+                "movdqu 7*16(%0), %%xmm13\n\t"
+                "movdqu 8*16(%0), %%xmm14\n\t"
+                "movdqu 9*16(%0), %%xmm15\n\t"
+                :
+                : "r" (win64tmp)
+                : "memory" );
+#else
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                ::: "memory" );
+#endif
+
+  return 0;
+}
+
 #if __clang__
 #  pragma clang attribute pop
 #endif
diff --git a/cipher/cipher-gcm-siv.c b/cipher/cipher-gcm-siv.c
index 813cf579..9ebc0036 100644
--- a/cipher/cipher-gcm-siv.c
+++ b/cipher/cipher-gcm-siv.c
@@ -96,6 +96,7 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
   unsigned int blocksize = GCRY_SIV_BLOCK_LEN;
   unsigned int unused = c->u_mode.gcm.mac_unused;
   ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
+  ghash_fn_t polyval_fn = c->u_mode.gcm.polyval_fn;
   byte tmp_blocks[16][GCRY_SIV_BLOCK_LEN];
   size_t nblocks, n;
   unsigned int burn = 0, nburn;
@@ -137,9 +138,17 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
           gcry_assert (unused == blocksize);
 
           /* Process one block from macbuf.  */
-          cipher_block_bswap (c->u_mode.gcm.macbuf, c->u_mode.gcm.macbuf,
-			      blocksize);
-          nburn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
+          if (polyval_fn)
+            {
+              nburn = polyval_fn (c, hash, c->u_mode.gcm.macbuf, 1);
+            }
+          else
+            {
+              cipher_block_bswap (c->u_mode.gcm.macbuf, c->u_mode.gcm.macbuf,
+                                  blocksize);
+              nburn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
+            }
+
           burn = nburn > burn ? nburn : burn;
           unused = 0;
         }
@@ -148,12 +157,22 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 
       while (nblocks)
         {
-	  for (n = 0; n < (nblocks > 16 ? 16 : nblocks); n++)
-	    cipher_block_bswap (tmp_blocks[n], buf + n * blocksize, blocksize);
-
-	  num_blks_used = n > num_blks_used ? n : num_blks_used;
+          if (polyval_fn)
+            {
+              n = nblocks;
+              nburn = polyval_fn (c, hash, buf, n);
+            }
+          else
+            {
+              for (n = 0; n < (nblocks > 16 ? 16 : nblocks); n++)
+                cipher_block_bswap (tmp_blocks[n], buf + n * blocksize,
+                                    blocksize);
+
+              num_blks_used = n > num_blks_used ? n : num_blks_used;
+
+              nburn = ghash_fn (c, hash, tmp_blocks[0], n);
+            }
 
-          nburn = ghash_fn (c, hash, tmp_blocks[0], n);
           burn = nburn > burn ? nburn : burn;
           buf += n * blocksize;
           buflen -= n * blocksize;
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 6b14cce7..d3ed9cf6 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -43,6 +43,11 @@ extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
 
 extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
                                               const byte *buf, size_t nblocks);
+
+extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c,
+                                                byte *result,
+                                                const byte *buf,
+                                                size_t nblocks);
 #endif
 
 #ifdef GCM_USE_ARM_PMULL
@@ -570,6 +575,7 @@ setupM (gcry_cipher_hd_t c)
 #endif
 
   c->u_mode.gcm.ghash_fn = NULL;
+  c->u_mode.gcm.polyval_fn = NULL;
 
   if (0)
     { }
@@ -577,6 +583,7 @@ setupM (gcry_cipher_hd_t c)
   else if (features & HWF_INTEL_PCLMUL)
     {
       c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+      c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
       _gcry_ghash_setup_intel_pclmul (c);
     }
 #endif
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 0bc85b1a..edb29628 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -350,6 +350,9 @@ struct gcry_cipher_handle
       /* GHASH implementation in use. */
       ghash_fn_t ghash_fn;
 
+      /* POLYVAL implementation in use (GCM-SIV). */
+      ghash_fn_t polyval_fn;
+
       /* Key length used for GCM-SIV key generating key. */
       unsigned int siv_keylen;
     } gcm;
-- 
2.32.0


From guidovranken at gmail.com  Tue Nov 30 14:21:43 2021
From: guidovranken at gmail.com (Guido Vranken)
Date: Tue, 30 Nov 2021 14:21:43 +0100
Subject: gcry_mpi_sub_ui result is positive when it should be negative
Message-ID: <CAO5O-EJ6+jhW5Y0HVBiEap+msCqL6XoRwEDbmc__HDwcYdk3Qg@mail.gmail.com>

In the program below, the result of the computation -5 - 2 should be -7,
but it is 7.

Compare with the behavior of gcry_mpi_add_ui and gcry_mpi_mul_ui
(commented), which do produce a negative result, as they should.

This was tested on the latest repository checkout.

#include <gcrypt.h>

#define CF_CHECK_EQ(expr, res) if ( (expr) != (res) ) { goto end; }

int main(void)
{
    gcry_mpi_t A;
    gcry_mpi_t res;
    gcry_error_t err;
    char *buf;

    CF_CHECK_EQ(err = gcry_mpi_scan(&A, GCRYMPI_FMT_HEX, "-5", 0, NULL), 0);
    CF_CHECK_EQ(err = gcry_mpi_scan(&res, GCRYMPI_FMT_HEX, "0", 0, NULL),
0);
    gcry_mpi_sub_ui(res, A, 2);
    //gcry_mpi_add_ui(res, A, 2);
    //gcry_mpi_mul_ui(res, A, 2);
    CF_CHECK_EQ(err = gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned
char**)&buf, NULL, res), 0);
    printf("%s\n", buf);
end:

    return 0;
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20211130/d6bfcb3c/attachment.html>

From jussi.kivilinna at iki.fi  Tue Nov 30 21:23:45 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 30 Nov 2021 22:23:45 +0200
Subject: [PATCH] gcry_mpi_sub_ui: fix subtracting from negative value
Message-ID: <20211130202345.877377-1-jussi.kivilinna@iki.fi>

* mpi/mpi-add.c (_gcry_mpi_sub_ui): Set output sign bit when 'u'
is negative.
* tests/mpitests.c (test_add): Additional tests for mpi_add_ui; Check
test output and fail if output does not match expected.
(test_sub): Additional tests for mpi_sub_ui; Check test output and fail
if output does not match expected.
(test_mul): Additional tests for mpi_mul_ui; Check test output and fail
if output does not match expected.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/mpi-add.c    |   1 +
 tests/mpitests.c | 119 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 113 insertions(+), 7 deletions(-)

diff --git a/mpi/mpi-add.c b/mpi/mpi-add.c
index 53f476e0..38dd352f 100644
--- a/mpi/mpi-add.c
+++ b/mpi/mpi-add.c
@@ -191,6 +191,7 @@ _gcry_mpi_sub_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned long v )
 	cy = _gcry_mpih_add_1(wp, up, usize, v);
 	wp[usize] = cy;
 	wsize = usize + cy;
+	wsign = 1;
     }
     else {  /* The signs are different.  Need exact comparison to determine
 	     * which operand to subtract from which.  */
diff --git a/tests/mpitests.c b/tests/mpitests.c
index 96e01551..48ea18b2 100644
--- a/tests/mpitests.c
+++ b/tests/mpitests.c
@@ -378,7 +378,8 @@ test_add (void)
   gcry_mpi_t two;
   gcry_mpi_t ff;
   gcry_mpi_t result;
-  unsigned char* pc;
+  gcry_mpi_t minusfive;
+  char *pc;
 
   gcry_mpi_scan(&one, GCRYMPI_FMT_USG, ones, sizeof(ones), NULL);
   gcry_mpi_scan(&two, GCRYMPI_FMT_USG, twos, sizeof(twos), NULL);
@@ -386,21 +387,47 @@ test_add (void)
   result = gcry_mpi_new(0);
 
   gcry_mpi_add(result, one, two);
-  gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result);
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
   if (debug)
     gcry_log_debug ("Result of one plus two:\n%s\n", pc);
+  if (strcmp (pc, "030303030303030303030303030303030303030303030303"
+                  "030303030303030303030303030303030303030303030303") != 0)
+    fail ("mpi_add failed at line %d", __LINE__);
   gcry_free(pc);
 
   gcry_mpi_add(result, ff, one);
-  gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result);
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
   if (debug)
     gcry_log_debug ("Result of ff plus one:\n%s\n", pc);
+  if (strcmp (pc, "010101010101010101010101010101010101010101010101"
+                  "01010101010101010101010101010101010101010101010100") != 0)
+    fail ("mpi_add failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  gcry_mpi_scan(&minusfive, GCRYMPI_FMT_HEX, "-5", 0, NULL);
+  gcry_mpi_add_ui (result, minusfive, 2);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
+  if (debug)
+    gcry_log_debug ("Result of minus five plus two:\n%s\n", pc);
+  if (strcmp (pc, "-03") != 0)
+    fail ("mpi_add_ui failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  gcry_mpi_add_ui (result, result, 3);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
+  if (debug)
+    gcry_log_debug ("Result of minus three plus three:\n%s\n", pc);
+  if (strcmp (pc, "00") != 0)
+    fail ("mpi_add_ui failed at line %d", __LINE__);
   gcry_free(pc);
 
   gcry_mpi_release(one);
   gcry_mpi_release(two);
   gcry_mpi_release(ff);
   gcry_mpi_release(result);
+  gcry_mpi_release(minusfive);
   return 1;
 }
 
@@ -408,24 +435,76 @@ test_add (void)
 static int
 test_sub (void)
 {
+  gcry_mpi_t zero;
   gcry_mpi_t one;
   gcry_mpi_t two;
+  gcry_mpi_t five;
   gcry_mpi_t result;
-  unsigned char* pc;
+  gcry_mpi_t minusfive;
+  char *pc;
 
   gcry_mpi_scan(&one, GCRYMPI_FMT_USG, ones, sizeof(ones), NULL);
   gcry_mpi_scan(&two, GCRYMPI_FMT_USG, twos, sizeof(twos), NULL);
   result = gcry_mpi_new(0);
   gcry_mpi_sub(result, two, one);
 
-  gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result);
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
   if (debug)
     gcry_log_debug ("Result of two minus one:\n%s\n", pc);
+  if (strcmp (pc, "010101010101010101010101010101010101010101010101"
+                  "010101010101010101010101010101010101010101010101") != 0)
+    fail ("mpi_sub failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  zero = gcry_mpi_new(0);
+  five = gcry_mpi_new(0);
+  minusfive = gcry_mpi_new(0);
+  gcry_mpi_set_ui (zero, 0);
+  gcry_mpi_set_ui (one, 1);
+  gcry_mpi_set_ui (two, 2);
+  gcry_mpi_set_ui (five, 5);
+  gcry_mpi_sub (minusfive, zero, five);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, minusfive);
+  if (debug)
+    gcry_log_debug ("Result of zero minus five:\n%s\n", pc);
+  if (strcmp (pc, "-05") != 0)
+    fail ("mpi_sub failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  gcry_mpi_sub_ui (result, five, 2);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
+  if (debug)
+    gcry_log_debug ("Result of five minus two:\n%s\n", pc);
+  if (strcmp (pc, "03") != 0)
+    fail ("mpi_sub_ui failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  gcry_mpi_sub_ui (result, one, 10);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
+  if (debug)
+    gcry_log_debug ("Result of one minus ten:\n%s\n", pc);
+  if (strcmp (pc, "-09") != 0)
+    fail ("mpi_sub_ui failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  gcry_mpi_sub_ui (result, minusfive, 2);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
+  if (debug)
+    gcry_log_debug ("Result of minus five minus two:\n%s\n", pc);
+  if (strcmp (pc, "-07") != 0)
+    fail ("mpi_sub_ui failed at line %d", __LINE__);
   gcry_free(pc);
 
   gcry_mpi_release(one);
   gcry_mpi_release(two);
   gcry_mpi_release(result);
+  gcry_mpi_release(zero);
+  gcry_mpi_release(five);
+  gcry_mpi_release(minusfive);
   return 1;
 }
 
@@ -436,21 +515,47 @@ test_mul (void)
   gcry_mpi_t two;
   gcry_mpi_t three;
   gcry_mpi_t result;
-  unsigned char* pc;
+  gcry_mpi_t minusfive;
+  char *pc;
 
   gcry_mpi_scan(&two, GCRYMPI_FMT_USG, twos, sizeof(twos), NULL);
   gcry_mpi_scan(&three, GCRYMPI_FMT_USG, threes, sizeof(threes), NULL);
   result = gcry_mpi_new(0);
   gcry_mpi_mul(result, two, three);
 
-  gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result);
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
   if (debug)
     gcry_log_debug ("Result of two mul three:\n%s\n", pc);
+  if (strcmp (pc, "060C12181E242A30363C42484E545A60666C72787E848A90"
+                  "969CA2A8AEB4BAC0C6CCD2D8DEE4EAF0F6FD03090F151B21"
+                  "1B150F0902FCF6F0EAE4DED8D2CCC6C0BAB4AEA8A29C9690"
+                  "8A847E78726C66605A544E48423C36302A241E18120C06") != 0)
+    fail ("mpi_mul failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  gcry_mpi_scan(&minusfive, GCRYMPI_FMT_HEX, "-5", 0, NULL);
+  gcry_mpi_mul_ui (result, minusfive, 3);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
+  if (debug)
+    gcry_log_debug ("Result of minus five mul three:\n%s\n", pc);
+  if (strcmp (pc, "-0F") != 0)
+    fail ("mpi_mul_ui failed at line %d", __LINE__);
+  gcry_free(pc);
+
+  gcry_mpi_mul_ui (result, result, 0);
+
+  gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result);
+  if (debug)
+    gcry_log_debug ("Result of minus fifteen mul zero:\n%s\n", pc);
+  if (strcmp (pc, "00") != 0)
+    fail ("mpi_mul_ui failed at line %d", __LINE__);
   gcry_free(pc);
 
   gcry_mpi_release(two);
   gcry_mpi_release(three);
   gcry_mpi_release(result);
+  gcry_mpi_release(minusfive);
   return 1;
 }
 
-- 
2.32.0


From jussi.kivilinna at iki.fi  Tue Nov 30 21:34:02 2021
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 30 Nov 2021 22:34:02 +0200
Subject: gcry_mpi_sub_ui result is positive when it should be negative
In-Reply-To: <CAO5O-EJ6+jhW5Y0HVBiEap+msCqL6XoRwEDbmc__HDwcYdk3Qg@mail.gmail.com>
References: <CAO5O-EJ6+jhW5Y0HVBiEap+msCqL6XoRwEDbmc__HDwcYdk3Qg@mail.gmail.com>
Message-ID: <2e4bbf20-16ab-759a-5e3d-0d732efddf5c@iki.fi>

Hello,

On 30.11.2021 15.21, Guido Vranken via Gcrypt-devel wrote:
> In the program below, the result of the computation -5 - 2 should be -7, but it is 7.
> 
> Compare with the behavior of gcry_mpi_add_ui and gcry_mpi_mul_ui (commented), which do produce a negative result, as they should.
> 
> This was tested on the latest repository checkout.

Thanks for report.

This looks to go all the way back to code from 1997, commit "initial checkin" (4b5e71ca4e84e61e595dec19e1c7cab0c0a73f24).

-Jussi

> 
> #include <gcrypt.h>
> 
> #define CF_CHECK_EQ(expr, res) if ( (expr) != (res) ) { goto end; }
> 
> int main(void)
> {
>  ? ? gcry_mpi_t A;
>  ? ? gcry_mpi_t res;
>  ? ? gcry_error_t err;
>  ? ? char *buf;
> 
>  ? ? CF_CHECK_EQ(err = gcry_mpi_scan(&A, GCRYMPI_FMT_HEX, "-5", 0, NULL), 0);
>  ? ? CF_CHECK_EQ(err = gcry_mpi_scan(&res, GCRYMPI_FMT_HEX, "0", 0, NULL), 0);
>  ? ? gcry_mpi_sub_ui(res, A, 2);
>  ? ? //gcry_mpi_add_ui(res, A, 2);
>  ? ? //gcry_mpi_mul_ui(res, A, 2);
>  ? ? CF_CHECK_EQ(err = gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char**)&buf, NULL, res), 0);
>  ? ? printf("%s\n", buf);
> end:
> 
>  ? ? return 0;
> }
> 
> _______________________________________________
> Gcrypt-devel mailing list
> Gcrypt-devel at gnupg.org
> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
>