[PATCH 1/3] ghash|polyval: add x86_64 VPCLMUL/AVX2 accelerated implementation

Sun Mar 6 18:19:08 CET 2022

* cipher/cipher-gcm-intel-pclmul.c (GCM_INTEL_USE_VPCLMUL_AVX2)
(GCM_INTEL_AGGR8_TABLE_INITIALIZED)
(GCM_INTEL_AGGR16_TABLE_INITIALIZED): New.
(gfmul_pclmul): Fixes to comments.
[GCM_USE_INTEL_VPCLMUL_AVX2] (GFMUL_AGGR16_ASM_VPCMUL_AVX2)
(gfmul_vpclmul_avx2_aggr16, gfmul_vpclmul_avx2_aggr16_le)
(gfmul_pclmul_avx2, gcm_lsh_avx2, load_h1h2_to_ymm1)
(ghash_setup_aggr8_avx2, ghash_setup_aggr16_avx2): New.
(_gcry_ghash_setup_intel_pclmul): Add 'hw_features' parameter; Setup
ghash and polyval function pointers for context; Add VPCLMUL/AVX2 code
path; Defer aggr8 and aggr16 table initialization to until first use in
'_gcry_ghash_intel_pclmul' or '_gcry_polyval_intel_pclmul'.
[__x86_64__] (ghash_setup_aggr8): New.
(_gcry_ghash_intel_pclmul): Add VPCLMUL/AVX2 code path; Add call for
aggr8 table initialization.
(_gcry_polyval_intel_pclmul): Add VPCLMUL/AVX2 code path; Add call for
aggr8 table initialization.
* cipher/cipher-gcm.c [GCM_USE_INTEL_PCLMUL] (_gcry_ghash_intel_pclmul)
(_gcry_polyval_intel_pclmul): Remove.
[GCM_USE_INTEL_PCLMUL] (_gcry_ghash_setup_intel_pclmul): Add
'hw_features' parameter.
(setupM) [GCM_USE_INTEL_PCLMUL]: Pass HW features to
'_gcry_ghash_setup_intel_pclmul'; Let '_gcry_ghash_setup_intel_pclmul'
setup function pointers.
* cipher/cipher-internal.h (GCM_USE_INTEL_VPCLMUL_AVX2): New.
(gcry_cipher_handle): Add member 'gcm.hw_impl_flags'.
--

Patch adds VPCLMUL/AVX2 accelerated implementation for GHASH (GCM) and
POLYVAL (GCM-SIV).

Benchmark on AMD Ryzen 5800X (zen3):

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
       GCM auth |     0.088 ns/B     10825 MiB/s     0.427 c/B      4850
   GCM-SIV auth |     0.083 ns/B     11472 MiB/s     0.403 c/B      4850

After: (~1.93x faster)
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
       GCM auth |     0.045 ns/B     21098 MiB/s     0.219 c/B      4850
   GCM-SIV auth |     0.043 ns/B     22181 MiB/s     0.209 c/B      4850

AES128-GCM / AES128-GCM-SIV encryption:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        GCM enc |     0.079 ns/B     12073 MiB/s     0.383 c/B      4850
    GCM-SIV enc |     0.076 ns/B     12500 MiB/s     0.370 c/B      4850

Benchmark on Intel Core i3-1115G4 (tigerlake):

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
       GCM auth |     0.080 ns/B     11919 MiB/s     0.327 c/B      4090
   GCM-SIV auth |     0.075 ns/B     12643 MiB/s     0.309 c/B      4090

After: (~1.28x faster)
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
       GCM auth |     0.062 ns/B     15348 MiB/s     0.254 c/B      4090
   GCM-SIV auth |     0.058 ns/B     16381 MiB/s     0.238 c/B      4090

AES128-GCM / AES128-GCM-SIV encryption:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        GCM enc |     0.101 ns/B      9441 MiB/s     0.413 c/B      4090
    GCM-SIV enc |     0.098 ns/B      9692 MiB/s     0.402 c/B      4089

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-intel-pclmul.c | 809 +++++++++++++++++++++++++++----
 cipher/cipher-gcm.c              |  15 +-
 cipher/cipher-internal.h         |  11 +
 3 files changed, 724 insertions(+), 111 deletions(-)

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index daf807d0..b7324e8f 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1,6 +1,6 @@
 /* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
  *                               implementation
- * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2014,2019,2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -49,12 +49,18 @@
 #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
 
 
+#define GCM_INTEL_USE_VPCLMUL_AVX2         (1 << 0)
+#define GCM_INTEL_AGGR8_TABLE_INITIALIZED  (1 << 1)
+#define GCM_INTEL_AGGR16_TABLE_INITIALIZED (1 << 2)
+
+
 /*
  Intel PCLMUL ghash based on white paper:
   "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
    GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
  */
-static ASM_FUNC_ATTR_INLINE void reduction(void)
+static ASM_FUNC_ATTR_INLINE
+void reduction(void)
 {
   /* input: <xmm1:xmm3> */
 
@@ -83,7 +89,8 @@ static ASM_FUNC_ATTR_INLINE void reduction(void)
                 ::: "memory" );
 }
 
-static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void)
+static ASM_FUNC_ATTR_INLINE
+void gfmul_pclmul(void)
 {
   /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
      Input must be converted to little-endian.
@@ -358,12 +365,12 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
                                                                                \
     "pshufd $78, %%xmm8, %%xmm11\n\t"                                          \
     "pshufd $78, %%xmm5, %%xmm7\n\t"                                           \
-    "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */                      \
-    "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */                       \
+    "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 2:a0+a1 */                      \
+    "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 2:b0+b1 */                       \
     "movdqa %%xmm8, %%xmm6\n\t"                                                \
-    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */              \
-    "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */              \
-    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */    \
+    "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 2:a0*b0 */              \
+    "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 2:a1*b1 */              \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 2:(a0+a1)*(b0+b1) */    \
                                                                                \
     "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */             \
     "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */             \
@@ -371,16 +378,16 @@ gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table)
                                                                                \
     "pshufd $78, %%xmm0, %%xmm11\n\t"                                          \
     "pshufd $78, %%xmm2, %%xmm7\n\t"                                           \
-    "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */                       \
-    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */                        \
+    "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */                       \
+    "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 1:b0+b1 */                        \
     "movdqa %%xmm0, %%xmm6\n\t"                                                \
-    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */               \
-    "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */               \
-    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */     \
+    "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */               \
+    "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */               \
+    "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */     \
                                                                                \
-    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */         \
-    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */         \
-    "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */\
+    "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4+5+6+7+8:a0*b0 */           \
+    "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4+5+6+7+8:a1*b1 */           \
+    "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */  \
                                                                                \
     /* aggregated reduction... */                                              \
     "movdqa %%xmm3, %%xmm5\n\t"                                                \
@@ -432,14 +439,409 @@ gfmul_pclmul_aggr8_le(const void *buf, const void *h_table)
 
   reduction();
 }
-#endif
 
-static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX2
+
+#define GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le)                                          \
+    /* perform clmul and merge results... */                                            \
+    "vmovdqu 0*16(%[buf]), %%ymm5\n\t"                                                  \
+    "vmovdqu 2*16(%[buf]), %%ymm2\n\t"                                                  \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"                                                  \
+                                                                                        \
+    "vpshufd $78, %%ymm0, %%ymm5\n\t"                                                   \
+    "vpshufd $78, %%ymm1, %%ymm4\n\t"                                                   \
+    "vpxor %%ymm0, %%ymm5, %%ymm5\n\t" /* ymm5 holds 15|16:a0+a1 */                     \
+    "vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds 15|16:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t"  /* ymm3 holds 15|16:a0*b0 */           \
+    "vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm1 holds 15|16:a1*b1 */           \
+    "vpclmulqdq $0, %%ymm5, %%ymm4, %%ymm4\n\t"  /* ymm4 holds 15|16:(a0+a1)*(b0+b1) */ \
+                                                                                        \
+    "vmovdqu %[h1_h2], %%ymm0\n\t"                                                      \
+                                                                                        \
+    "vpshufd $78, %%ymm13, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm13, %%ymm14, %%ymm14\n\t" /* ymm14 holds 13|14:a0+a1 */                 \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 13|14:b0+b1 */                  \
+    "vpclmulqdq $0, %%ymm2, %%ymm13, %%ymm6\n\t"  /* ymm6 holds 13|14:a0*b0 */          \
+    "vpclmulqdq $17, %%ymm13, %%ymm2, %%ymm2\n\t" /* ymm2 holds 13|14:a1*b1 */          \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t"  /* ymm7 holds 13|14:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 13+15|14+16:a0*b0 */               \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 13+15|14+16:a1*b1 */               \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 13+15|14+16:(a0+a1)*(b0+b1) */     \
+                                                                                        \
+    "vmovdqu 4*16(%[buf]), %%ymm5\n\t"                                                  \
+    "vmovdqu 6*16(%[buf]), %%ymm2\n\t"                                                  \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+                                                                                        \
+    "vpshufd $78, %%ymm12, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm5, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm12, %%ymm14, %%ymm14\n\t" /* ymm14 holds 11|12:a0+a1 */                 \
+    "vpxor %%ymm5, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 11|12:b0+b1 */                  \
+    "vpclmulqdq $0, %%ymm5, %%ymm12, %%ymm6\n\t"  /* ymm6 holds 11|12:a0*b0 */          \
+    "vpclmulqdq $17, %%ymm12, %%ymm5, %%ymm5\n\t" /* ymm5 holds 11|12:a1*b1 */          \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t"  /* ymm7 holds 11|12:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 11+13+15|12+14+16:a0*b0 */         \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 11+13+15|12+14+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 11+13+15|12+14+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpshufd $78, %%ymm11, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm11, %%ymm14, %%ymm14\n\t" /* ymm14 holds 9|10:a0+a1 */                  \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 9|10:b0+b1 */                   \
+    "vpclmulqdq $0, %%ymm2, %%ymm11, %%ymm6\n\t"  /* ymm6 holds 9|10:a0*b0 */           \
+    "vpclmulqdq $17, %%ymm11, %%ymm2, %%ymm2\n\t" /* ymm2 holds 9|10:a1*b1 */           \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 9|10:(a0+a1)*(b0+b1) */  \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 9+11+…+15|10+12+…+16:a0*b0 */      \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 9+11+…+15|10+12+…+16:a1*b1 */      \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 9+11+…+15|10+12+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vmovdqu 8*16(%[buf]), %%ymm5\n\t"                                                  \
+    "vmovdqu 10*16(%[buf]), %%ymm2\n\t"                                                 \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+                                                                                        \
+    "vpshufd $78, %%ymm10, %%ymm14\n\t"                                                 \
+    "vpshufd $78, %%ymm5, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm10, %%ymm14, %%ymm14\n\t" /* ymm14 holds 7|8:a0+a1 */                   \
+    "vpxor %%ymm5, %%ymm7, %%ymm7\n\t"    /* ymm7 holds 7|8:b0+b1 */                    \
+    "vpclmulqdq $0, %%ymm5, %%ymm10, %%ymm6\n\t"  /* ymm6 holds 7|8:a0*b0 */            \
+    "vpclmulqdq $17, %%ymm10, %%ymm5, %%ymm5\n\t" /* ymm5 holds 7|8:a1*b1 */            \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 7|8:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 7+9+…+15|8+10+…+16:a0*b0 */        \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 7+9+…+15|8+10+…+16:a1*b1 */        \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 7+9+…+15|8+10+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpshufd $78, %%ymm9, %%ymm14\n\t"                                                  \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm9, %%ymm14, %%ymm14\n\t" /* ymm14 holds 5|6:a0+a1 */                    \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"   /* ymm7 holds 5|6:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm2, %%ymm9, %%ymm6\n\t"  /* ymm6 holds 5|6:a0*b0 */             \
+    "vpclmulqdq $17, %%ymm9, %%ymm2, %%ymm2\n\t" /* ymm2 holds 5|6:a1*b1 */             \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 5|6:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 5+7+…+15|6+8+…+16:a0*b0 */         \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 5+7+…+15|6+8+…+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 5+7+…+15|6+8+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vmovdqu 12*16(%[buf]), %%ymm5\n\t"                                                 \
+    "vmovdqu 14*16(%[buf]), %%ymm2\n\t"                                                 \
+    be_to_le("vpshufb %%ymm15, %%ymm5, %%ymm5\n\t") /* be => le */                      \
+    be_to_le("vpshufb %%ymm15, %%ymm2, %%ymm2\n\t") /* be => le */                      \
+                                                                                        \
+    "vpshufd $78, %%ymm8, %%ymm14\n\t"                                                  \
+    "vpshufd $78, %%ymm5, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm8, %%ymm14, %%ymm14\n\t" /* ymm14 holds 3|4:a0+a1 */                    \
+    "vpxor %%ymm5, %%ymm7, %%ymm7\n\t"   /* ymm7 holds 3|4:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm5, %%ymm8, %%ymm6\n\t"  /* ymm6 holds 3|4:a0*b0 */             \
+    "vpclmulqdq $17, %%ymm8, %%ymm5, %%ymm5\n\t" /* ymm5 holds 3|4:a1*b1 */             \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 3|4:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 3+5+…+15|4+6+…+16:a0*b0 */         \
+    "vpxor %%ymm5, %%ymm1, %%ymm1\n\t" /* ymm1 holds 3+5+…+15|4+6+…+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 3+5+…+15|4+6+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    "vpshufd $78, %%ymm0, %%ymm14\n\t"                                                  \
+    "vpshufd $78, %%ymm2, %%ymm7\n\t"                                                   \
+    "vpxor %%ymm0, %%ymm14, %%ymm14\n\t" /* ymm14 holds 1|2:a0+a1 */                    \
+    "vpxor %%ymm2, %%ymm7, %%ymm7\n\t"   /* ymm7 holds 1|2:b0+b1 */                     \
+    "vpclmulqdq $0, %%ymm2, %%ymm0, %%ymm6\n\t"  /* ymm6 holds 1|2:a0*b0 */             \
+    "vpclmulqdq $17, %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds 1|2:a1*b1 */             \
+    "vpclmulqdq $0, %%ymm14, %%ymm7, %%ymm7\n\t" /* ymm7 holds 1|2:(a0+a1)*(b0+b1) */   \
+                                                                                        \
+    "vmovdqu %[h15_h16], %%ymm0\n\t"                                                    \
+                                                                                        \
+    "vpxor %%ymm6, %%ymm3, %%ymm3\n\t" /* ymm3 holds 1+3+…+15|2+4+…+16:a0*b0 */         \
+    "vpxor %%ymm2, %%ymm1, %%ymm1\n\t" /* ymm1 holds 1+3+…+15|2+4+…+16:a1*b1 */         \
+    "vpxor %%ymm7, %%ymm4, %%ymm4\n\t" /* ymm4 holds 1+3+…+15|2+4+…+16:(a0+a1)*(b0+b1) */\
+                                                                                        \
+    /* aggregated reduction... */                                                       \
+    "vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */                     \
+    "vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */     \
+    "vpslldq $8, %%ymm4, %%ymm5\n\t"                                                    \
+    "vpsrldq $8, %%ymm4, %%ymm4\n\t"                                                    \
+    "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"                                                  \
+    "vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:xmm3> holds the result of the           \
+                                          carry-less multiplication of ymm0             \
+                                          by ymm1 */                                    \
+                                                                                        \
+    /* first phase of the reduction */                                                  \
+    "vpsllq $1, %%ymm3, %%ymm6\n\t"  /* packed right shifting << 63 */                  \
+    "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"                                                  \
+    "vpsllq $57, %%ymm3, %%ymm5\n\t"  /* packed right shifting << 57 */                 \
+    "vpsllq $62, %%ymm6, %%ymm6\n\t"  /* packed right shifting << 62 */                 \
+    "vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */                   \
+    "vpshufd $0x6a, %%ymm6, %%ymm5\n\t"                                                 \
+    "vpshufd $0xae, %%ymm6, %%ymm6\n\t"                                                 \
+    "vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */      \
+                                                                                        \
+    /* second phase of the reduction */                                                 \
+    "vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */                   \
+    "vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 1 */                  \
+    "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"                                                  \
+    "vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 2 */                  \
+    "vpxor %%ymm3, %%ymm1, %%ymm1\n\t"                                                  \
+    "vpsrlq $5, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 7 */                  \
+    "vpxor %%ymm3, %%ymm6, %%ymm6\n\t"                                                  \
+    "vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */                      \
+                                                                                        \
+    /* merge 128-bit halves */                                                          \
+    "vextracti128 $1, %%ymm1, %%xmm2\n\t"                                               \
+    "vpxor %%xmm2, %%xmm1, %%xmm1\n\t"
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx2_aggr16(const void *buf, const void *h_table,
+			  const u64 *h1_h2_h15_h16)
+{
+  /* Input:
+      Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13
+      bemask: YMM15
+      Hash: XMM1
+    Output:
+      Hash: XMM1
+    Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay
+    unmodified.
+  */
+  asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(be_to_le)
+		:
+		: [buf] "r" (buf),
+		  [h_table] "r" (h_table),
+		  [h1_h2] "m" (h1_h2_h15_h16[0]),
+		  [h15_h16] "m" (h1_h2_h15_h16[4])
+		: "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gfmul_vpclmul_avx2_aggr16_le(const void *buf, const void *h_table,
+			     const u64 *h1_h2_h15_h16)
+{
+  /* Input:
+      Hx: YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13
+      bemask: YMM15
+      Hash: XMM1
+    Output:
+      Hash: XMM1
+    Inputs YMM0, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13 and YMM15 stay
+    unmodified.
+  */
+  asm volatile (GFMUL_AGGR16_ASM_VPCMUL_AVX2(le_to_le)
+		:
+		: [buf] "r" (buf),
+		  [h_table] "r" (h_table),
+		  [h1_h2] "m" (h1_h2_h15_h16[0]),
+		  [h15_h16] "m" (h1_h2_h15_h16[4])
+		: "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE
+void gfmul_pclmul_avx2(void)
+{
+  /* Input: YMM0 and YMM1, Output: YMM1. Input YMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* gfmul, ymm0 has operator a and ymm1 has operator b. */
+		"vpshufd $78, %%ymm0, %%ymm2\n\t"
+		"vpshufd $78, %%ymm1, %%ymm4\n\t"
+		"vpxor %%ymm0, %%ymm2, %%ymm2\n\t" /* ymm2 holds a0+a1 */
+		"vpxor %%ymm1, %%ymm4, %%ymm4\n\t" /* ymm4 holds b0+b1 */
+
+		"vpclmulqdq $0, %%ymm1, %%ymm0, %%ymm3\n\t"  /* ymm3 holds a0*b0 */
+		"vpclmulqdq $17, %%ymm0, %%ymm1, %%ymm1\n\t" /* ymm6 holds a1*b1 */
+		"vpclmulqdq $0, %%ymm2, %%ymm4, %%ymm4\n\t"  /* ymm4 holds (a0+a1)*(b0+b1) */
+
+		"vpxor %%ymm1, %%ymm3, %%ymm5\n\t" /* ymm5 holds a0*b0+a1*b1 */
+		"vpxor %%ymm5, %%ymm4, %%ymm4\n\t" /* ymm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+		"vpslldq $8, %%ymm4, %%ymm5\n\t"
+		"vpsrldq $8, %%ymm4, %%ymm4\n\t"
+		"vpxor %%ymm5, %%ymm3, %%ymm3\n\t"
+		"vpxor %%ymm4, %%ymm1, %%ymm1\n\t" /* <ymm1:ymm3> holds the result of the
+						      carry-less multiplication of ymm0
+						      by ymm1 */
+
+		/* first phase of the reduction */
+		"vpsllq $1, %%ymm3, %%ymm6\n\t"  /* packed right shifting << 63 */
+		"vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+		"vpsllq $57, %%ymm3, %%ymm5\n\t"  /* packed right shifting << 57 */
+		"vpsllq $62, %%ymm6, %%ymm6\n\t"  /* packed right shifting << 62 */
+		"vpxor %%ymm5, %%ymm6, %%ymm6\n\t" /* xor the shifted versions */
+		"vpshufd $0x6a, %%ymm6, %%ymm5\n\t"
+		"vpshufd $0xae, %%ymm6, %%ymm6\n\t"
+		"vpxor %%ymm5, %%ymm3, %%ymm3\n\t" /* first phase of the reduction complete */
+
+		/* second phase of the reduction */
+		"vpxor %%ymm3, %%ymm1, %%ymm1\n\t" /* xor the shifted versions */
+		"vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 1 */
+		"vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+		"vpsrlq $1, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 2 */
+		"vpxor %%ymm3, %%ymm1, %%ymm1\n\t"
+		"vpsrlq $5, %%ymm3, %%ymm3\n\t"    /* packed left shifting >> 7 */
+		"vpxor %%ymm3, %%ymm6, %%ymm6\n\t"
+		"vpxor %%ymm6, %%ymm1, %%ymm1\n\t" /* the result is in ymm1 */
+                ::: "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh_avx2(void *h, unsigned int hoffs)
+{
+  static const u64 pconst[4] __attribute__ ((aligned (32))) =
+    {
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000),
+      U64_C(0x0000000000000001), U64_C(0xc200000000000000)
+    };
+
+  asm volatile ("vmovdqu %[h], %%ymm2\n\t"
+                "vpshufd $0xff, %%ymm2, %%ymm3\n\t"
+                "vpsrad $31, %%ymm3, %%ymm3\n\t"
+                "vpslldq $8, %%ymm2, %%ymm4\n\t"
+                "vpand %[pconst], %%ymm3, %%ymm3\n\t"
+                "vpaddq %%ymm2, %%ymm2, %%ymm2\n\t"
+                "vpsrlq $63, %%ymm4, %%ymm4\n\t"
+                "vpxor %%ymm3, %%ymm2, %%ymm2\n\t"
+                "vpxor %%ymm4, %%ymm2, %%ymm2\n\t"
+                "vmovdqu %%ymm2, %[h]\n\t"
+                : [h] "+m" (*((byte *)h + hoffs))
+                : [pconst] "m" (*pconst)
+                : "memory" );
+}
+
+static ASM_FUNC_ATTR_INLINE void
+load_h1h2_to_ymm1(gcry_cipher_hd_t c)
+{
+  unsigned int key_pos =
+    offsetof(struct gcry_cipher_handle, u_mode.gcm.u_ghash_key.key);
+  unsigned int table_pos =
+    offsetof(struct gcry_cipher_handle, u_mode.gcm.gcm_table);
+
+  if (key_pos + 16 == table_pos)
+    {
+      /* Optimization: Table follows immediately after key. */
+      asm volatile ("vmovdqu %[key], %%ymm1\n\t"
+		    :
+		    : [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+		    : "memory");
+    }
+  else
+    {
+      asm volatile ("vmovdqa %[key], %%xmm1\n\t"
+		    "vinserti128 $1, 0*16(%[h_table]), %%ymm1, %%ymm1\n\t"
+		    :
+		    : [h_table] "r" (c->u_mode.gcm.gcm_table),
+		      [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+		    : "memory");
+    }
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr8_avx2(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED;
+
+  asm volatile (/* load H⁴ */
+		"vbroadcasti128 3*16(%[h_table]), %%ymm0\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+  /* load H <<< 1, H² <<< 1 */
+  load_h1h2_to_ymm1 (c);
+
+  gfmul_pclmul_avx2 (); /* H<<<1•H⁴ => H⁵, H²<<<1•H⁴ => H⁶ */
+
+  asm volatile ("vmovdqu %%ymm1, 3*16(%[h_table])\n\t"
+		/* load H³ <<< 1, H⁴ <<< 1 */
+		"vmovdqu 1*16(%[h_table]), %%ymm1\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gfmul_pclmul_avx2 (); /* H³<<<1•H⁴ => H⁷, H⁴<<<1•H⁴ => H⁸ */
+
+  asm volatile ("vmovdqu %%ymm1, 6*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */
+		"vmovdqu %%ymm1, 5*16(%[h_table])\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1, H⁶ <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1, H⁸ <<< 1 */
+}
+
+static ASM_FUNC_ATTR void
+ghash_setup_aggr16_avx2(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR16_TABLE_INITIALIZED;
+
+  asm volatile (/* load H⁸ */
+		"vbroadcasti128 7*16(%[h_table]), %%ymm0\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+  /* load H <<< 1, H² <<< 1 */
+  load_h1h2_to_ymm1 (c);
+
+  gfmul_pclmul_avx2 (); /* H<<<1•H⁸ => H⁹, H²<<<1•H⁸ => H¹⁰ */
+
+  asm volatile ("vmovdqu %%ymm1, 7*16(%[h_table])\n\t"
+		/* load H³ <<< 1, H⁴ <<< 1 */
+		"vmovdqu 1*16(%[h_table]), %%ymm1\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gfmul_pclmul_avx2 (); /* H³<<<1•H⁸ => H¹¹, H⁴<<<1•H⁸ => H¹² */
+
+  asm volatile ("vmovdqu %%ymm1, 9*16(%[h_table])\n\t"
+		/* load H⁵ <<< 1, H⁶ <<< 1 */
+		"vmovdqu 3*16(%[h_table]), %%ymm1\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gfmul_pclmul_avx2 (); /* H⁵<<<1•H⁸ => H¹³, H⁶<<<1•H⁸ => H¹⁴ */
+
+  asm volatile ("vmovdqu %%ymm1, 11*16(%[h_table])\n\t"
+		/* load H⁷ <<< 1, H⁸ <<< 1 */
+		"vmovdqu 5*16(%[h_table]), %%ymm1\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gfmul_pclmul_avx2 (); /* H⁷<<<1•H⁸ => H¹⁵, H⁸<<<1•H⁸ => H¹⁶ */
+
+  asm volatile ("vmovdqu %%ymm1, 13*16(%[h_table])\n\t"
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 7 * 16); /* H⁹ <<< 1, H¹⁰ <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 9 * 16); /* H¹¹ <<< 1, H¹² <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 11 * 16); /* H¹³ <<< 1, H¹⁴ <<< 1 */
+  gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 13 * 16); /* H¹⁵ <<< 1, H¹⁶ <<< 1 */
+}
+
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+#endif /* __x86_64__ */
+
+static unsigned int ASM_FUNC_ATTR
+_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+			  size_t nblocks);
+
+static unsigned int ASM_FUNC_ATTR
+_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+			    size_t nblocks);
+
+static ASM_FUNC_ATTR_INLINE void
+gcm_lsh(void *h, unsigned int hoffs)
 {
   static const u64 pconst[2] __attribute__ ((aligned (16))) =
     { U64_C(0x0000000000000001), U64_C(0xc200000000000000) };
 
-  asm volatile ("movdqu (%[h]), %%xmm2\n\t"
+  asm volatile ("movdqu %[h], %%xmm2\n\t"
                 "pshufd $0xff, %%xmm2, %%xmm3\n\t"
                 "movdqa %%xmm2, %%xmm4\n\t"
                 "psrad $31, %%xmm3\n\t"
@@ -449,15 +851,14 @@ static ASM_FUNC_ATTR_INLINE void gcm_lsh(void *h, unsigned int hoffs)
                 "psrlq $63, %%xmm4\n\t"
                 "pxor %%xmm3, %%xmm2\n\t"
                 "pxor %%xmm4, %%xmm2\n\t"
-                "movdqu %%xmm2, (%[h])\n\t"
-                :
-                : [pconst] "m" (*pconst),
-                  [h] "r" ((byte *)h + hoffs)
+                "movdqu %%xmm2, %[h]\n\t"
+                : [h] "+m" (*((byte *)h + hoffs))
+                : [pconst] "m" (*pconst)
                 : "memory" );
 }
 
 void ASM_FUNC_ATTR
-_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
+_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, unsigned int hw_features)
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -480,6 +881,12 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 : "memory" );
 #endif
 
+  (void)hw_features;
+
+  c->u_mode.gcm.hw_impl_flags = 0;
+  c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+  c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
+
   /* Swap endianness of hsub. */
   asm volatile ("movdqu (%[key]), %%xmm0\n\t"
                 "pshufb %[be_mask], %%xmm0\n\t"
@@ -489,7 +896,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                   [be_mask] "m" (*be_mask)
                 : "memory");
 
-  gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
+  gcm_lsh (c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
 
   asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
                 "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
@@ -500,80 +907,81 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
   gfmul_pclmul (); /* H<<<1•H => H² */
 
   asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t"
-                "movdqa %%xmm1, %%xmm7\n\t"
                 :
                 : [h_table] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
 
-  gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */
-  gfmul_pclmul (); /* H<<<1•H² => H³ */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 0 * 16); /* H² <<< 1 */
 
-  asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
-                "movdqu %%xmm1, 1*16(%[h_table])\n\t"
-                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+  if (0)
+    { }
+#ifdef GCM_USE_INTEL_VPCLMUL_AVX2
+  else if ((hw_features & HWF_INTEL_VAES_VPCLMUL)
+           && (hw_features & HWF_INTEL_AVX2))
+    {
+      c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_USE_VPCLMUL_AVX2;
 
-  gfmul_pclmul (); /* H²<<<1•H² => H⁴ */
+      asm volatile (/* H² */
+		    "vinserti128 $1, %%xmm1, %%ymm1, %%ymm1\n\t"
+		    /* load H <<< 1, H² <<< 1 */
+		    "vinserti128 $1, 0*16(%[h_table]), %%ymm0, %%ymm0\n\t"
+		    :
+		    : [h_table] "r" (c->u_mode.gcm.gcm_table)
+		    : "memory");
 
-  asm volatile ("movdqu %%xmm1, 2*16(%[h_table])\n\t"
-                "movdqa %%xmm1, %%xmm0\n\t"
-                "movdqu (%[key]), %%xmm1\n\t" /* load H <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table),
-                  [key] "r" (c->u_mode.gcm.u_ghash_key.key)
-                : "memory");
+      gfmul_pclmul_avx2 (); /* H<<<1•H² => H³, H²<<<1•H² => H⁴ */
 
-  gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */
+      asm volatile ("vmovdqu %%ymm1, 2*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */
+		    "vmovdqu %%ymm1, 1*16(%[h_table])\n\t"
+		    :
+		    : [h_table] "r" (c->u_mode.gcm.gcm_table)
+		    : "memory");
 
-#ifdef __x86_64__
-  gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */
+      gcm_lsh_avx2 (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1, H⁴ <<< 1 */
 
-  asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
-                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
-
-  gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */
-
-  asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
-                "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+      asm volatile ("vzeroupper\n\t"
+		    ::: "memory" );
+    }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+  else
+    {
+      asm volatile ("movdqa %%xmm1, %%xmm7\n\t"
+		    ::: "memory");
 
-  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */
+      gfmul_pclmul (); /* H<<<1•H² => H³ */
 
-  asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
-                "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+      asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
+		    "movdqu %%xmm1, 1*16(%[h_table])\n\t"
+		    "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+		    :
+		    : [h_table] "r" (c->u_mode.gcm.gcm_table)
+		    : "memory");
 
-  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁸ */
+      gfmul_pclmul (); /* H²<<<1•H² => H⁴ */
 
-  asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
-                :
-                : [h_table] "r" (c->u_mode.gcm.gcm_table)
-                : "memory");
+      asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" /* store H⁴ for aggr8 setup */
+		    "movdqu %%xmm1, 2*16(%[h_table])\n\t"
+		    :
+		    : [h_table] "r" (c->u_mode.gcm.gcm_table)
+		    : "memory");
 
-  gcm_lsh(c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */
-  gcm_lsh(c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */
+      gcm_lsh (c->u_mode.gcm.gcm_table, 1 * 16); /* H³ <<< 1 */
+      gcm_lsh (c->u_mode.gcm.gcm_table, 2 * 16); /* H⁴ <<< 1 */
+    }
 
-#ifdef __WIN64__
   /* Clear/restore used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "movdqu 0*16(%0), %%xmm6\n\t"
+  asm volatile ("pxor %%xmm0, %%xmm0\n\t"
+		"pxor %%xmm1, %%xmm1\n\t"
+		"pxor %%xmm2, %%xmm2\n\t"
+		"pxor %%xmm3, %%xmm3\n\t"
+		"pxor %%xmm4, %%xmm4\n\t"
+		"pxor %%xmm5, %%xmm5\n\t"
+		"pxor %%xmm6, %%xmm6\n\t"
+		"pxor %%xmm7, %%xmm7\n\t"
+		::: "memory" );
+#ifdef __x86_64__
+#ifdef __WIN64__
+  asm volatile ("movdqu 0*16(%0), %%xmm6\n\t"
                 "movdqu 1*16(%0), %%xmm7\n\t"
                 "movdqu 2*16(%0), %%xmm8\n\t"
                 "movdqu 3*16(%0), %%xmm9\n\t"
@@ -587,16 +995,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 : "r" (win64tmp)
                 : "memory" );
 #else
-  /* Clear used registers. */
-  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                "pxor %%xmm1, %%xmm1\n\t"
-                "pxor %%xmm2, %%xmm2\n\t"
-                "pxor %%xmm3, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm4\n\t"
-                "pxor %%xmm5, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm6\n\t"
-                "pxor %%xmm7, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm8\n\t"
+  asm volatile ("pxor %%xmm8, %%xmm8\n\t"
                 "pxor %%xmm9, %%xmm9\n\t"
                 "pxor %%xmm10, %%xmm10\n\t"
                 "pxor %%xmm11, %%xmm11\n\t"
@@ -605,14 +1004,67 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 "pxor %%xmm14, %%xmm14\n\t"
                 "pxor %%xmm15, %%xmm15\n\t"
                 ::: "memory" );
-#endif
-#endif
+#endif /* __WIN64__ */
+#endif /* __x86_64__ */
 }
 
 
+#ifdef __x86_64__
+static ASM_FUNC_ATTR void
+ghash_setup_aggr8(gcry_cipher_hd_t c)
+{
+  c->u_mode.gcm.hw_impl_flags |= GCM_INTEL_AGGR8_TABLE_INITIALIZED;
+
+  asm volatile ("movdqa 3*16(%[h_table]), %%xmm0\n\t" /* load H⁴ */
+		"movdqu %[key], %%xmm1\n\t" /* load H <<< 1 */
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table),
+		  [key] "m" (*c->u_mode.gcm.u_ghash_key.key)
+		: "memory");
+
+  gfmul_pclmul (); /* H<<<1•H⁴ => H⁵ */
+
+  asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
+		"movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H² <<< 1 */
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gfmul_pclmul (); /* H²<<<1•H⁴ => H⁶ */
+
+  asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
+		"movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H³ <<< 1 */
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gfmul_pclmul (); /* H³<<<1•H⁴ => H⁷ */
+
+  asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
+		"movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H⁴ <<< 1 */
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gfmul_pclmul (); /* H⁴<<<1•H⁴ => H⁸ */
+
+  asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
+		"movdqu %%xmm1, 7*16(%[h_table])\n\t" /* store H⁸ for aggr16 setup */
+		:
+		: [h_table] "r" (c->u_mode.gcm.gcm_table)
+		: "memory");
+
+  gcm_lsh (c->u_mode.gcm.gcm_table, 3 * 16); /* H⁵ <<< 1 */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 4 * 16); /* H⁶ <<< 1 */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 5 * 16); /* H⁷ <<< 1 */
+  gcm_lsh (c->u_mode.gcm.gcm_table, 6 * 16); /* H⁸ <<< 1 */
+}
+#endif /* __x86_64__ */
+
+
 unsigned int ASM_FUNC_ATTR
 _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
-                          size_t nblocks)
+			  size_t nblocks)
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -650,12 +1102,93 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                   [be_mask] "m" (*be_mask)
                 : "memory" );
 
+#if defined(GCM_USE_INTEL_VPCLMUL_AVX2)
+  if (nblocks >= 16
+      && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2))
+    {
+      u64 h1_h2_h15_h16[4*2];
+
+      asm volatile ("vinserti128 $1, %%xmm7, %%ymm7, %%ymm15\n\t"
+		    "vmovdqa %%xmm1, %%xmm8\n\t"
+		    ::: "memory" );
+
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+	{
+	  ghash_setup_aggr8_avx2 (c);
+	}
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+	{
+	  ghash_setup_aggr16_avx2 (c);
+	}
+
+      /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */
+      asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t"
+		    "vmovdqu 0*16(%[h_table]), %%xmm7\n\t"
+		    "vpxor %%xmm8, %%xmm8, %%xmm8\n\t"
+		    "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t"  /* H15|H16 */
+		    "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */
+		    "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t"  /* H11|H12 */
+		    "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t"  /* H9|H10 */
+		    "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t"  /* H7|H8 */
+		    "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t"   /* H5|H6 */
+		    "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t"   /* H3|H4 */
+		    "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */
+		    "vmovdqu %%ymm0, %[h15_h16]\n\t"
+		    "vmovdqu %%ymm7, %[h1_h2]\n\t"
+		    : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+		      [h15_h16] "=m" (h1_h2_h15_h16[4])
+		    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+		      [h_table] "r" (c->u_mode.gcm.gcm_table)
+		    : "memory" );
+
+      while (nblocks >= 16)
+	{
+	  gfmul_vpclmul_avx2_aggr16 (buf, c->u_mode.gcm.gcm_table,
+				     h1_h2_h15_h16);
+
+	  buf += 16 * blocksize;
+	  nblocks -= 16;
+	}
+
+      /* Clear used x86-64/XMM registers. */
+      asm volatile("vmovdqu %%ymm15, %[h15_h16]\n\t"
+		   "vmovdqu %%ymm15, %[h1_h2]\n\t"
+		   "vzeroupper\n\t"
+#ifndef __WIN64__
+		   "pxor %%xmm8, %%xmm8\n\t"
+		   "pxor %%xmm9, %%xmm9\n\t"
+		   "pxor %%xmm10, %%xmm10\n\t"
+		   "pxor %%xmm11, %%xmm11\n\t"
+		   "pxor %%xmm12, %%xmm12\n\t"
+		   "pxor %%xmm13, %%xmm13\n\t"
+		   "pxor %%xmm14, %%xmm14\n\t"
+		   "pxor %%xmm15, %%xmm15\n\t"
+#endif
+		   "movdqa %[be_mask], %%xmm7\n\t"
+		   : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+		     [h15_h16] "=m" (h1_h2_h15_h16[4])
+		   : [be_mask] "m" (*be_mask)
+		   : "memory" );
+    }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
-      /* Preload H1. */
       asm volatile ("movdqa %%xmm7, %%xmm15\n\t"
-                    "movdqa %[h_1], %%xmm0\n\t"
+		    ::: "memory" );
+
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+	{
+	  asm volatile ("movdqa %%xmm1, %%xmm8\n\t"
+			::: "memory" );
+	  ghash_setup_aggr8 (c);
+	  asm volatile ("movdqa %%xmm8, %%xmm1\n\t"
+			::: "memory" );
+	}
+
+      /* Preload H1. */
+      asm volatile ("movdqa %[h_1], %%xmm0\n\t"
                     :
                     : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
                     : "memory" );
@@ -667,6 +1200,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
           buf += 8 * blocksize;
           nblocks -= 8;
         }
+
 #ifndef __WIN64__
       /* Clear used x86-64/XMM registers. */
       asm volatile( "pxor %%xmm8, %%xmm8\n\t"
@@ -680,7 +1214,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                     ::: "memory" );
 #endif
     }
-#endif
+#endif /* __x86_64__ */
 
   while (nblocks >= 4)
     {
@@ -761,7 +1295,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
 
 unsigned int ASM_FUNC_ATTR
 _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
-                            size_t nblocks)
+			    size_t nblocks)
 {
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
@@ -799,9 +1333,86 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                   [be_mask] "m" (*be_mask)
                 : "memory" );
 
+#if defined(GCM_USE_INTEL_VPCLMUL_AVX2)
+  if (nblocks >= 16
+      && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX2))
+    {
+      u64 h1_h2_h15_h16[4*2];
+
+      asm volatile ("vmovdqa %%xmm1, %%xmm8\n\t"
+		    ::: "memory" );
+
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+	{
+	  ghash_setup_aggr8_avx2 (c);
+	}
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR16_TABLE_INITIALIZED))
+	{
+	  ghash_setup_aggr16_avx2 (c);
+	}
+
+      /* Preload H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12. */
+      asm volatile ("vmovdqa %%xmm8, %%xmm1\n\t"
+		    "vpxor %%xmm8, %%xmm8, %%xmm8\n\t"
+		    "vmovdqu 0*16(%[h_table]), %%xmm7\n\t"
+		    "vperm2i128 $0x23, 13*16(%[h_table]), %%ymm8, %%ymm0\n\t"  /* H15|H16 */
+		    "vperm2i128 $0x23, 11*16(%[h_table]), %%ymm8, %%ymm13\n\t" /* H13|H14 */
+		    "vperm2i128 $0x23, 9*16(%[h_table]), %%ymm8, %%ymm12\n\t"  /* H11|H12 */
+		    "vperm2i128 $0x23, 7*16(%[h_table]), %%ymm8, %%ymm11\n\t"  /* H9|H10 */
+		    "vperm2i128 $0x23, 5*16(%[h_table]), %%ymm8, %%ymm10\n\t"  /* H7|H8 */
+		    "vperm2i128 $0x23, 3*16(%[h_table]), %%ymm8, %%ymm9\n\t"   /* H5|H6 */
+		    "vperm2i128 $0x23, 1*16(%[h_table]), %%ymm8, %%ymm8\n\t"   /* H3|H4 */
+		    "vinserti128 $1, %[h_1], %%ymm7, %%ymm7\n\t" /* H1|H2 */
+		    "vmovdqu %%ymm0, %[h15_h16]\n\t"
+		    "vmovdqu %%ymm7, %[h1_h2]\n\t"
+		    : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+		      [h15_h16] "=m" (h1_h2_h15_h16[4])
+		    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key),
+		      [h_table] "r" (c->u_mode.gcm.gcm_table)
+		    : "memory" );
+
+      while (nblocks >= 16)
+	{
+	  gfmul_vpclmul_avx2_aggr16_le (buf, c->u_mode.gcm.gcm_table,
+					h1_h2_h15_h16);
+
+	  buf += 16 * blocksize;
+	  nblocks -= 16;
+	}
+
+      /* Clear used x86-64/XMM registers. */
+      asm volatile("vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
+		   "vmovdqu %%ymm7, %[h15_h16]\n\t"
+		   "vmovdqu %%ymm7, %[h1_h2]\n\t"
+		   "vzeroupper\n\t"
+#ifndef __WIN64__
+		   "pxor %%xmm8, %%xmm8\n\t"
+		   "pxor %%xmm9, %%xmm9\n\t"
+		   "pxor %%xmm10, %%xmm10\n\t"
+		   "pxor %%xmm11, %%xmm11\n\t"
+		   "pxor %%xmm12, %%xmm12\n\t"
+		   "pxor %%xmm13, %%xmm13\n\t"
+		   "pxor %%xmm14, %%xmm14\n\t"
+#endif
+		   : [h1_h2] "=m" (h1_h2_h15_h16[0]),
+		     [h15_h16] "=m" (h1_h2_h15_h16[4])
+		   :
+		   : "memory" );
+    }
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
 #ifdef __x86_64__
   if (nblocks >= 8)
     {
+      if (!(c->u_mode.gcm.hw_impl_flags & GCM_INTEL_AGGR8_TABLE_INITIALIZED))
+	{
+	  asm volatile ("movdqa %%xmm1, %%xmm8\n\t"
+			::: "memory" );
+	  ghash_setup_aggr8 (c);
+	  asm volatile ("movdqa %%xmm8, %%xmm1\n\t"
+			::: "memory" );
+	}
+
       /* Preload H1. */
       asm volatile ("pxor %%xmm15, %%xmm15\n\t"
                     "movdqa %[h_1], %%xmm0\n\t"
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 69ff0de6..683f07b0 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -39,15 +39,8 @@
 
 
 #ifdef GCM_USE_INTEL_PCLMUL
-extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c);
-
-extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
-                                              const byte *buf, size_t nblocks);
-
-extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c,
-                                                byte *result,
-                                                const byte *buf,
-                                                size_t nblocks);
+extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c,
+					    unsigned int hw_features);
 #endif
 
 #ifdef GCM_USE_ARM_PMULL
@@ -594,9 +587,7 @@ setupM (gcry_cipher_hd_t c)
 #ifdef GCM_USE_INTEL_PCLMUL
   else if (features & HWF_INTEL_PCLMUL)
     {
-      c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
-      c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul;
-      _gcry_ghash_setup_intel_pclmul (c);
+      _gcry_ghash_setup_intel_pclmul (c, features);
     }
 #endif
 #ifdef GCM_USE_ARM_PMULL
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index c8a1097a..e31ac860 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -72,6 +72,14 @@
 # endif
 #endif /* GCM_USE_INTEL_PCLMUL */
 
+/* GCM_USE_INTEL_VPCLMUL_AVX2 indicates whether to compile GCM with Intel
+   VPCLMUL/AVX2 code.  */
+#undef GCM_USE_INTEL_VPCLMUL_AVX2
+#if defined(__x86_64__) && defined(GCM_USE_INTEL_PCLMUL) && \
+    defined(ENABLE_AVX2_SUPPORT) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define GCM_USE_INTEL_VPCLMUL_AVX2 1
+#endif /* GCM_USE_INTEL_VPCLMUL_AVX2 */
+
 /* GCM_USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
 #undef GCM_USE_ARM_PMULL
 #if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(GCM_USE_TABLES)
@@ -355,6 +363,9 @@ struct gcry_cipher_handle
 
       /* Key length used for GCM-SIV key generating key. */
       unsigned int siv_keylen;
+
+      /* Flags for accelerated implementations. */
+      unsigned int hw_impl_flags;
     } gcm;
 
     /* Mode specific storage for OCB mode. */
-- 
2.32.0