[PATCH 07/13] Add aggregated bulk processing for GCM on x86-64

Wed Nov 20 18:00:32 CET 2013

* cipher/cipher-gcm.c [__x86_64__] (gfmul_pclmul_aggr4): New.
(ghash) [GCM_USE_INTEL_PCLMUL]: Add aggregated bulk processing
for __x86_64__.
(setupM) [__x86_64__]: Add initialization for aggregated bulk
processing.
--

Intel Haswell (x86-64):
Old:
AES     GCM enc |     0.990 ns/B     963.3 MiB/s      3.17 c/B
        GCM dec |     0.982 ns/B     970.9 MiB/s      3.14 c/B
       GCM auth |     0.711 ns/B    1340.8 MiB/s      2.28 c/B
New:
AES     GCM enc |     0.535 ns/B    1783.8 MiB/s      1.71 c/B
        GCM dec |     0.531 ns/B    1796.2 MiB/s      1.70 c/B
       GCM auth |     0.255 ns/B    3736.4 MiB/s     0.817 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm.c |  228 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 219 insertions(+), 9 deletions(-)

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index baedc04..fcfa357 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -421,6 +421,131 @@ static inline void gfmul_pclmul(void)
                 ::: "cc" );
 }
 
+#ifdef __x86_64__
+static inline void gfmul_pclmul_aggr4(void)
+{
+  /* Input:
+      H¹: XMM0		X_i            : XMM6
+      H²: XMM8		X_(i-1)        : XMM3
+      H³: XMM9		X_(i-2)        : XMM2
+      H⁴: XMM10		X_(i-3)⊕Y_(i-4): XMM1
+     Output:
+      Y_i: XMM1
+     Inputs XMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* perform clmul and merge results... */
+                "pshufd $78, %%xmm10, %%xmm11\n\t"
+                "pshufd $78, %%xmm1, %%xmm12\n\t"
+                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
+
+                "pshufd $78, %%xmm9, %%xmm13\n\t"
+                "pshufd $78, %%xmm2, %%xmm14\n\t"
+                "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
+
+                "pshufd $78, %%xmm8, %%xmm5\n\t"
+                "pshufd $78, %%xmm3, %%xmm15\n\t"
+                "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
+                "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
+
+                "movdqa %%xmm10, %%xmm4\n\t"
+                "movdqa %%xmm9, %%xmm7\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
+                "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
+                "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
+                "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm0, %%xmm10\n\t"
+                "pshufd $78, %%xmm6, %%xmm11\n\t"
+                "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
+                "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
+
+                "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
+                "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
+
+                "movdqa %%xmm8, %%xmm13\n\t"
+                "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
+                "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
+                "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
+                "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+                "movdqa %%xmm0, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
+                "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
+                "movdqa %%xmm11, %%xmm4\n\t"
+                "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
+                "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
+                "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+
+                /* aggregated reduction... */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+
+                /* shift the result by one bit position to the left cope for
+                   the fact that bits are reversed */
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "movdqa %%xmm6, %%xmm5\n\t"
+                "pslld $1, %%xmm3\n\t"
+                "pslld $1, %%xmm6\n\t"
+                "psrld $31, %%xmm4\n\t"
+                "psrld $31, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm1\n\t"
+                "pslldq $4, %%xmm5\n\t"
+                "pslldq $4, %%xmm4\n\t"
+                "psrldq $12, %%xmm1\n\t"
+                "por %%xmm4, %%xmm3\n\t"
+                "por %%xmm5, %%xmm6\n\t"
+                "por %%xmm6, %%xmm1\n\t"
+
+                /* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
+                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
+                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm6\n\t"
+                "movdqa %%xmm6, %%xmm7\n\t"
+                "pslldq $12, %%xmm6\n\t"
+                "psrldq $4, %%xmm7\n\t"
+                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "movdqa %%xmm3, %%xmm2\n\t"
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
+                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm2\n\t"
+                "pxor %%xmm7, %%xmm2\n\t"
+                "pxor %%xmm2, %%xmm3\n\t"
+                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
+                :::"cc");
+}
+#endif
+
 #endif /*GCM_USE_INTEL_PCLMUL*/
 
 
@@ -438,32 +563,74 @@ ghash (gcry_cipher_hd_t c, byte *result, const byte *buf,
 #ifdef GCM_USE_INTEL_PCLMUL
   else if (c->u_mode.gcm.use_intel_pclmul)
     {
-      /* TODO: Add faster bulk processing (parallel four blocks) for x86-64. */
-
       static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
         { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
 
-      /* Preload hash. */
+      /* Preload hash and H1. */
       asm volatile ("movdqu %[hash], %%xmm1\n\t"
+                    "movdqa %[hsub], %%xmm0\n\t"
                     "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
                     :
-                    : [hash] "m" (*result), [be_mask] "m" (*be_mask));
+                    : [hash] "m" (*result), [be_mask] "m" (*be_mask),
+                      [hsub] "m" (*c->u_iv.iv));
+
+#ifdef __x86_64__
+      if (nblocks >= 4)
+        {
+          do
+            {
+              asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
+                            "movdqu 0*16(%[buf]), %%xmm5\n\t"
+                            "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                            "movdqu 2*16(%[buf]), %%xmm3\n\t"
+                            "movdqu 3*16(%[buf]), %%xmm6\n\t"
+                            "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
+
+                            /* Load H2, H3, H4. */
+                            "movdqu 2*16(%[h_234]), %%xmm10\n\t"
+                            "movdqu 1*16(%[h_234]), %%xmm9\n\t"
+                            "movdqu 0*16(%[h_234]), %%xmm8\n\t"
+
+                            "pxor %%xmm5, %%xmm1\n\t"
+                            "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
+                            "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
+                            "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
+                            :
+                            : [buf] "r" (buf), [be_mask] "m" (*be_mask),
+                              [h_234] "r" (c->u_mode.gcm.gcm_table));
+
+              gfmul_pclmul_aggr4 ();
+
+              buf += 4 * blocksize;
+              nblocks -= 4;
+            }
+          while (nblocks >= 4);
+
+          /* Clear used x86-64/XMM registers. */
+          asm volatile( "pxor %%xmm8, %%xmm8\n\t"
+                        "pxor %%xmm9, %%xmm9\n\t"
+                        "pxor %%xmm10, %%xmm10\n\t"
+                        "pxor %%xmm11, %%xmm11\n\t"
+                        "pxor %%xmm12, %%xmm12\n\t"
+                        "pxor %%xmm13, %%xmm13\n\t"
+                        "pxor %%xmm14, %%xmm14\n\t"
+                        "pxor %%xmm15, %%xmm15\n\t"
+                        ::: "cc" );
+        }
+#endif
 
-      do
+      while (nblocks--)
         {
           asm volatile ("movdqu %[buf], %%xmm2\n\t"
-                        "movdqa %[hsub], %%xmm0\n\t"
                         "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
                         "pxor %%xmm2, %%xmm1\n\t"
                         :
-                        : [buf] "m" (*buf), [be_mask] "m" (*be_mask),
-                          [hsub] "m" (*c->u_iv.iv));
+                        : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
 
           gfmul_pclmul ();
 
           buf += blocksize;
         }
-      while (--nblocks);
 
       /* Store hash. */
       asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
@@ -511,6 +678,49 @@ setupM (gcry_cipher_hd_t c, byte *h)
       tmp[0] = buf_get_be64(c->u_iv.iv + 8);
       tmp[1] = buf_get_be64(c->u_iv.iv + 0);
       buf_cpy (c->u_iv.iv, tmp, GCRY_GCM_BLOCK_LEN);
+
+#ifdef __x86_64__
+      asm volatile ("movdqu %[h_1], %%xmm0\n\t"
+                    "movdqa %%xmm0, %%xmm1\n\t"
+                    :
+                    : [h_1] "m" (*tmp));
+
+      gfmul_pclmul (); /* H•H => H² */
+
+      asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
+                    "movdqa %%xmm1, %%xmm8\n\t"
+                    :
+                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                    : "memory");
+
+      gfmul_pclmul (); /* H•H² => H³ */
+
+      asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
+                    "movdqu %%xmm1, 1*16(%[h_234])\n\t"
+                    "movdqa %%xmm8, %%xmm1\n\t"
+                    :
+                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                    : "memory");
+
+      gfmul_pclmul (); /* H²•H² => H⁴ */
+
+      asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
+                    :
+                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                    : "memory");
+
+      /* Clear used registers. */
+      asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                    "pxor %%xmm1, %%xmm1\n\t"
+                    "pxor %%xmm2, %%xmm2\n\t"
+                    "pxor %%xmm3, %%xmm3\n\t"
+                    "pxor %%xmm4, %%xmm4\n\t"
+                    "pxor %%xmm5, %%xmm5\n\t"
+                    "pxor %%xmm6, %%xmm6\n\t"
+                    "pxor %%xmm7, %%xmm7\n\t"
+                    "pxor %%xmm8, %%xmm8\n\t"
+                    ::: "cc" );
+#endif
     }
 #endif
   else