[PATCH 07/13] Add aggregated bulk processing for GCM on x86-64
Jussi Kivilinna
jussi.kivilinna at iki.fi
Wed Nov 20 18:00:32 CET 2013
* cipher/cipher-gcm.c [__x86_64__] (gfmul_pclmul_aggr4): New.
(ghash) [GCM_USE_INTEL_PCLMUL]: Add aggregated bulk processing
for __x86_64__.
(setupM) [__x86_64__]: Add initialization for aggregated bulk
processing.
--
Intel Haswell (x86-64):
Old:
AES GCM enc | 0.990 ns/B 963.3 MiB/s 3.17 c/B
GCM dec | 0.982 ns/B 970.9 MiB/s 3.14 c/B
GCM auth | 0.711 ns/B 1340.8 MiB/s 2.28 c/B
New:
AES GCM enc | 0.535 ns/B 1783.8 MiB/s 1.71 c/B
GCM dec | 0.531 ns/B 1796.2 MiB/s 1.70 c/B
GCM auth | 0.255 ns/B 3736.4 MiB/s 0.817 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/cipher-gcm.c | 228 +++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 219 insertions(+), 9 deletions(-)
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index baedc04..fcfa357 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -421,6 +421,131 @@ static inline void gfmul_pclmul(void)
::: "cc" );
}
+#ifdef __x86_64__
+static inline void gfmul_pclmul_aggr4(void)
+{
+ /* Input:
+ H¹: XMM0 X_i : XMM6
+ H²: XMM8 X_(i-1) : XMM3
+ H³: XMM9 X_(i-2) : XMM2
+ H⁴: XMM10 X_(i-3)⊕Y_(i-4): XMM1
+ Output:
+ Y_i: XMM1
+ Inputs XMM0 stays unmodified.
+ Input must be converted to little-endian.
+ */
+ asm volatile (/* perform clmul and merge results... */
+ "pshufd $78, %%xmm10, %%xmm11\n\t"
+ "pshufd $78, %%xmm1, %%xmm12\n\t"
+ "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+ "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
+
+ "pshufd $78, %%xmm9, %%xmm13\n\t"
+ "pshufd $78, %%xmm2, %%xmm14\n\t"
+ "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
+ "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
+
+ "pshufd $78, %%xmm8, %%xmm5\n\t"
+ "pshufd $78, %%xmm3, %%xmm15\n\t"
+ "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
+ "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
+
+ "movdqa %%xmm10, %%xmm4\n\t"
+ "movdqa %%xmm9, %%xmm7\n\t"
+ "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */
+ "pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */
+ "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
+ "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */
+ "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
+ "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
+
+ "pshufd $78, %%xmm0, %%xmm10\n\t"
+ "pshufd $78, %%xmm6, %%xmm11\n\t"
+ "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
+ "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
+
+ "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */
+ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
+ "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
+
+ "movdqa %%xmm8, %%xmm13\n\t"
+ "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */
+ "pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */
+ "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
+
+ "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
+ "pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
+ "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+ "movdqa %%xmm0, %%xmm3\n\t"
+ "pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */
+ "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
+ "movdqa %%xmm11, %%xmm4\n\t"
+ "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
+
+ "pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
+ "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */
+ "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+
+ /* aggregated reduction... */
+ "movdqa %%xmm3, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+ "movdqa %%xmm4, %%xmm5\n\t"
+ "psrldq $8, %%xmm4\n\t"
+ "pslldq $8, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+ carry-less multiplication of xmm0
+ by xmm1 */
+
+ /* shift the result by one bit position to the left cope for
+ the fact that bits are reversed */
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "movdqa %%xmm6, %%xmm5\n\t"
+ "pslld $1, %%xmm3\n\t"
+ "pslld $1, %%xmm6\n\t"
+ "psrld $31, %%xmm4\n\t"
+ "psrld $31, %%xmm5\n\t"
+ "movdqa %%xmm4, %%xmm1\n\t"
+ "pslldq $4, %%xmm5\n\t"
+ "pslldq $4, %%xmm4\n\t"
+ "psrldq $12, %%xmm1\n\t"
+ "por %%xmm4, %%xmm3\n\t"
+ "por %%xmm5, %%xmm6\n\t"
+ "por %%xmm6, %%xmm1\n\t"
+
+ /* first phase of the reduction */
+ "movdqa %%xmm3, %%xmm6\n\t"
+ "movdqa %%xmm3, %%xmm7\n\t"
+ "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+ "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */
+ "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */
+ "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+ "pxor %%xmm5, %%xmm6\n\t"
+ "movdqa %%xmm6, %%xmm7\n\t"
+ "pslldq $12, %%xmm6\n\t"
+ "psrldq $4, %%xmm7\n\t"
+ "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
+ complete */
+
+ /* second phase of the reduction */
+ "movdqa %%xmm3, %%xmm2\n\t"
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+ "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
+ "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
+ "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+ "pxor %%xmm5, %%xmm2\n\t"
+ "pxor %%xmm7, %%xmm2\n\t"
+ "pxor %%xmm2, %%xmm3\n\t"
+ "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
+ :::"cc");
+}
+#endif
+
#endif /*GCM_USE_INTEL_PCLMUL*/
@@ -438,32 +563,74 @@ ghash (gcry_cipher_hd_t c, byte *result, const byte *buf,
#ifdef GCM_USE_INTEL_PCLMUL
else if (c->u_mode.gcm.use_intel_pclmul)
{
- /* TODO: Add faster bulk processing (parallel four blocks) for x86-64. */
-
static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
- /* Preload hash. */
+ /* Preload hash and H1. */
asm volatile ("movdqu %[hash], %%xmm1\n\t"
+ "movdqa %[hsub], %%xmm0\n\t"
"pshufb %[be_mask], %%xmm1\n\t" /* be => le */
:
- : [hash] "m" (*result), [be_mask] "m" (*be_mask));
+ : [hash] "m" (*result), [be_mask] "m" (*be_mask),
+ [hsub] "m" (*c->u_iv.iv));
+
+#ifdef __x86_64__
+ if (nblocks >= 4)
+ {
+ do
+ {
+ asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
+ "movdqu 0*16(%[buf]), %%xmm5\n\t"
+ "movdqu 1*16(%[buf]), %%xmm2\n\t"
+ "movdqu 2*16(%[buf]), %%xmm3\n\t"
+ "movdqu 3*16(%[buf]), %%xmm6\n\t"
+ "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
+
+ /* Load H2, H3, H4. */
+ "movdqu 2*16(%[h_234]), %%xmm10\n\t"
+ "movdqu 1*16(%[h_234]), %%xmm9\n\t"
+ "movdqu 0*16(%[h_234]), %%xmm8\n\t"
+
+ "pxor %%xmm5, %%xmm1\n\t"
+ "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
+ "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
+ "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
+ :
+ : [buf] "r" (buf), [be_mask] "m" (*be_mask),
+ [h_234] "r" (c->u_mode.gcm.gcm_table));
+
+ gfmul_pclmul_aggr4 ();
+
+ buf += 4 * blocksize;
+ nblocks -= 4;
+ }
+ while (nblocks >= 4);
+
+ /* Clear used x86-64/XMM registers. */
+ asm volatile( "pxor %%xmm8, %%xmm8\n\t"
+ "pxor %%xmm9, %%xmm9\n\t"
+ "pxor %%xmm10, %%xmm10\n\t"
+ "pxor %%xmm11, %%xmm11\n\t"
+ "pxor %%xmm12, %%xmm12\n\t"
+ "pxor %%xmm13, %%xmm13\n\t"
+ "pxor %%xmm14, %%xmm14\n\t"
+ "pxor %%xmm15, %%xmm15\n\t"
+ ::: "cc" );
+ }
+#endif
- do
+ while (nblocks--)
{
asm volatile ("movdqu %[buf], %%xmm2\n\t"
- "movdqa %[hsub], %%xmm0\n\t"
"pshufb %[be_mask], %%xmm2\n\t" /* be => le */
"pxor %%xmm2, %%xmm1\n\t"
:
- : [buf] "m" (*buf), [be_mask] "m" (*be_mask),
- [hsub] "m" (*c->u_iv.iv));
+ : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
gfmul_pclmul ();
buf += blocksize;
}
- while (--nblocks);
/* Store hash. */
asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
@@ -511,6 +678,49 @@ setupM (gcry_cipher_hd_t c, byte *h)
tmp[0] = buf_get_be64(c->u_iv.iv + 8);
tmp[1] = buf_get_be64(c->u_iv.iv + 0);
buf_cpy (c->u_iv.iv, tmp, GCRY_GCM_BLOCK_LEN);
+
+#ifdef __x86_64__
+ asm volatile ("movdqu %[h_1], %%xmm0\n\t"
+ "movdqa %%xmm0, %%xmm1\n\t"
+ :
+ : [h_1] "m" (*tmp));
+
+ gfmul_pclmul (); /* H•H => H² */
+
+ asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
+ "movdqa %%xmm1, %%xmm8\n\t"
+ :
+ : [h_234] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul (); /* H•H² => H³ */
+
+ asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
+ "movdqu %%xmm1, 1*16(%[h_234])\n\t"
+ "movdqa %%xmm8, %%xmm1\n\t"
+ :
+ : [h_234] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ gfmul_pclmul (); /* H²•H² => H⁴ */
+
+ asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
+ :
+ : [h_234] "r" (c->u_mode.gcm.gcm_table)
+ : "memory");
+
+ /* Clear used registers. */
+ asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm4\n\t"
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "pxor %%xmm7, %%xmm7\n\t"
+ "pxor %%xmm8, %%xmm8\n\t"
+ ::: "cc" );
+#endif
}
#endif
else
More information about the Gcrypt-devel
mailing list