[PATCH 4/4] Optimizations for GCM Intel/PCLMUL implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Apr 26 18:33:46 CEST 2019
* cipher/cipher-gcm-intel-pclmul.c (reduction): New.
(glmul_pclmul): Include shifting to left into pclmul operations; Use
'reduction' helper function.
(gfmul_pclmul_aggr4): Reorder instructions and adjust register usage to
free up registers; Use 'reduction' helper function; Include shifting to
left into pclmul operations.
(gcm_lsh): New.
(_gcry_ghash_setup_intel_pclmul): Left shift H values to left by one.
(_gcry_ghash_intel_pclmul) [__x86_64__]: Preload H values to unused
registers.
--
Benchmark on Intel Haswell (amd64):
Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GMAC_AES | 0.206 ns/B 4624 MiB/s 0.825 c/B 3998
After (+12% faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GMAC_AES | 0.184 ns/B 5195 MiB/s 0.734 c/B 3998
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 60ae7aa9a..da309aead 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1,6 +1,6 @@
/* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode
* implementation
- * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -47,6 +47,35 @@
"Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
*/
+static inline void reduction(void)
+{
+ /* input: <xmm1:xmm3> */
+
+ asm volatile (/* first phase of the reduction */
+ "movdqa %%xmm3, %%xmm6\n\t"
+ "movdqa %%xmm3, %%xmm7\n\t"
+ "psllq $1, %%xmm6\n\t" /* packed right shifting << 63 */
+ "pxor %%xmm3, %%xmm6\n\t"
+ "psllq $57, %%xmm7\n\t" /* packed right shifting << 57 */
+ "psllq $62, %%xmm6\n\t" /* packed right shifting << 62 */
+ "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+ "pshufd $0x6a, %%xmm6, %%xmm7\n\t"
+ "pshufd $0xae, %%xmm6, %%xmm6\n\t"
+ "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction
+ complete */
+
+ /* second phase of the reduction */
+ "pxor %%xmm3, %%xmm1\n\t" /* xor the shifted versions */
+ "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 1 */
+ "pxor %%xmm3, %%xmm6\n\t"
+ "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 2 */
+ "pxor %%xmm3, %%xmm1\n\t"
+ "psrlq $5, %%xmm3\n\t" /* packed left shifting >> 7 */
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm6, %%xmm1\n\t" /* the result is in xmm1 */
+ ::: "memory" );
+}
+
static inline void gfmul_pclmul(void)
{
/* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
@@ -60,65 +89,22 @@ static inline void gfmul_pclmul(void)
"movdqa %%xmm0, %%xmm3\n\t"
"pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */
- "movdqa %%xmm0, %%xmm6\n\t"
- "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
+ "pclmulqdq $17, %%xmm0, %%xmm1\n\t" /* xmm6 holds a1*b1 */
"movdqa %%xmm3, %%xmm5\n\t"
"pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */
- "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+ "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
"pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
"movdqa %%xmm4, %%xmm5\n\t"
"psrldq $8, %%xmm4\n\t"
"pslldq $8, %%xmm5\n\t"
"pxor %%xmm5, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+ "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
carry-less multiplication of xmm0
by xmm1 */
+ ::: "memory" );
- /* shift the result by one bit position to the left cope for
- the fact that bits are reversed */
- "movdqa %%xmm3, %%xmm4\n\t"
- "movdqa %%xmm6, %%xmm5\n\t"
- "pslld $1, %%xmm3\n\t"
- "pslld $1, %%xmm6\n\t"
- "psrld $31, %%xmm4\n\t"
- "psrld $31, %%xmm5\n\t"
- "movdqa %%xmm4, %%xmm1\n\t"
- "pslldq $4, %%xmm5\n\t"
- "pslldq $4, %%xmm4\n\t"
- "psrldq $12, %%xmm1\n\t"
- "por %%xmm4, %%xmm3\n\t"
- "por %%xmm5, %%xmm6\n\t"
- "por %%xmm6, %%xmm1\n\t"
-
- /* first phase of the reduction */
- "movdqa %%xmm3, %%xmm6\n\t"
- "movdqa %%xmm3, %%xmm7\n\t"
- "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */
- "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */
- "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm6\n\t"
- "movdqa %%xmm6, %%xmm7\n\t"
- "pslldq $12, %%xmm6\n\t"
- "psrldq $4, %%xmm7\n\t"
- "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
- complete */
-
- /* second phase of the reduction */
- "movdqa %%xmm3, %%xmm2\n\t"
- "movdqa %%xmm3, %%xmm4\n\t"
- "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
- "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
- "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm2\n\t"
- "pxor %%xmm7, %%xmm2\n\t"
- "pxor %%xmm2, %%xmm3\n\t"
- "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
- ::: "cc" );
+ reduction();
}
@@ -136,117 +122,92 @@ static inline void gfmul_pclmul_aggr4(void)
Input must be converted to little-endian.
*/
asm volatile (/* perform clmul and merge results... */
- "pshufd $78, %%xmm10, %%xmm11\n\t"
+ "pshufd $78, %%xmm10, %%xmm5\n\t"
"pshufd $78, %%xmm1, %%xmm12\n\t"
- "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+ "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
"pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
+ "movdqa %%xmm10, %%xmm4\n\t"
+ "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */
+ "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
+ "pclmulqdq $0, %%xmm5, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
"pshufd $78, %%xmm9, %%xmm13\n\t"
- "pshufd $78, %%xmm2, %%xmm14\n\t"
+ "pshufd $78, %%xmm2, %%xmm5\n\t"
"pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
- "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
-
- "pshufd $78, %%xmm8, %%xmm5\n\t"
- "pshufd $78, %%xmm3, %%xmm15\n\t"
- "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
- "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
-
- "movdqa %%xmm10, %%xmm4\n\t"
+ "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 3:b0+b1 */
"movdqa %%xmm9, %%xmm7\n\t"
- "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */
"pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */
- "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
"pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */
- "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
- "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
-
- "pshufd $78, %%xmm0, %%xmm10\n\t"
- "pshufd $78, %%xmm6, %%xmm11\n\t"
- "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
- "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
+ "pclmulqdq $0, %%xmm13, %%xmm5\n\t" /* xmm5 holds 3:(a0+a1)*(b0+b1) */
"pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */
"pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
- "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
+ "pxor %%xmm5, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
- "movdqa %%xmm8, %%xmm13\n\t"
- "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */
+ "pshufd $78, %%xmm8, %%xmm5\n\t"
+ "pshufd $78, %%xmm3, %%xmm2\n\t"
+ "pxor %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a0+a1 */
+ "pxor %%xmm3, %%xmm2\n\t" /* xmm2 holds 2:b0+b1 */
+ "movdqa %%xmm8, %%xmm4\n\t"
+ "pclmulqdq $0, %%xmm3, %%xmm4\n\t" /* xmm4 holds 2:a0*b0 */
"pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */
- "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
+ "pclmulqdq $0, %%xmm5, %%xmm2\n\t" /* xmm2 holds 2:(a0+a1)*(b0+b1) */
- "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
+ "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
"pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
- "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
+ "pxor %%xmm12, %%xmm2\n\t" /* xmm2 holds 2+3+4:(a0+a1)*(b0+b1) */
+ "pshufd $78, %%xmm0, %%xmm11\n\t"
+ "pshufd $78, %%xmm6, %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */
+ "pxor %%xmm6, %%xmm4\n\t" /* xmm4 holds 1:b0+b1 */
"movdqa %%xmm0, %%xmm3\n\t"
"pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */
"pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
- "movdqa %%xmm11, %%xmm4\n\t"
- "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
+ "pclmulqdq $0, %%xmm11, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
"pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
- "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */
- "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+ "pxor %%xmm6, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */
+ "pxor %%xmm2, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
/* aggregated reduction... */
"movdqa %%xmm3, %%xmm5\n\t"
- "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+ "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
"pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
"movdqa %%xmm4, %%xmm5\n\t"
"psrldq $8, %%xmm4\n\t"
"pslldq $8, %%xmm5\n\t"
"pxor %%xmm5, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+ "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
carry-less multiplication of xmm0
by xmm1 */
+ :::"memory");
- /* shift the result by one bit position to the left cope for
- the fact that bits are reversed */
- "movdqa %%xmm3, %%xmm4\n\t"
- "movdqa %%xmm6, %%xmm5\n\t"
- "pslld $1, %%xmm3\n\t"
- "pslld $1, %%xmm6\n\t"
- "psrld $31, %%xmm4\n\t"
- "psrld $31, %%xmm5\n\t"
- "movdqa %%xmm4, %%xmm1\n\t"
- "pslldq $4, %%xmm5\n\t"
- "pslldq $4, %%xmm4\n\t"
- "psrldq $12, %%xmm1\n\t"
- "por %%xmm4, %%xmm3\n\t"
- "por %%xmm5, %%xmm6\n\t"
- "por %%xmm6, %%xmm1\n\t"
-
- /* first phase of the reduction */
- "movdqa %%xmm3, %%xmm6\n\t"
- "movdqa %%xmm3, %%xmm7\n\t"
- "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */
- "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */
- "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm6\n\t"
- "movdqa %%xmm6, %%xmm7\n\t"
- "pslldq $12, %%xmm6\n\t"
- "psrldq $4, %%xmm7\n\t"
- "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
- complete */
-
- /* second phase of the reduction */
- "movdqa %%xmm3, %%xmm2\n\t"
- "movdqa %%xmm3, %%xmm4\n\t"
- "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
- "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
- "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm2\n\t"
- "pxor %%xmm7, %%xmm2\n\t"
- "pxor %%xmm2, %%xmm3\n\t"
- "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
- :::"cc");
+ reduction();
}
#endif
+static inline void gcm_lsh(void *h, unsigned int hoffs)
+{
+ static const u64 pconst[2] __attribute__ ((aligned (16))) =
+ { U64_C(0x0000000000000001), U64_C(0xc200000000000000) };
+
+ asm volatile ("movdqu (%[h]), %%xmm2\n\t"
+ "pshufd $0xff, %%xmm2, %%xmm3\n\t"
+ "movdqa %%xmm2, %%xmm4\n\t"
+ "psrad $31, %%xmm3\n\t"
+ "pslldq $8, %%xmm4\n\t"
+ "pand %[pconst], %%xmm3\n\t"
+ "paddq %%xmm2, %%xmm2\n\t"
+ "psrlq $63, %%xmm4\n\t"
+ "pxor %%xmm3, %%xmm2\n\t"
+ "pxor %%xmm4, %%xmm2\n\t"
+ "movdqu %%xmm2, (%[h])\n\t"
+ :
+ : [pconst] "m" (pconst),
+ [h] "r" ((byte *)h + hoffs)
+ : "memory" );
+}
void
_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
@@ -274,13 +235,16 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
[be_mask] "m" (*be_mask)
: "memory");
+ gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
+
#ifdef __x86_64__
asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
+ "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
:
- :
+ : [key] "r" (c->u_mode.gcm.u_ghash_key.key)
: "memory");
- gfmul_pclmul (); /* H•H => H² */
+ gfmul_pclmul (); /* H<<<1•H => H² */
asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
"movdqa %%xmm1, %%xmm8\n\t"
@@ -288,22 +252,26 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
: [h_234] "r" (c->u_mode.gcm.gcm_table)
: "memory");
- gfmul_pclmul (); /* H•H² => H³ */
+ gcm_lsh(c->u_mode.gcm.gcm_table, 0); /* H² <<< 1 */
+ gfmul_pclmul (); /* H<<<1•H² => H³ */
asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
"movdqu %%xmm1, 1*16(%[h_234])\n\t"
- "movdqa %%xmm8, %%xmm1\n\t"
+ "movdqu 0*16(%[h_234]), %%xmm1\n\t" /* load H² <<< 1 */
:
: [h_234] "r" (c->u_mode.gcm.gcm_table)
: "memory");
- gfmul_pclmul (); /* H²•H² => H⁴ */
+ gfmul_pclmul (); /* H²<<<1•H² => H⁴ */
asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
:
: [h_234] "r" (c->u_mode.gcm.gcm_table)
: "memory");
+ gcm_lsh(c->u_mode.gcm.gcm_table, 16); /* H² <<< 1 */
+ gcm_lsh(c->u_mode.gcm.gcm_table, 32); /* H³ <<< 1 */
+
#ifdef __WIN64__
/* Clear/restore used registers. */
asm volatile( "pxor %%xmm0, %%xmm0\n\t"
@@ -329,7 +297,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
"pxor %%xmm6, %%xmm6\n\t"
"pxor %%xmm7, %%xmm7\n\t"
"pxor %%xmm8, %%xmm8\n\t"
- ::: "cc" );
+ ::: "memory" );
#endif
#endif
}
@@ -372,32 +340,36 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
"pshufb %[be_mask], %%xmm1\n\t" /* be => le */
:
: [hash] "m" (*result), [be_mask] "m" (*be_mask),
- [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
+ [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)
+ : "memory" );
#ifdef __x86_64__
if (nblocks >= 4)
{
+ asm volatile (/* Load H2, H3, H4, be_mask. */
+ "movdqu 2*16(%[h_234]), %%xmm10\n\t"
+ "movdqu 1*16(%[h_234]), %%xmm9\n\t"
+ "movdqu 0*16(%[h_234]), %%xmm8\n\t"
+ "movdqa %[be_mask], %%xmm14\n\t"
+ :
+ : [h_234] "r" (c->u_mode.gcm.gcm_table),
+ [be_mask] "m" (*be_mask)
+ : "memory" );
+
do
{
- asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
- "movdqu 0*16(%[buf]), %%xmm5\n\t"
+ asm volatile ("movdqu 0*16(%[buf]), %%xmm5\n\t"
"movdqu 1*16(%[buf]), %%xmm2\n\t"
"movdqu 2*16(%[buf]), %%xmm3\n\t"
"movdqu 3*16(%[buf]), %%xmm6\n\t"
- "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
-
- /* Load H2, H3, H4. */
- "movdqu 2*16(%[h_234]), %%xmm10\n\t"
- "movdqu 1*16(%[h_234]), %%xmm9\n\t"
- "movdqu 0*16(%[h_234]), %%xmm8\n\t"
-
+ "pshufb %%xmm14, %%xmm5\n\t" /* be => le */
+ "pshufb %%xmm14, %%xmm2\n\t" /* be => le */
+ "pshufb %%xmm14, %%xmm3\n\t" /* be => le */
"pxor %%xmm5, %%xmm1\n\t"
- "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
- "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
- "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
+ "pshufb %%xmm14, %%xmm6\n\t" /* be => le */
:
- : [buf] "r" (buf), [be_mask] "m" (*be_mask),
- [h_234] "r" (c->u_mode.gcm.gcm_table));
+ : [buf] "r" (buf)
+ : "memory" );
gfmul_pclmul_aggr4 ();
@@ -416,29 +388,32 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
"pxor %%xmm13, %%xmm13\n\t"
"pxor %%xmm14, %%xmm14\n\t"
"pxor %%xmm15, %%xmm15\n\t"
- ::: "cc" );
+ ::: "memory" );
#endif
}
#endif
- while (nblocks--)
+ while (nblocks)
{
asm volatile ("movdqu %[buf], %%xmm2\n\t"
"pshufb %[be_mask], %%xmm2\n\t" /* be => le */
"pxor %%xmm2, %%xmm1\n\t"
:
- : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
+ : [buf] "m" (*buf), [be_mask] "m" (*be_mask)
+ : "memory" );
gfmul_pclmul ();
buf += blocksize;
+ nblocks--;
}
/* Store hash. */
asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
"movdqu %%xmm1, %[hash]\n\t"
: [hash] "=m" (*result)
- : [be_mask] "m" (*be_mask));
+ : [be_mask] "m" (*be_mask)
+ : "memory" );
#ifdef __WIN64__
/* Clear/restore used registers. */
@@ -471,7 +446,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
"pxor %%xmm5, %%xmm5\n\t"
"pxor %%xmm6, %%xmm6\n\t"
"pxor %%xmm7, %%xmm7\n\t"
- ::: "cc" );
+ ::: "memory" );
#endif
return 0;
More information about the Gcrypt-devel
mailing list