[PATCH] crc-intel-pclmul: add AVX2 and AVX512 code paths
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu Aug 7 14:45:47 CEST 2025
* cipher/crc-intel-pclmul.c (crc32_consts_s, crc32_consts)
(crc24rfc2440_consts): Add k_ymm and k_zmm.
(crc32_reflected_bulk, crc32_bulk): Add VPCLMUL+AVX2 and VAES_VPCLMUL+AVX512
code paths; Add 'hwfeatures' parameter.
(_gcry_crc32_intel_pclmul, _gcry_crc24rfc2440_intel_pclmul): Add 'hwfeatures'
parameter.
* cipher/crc.c (CRC_CONTEXT) [USE_INTEL_PCLMUL]: Add 'hwfeatures'.
(_gcry_crc32_intel_pclmul, _gcry_crc24rfc2440_intel_pclmul): Add 'hwfeatures'
parameter.
(crc32_init, crc32rfc1510_init, crc24rfc2440_init) [USE_INTEL_PCLMUL]: Store
HW features to context.
--
Benchmark on Zen4:
Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CRC32 | 0.046 ns/B 20861 MiB/s 0.248 c/B 5421±1
CRC32RFC1510 | 0.046 ns/B 20809 MiB/s 0.250 c/B 5463±14
CRC24RFC2440 | 0.046 ns/B 20934 MiB/s 0.251 c/B 5504±2
After AVX2:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CRC32 | 0.023 ns/B 42277 MiB/s 0.123 c/B 5440±6
CRC32RFC1510 | 0.022 ns/B 42949 MiB/s 0.121 c/B 5454±16
CRC24RFC2440 | 0.023 ns/B 41955 MiB/s 0.124 c/B 5439±13
After AVX512:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CRC32 | 0.011 ns/B 85877 MiB/s 0.061 c/B 5500
CRC32RFC1510 | 0.011 ns/B 83898 MiB/s 0.063 c/B 5500
CRC24RFC2440 | 0.012 ns/B 80590 MiB/s 0.065 c/B 5500
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/crc-intel-pclmul.c | 500 ++++++++++++++++++++++++++++++++++----
cipher/crc.c | 13 +-
2 files changed, 459 insertions(+), 54 deletions(-)
diff --git a/cipher/crc-intel-pclmul.c b/cipher/crc-intel-pclmul.c
index 825dee2a..8209fc34 100644
--- a/cipher/crc-intel-pclmul.c
+++ b/cipher/crc-intel-pclmul.c
@@ -68,6 +68,10 @@ struct crc32_consts_s
u64 k[6];
/* my_p: { floor(x^64 / P(x)), P(x) } */
u64 my_p[2];
+ /* k_ymm: { x^(32*33), x^(32*31) } mod P(x) */
+ u64 k_ymm[2];
+ /* k_zmm: { x^(32*65), x^(32*63) } mod P(x) */
+ u64 k_zmm[2];
};
@@ -81,6 +85,12 @@ static const struct crc32_consts_s crc32_consts ALIGNED_16 =
},
{ /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
U64_C(0x1f7011641), U64_C(0x1db710641)
+ },
+ { /* k_ymm[2] */
+ U64_C(0x1e88ef372), U64_C(0x14a7fe880) /* y = { 33, 31 } */,
+ },
+ { /* k_zmm[2] */
+ U64_C(0x11542778a), U64_C(0x1322d1430) /* y = { 65, 63 } */
}
};
@@ -94,6 +104,12 @@ static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 =
},
{ /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+ },
+ { /* k_ymm[2] */
+ U64_C(0xaee5d500) << 32, U64_C(0x1a43ea00) << 32 /* y = { 33, 31 } */
+ },
+ { /* k_zmm[2] */
+ U64_C(0x21342700) << 32, U64_C(0x5d2b6300) << 32 /* y = { 65, 63 } */
}
};
@@ -144,31 +160,216 @@ static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
/* PCLMUL functions for reflected CRC32. */
static ASM_FUNC_ATTR_INLINE void
crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
- const struct crc32_consts_s *consts)
+ const struct crc32_consts_s *consts, u32 hwfeatures)
{
if (inlen >= 8 * 16)
{
- asm volatile ("movd %[crc], %%xmm4\n\t"
- "movdqu %[inbuf_0], %%xmm0\n\t"
- "movdqu %[inbuf_1], %%xmm1\n\t"
- "movdqu %[inbuf_2], %%xmm2\n\t"
- "movdqu %[inbuf_3], %%xmm3\n\t"
- "pxor %%xmm4, %%xmm0\n\t"
- :
- : [inbuf_0] "m" (inbuf[0 * 16]),
- [inbuf_1] "m" (inbuf[1 * 16]),
- [inbuf_2] "m" (inbuf[2 * 16]),
- [inbuf_3] "m" (inbuf[3 * 16]),
- [crc] "m" (*pcrc)
- );
+ if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+ && (hwfeatures & HWF_INTEL_AVX2)
+ && inlen >= 8 * 32)
+ {
+ if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+ && (hwfeatures & HWF_INTEL_AVX512)
+ && inlen >= 8 * 64)
+ {
+ asm volatile("vmovd %[crc], %%xmm4\n\t"
+ "vpopcntb %%xmm4, %%xmm0\n\t" /* spec stop for old AVX512 CPUs */
+ "vmovdqu64 %[inbuf_0], %%zmm0\n\t"
+ "vmovdqu64 %[inbuf_1], %%zmm1\n\t"
+ "vmovdqu64 %[inbuf_2], %%zmm2\n\t"
+ "vmovdqu64 %[inbuf_3], %%zmm3\n\t"
+ "vpxorq %%zmm4, %%zmm0, %%zmm0\n\t"
+ :
+ : [crc] "m" (*pcrc),
+ [inbuf_0] "m" (inbuf[0 * 64]),
+ [inbuf_1] "m" (inbuf[1 * 64]),
+ [inbuf_2] "m" (inbuf[2 * 64]),
+ [inbuf_3] "m" (inbuf[3 * 64]),
+ [k_zmm] "m" (consts->k_zmm[0])
+ );
+
+ inbuf += 4 * 64;
+ inlen -= 4 * 64;
+
+ asm volatile("vbroadcasti32x4 %[k_zmm], %%zmm4\n\t"
+ :
+ : [k_zmm] "m" (consts->k_zmm[0])
+ );
+
+ /* Fold by 16. */
+ while (inlen >= 4 * 64)
+ {
+ asm volatile ("vmovdqu64 %[inbuf_0], %%zmm5\n\t"
+ "vmovdqa64 %%zmm0, %%zmm6\n\t"
+ "vpclmulqdq $0x00, %%zmm4, %%zmm0, %%zmm0\n\t"
+ "vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm0\n\t"
+
+ "vmovdqu64 %[inbuf_1], %%zmm5\n\t"
+ "vmovdqa64 %%zmm1, %%zmm6\n\t"
+ "vpclmulqdq $0x00, %%zmm4, %%zmm1, %%zmm1\n\t"
+ "vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm1\n\t"
+
+ "vmovdqu64 %[inbuf_2], %%zmm5\n\t"
+ "vmovdqa64 %%zmm2, %%zmm6\n\t"
+ "vpclmulqdq $0x00, %%zmm4, %%zmm2, %%zmm2\n\t"
+ "vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm2\n\t"
+
+ "vmovdqu64 %[inbuf_3], %%zmm5\n\t"
+ "vmovdqa64 %%zmm3, %%zmm6\n\t"
+ "vpclmulqdq $0x00, %%zmm4, %%zmm3, %%zmm3\n\t"
+ "vpclmulqdq $0x11, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 64]),
+ [inbuf_1] "m" (inbuf[1 * 64]),
+ [inbuf_2] "m" (inbuf[2 * 64]),
+ [inbuf_3] "m" (inbuf[3 * 64])
+ );
+
+ inbuf += 4 * 64;
+ inlen -= 4 * 64;
+ }
+
+ /* Fold 16 to 8. */
+ asm volatile("vbroadcasti32x4 %[k_ymm], %%zmm4\n\t"
+ /* Fold zmm2 into zmm0. */
+ "vmovdqa64 %%zmm0, %%zmm5\n\t"
+ "vpclmulqdq $0x00, %%zmm4, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x11, %%zmm4, %%zmm0, %%zmm0\n\t"
+ "vpternlogq $0x96, %%zmm2, %%zmm5, %%zmm0\n\t"
+ /* Fold zmm3 into zmm1. */
+ "vmovdqa64 %%zmm1, %%zmm5\n\t"
+ "vpclmulqdq $0x00, %%zmm4, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x11, %%zmm4, %%zmm1, %%zmm1\n\t"
+ "vpternlogq $0x96, %%zmm3, %%zmm5, %%zmm1\n\t"
+ :
+ : [k_ymm] "m" (consts->k_ymm[0]));
+
+ asm volatile("vextracti64x4 $1, %%zmm1, %%ymm3\n\t"
+ "vmovdqa %%ymm1, %%ymm2\n\t"
+ "vextracti64x4 $1, %%zmm0, %%ymm1\n\t"
+ :
+ : );
+ }
+ else
+ {
+ asm volatile ("vmovd %[crc], %%xmm4\n\t"
+ "vmovdqu %[inbuf_0], %%ymm0\n\t"
+ "vmovdqu %[inbuf_1], %%ymm1\n\t"
+ "vmovdqu %[inbuf_2], %%ymm2\n\t"
+ "vmovdqu %[inbuf_3], %%ymm3\n\t"
+ "vpxor %%ymm4, %%ymm0, %%ymm0\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 32]),
+ [inbuf_1] "m" (inbuf[1 * 32]),
+ [inbuf_2] "m" (inbuf[2 * 32]),
+ [inbuf_3] "m" (inbuf[3 * 32]),
+ [crc] "m" (*pcrc)
+ );
- inbuf += 4 * 16;
- inlen -= 4 * 16;
+ inbuf += 4 * 32;
+ inlen -= 4 * 32;
- asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
- :
- : [k1k2] "m" (consts->k[1 - 1])
- );
+ asm volatile ("vbroadcasti128 %[k_ymm], %%ymm4\n\t"
+ :
+ : [k_ymm] "m" (consts->k_ymm[0])
+ );
+ }
+
+ /* Fold by 8. */
+ while (inlen >= 4 * 32)
+ {
+ asm volatile ("vmovdqu %[inbuf_0], %%ymm5\n\t"
+ "vmovdqa %%ymm0, %%ymm6\n\t"
+ "vpclmulqdq $0x00, %%ymm4, %%ymm0, %%ymm0\n\t"
+ "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+ "vpxor %%ymm6, %%ymm0, %%ymm0\n\t"
+
+ "vmovdqu %[inbuf_1], %%ymm5\n\t"
+ "vmovdqa %%ymm1, %%ymm6\n\t"
+ "vpclmulqdq $0x00, %%ymm4, %%ymm1, %%ymm1\n\t"
+ "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+ "vpxor %%ymm6, %%ymm1, %%ymm1\n\t"
+
+ "vmovdqu %[inbuf_2], %%ymm5\n\t"
+ "vmovdqa %%ymm2, %%ymm6\n\t"
+ "vpclmulqdq $0x00, %%ymm4, %%ymm2, %%ymm2\n\t"
+ "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
+ "vpxor %%ymm6, %%ymm2, %%ymm2\n\t"
+
+ "vmovdqu %[inbuf_3], %%ymm5\n\t"
+ "vmovdqa %%ymm3, %%ymm6\n\t"
+ "vpclmulqdq $0x00, %%ymm4, %%ymm3, %%ymm3\n\t"
+ "vpclmulqdq $0x11, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 32]),
+ [inbuf_1] "m" (inbuf[1 * 32]),
+ [inbuf_2] "m" (inbuf[2 * 32]),
+ [inbuf_3] "m" (inbuf[3 * 32])
+ );
+
+ inbuf += 4 * 32;
+ inlen -= 4 * 32;
+ }
+
+ /* Fold 8 to 4. */
+ asm volatile("vbroadcasti128 %[k1k2], %%ymm4\n\t"
+
+ /* Fold ymm2 into ymm0. */
+ "vmovdqa %%ymm0, %%ymm5\n\t"
+ "vpclmulqdq $0x00, %%ymm4, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x11, %%ymm4, %%ymm0, %%ymm0\n\t"
+ "vpxor %%ymm2, %%ymm5, %%ymm5\n\t"
+ "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+
+ /* Fold ymm3 into ymm1. */
+ "vmovdqa %%ymm1, %%ymm5\n\t"
+ "vpclmulqdq $0x00, %%ymm4, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x11, %%ymm4, %%ymm1, %%ymm1\n\t"
+ "vpxor %%ymm3, %%ymm5, %%ymm5\n\t"
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+
+ "vextracti128 $1, %%ymm1, %%xmm3\n\t"
+ "vmovdqa %%xmm1, %%xmm2\n\t"
+ "vextracti128 $1, %%ymm0, %%xmm1\n\t"
+
+ "vzeroupper\n\t"
+ :
+ : [k1k2] "m" (consts->k[1 - 1])
+ );
+ }
+ else
+ {
+ asm volatile ("movd %[crc], %%xmm4\n\t"
+ "movdqu %[inbuf_0], %%xmm0\n\t"
+ "movdqu %[inbuf_1], %%xmm1\n\t"
+ "movdqu %[inbuf_2], %%xmm2\n\t"
+ "movdqu %[inbuf_3], %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 16]),
+ [inbuf_1] "m" (inbuf[1 * 16]),
+ [inbuf_2] "m" (inbuf[2 * 16]),
+ [inbuf_3] "m" (inbuf[3 * 16]),
+ [crc] "m" (*pcrc)
+ );
+
+ inbuf += 4 * 16;
+ inlen -= 4 * 16;
+
+ asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+ :
+ : [k1k2] "m" (consts->k[1 - 1])
+ );
+ }
/* Fold by 4. */
while (inlen >= 4 * 16)
@@ -219,7 +420,6 @@ crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
);
/* Fold 4 to 1. */
-
asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
"pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
"pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
@@ -489,7 +689,7 @@ crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
/* PCLMUL functions for non-reflected CRC32. */
static ASM_FUNC_ATTR_INLINE void
crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
- const struct crc32_consts_s *consts)
+ const struct crc32_consts_s *consts, u32 hwfeatures)
{
asm volatile ("movdqa %[bswap], %%xmm7\n\t"
:
@@ -498,31 +698,230 @@ crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
if (inlen >= 8 * 16)
{
- asm volatile ("movd %[crc], %%xmm4\n\t"
- "movdqu %[inbuf_0], %%xmm0\n\t"
- "movdqu %[inbuf_1], %%xmm1\n\t"
- "movdqu %[inbuf_2], %%xmm2\n\t"
- "pxor %%xmm4, %%xmm0\n\t"
- "movdqu %[inbuf_3], %%xmm3\n\t"
- "pshufb %%xmm7, %%xmm0\n\t"
- "pshufb %%xmm7, %%xmm1\n\t"
- "pshufb %%xmm7, %%xmm2\n\t"
- "pshufb %%xmm7, %%xmm3\n\t"
- :
- : [inbuf_0] "m" (inbuf[0 * 16]),
- [inbuf_1] "m" (inbuf[1 * 16]),
- [inbuf_2] "m" (inbuf[2 * 16]),
- [inbuf_3] "m" (inbuf[3 * 16]),
- [crc] "m" (*pcrc)
- );
+ if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+ && (hwfeatures & HWF_INTEL_AVX2)
+ && inlen >= 8 * 32)
+ {
+ if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL)
+ && (hwfeatures & HWF_INTEL_AVX512)
+ && inlen >= 8 * 64)
+ {
+ asm volatile("vpopcntb %%xmm7, %%xmm0\n\t" /* spec stop for old AVX512 CPUs */
+ "vshufi32x4 $0x00, %%zmm7, %%zmm7, %%zmm7\n\t"
+ "vmovd %[crc], %%xmm4\n\t"
+ "vmovdqu64 %[inbuf_0], %%zmm0\n\t"
+ "vmovdqu64 %[inbuf_1], %%zmm1\n\t"
+ "vmovdqu64 %[inbuf_2], %%zmm2\n\t"
+ "vmovdqu64 %[inbuf_3], %%zmm3\n\t"
+ "vpxorq %%zmm4, %%zmm0, %%zmm0\n\t"
+ "vpshufb %%zmm7, %%zmm0, %%zmm0\n\t"
+ "vpshufb %%zmm7, %%zmm1, %%zmm1\n\t"
+ "vpshufb %%zmm7, %%zmm2, %%zmm2\n\t"
+ "vpshufb %%zmm7, %%zmm3, %%zmm3\n\t"
+ :
+ : [crc] "m" (*pcrc),
+ [inbuf_0] "m" (inbuf[0 * 64]),
+ [inbuf_1] "m" (inbuf[1 * 64]),
+ [inbuf_2] "m" (inbuf[2 * 64]),
+ [inbuf_3] "m" (inbuf[3 * 64])
+ );
+
+ inbuf += 4 * 64;
+ inlen -= 4 * 64;
+
+ asm volatile ("vbroadcasti32x4 %[k_zmm], %%zmm4\n\t"
+ :
+ : [k_zmm] "m" (consts->k_zmm[0])
+ );
- inbuf += 4 * 16;
- inlen -= 4 * 16;
+ /* Fold by 16. */
+ while (inlen >= 4 * 64)
+ {
+ asm volatile ("vmovdqu64 %[inbuf_0], %%zmm5\n\t"
+ "vmovdqa64 %%zmm0, %%zmm6\n\t"
+ "vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x01, %%zmm4, %%zmm0, %%zmm0\n\t"
+ "vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm0\n\t"
+
+ "vmovdqu64 %[inbuf_1], %%zmm5\n\t"
+ "vmovdqa64 %%zmm1, %%zmm6\n\t"
+ "vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x01, %%zmm4, %%zmm1, %%zmm1\n\t"
+ "vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm1\n\t"
+
+ "vmovdqu64 %[inbuf_2], %%zmm5\n\t"
+ "vmovdqa64 %%zmm2, %%zmm6\n\t"
+ "vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x01, %%zmm4, %%zmm2, %%zmm2\n\t"
+ "vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm2\n\t"
+
+ "vmovdqu64 %[inbuf_3], %%zmm5\n\t"
+ "vmovdqa64 %%zmm3, %%zmm6\n\t"
+ "vpshufb %%zmm7, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x01, %%zmm4, %%zmm3, %%zmm3\n\t"
+ "vpclmulqdq $0x10, %%zmm4, %%zmm6, %%zmm6\n\t"
+ "vpternlogq $0x96, %%zmm5, %%zmm6, %%zmm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 64]),
+ [inbuf_1] "m" (inbuf[1 * 64]),
+ [inbuf_2] "m" (inbuf[2 * 64]),
+ [inbuf_3] "m" (inbuf[3 * 64])
+ );
+
+ inbuf += 4 * 64;
+ inlen -= 4 * 64;
+ }
+
+ asm volatile("vbroadcasti32x4 %[k_ymm], %%zmm4\n\t"
+ /* Fold zmm2 into zmm0. */
+ "vmovdqa64 %%zmm0, %%zmm5\n\t"
+ "vpclmulqdq $0x01, %%zmm4, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x10, %%zmm4, %%zmm0, %%zmm0\n\t"
+ "vpternlogq $0x96, %%zmm2, %%zmm5, %%zmm0\n\t"
+ /* Fold zmm3 into zmm1. */
+ "vmovdqa64 %%zmm1, %%zmm5\n\t"
+ "vpclmulqdq $0x01, %%zmm4, %%zmm5, %%zmm5\n\t"
+ "vpclmulqdq $0x10, %%zmm4, %%zmm1, %%zmm1\n\t"
+ "vpternlogq $0x96, %%zmm3, %%zmm5, %%zmm1\n\t"
+ :
+ : [k_ymm] "m" (consts->k_ymm[0])
+ );
+
+ asm volatile("vextracti64x4 $1, %%zmm1, %%ymm3\n\t"
+ "vmovdqa %%ymm1, %%ymm2\n\t"
+ "vextracti64x4 $1, %%zmm0, %%ymm1\n\t"
+ :
+ :
+ );
+ }
+ else
+ {
+ asm volatile("vinserti128 $1, %%xmm7, %%ymm7, %%ymm7\n\t"
+ "vmovd %[crc], %%xmm4\n\t"
+ "vmovdqu %[inbuf_0], %%ymm0\n\t"
+ "vmovdqu %[inbuf_1], %%ymm1\n\t"
+ "vmovdqu %[inbuf_2], %%ymm2\n\t"
+ "vmovdqu %[inbuf_3], %%ymm3\n\t"
+ "vpxor %%ymm4, %%ymm0, %%ymm0\n\t"
+ "vpshufb %%ymm7, %%ymm0, %%ymm0\n\t"
+ "vpshufb %%ymm7, %%ymm1, %%ymm1\n\t"
+ "vpshufb %%ymm7, %%ymm2, %%ymm2\n\t"
+ "vpshufb %%ymm7, %%ymm3, %%ymm3\n\t"
+ :
+ : [crc] "m" (*pcrc),
+ [inbuf_0] "m" (inbuf[0 * 32]),
+ [inbuf_1] "m" (inbuf[1 * 32]),
+ [inbuf_2] "m" (inbuf[2 * 32]),
+ [inbuf_3] "m" (inbuf[3 * 32])
+ );
+
+ inbuf += 4 * 32;
+ inlen -= 4 * 32;
+
+ asm volatile ("vbroadcasti128 %[k_ymm], %%ymm4\n\t"
+ : : [k_ymm] "m" (consts->k_ymm[0]));
+ }
- asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
- :
- : [k1k2] "m" (consts->k[1 - 1])
- );
+ /* Fold by 8. */
+ while (inlen >= 4 * 32)
+ {
+ asm volatile ("vmovdqu %[inbuf_0], %%ymm5\n\t"
+ "vmovdqa %%ymm0, %%ymm6\n\t"
+ "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x01, %%ymm4, %%ymm0, %%ymm0\n\t"
+ "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+ "vpxor %%ymm6, %%ymm0, %%ymm0\n\t"
+
+ "vmovdqu %[inbuf_1], %%ymm5\n\t"
+ "vmovdqa %%ymm1, %%ymm6\n\t"
+ "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x01, %%ymm4, %%ymm1, %%ymm1\n\t"
+ "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+ "vpxor %%ymm6, %%ymm1, %%ymm1\n\t"
+
+ "vmovdqu %[inbuf_2], %%ymm5\n\t"
+ "vmovdqa %%ymm2, %%ymm6\n\t"
+ "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x01, %%ymm4, %%ymm2, %%ymm2\n\t"
+ "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
+ "vpxor %%ymm6, %%ymm2, %%ymm2\n\t"
+
+ "vmovdqu %[inbuf_3], %%ymm5\n\t"
+ "vmovdqa %%ymm3, %%ymm6\n\t"
+ "vpshufb %%ymm7, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x01, %%ymm4, %%ymm3, %%ymm3\n\t"
+ "vpclmulqdq $0x10, %%ymm4, %%ymm6, %%ymm6\n\t"
+ "vpxor %%ymm5, %%ymm3, %%ymm3\n\t"
+ "vpxor %%ymm6, %%ymm3, %%ymm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 32]),
+ [inbuf_1] "m" (inbuf[1 * 32]),
+ [inbuf_2] "m" (inbuf[2 * 32]),
+ [inbuf_3] "m" (inbuf[3 * 32])
+ );
+
+ inbuf += 4 * 32;
+ inlen -= 4 * 32;
+ }
+
+ asm volatile("vbroadcasti128 %[k1k2], %%ymm4\n\t"
+
+ /* Fold ymm2 into ymm0. */
+ "vmovdqa %%ymm0, %%ymm5\n\t"
+ "vpclmulqdq $0x01, %%ymm4, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x10, %%ymm4, %%ymm0, %%ymm0\n\t"
+ "vpxor %%ymm2, %%ymm5, %%ymm5\n\t"
+ "vpxor %%ymm5, %%ymm0, %%ymm0\n\t"
+
+ /* Fold ymm3 into ymm1. */
+ "vmovdqa %%ymm1, %%ymm5\n\t"
+ "vpclmulqdq $0x01, %%ymm4, %%ymm5, %%ymm5\n\t"
+ "vpclmulqdq $0x10, %%ymm4, %%ymm1, %%ymm1\n\t"
+ "vpxor %%ymm3, %%ymm5, %%ymm5\n\t"
+ "vpxor %%ymm5, %%ymm1, %%ymm1\n\t"
+
+ "vextracti128 $1, %%ymm1, %%xmm3\n\t"
+ "vmovdqa %%xmm1, %%xmm2\n\t"
+ "vextracti128 $1, %%ymm0, %%xmm1\n\t"
+ "vzeroupper\n\t"
+ :
+ : [k1k2] "m" (consts->k[1 - 1])
+ );
+ }
+ else
+ {
+ asm volatile ("movd %[crc], %%xmm4\n\t"
+ "movdqu %[inbuf_0], %%xmm0\n\t"
+ "movdqu %[inbuf_1], %%xmm1\n\t"
+ "movdqu %[inbuf_2], %%xmm2\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf_3], %%xmm3\n\t"
+ "pshufb %%xmm7, %%xmm0\n\t"
+ "pshufb %%xmm7, %%xmm1\n\t"
+ "pshufb %%xmm7, %%xmm2\n\t"
+ "pshufb %%xmm7, %%xmm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 16]),
+ [inbuf_1] "m" (inbuf[1 * 16]),
+ [inbuf_2] "m" (inbuf[2 * 16]),
+ [inbuf_3] "m" (inbuf[3 * 16]),
+ [crc] "m" (*pcrc)
+ );
+
+ inbuf += 4 * 16;
+ inlen -= 4 * 16;
+
+ asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+ :
+ : [k1k2] "m" (consts->k[1 - 1])
+ );
+ }
/* Fold by 4. */
while (inlen >= 4 * 16)
@@ -577,7 +976,6 @@ crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
);
/* Fold 4 to 1. */
-
asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
"pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
"pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
@@ -865,7 +1263,8 @@ crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
}
void ASM_FUNC_ATTR
-_gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+_gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen,
+ u32 hwfeatures)
{
const struct crc32_consts_s *consts = &crc32_consts;
#if defined(__x86_64__) && defined(__WIN64__)
@@ -883,7 +1282,7 @@ _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
return;
if (inlen >= 16)
- crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
+ crc32_reflected_bulk(pcrc, inbuf, inlen, consts, hwfeatures);
else
crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
@@ -898,7 +1297,8 @@ _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
}
void ASM_FUNC_ATTR
-_gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+_gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen,
+ u32 hwfeatures)
{
const struct crc32_consts_s *consts = &crc24rfc2440_consts;
#if defined(__x86_64__) && defined(__WIN64__)
@@ -918,7 +1318,7 @@ _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
/* Note: *pcrc in input endian. */
if (inlen >= 16)
- crc32_bulk(pcrc, inbuf, inlen, consts);
+ crc32_bulk(pcrc, inbuf, inlen, consts, hwfeatures);
else
crc32_less_than_16(pcrc, inbuf, inlen, consts);
diff --git a/cipher/crc.c b/cipher/crc.c
index cdff0648..21ab8523 100644
--- a/cipher/crc.c
+++ b/cipher/crc.c
@@ -70,6 +70,7 @@ typedef struct
u32 CRC;
#ifdef USE_INTEL_PCLMUL
unsigned int use_pclmul:1; /* Intel PCLMUL shall be used. */
+ u32 hwfeatures;
#endif
#ifdef USE_ARM_PMULL
unsigned int use_pmull:1; /* ARMv8 PMULL shall be used. */
@@ -84,9 +85,10 @@ CRC_CONTEXT;
#ifdef USE_INTEL_PCLMUL
/*-- crc-intel-pclmul.c --*/
-void _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen,
+ u32 hwfeatures);
void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf,
- size_t inlen);
+ size_t inlen, u32 hwfeatures);
#endif
#ifdef USE_ARM_PMULL
@@ -407,6 +409,7 @@ crc32_init (void *context, unsigned int flags)
#ifdef USE_INTEL_PCLMUL
ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+ ctx->hwfeatures = hwf;
#endif
#ifdef USE_ARM_PMULL
ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
@@ -431,7 +434,7 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen)
#ifdef USE_INTEL_PCLMUL
if (ctx->use_pclmul)
{
- _gcry_crc32_intel_pclmul(&ctx->CRC, inbuf, inlen);
+ _gcry_crc32_intel_pclmul(&ctx->CRC, inbuf, inlen, ctx->hwfeatures);
return;
}
#endif
@@ -506,6 +509,7 @@ crc32rfc1510_init (void *context, unsigned int flags)
#ifdef USE_INTEL_PCLMUL
ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+ ctx->hwfeatures = hwf;
#endif
#ifdef USE_ARM_PMULL
ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
@@ -843,6 +847,7 @@ crc24rfc2440_init (void *context, unsigned int flags)
#ifdef USE_INTEL_PCLMUL
ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+ ctx->hwfeatures = hwf;
#endif
#ifdef USE_ARM_PMULL
ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
@@ -867,7 +872,7 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen)
#ifdef USE_INTEL_PCLMUL
if (ctx->use_pclmul)
{
- _gcry_crc24rfc2440_intel_pclmul(&ctx->CRC, inbuf, inlen);
+ _gcry_crc24rfc2440_intel_pclmul(&ctx->CRC, inbuf, inlen, ctx->hwfeatures);
return;
}
#endif
--
2.48.1
More information about the Gcrypt-devel
mailing list