[PATCH 2/2] chacha20: add AVX512 implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Apr 3 17:10:43 CEST 2022
* cipher/Makefile.am: Add 'chacha20-amd64-avx512.S'.
* cipher/chacha20-amd64-avx512.S: New.
* cipher/chacha20.c (USE_AVX512): New.
(CHACHA20_context_s): Add 'use_avx512'.
[USE_AVX512] (_gcry_chacha20_amd64_avx512_blocks16): New.
(chacha20_do_setkey) [USE_AVX512]: Setup 'use_avx512' based on
HW features.
(do_chacha20_encrypt_stream_tail) [USE_AVX512]: Use AVX512
implementation if supported.
(_gcry_chacha20_poly1305_encrypt) [USE_AVX512]: Disable stitched
chacha20-poly1305 implementations if AVX512 implementation is used.
(_gcry_chacha20_poly1305_decrypt) [USE_AVX512]: Disable stitched
chacha20-poly1305 implementations if AVX512 implementation is used.
--
Benchmark on Intel Core i3-1115G4 (tigerlake):
Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
STREAM enc | 0.276 ns/B 3451 MiB/s 1.13 c/B 4090
STREAM dec | 0.284 ns/B 3359 MiB/s 1.16 c/B 4090
POLY1305 enc | 0.411 ns/B 2320 MiB/s 1.68 c/B 4098±3
POLY1305 dec | 0.408 ns/B 2338 MiB/s 1.67 c/B 4091±1
POLY1305 auth | 0.060 ns/B 15785 MiB/s 0.247 c/B 4090±1
After (stream 1.7x faster, poly1305-aead 1.8x faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
STREAM enc | 0.162 ns/B 5869 MiB/s 0.665 c/B 4092±1
STREAM dec | 0.162 ns/B 5884 MiB/s 0.664 c/B 4096±3
POLY1305 enc | 0.221 ns/B 4306 MiB/s 0.907 c/B 4097±3
POLY1305 dec | 0.220 ns/B 4342 MiB/s 0.900 c/B 4096±3
POLY1305 auth | 0.060 ns/B 15797 MiB/s 0.247 c/B 4085±2
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 2 +-
cipher/chacha20-amd64-avx512.S | 300 +++++++++++++++++++++++++++++++++
cipher/chacha20.c | 60 ++++++-
configure.ac | 1 +
4 files changed, 357 insertions(+), 6 deletions(-)
create mode 100644 cipher/chacha20-amd64-avx512.S
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index b6319d35..ed6d7c35 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -81,7 +81,7 @@ EXTRA_libcipher_la_SOURCES = \
blowfish.c blowfish-amd64.S blowfish-arm.S \
cast5.c cast5-amd64.S cast5-arm.S \
chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
- chacha20-armv7-neon.S chacha20-aarch64.S \
+ chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
chacha20-ppc.c chacha20-s390x.S \
cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
new file mode 100644
index 00000000..da24286e
--- /dev/null
+++ b/cipher/chacha20-amd64-avx512.S
@@ -0,0 +1,300 @@
+/* chacha20-amd64-avx512.S - AVX512 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define INPUT %rdi
+#define DST %rsi
+#define SRC %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* vector registers */
+#define X0 %zmm0
+#define X1 %zmm1
+#define X2 %zmm2
+#define X3 %zmm3
+#define X4 %zmm4
+#define X5 %zmm5
+#define X6 %zmm6
+#define X7 %zmm7
+#define X8 %zmm8
+#define X9 %zmm9
+#define X10 %zmm10
+#define X11 %zmm11
+#define X12 %zmm12
+#define X13 %zmm13
+#define X14 %zmm14
+#define X15 %zmm15
+
+#define TMP0 %zmm16
+#define TMP1 %zmm17
+
+#define COUNTER_ADD %zmm18
+
+#define X12_SAVE %zmm19
+#define X13_SAVE %zmm20
+
+#define S0 %zmm21
+#define S1 %zmm22
+#define S2 %zmm23
+#define S3 %zmm24
+#define S4 %zmm25
+#define S5 %zmm26
+#define S6 %zmm27
+#define S7 %zmm28
+#define S8 %zmm29
+#define S14 %zmm30
+#define S15 %zmm31
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* 4x4 128-bit matrix transpose */
+#define transpose_16byte_4x4(x0,x1,x2,x3,t1,t2) \
+ vshufi32x4 $0xee, x1, x0, t2; \
+ vshufi32x4 $0x44, x1, x0, x0; \
+ \
+ vshufi32x4 $0x44, x3, x2, t1; \
+ vshufi32x4 $0xee, x3, x2, x2; \
+ \
+ vshufi32x4 $0xdd, t1, x0, x1; \
+ vshufi32x4 $0x88, t1, x0, x0; \
+ \
+ vshufi32x4 $0xdd, x2, t2, x3; \
+ vshufi32x4 $0x88, x2, t2, x2;
+
+#define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \
+ vpxord (offset + 0 * (add))(src), x0, x0; \
+ vpxord (offset + 1 * (add))(src), x4, x4; \
+ vpxord (offset + 2 * (add))(src), x8, x8; \
+ vpxord (offset + 3 * (add))(src), x12, x12; \
+ vmovdqu32 x0, (offset + 0 * (add))(dst); \
+ vmovdqu32 x4, (offset + 1 * (add))(dst); \
+ vmovdqu32 x8, (offset + 2 * (add))(dst); \
+ vmovdqu32 x12, (offset + 3 * (add))(dst);
+
+#define xor_src_dst(dst, src, offset, xreg) \
+ vpxord offset(src), xreg, xreg; \
+ vmovdqu32 xreg, offset(dst);
+
+#define clear_vec4(v0,v1,v2,v3) \
+ vpxord v0, v0, v0; \
+ vpxord v1, v1, v1; \
+ vpxord v2, v2, v2; \
+ vpxord v3, v3, v3;
+
+#define clear_zmm16_zmm31() \
+ clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \
+ clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \
+ clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \
+ clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31);
+
+/**********************************************************************
+ 16-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c) \
+ vprold $(c), v1, v1; \
+ vprold $(c), v2, v2;
+
+#define XOR(ds,s) \
+ vpxord s, ds, ds;
+
+#define PLUS(ds,s) \
+ vpaddd s, ds, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \
+ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
+ ROTATE2(d1, d2, 16); \
+ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
+ ROTATE2(b1, b2, 12); \
+ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
+ ROTATE2(d1, d2, 8); \
+ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
+ ROTATE2(b1, b2, 7);
+
+.align 64
+ELF(.type _gcry_chacha20_amd64_avx512_data, at object;)
+_gcry_chacha20_amd64_avx512_data:
+.Linc_counter:
+ .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lone:
+ .long 1,0,0,0
+ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data)
+
+.align 16
+.globl _gcry_chacha20_amd64_avx512_blocks16
+ELF(.type _gcry_chacha20_amd64_avx512_blocks16, at function;)
+_gcry_chacha20_amd64_avx512_blocks16:
+ /* input:
+ * %rdi: input
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: nblks (multiple of 16)
+ */
+ CFI_STARTPROC();
+
+ vpxord %xmm16, %xmm16, %xmm16;
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ vpmovzxbd .Linc_counter rRIP, COUNTER_ADD;
+
+ /* Preload state */
+ vpbroadcastd (0 * 4)(INPUT), S0;
+ vpbroadcastd (1 * 4)(INPUT), S1;
+ vpbroadcastd (2 * 4)(INPUT), S2;
+ vpbroadcastd (3 * 4)(INPUT), S3;
+ vpbroadcastd (4 * 4)(INPUT), S4;
+ vpbroadcastd (5 * 4)(INPUT), S5;
+ vpbroadcastd (6 * 4)(INPUT), S6;
+ vpbroadcastd (7 * 4)(INPUT), S7;
+ vpbroadcastd (8 * 4)(INPUT), S8;
+ vpbroadcastd (14 * 4)(INPUT), S14;
+ vpbroadcastd (15 * 4)(INPUT), S15;
+
+.align 16
+.Loop16:
+ movl $20, ROUND;
+
+ /* Construct counter vectors X12 and X13 */
+ vpbroadcastd (12 * 4)(INPUT), X12;
+ vpbroadcastd (13 * 4)(INPUT), X13;
+ vpaddd COUNTER_ADD, X12, X12;
+ vpcmpud $6, X12, COUNTER_ADD, %k2;
+ vpaddd .Lone rRIP {1to16}, X13, X13{%k2};
+ vmovdqa32 X12, X12_SAVE;
+ vmovdqa32 X13, X13_SAVE;
+
+ /* Load vectors */
+ vmovdqa32 S0, X0;
+ vmovdqa32 S4, X4;
+ vmovdqa32 S8, X8;
+ vmovdqa32 S1, X1;
+ vmovdqa32 S5, X5;
+ vpbroadcastd (9 * 4)(INPUT), X9;
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13)
+ vmovdqa32 S2, X2;
+ vmovdqa32 S6, X6;
+ vpbroadcastd (10 * 4)(INPUT), X10;
+ vmovdqa32 S14, X14;
+ vmovdqa32 S3, X3;
+ vmovdqa32 S7, X7;
+ vpbroadcastd (11 * 4)(INPUT), X11;
+ vmovdqa32 S15, X15;
+
+ /* Update counter */
+ addq $16, (12 * 4)(INPUT);
+ jmp .Lround2_entry;
+
+.align 16
+.Lround2:
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14)
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13)
+.Lround2_entry:
+ subl $2, ROUND;
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15)
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12)
+ jnz .Lround2;
+
+.Lround2_end:
+ PLUS(X0, S0);
+ PLUS(X1, S1);
+ PLUS(X5, S5);
+ PLUS(X6, S6);
+ PLUS(X10, (10 * 4)(INPUT){1to16});
+ PLUS(X11, (11 * 4)(INPUT){1to16});
+ PLUS(X15, S15);
+ PLUS(X12, X12_SAVE);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14)
+
+ PLUS(X2, S2);
+ PLUS(X3, S3);
+ PLUS(X4, S4);
+ PLUS(X7, S7);
+ transpose_4x4(X0, X1, X2, X3, TMP0, TMP1);
+ transpose_4x4(X4, X5, X6, X7, TMP0, TMP1);
+ PLUS(X8, S8);
+ PLUS(X9, (9 * 4)(INPUT){1to16});
+ PLUS(X13, X13_SAVE);
+ PLUS(X14, S14);
+ transpose_4x4(X8, X9, X10, X11, TMP0, TMP1);
+ transpose_4x4(X12, X13, X14, X15, TMP0, TMP1);
+
+ transpose_16byte_4x4(X0, X4, X8, X12, TMP0, TMP1);
+ xor_src_dst_4x4(DST, SRC, (64 * 0), (64 * 4), X0, X4, X8, X12);
+ transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1);
+ xor_src_dst_4x4(DST, SRC, (64 * 1), (64 * 4), X1, X5, X9, X13);
+ transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1);
+ xor_src_dst_4x4(DST, SRC, (64 * 2), (64 * 4), X2, X6, X10, X14);
+ transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
+ xor_src_dst_4x4(DST, SRC, (64 * 3), (64 * 4), X3, X7, X11, X15);
+
+ subq $16, NBLKS;
+ leaq (16 * 64)(SRC), SRC;
+ leaq (16 * 64)(DST), DST;
+ jnz .Loop16;
+
+ /* clear the used vector registers */
+ clear_zmm16_zmm31();
+ kmovd %eax, %k2;
+ vzeroall; /* clears ZMM0-ZMM15 */
+
+ /* eax zeroed by round loop. */
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_chacha20_amd64_avx512_blocks16,
+ .-_gcry_chacha20_amd64_avx512_blocks16;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 870cfa18..8dec4317 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -64,6 +64,14 @@
# define USE_AVX2 1
#endif
+/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX512 1
+#endif
+
/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
#undef USE_ARMV7_NEON
#ifdef ENABLE_NEON_SUPPORT
@@ -123,6 +131,7 @@ typedef struct CHACHA20_context_s
unsigned int unused; /* bytes in the pad. */
unsigned int use_ssse3:1;
unsigned int use_avx2:1;
+ unsigned int use_avx512:1;
unsigned int use_neon:1;
unsigned int use_ppc:1;
unsigned int use_s390x:1;
@@ -161,6 +170,14 @@ unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
#endif /* USE_AVX2 */
+#ifdef USE_AVX512
+
+unsigned int _gcry_chacha20_amd64_avx512_blocks16(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks) ASM_FUNC_ABI;
+
+#endif /* USE_AVX2 */
+
#ifdef USE_PPC_VEC
unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
@@ -464,6 +481,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
#ifdef USE_SSSE3
ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
#endif
+#ifdef USE_AVX512
+ ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0;
+#endif
#ifdef USE_AVX2
ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
#endif
@@ -510,6 +530,20 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
unsigned int nburn, burn = 0;
+#ifdef USE_AVX512
+ if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE * 16)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 16;
+ nburn = _gcry_chacha20_amd64_avx512_blocks16(ctx->input, outbuf, inbuf,
+ nblocks);
+ burn = nburn > burn ? nburn : burn;
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
#ifdef USE_AVX2
if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
{
@@ -703,6 +737,13 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
if (0)
{ }
+#ifdef USE_AVX512
+ else if (ctx->use_avx512)
+ {
+ /* Skip stitched chacha20-poly1305 for AVX512. */
+ authptr = NULL;
+ }
+#endif
#ifdef USE_AVX2
else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
{
@@ -1000,6 +1041,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
{
CHACHA20_context_t *ctx = (void *) &c->context.c;
unsigned int nburn, burn = 0;
+ int skip_stitched = 0;
if (!length)
return 0;
@@ -1035,8 +1077,16 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+#ifdef USE_AVX512
+ if (ctx->use_avx512)
+ {
+ /* Skip stitched chacha20-poly1305 for AVX512. */
+ skip_stitched = 1;
+ }
+#endif
+
#ifdef USE_AVX2
- if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
+ if (!skip_stitched && ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 8;
@@ -1053,7 +1103,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
#endif
#ifdef USE_SSSE3
- if (ctx->use_ssse3)
+ if (!skip_stitched && ctx->use_ssse3)
{
if (length >= 4 * CHACHA20_BLOCK_SIZE)
{
@@ -1087,7 +1137,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
#endif
#ifdef USE_AARCH64_SIMD
- if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
+ if (!skip_stitched && ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
@@ -1104,7 +1154,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
#endif
#ifdef USE_PPC_VEC_POLY1305
- if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
+ if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
nblocks -= nblocks % 4;
@@ -1121,7 +1171,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
#endif
#ifdef USE_S390X_VX_POLY1305
- if (ctx->use_s390x)
+ if (!skip_stitched && ctx->use_s390x)
{
if (length >= 8 * CHACHA20_BLOCK_SIZE)
{
diff --git a/configure.ac b/configure.ac
index 778dc633..582678e6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2759,6 +2759,7 @@ if test "$found" = "1" ; then
# Build with the assembly implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo"
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo"
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx512.lo"
;;
aarch64-*-*)
# Build with the assembly implementation
--
2.32.0
More information about the Gcrypt-devel
mailing list