[PATCH 2/2] chacha20: add AVX512 implementation

Sun Apr 3 17:10:43 CEST 2022

* cipher/Makefile.am: Add 'chacha20-amd64-avx512.S'.
* cipher/chacha20-amd64-avx512.S: New.
* cipher/chacha20.c (USE_AVX512): New.
(CHACHA20_context_s): Add 'use_avx512'.
[USE_AVX512] (_gcry_chacha20_amd64_avx512_blocks16): New.
(chacha20_do_setkey) [USE_AVX512]: Setup 'use_avx512' based on
HW features.
(do_chacha20_encrypt_stream_tail) [USE_AVX512]: Use AVX512
implementation if supported.
(_gcry_chacha20_poly1305_encrypt) [USE_AVX512]: Disable stitched
chacha20-poly1305 implementations if AVX512 implementation is used.
(_gcry_chacha20_poly1305_decrypt) [USE_AVX512]: Disable stitched
chacha20-poly1305 implementations if AVX512 implementation is used.
--

Benchmark on Intel Core i3-1115G4 (tigerlake):

 Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.276 ns/B      3451 MiB/s      1.13 c/B      4090
     STREAM dec |     0.284 ns/B      3359 MiB/s      1.16 c/B      4090
   POLY1305 enc |     0.411 ns/B      2320 MiB/s      1.68 c/B      4098±3
   POLY1305 dec |     0.408 ns/B      2338 MiB/s      1.67 c/B      4091±1
  POLY1305 auth |     0.060 ns/B     15785 MiB/s     0.247 c/B      4090±1

 After (stream 1.7x faster, poly1305-aead 1.8x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.162 ns/B      5869 MiB/s     0.665 c/B      4092±1
     STREAM dec |     0.162 ns/B      5884 MiB/s     0.664 c/B      4096±3
   POLY1305 enc |     0.221 ns/B      4306 MiB/s     0.907 c/B      4097±3
   POLY1305 dec |     0.220 ns/B      4342 MiB/s     0.900 c/B      4096±3
  POLY1305 auth |     0.060 ns/B     15797 MiB/s     0.247 c/B      4085±2

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am             |   2 +-
 cipher/chacha20-amd64-avx512.S | 300 +++++++++++++++++++++++++++++++++
 cipher/chacha20.c              |  60 ++++++-
 configure.ac                   |   1 +
 4 files changed, 357 insertions(+), 6 deletions(-)
 create mode 100644 cipher/chacha20-amd64-avx512.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index b6319d35..ed6d7c35 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -81,7 +81,7 @@ EXTRA_libcipher_la_SOURCES = \
 	blowfish.c blowfish-amd64.S blowfish-arm.S \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
-	chacha20-armv7-neon.S chacha20-aarch64.S \
+	chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
 	chacha20-ppc.c chacha20-s390x.S \
 	cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
 	cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
new file mode 100644
index 00000000..da24286e
--- /dev/null
+++ b/cipher/chacha20-amd64-avx512.S
@@ -0,0 +1,300 @@
+/* chacha20-amd64-avx512.S  -  AVX512 implementation of ChaCha20 cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+.text
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* vector registers */
+#define X0 %zmm0
+#define X1 %zmm1
+#define X2 %zmm2
+#define X3 %zmm3
+#define X4 %zmm4
+#define X5 %zmm5
+#define X6 %zmm6
+#define X7 %zmm7
+#define X8 %zmm8
+#define X9 %zmm9
+#define X10 %zmm10
+#define X11 %zmm11
+#define X12 %zmm12
+#define X13 %zmm13
+#define X14 %zmm14
+#define X15 %zmm15
+
+#define TMP0 %zmm16
+#define TMP1 %zmm17
+
+#define COUNTER_ADD %zmm18
+
+#define X12_SAVE %zmm19
+#define X13_SAVE %zmm20
+
+#define S0 %zmm21
+#define S1 %zmm22
+#define S2 %zmm23
+#define S3 %zmm24
+#define S4 %zmm25
+#define S5 %zmm26
+#define S6 %zmm27
+#define S7 %zmm28
+#define S8 %zmm29
+#define S14 %zmm30
+#define S15 %zmm31
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+/* 4x4 128-bit matrix transpose */
+#define transpose_16byte_4x4(x0,x1,x2,x3,t1,t2) \
+	vshufi32x4 $0xee, x1, x0, t2; \
+	vshufi32x4 $0x44, x1, x0, x0; \
+	\
+	vshufi32x4 $0x44, x3, x2, t1; \
+	vshufi32x4 $0xee, x3, x2, x2; \
+	\
+	vshufi32x4 $0xdd, t1, x0, x1; \
+	vshufi32x4 $0x88, t1, x0, x0; \
+	\
+	vshufi32x4 $0xdd, x2, t2, x3; \
+	vshufi32x4 $0x88, x2, t2, x2;
+
+#define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \
+	vpxord (offset + 0 * (add))(src), x0, x0; \
+	vpxord (offset + 1 * (add))(src), x4, x4; \
+	vpxord (offset + 2 * (add))(src), x8, x8; \
+	vpxord (offset + 3 * (add))(src), x12, x12; \
+	vmovdqu32 x0, (offset + 0 * (add))(dst); \
+	vmovdqu32 x4, (offset + 1 * (add))(dst); \
+	vmovdqu32 x8, (offset + 2 * (add))(dst); \
+	vmovdqu32 x12, (offset + 3 * (add))(dst);
+
+#define xor_src_dst(dst, src, offset, xreg) \
+	vpxord offset(src), xreg, xreg; \
+	vmovdqu32 xreg, offset(dst);
+
+#define clear_vec4(v0,v1,v2,v3) \
+	vpxord v0, v0, v0; \
+	vpxord v1, v1, v1; \
+	vpxord v2, v2, v2; \
+	vpxord v3, v3, v3;
+
+#define clear_zmm16_zmm31() \
+	clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \
+	clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \
+	clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \
+	clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31);
+
+/**********************************************************************
+  16-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c)	\
+	vprold $(c), v1, v1;	\
+	vprold $(c), v2, v2;
+
+#define XOR(ds,s) \
+	vpxord s, ds, ds;
+
+#define PLUS(ds,s) \
+	vpaddd s, ds, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2)			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE2(d1, d2, 16);				\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12);				\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE2(d1, d2, 8);					\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 7);
+
+.align 64
+ELF(.type _gcry_chacha20_amd64_avx512_data, at object;)
+_gcry_chacha20_amd64_avx512_data:
+.Linc_counter:
+	.byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lone:
+	.long 1,0,0,0
+ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data)
+
+.align 16
+.globl _gcry_chacha20_amd64_avx512_blocks16
+ELF(.type _gcry_chacha20_amd64_avx512_blocks16, at function;)
+_gcry_chacha20_amd64_avx512_blocks16:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 16)
+	 */
+	CFI_STARTPROC();
+
+	vpxord %xmm16, %xmm16, %xmm16;
+	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+	vpmovzxbd .Linc_counter rRIP, COUNTER_ADD;
+
+	/* Preload state */
+	vpbroadcastd (0 * 4)(INPUT), S0;
+	vpbroadcastd (1 * 4)(INPUT), S1;
+	vpbroadcastd (2 * 4)(INPUT), S2;
+	vpbroadcastd (3 * 4)(INPUT), S3;
+	vpbroadcastd (4 * 4)(INPUT), S4;
+	vpbroadcastd (5 * 4)(INPUT), S5;
+	vpbroadcastd (6 * 4)(INPUT), S6;
+	vpbroadcastd (7 * 4)(INPUT), S7;
+	vpbroadcastd (8 * 4)(INPUT), S8;
+	vpbroadcastd (14 * 4)(INPUT), S14;
+	vpbroadcastd (15 * 4)(INPUT), S15;
+
+.align 16
+.Loop16:
+	movl $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd COUNTER_ADD, X12, X12;
+	vpcmpud $6, X12, COUNTER_ADD, %k2;
+	vpaddd .Lone rRIP {1to16}, X13, X13{%k2};
+	vmovdqa32 X12, X12_SAVE;
+	vmovdqa32 X13, X13_SAVE;
+
+	/* Load vectors */
+	vmovdqa32 S0, X0;
+	vmovdqa32 S4, X4;
+	vmovdqa32 S8, X8;
+	vmovdqa32 S1, X1;
+	vmovdqa32 S5, X5;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13)
+	vmovdqa32 S2, X2;
+	vmovdqa32 S6, X6;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vmovdqa32 S14, X14;
+	vmovdqa32 S3, X3;
+	vmovdqa32 S7, X7;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vmovdqa32 S15, X15;
+
+	/* Update counter */
+	addq $16, (12 * 4)(INPUT);
+	jmp .Lround2_entry;
+
+.align 16
+.Lround2:
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14)
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13)
+.Lround2_entry:
+	subl $2, ROUND;
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12)
+	jnz .Lround2;
+
+.Lround2_end:
+	PLUS(X0, S0);
+	PLUS(X1, S1);
+	PLUS(X5, S5);
+	PLUS(X6, S6);
+	PLUS(X10, (10 * 4)(INPUT){1to16});
+	PLUS(X11, (11 * 4)(INPUT){1to16});
+	PLUS(X15, S15);
+	PLUS(X12, X12_SAVE);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14)
+
+	PLUS(X2, S2);
+	PLUS(X3, S3);
+	PLUS(X4, S4);
+	PLUS(X7, S7);
+	transpose_4x4(X0, X1, X2, X3, TMP0, TMP1);
+	transpose_4x4(X4, X5, X6, X7, TMP0, TMP1);
+	PLUS(X8, S8);
+	PLUS(X9, (9 * 4)(INPUT){1to16});
+	PLUS(X13, X13_SAVE);
+	PLUS(X14, S14);
+	transpose_4x4(X8, X9, X10, X11, TMP0, TMP1);
+	transpose_4x4(X12, X13, X14, X15, TMP0, TMP1);
+
+	transpose_16byte_4x4(X0, X4, X8, X12, TMP0, TMP1);
+	xor_src_dst_4x4(DST, SRC, (64 * 0), (64 * 4), X0, X4, X8, X12);
+	transpose_16byte_4x4(X1, X5, X9, X13, TMP0, TMP1);
+	xor_src_dst_4x4(DST, SRC, (64 * 1), (64 * 4), X1, X5, X9, X13);
+	transpose_16byte_4x4(X2, X6, X10, X14, TMP0, TMP1);
+	xor_src_dst_4x4(DST, SRC, (64 * 2), (64 * 4), X2, X6, X10, X14);
+	transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
+	xor_src_dst_4x4(DST, SRC, (64 * 3), (64 * 4), X3, X7, X11, X15);
+
+	subq $16, NBLKS;
+	leaq (16 * 64)(SRC), SRC;
+	leaq (16 * 64)(DST), DST;
+	jnz .Loop16;
+
+	/* clear the used vector registers */
+	clear_zmm16_zmm31();
+	kmovd %eax, %k2;
+	vzeroall; /* clears ZMM0-ZMM15 */
+
+	/* eax zeroed by round loop. */
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_chacha20_amd64_avx512_blocks16,
+	  .-_gcry_chacha20_amd64_avx512_blocks16;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 870cfa18..8dec4317 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -64,6 +64,14 @@
 # define USE_AVX2 1
 #endif
 
+/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX512 1
+#endif
+
 /* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
 #undef USE_ARMV7_NEON
 #ifdef ENABLE_NEON_SUPPORT
@@ -123,6 +131,7 @@ typedef struct CHACHA20_context_s
   unsigned int unused; /* bytes in the pad.  */
   unsigned int use_ssse3:1;
   unsigned int use_avx2:1;
+  unsigned int use_avx512:1;
   unsigned int use_neon:1;
   unsigned int use_ppc:1;
   unsigned int use_s390x:1;
@@ -161,6 +170,14 @@ unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
 
 #endif /* USE_AVX2 */
 
+#ifdef USE_AVX512
+
+unsigned int _gcry_chacha20_amd64_avx512_blocks16(u32 *state, byte *dst,
+						  const byte *src,
+						  size_t nblks) ASM_FUNC_ABI;
+
+#endif /* USE_AVX2 */
+
 #ifdef USE_PPC_VEC
 
 unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
@@ -464,6 +481,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
 #ifdef USE_SSSE3
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
+#ifdef USE_AVX512
+  ctx->use_avx512 = (features & HWF_INTEL_AVX512) != 0;
+#endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
 #endif
@@ -510,6 +530,20 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
   static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
   unsigned int nburn, burn = 0;
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE * 16)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 16;
+      nburn = _gcry_chacha20_amd64_avx512_blocks16(ctx->input, outbuf, inbuf,
+						   nblocks);
+      burn = nburn > burn ? nburn : burn;
+      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+      inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+#endif
+
 #ifdef USE_AVX2
   if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
@@ -703,6 +737,13 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
 
   if (0)
     { }
+#ifdef USE_AVX512
+  else if (ctx->use_avx512)
+    {
+      /* Skip stitched chacha20-poly1305 for AVX512. */
+      authptr = NULL;
+    }
+#endif
 #ifdef USE_AVX2
   else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
     {
@@ -1000,6 +1041,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 {
   CHACHA20_context_t *ctx = (void *) &c->context.c;
   unsigned int nburn, burn = 0;
+  int skip_stitched = 0;
 
   if (!length)
     return 0;
@@ -1035,8 +1077,16 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 
   gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
 
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      /* Skip stitched chacha20-poly1305 for AVX512. */
+      skip_stitched = 1;
+    }
+#endif
+
 #ifdef USE_AVX2
-  if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
+  if (!skip_stitched && ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 8;
@@ -1053,7 +1103,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_SSSE3
-  if (ctx->use_ssse3)
+  if (!skip_stitched && ctx->use_ssse3)
     {
       if (length >= 4 * CHACHA20_BLOCK_SIZE)
 	{
@@ -1087,7 +1137,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_AARCH64_SIMD
-  if (ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
+  if (!skip_stitched && ctx->use_neon && length >= 4 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
@@ -1104,7 +1154,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_PPC_VEC_POLY1305
-  if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
+  if (!skip_stitched && ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nblocks -= nblocks % 4;
@@ -1121,7 +1171,7 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
 #endif
 
 #ifdef USE_S390X_VX_POLY1305
-  if (ctx->use_s390x)
+  if (!skip_stitched && ctx->use_s390x)
     {
       if (length >= 8 * CHACHA20_BLOCK_SIZE)
 	{
diff --git a/configure.ac b/configure.ac
index 778dc633..582678e6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2759,6 +2759,7 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-ssse3.lo"
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx2.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS chacha20-amd64-avx512.lo"
       ;;
       aarch64-*-*)
          # Build with the assembly implementation
-- 
2.32.0