[PATCH 3/3] sm4-aesni-avx2: add generic 1 to 16 block bulk processing function

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Apr 24 20:47:03 CEST 2022


* cipher/sm4-aesni-avx2-amd64.S: Remove unnecessary vzeroupper at
function entries.
(_gcry_sm4_aesni_avx2_crypt_blk1_16): New.
* cipher/sm4.c (_gcry_sm4_aesni_avx2_crypt_blk1_16)
(sm4_aesni_avx2_crypt_blk1_16): New.
(sm4_get_crypt_blk1_16_fn) [USE_AESNI_AVX2]: Add
'sm4_aesni_avx2_crypt_blk1_16'.
--

Benchmark AMD Ryzen 5800X:

Before:
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        XTS enc |      1.48 ns/B     643.2 MiB/s      7.19 c/B      4850
        XTS dec |      1.48 ns/B     644.3 MiB/s      7.18 c/B      4850

After (1.37x faster):
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        XTS enc |      1.07 ns/B     888.7 MiB/s      5.21 c/B      4850
        XTS dec |      1.07 ns/B     889.4 MiB/s      5.20 c/B      4850

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sm4-aesni-avx2-amd64.S | 82 +++++++++++++++++++++++++++++------
 cipher/sm4.c                  | 26 +++++++++++
 2 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index effe590b..e09fed8f 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-avx2-amd64.S  -  AVX2 implementation of SM4 cipher
  *
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2020, 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -45,11 +45,19 @@
 #define RA1          %ymm9
 #define RA2          %ymm10
 #define RA3          %ymm11
+#define RA0x         %xmm8
+#define RA1x         %xmm9
+#define RA2x         %xmm10
+#define RA3x         %xmm11
 
 #define RB0          %ymm12
 #define RB1          %ymm13
 #define RB2          %ymm14
 #define RB3          %ymm15
+#define RB0x         %xmm12
+#define RB1x         %xmm13
+#define RB2x         %xmm14
+#define RB3x         %xmm15
 
 #define RNOT         %ymm0
 #define RBSWAP       %ymm1
@@ -280,6 +288,66 @@ __sm4_crypt_blk16:
 	CFI_ENDPROC();
 ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)
 
+.align 8
+.globl _gcry_sm4_aesni_avx2_crypt_blk1_16
+ELF(.type   _gcry_sm4_aesni_avx2_crypt_blk1_16, at function;)
+_gcry_sm4_aesni_avx2_crypt_blk1_16:
+	/* input:
+	 *	%rdi: round key array, CTX
+	 *	%rsi: dst (1..16 blocks)
+	 *	%rdx: src (1..16 blocks)
+	 *	%rcx: num blocks (1..16)
+	 */
+	CFI_STARTPROC();
+
+#define LOAD_INPUT(offset, yreg) \
+	cmpq $(1 + 2 * (offset)), %rcx; \
+	jb .Lblk16_load_input_done; \
+	ja 1f; \
+	  vmovdqu (offset) * 32(%rdx), yreg##x; \
+	  jmp .Lblk16_load_input_done; \
+	1: \
+	  vmovdqu (offset) * 32(%rdx), yreg;
+
+	LOAD_INPUT(0, RA0);
+	LOAD_INPUT(1, RA1);
+	LOAD_INPUT(2, RA2);
+	LOAD_INPUT(3, RA3);
+	LOAD_INPUT(4, RB0);
+	LOAD_INPUT(5, RB1);
+	LOAD_INPUT(6, RB2);
+	LOAD_INPUT(7, RB3);
+#undef LOAD_INPUT
+
+.Lblk16_load_input_done:
+	call __sm4_crypt_blk16;
+
+#define STORE_OUTPUT(yreg, offset) \
+	cmpq $(1 + 2 * (offset)), %rcx; \
+	jb .Lblk16_store_output_done; \
+	ja 1f; \
+	  vmovdqu yreg##x, (offset) * 32(%rsi); \
+	  jmp .Lblk16_store_output_done; \
+	1: \
+	  vmovdqu yreg, (offset) * 32(%rsi);
+
+	STORE_OUTPUT(RA0, 0);
+	STORE_OUTPUT(RA1, 1);
+	STORE_OUTPUT(RA2, 2);
+	STORE_OUTPUT(RA3, 3);
+	STORE_OUTPUT(RB0, 4);
+	STORE_OUTPUT(RB1, 5);
+	STORE_OUTPUT(RB2, 6);
+	STORE_OUTPUT(RB3, 7);
+#undef STORE_OUTPUT
+
+.Lblk16_store_output_done:
+	vzeroall;
+	xorl %eax, %eax;
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_16;)
+
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
 	vpsubq minus_one, x, x; \
@@ -301,8 +369,6 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
-	vzeroupper;
-
 	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
@@ -410,8 +476,6 @@ _gcry_sm4_aesni_avx2_cbc_dec:
 	 */
 	CFI_STARTPROC();
 
-	vzeroupper;
-
 	vmovdqu (0 * 32)(%rdx), RA0;
 	vmovdqu (1 * 32)(%rdx), RA1;
 	vmovdqu (2 * 32)(%rdx), RA2;
@@ -463,8 +527,6 @@ _gcry_sm4_aesni_avx2_cfb_dec:
 	 */
 	CFI_STARTPROC();
 
-	vzeroupper;
-
 	/* Load input */
 	vmovdqu (%rcx), RNOTx;
 	vinserti128 $1, (%rdx), RNOT, RA0;
@@ -521,8 +583,6 @@ _gcry_sm4_aesni_avx2_ocb_enc:
 	 */
 	CFI_STARTPROC();
 
-	vzeroupper;
-
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
@@ -635,8 +695,6 @@ _gcry_sm4_aesni_avx2_ocb_dec:
 	 */
 	CFI_STARTPROC();
 
-	vzeroupper;
-
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
@@ -758,8 +816,6 @@ _gcry_sm4_aesni_avx2_ocb_auth:
 	 */
 	CFI_STARTPROC();
 
-	vzeroupper;
-
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 9d00ee05..1f27f508 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -291,6 +291,24 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
 					  unsigned char *offset,
 					  unsigned char *checksum,
 					  const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_aesni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_aesni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+                             unsigned int num_blks)
+{
+#ifdef USE_AESNI_AVX
+  /* Use 128-bit register implementation for short input. */
+  if (num_blks <= 8)
+    return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
+#endif
+
+  return _gcry_sm4_aesni_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+
 #endif /* USE_AESNI_AVX2 */
 
 #ifdef USE_GFNI_AVX2
@@ -382,6 +400,7 @@ sm4_aarch64_crypt_blk1_16(const void *rk, byte *out, const byte *in,
   _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks);
   return 0;
 }
+
 #endif /* USE_AARCH64_SIMD */
 
 #ifdef USE_ARM_CE
@@ -427,6 +446,7 @@ sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in,
   _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks);
   return 0;
 }
+
 #endif /* USE_ARM_CE */
 
 static inline void prefetch_sbox_table(void)
@@ -758,6 +778,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
       return &sm4_gfni_avx2_crypt_blk1_16;
     }
 #endif
+#ifdef USE_AESNI_AVX2
+  else if (ctx->use_aesni_avx2)
+    {
+      return &sm4_aesni_avx2_crypt_blk1_16;
+    }
+#endif
 #ifdef USE_AESNI_AVX
   else if (ctx->use_aesni_avx)
     {
-- 
2.34.1




More information about the Gcrypt-devel mailing list