[PATCH 2/8] sm4: add CTR-mode byte addition for AVX/AVX2/AVX512 implementations

Jussi Kivilinna jussi.kivilinna at iki.fi
Wed Feb 22 20:29:18 CET 2023


* cipher/sm4-aesni-avx-amd64.S
(_gcry_sm4_aesni_avx_ctr_enc): Add byte addition fast-path.
* cipher/sm4-aesni-avx2-amd64.S
(_gcry_sm4_aesni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx2-amd64.S
(_gcry_sm4_gfni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx512-amd64.S
(_gcry_sm4_gfni_avx512_ctr_enc)
(_gcry_sm4_gfni_avx512_ctr_enc_blk32): Likewise.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sm4-aesni-avx-amd64.S   |  68 +++++++++++++++++++++-
 cipher/sm4-aesni-avx2-amd64.S  |  65 ++++++++++++++++++++-
 cipher/sm4-gfni-avx2-amd64.S   |  65 ++++++++++++++++++++-
 cipher/sm4-gfni-avx512-amd64.S | 103 ++++++++++++++++++++++++++++++++-
 4 files changed, 295 insertions(+), 6 deletions(-)

diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S
index c09b205d..ca9be44a 100644
--- a/cipher/sm4-aesni-avx-amd64.S
+++ b/cipher/sm4-aesni-avx-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-avx-aesni-amd64.S  -  AES-NI/AVX implementation of SM4 cipher
  *
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2020,2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -150,6 +150,38 @@ _sm4_aesni_avx_consts:
 .Lbswap32_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.Lbige_addb_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
@@ -529,6 +561,9 @@ _gcry_sm4_aesni_avx_ctr_enc:
 	 */
 	CFI_STARTPROC();
 
+	cmpb $(0x100 - 8), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	/* load IV and byteswap */
 	vmovdqu (%rcx), RA0;
 
@@ -565,6 +600,8 @@ _gcry_sm4_aesni_avx_ctr_enc:
 	/* store new IV */
 	vmovdqu RTMP1, (%rcx);
 
+.align 8
+.Lload_ctr_done:
 	call __sm4_crypt_blk8;
 
 	vpxor (0 * 16)(%rdx), RA0, RA0;
@@ -588,6 +625,35 @@ _gcry_sm4_aesni_avx_ctr_enc:
 	vzeroall;
 
 	ret_spec_stop;
+	.align 8
+
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $8, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+	vmovdqu (%rcx), RA0;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $8, 15(%rcx);
+.Lctr_byteadd_xmm:
+	vpaddb .Lbige_addb_1 rRIP, RA0, RA1;
+	vpaddb .Lbige_addb_2 rRIP, RA0, RA2;
+	vpaddb .Lbige_addb_3 rRIP, RA0, RA3;
+	vpaddb .Lbige_addb_4 rRIP, RA0, RB0;
+	vpaddb .Lbige_addb_5 rRIP, RA0, RB1;
+	vpaddb .Lbige_addb_6 rRIP, RA0, RB2;
+	vpaddb .Lbige_addb_7 rRIP, RA0, RB3;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;)
 
diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index acd37cff..03f979fa 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-avx2-amd64.S  -  AVX2 implementation of SM4 cipher
  *
- * Copyright (C) 2020, 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -171,6 +171,33 @@ _sm4_aesni_avx2_consts:
 .Lbswap32_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
@@ -371,6 +398,9 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 	 */
 	CFI_STARTPROC();
 
+	cmpb $(0x100 - 16), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
@@ -438,11 +468,12 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 	vextracti128 $1, RTMP0, RTMP0x;
 	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
 
-.align 4
 .Lctr_carry_done:
 	/* store new IV */
 	vmovdqu RTMP0x, (%rcx);
 
+.align 8
+.Lload_ctr_done:
 	call __sm4_crypt_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -466,6 +497,36 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 	vzeroall;
 
 	ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $16, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+	vbroadcasti128 (%rcx), RB3;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $16, 15(%rcx);
+.Lctr_byteadd_ymm:
+	vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+	vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+	vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+	vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+	vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;)
 
diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S
index 2fbaffd5..464da399 100644
--- a/cipher/sm4-gfni-avx2-amd64.S
+++ b/cipher/sm4-gfni-avx2-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-gfni-avx2-amd64.S  -  GFNI/AVX2 implementation of SM4 cipher
  *
- * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -136,6 +136,33 @@ _sm4_gfni_avx2_consts:
 .Lbswap32_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+/* CTR byte addition constants */
+.align 32
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
 .text
 
 .align 16
@@ -658,6 +685,9 @@ _gcry_sm4_gfni_avx2_ctr_enc:
 	 */
 	CFI_STARTPROC();
 
+	cmpb $(0x100 - 16), 15(%rcx);
+	jbe .Lctr_byteadd;
+
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
@@ -725,11 +755,12 @@ _gcry_sm4_gfni_avx2_ctr_enc:
 	vextracti128 $1, RTMP0, RTMP0x;
 	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
 
-.align 4
 .Lctr_carry_done:
 	/* store new IV */
 	vmovdqu RTMP0x, (%rcx);
 
+.align 8
+.Lload_ctr_done:
 	call __sm4_gfni_crypt_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -753,6 +784,36 @@ _gcry_sm4_gfni_avx2_ctr_enc:
 	vzeroall;
 
 	ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $16, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+	vbroadcasti128 (%rcx), RB3;
+	je .Lctr_byteadd_full_ctr_carry;
+	addb $16, 15(%rcx);
+.Lctr_byteadd_ymm:
+	vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+	vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+	vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+	vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+	vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+	jmp .Lload_ctr_done;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;)
 
diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
index b095f85d..91f6e80b 100644
--- a/cipher/sm4-gfni-avx512-amd64.S
+++ b/cipher/sm4-gfni-avx512-amd64.S
@@ -1,6 +1,6 @@
 /* sm4-gfni-avx512-amd64.S  -  GFNI/AVX512 implementation of SM4 cipher
  *
- * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -146,6 +146,35 @@ SECTION_RODATA
 	.quad 2, 0
 	.quad 3, 0
 
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
 .text
 
 .align 16
@@ -627,6 +656,9 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	CFI_STARTPROC();
 	spec_stop_avx512;
 
+	cmpb $(0x100 - 16), 15(%rcx);
+	jbe .Lctr_byteadd16;
+
 	vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
 	vmovdqa .Lcounter0123_lo rRIP, RTMP1;
 	vbroadcasti128 .Lcounter2222_lo rRIP, RTMP2;
@@ -695,6 +727,8 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	vpshufb RTMP0, RB2, RB2;
 	vpshufb RTMP0, RB3, RB3;
 
+.align 16
+.Lload_ctr_done16:
 	call __sm4_gfni_crypt_blk16;
 
 	vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -719,6 +753,36 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	kxorq %k1, %k1, %k1;
 
 	ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry16:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $16, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_ymm16;
+.align 16
+.Lctr_byteadd16:
+	vbroadcasti128 (%rcx), RB3;
+	je .Lctr_byteadd_full_ctr_carry16;
+	addb $16, 15(%rcx);
+.Lctr_byteadd_ymm16:
+	vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+	vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+	vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+	vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+	vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+	jmp .Lload_ctr_done16;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_gfni_avx512_ctr_enc,.-_gcry_sm4_gfni_avx512_ctr_enc;)
 
@@ -1304,6 +1368,9 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	CFI_STARTPROC();
 	spec_stop_avx512;
 
+	cmpb $(0x100 - 32), 15(%rcx);
+	jbe .Lctr_byteadd32;
+
 	vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z;
 	vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z;
 	vbroadcasti64x2 .Lcounter4444_lo rRIP, RTMP2z;
@@ -1372,6 +1439,8 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	vpshufb RTMP0z, RB2z, RB2z;
 	vpshufb RTMP0z, RB3z, RB3z;
 
+.align 16
+.Lload_ctr_done32:
 	call __sm4_gfni_crypt_blk32;
 
 	vpxord (0 * 64)(%rdx), RA0z, RA0z;
@@ -1396,6 +1465,38 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	kxorq %k1, %k1, %k1;
 
 	ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry32:
+	movq 8(%rcx), %r11;
+	movq (%rcx), %r10;
+	bswapq %r11;
+	bswapq %r10;
+	addq $32, %r11;
+	adcq $0, %r10;
+	bswapq %r11;
+	bswapq %r10;
+	movq %r11, 8(%rcx);
+	movq %r10, (%rcx);
+	jmp .Lctr_byteadd_zmm32;
+.align 16
+.Lctr_byteadd32:
+	vbroadcasti64x2 (%rcx), RA3z;
+	je .Lctr_byteadd_full_ctr_carry32;
+	addb $32, 15(%rcx);
+.Lctr_byteadd_zmm32:
+	vbroadcasti64x2 .Lbige_addb_16 rRIP, RB3z;
+	vpaddb RB3z, RA3z, RB3z;
+	vpaddb .Lbige_addb_0_1 rRIP, RA3z, RA0z;
+	vpaddb .Lbige_addb_4_5 rRIP, RA3z, RA1z;
+	vpaddb .Lbige_addb_8_9 rRIP, RA3z, RA2z;
+	vpaddb .Lbige_addb_12_13 rRIP, RA3z, RA3z;
+	vpaddb .Lbige_addb_0_1 rRIP, RB3z, RB0z;
+	vpaddb .Lbige_addb_4_5 rRIP, RB3z, RB1z;
+	vpaddb .Lbige_addb_8_9 rRIP, RB3z, RB2z;
+	vpaddb .Lbige_addb_12_13 rRIP, RB3z, RB3z;
+
+	jmp .Lload_ctr_done32;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_gfni_avx512_ctr_enc_blk32,.-_gcry_sm4_gfni_avx512_ctr_enc_blk32;)
 
-- 
2.37.2




More information about the Gcrypt-devel mailing list