[PATCH 5/8] camellia-avx2: add fast path for full 32 block ECB input

Jussi Kivilinna jussi.kivilinna at iki.fi
Wed Feb 22 20:29:21 CET 2023


* cipher/camellia-aesni-avx2-amd64.h (enc_blk1_32, dec_blk1_32): Add
fast path for 32 block input.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx2-amd64.h | 41 ++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 7d451c09..92f0ce5f 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -2127,12 +2127,9 @@ FUNC_NAME(enc_blk1_32):
 
 	cmpl $31, %ecx;
 	vpxor %xmm0, %xmm0, %xmm0;
-	ja 1f;
+	ja .Lenc_blk32;
 	jb 2f;
 	  vmovdqu 15 * 32(%rdx), %xmm0;
-	  jmp 2f;
-	1:
-	  vmovdqu 15 * 32(%rdx), %ymm0;
 	2:
 	  vmovdqu %ymm0, (%rax);
 
@@ -2195,13 +2192,29 @@ FUNC_NAME(enc_blk1_32):
 	STORE_OUTPUT(ymm9, 14);
 	STORE_OUTPUT(ymm8, 15);
 
+.align 8
 2:
+.Lenc_blk32_done:
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
 	CFI_ENDPROC();
+
+.align 8
+.Lenc_blk32:
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX));
+
+	call FUNC_NAME(enc_blk32);
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+	jmp .Lenc_blk32_done;
+	CFI_ENDPROC();
 ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
 
 .align 16
@@ -2235,12 +2248,9 @@ FUNC_NAME(dec_blk1_32):
 
 	cmpl $31, %ecx;
 	vpxor %xmm0, %xmm0, %xmm0;
-	ja 1f;
+	ja .Ldec_blk32;
 	jb 2f;
 	  vmovdqu 15 * 32(%rdx), %xmm0;
-	  jmp 2f;
-	1:
-	  vmovdqu 15 * 32(%rdx), %ymm0;
 	2:
 	  vmovdqu %ymm0, (%rax);
 
@@ -2284,12 +2294,27 @@ FUNC_NAME(dec_blk1_32):
 	STORE_OUTPUT(ymm9, 14);
 	STORE_OUTPUT(ymm8, 15);
 
+.align 8
 2:
+.Ldec_blk32_done:
 	vzeroall;
 
 	leave;
 	CFI_LEAVE();
 	ret_spec_stop;
+
+.align 8
+.Ldec_blk32:
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	call FUNC_NAME(dec_blk32);
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+	jmp .Ldec_blk32_done;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
 
-- 
2.37.2




More information about the Gcrypt-devel mailing list