[PATCH 2/4] rijndael-vaes-avx2-amd64: avoid extra load in CFB & CBC IV handling
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Jul 10 20:07:01 CEST 2023
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_cbc_dec_amd64, _gcry_vaes_avx2_cfb_dec_amd64): Avoid
duplicate memory load from source buffer.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-vaes-avx2-amd64.S | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index fd012982..51ccf932 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -119,6 +119,7 @@ _gcry_vaes_avx2_cbc_dec_amd64:
vmovdqu (10 * 16)(%rcx), %ymm5;
vmovdqu (12 * 16)(%rcx), %ymm6;
vmovdqu (14 * 16)(%rcx), %ymm7;
+ vinserti128 $1, %xmm0, %ymm15, %ymm9;
vpxor %ymm8, %ymm0, %ymm0;
vpxor %ymm8, %ymm1, %ymm1;
vpxor %ymm8, %ymm2, %ymm2;
@@ -128,7 +129,6 @@ _gcry_vaes_avx2_cbc_dec_amd64:
vpxor %ymm8, %ymm6, %ymm6;
vpxor %ymm8, %ymm7, %ymm7;
vbroadcasti128 (1 * 16)(%rdi), %ymm8;
- vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm9;
vmovdqu (1 * 16)(%rcx), %ymm10;
vmovdqu (3 * 16)(%rcx), %ymm11;
vmovdqu (5 * 16)(%rcx), %ymm12;
@@ -212,12 +212,12 @@ _gcry_vaes_avx2_cbc_dec_amd64:
vmovdqu (2 * 16)(%rcx), %ymm1;
vmovdqu (4 * 16)(%rcx), %ymm2;
vmovdqu (6 * 16)(%rcx), %ymm3;
+ vinserti128 $1, %xmm0, %ymm15, %ymm10;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vpxor %ymm4, %ymm2, %ymm2;
vpxor %ymm4, %ymm3, %ymm3;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
- vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10;
vmovdqu (1 * 16)(%rcx), %ymm11;
vmovdqu (3 * 16)(%rcx), %ymm12;
vmovdqu (5 * 16)(%rcx), %ymm13;
@@ -283,10 +283,10 @@ _gcry_vaes_avx2_cbc_dec_amd64:
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
vmovdqu (0 * 16)(%rcx), %ymm0;
vmovdqu (2 * 16)(%rcx), %ymm1;
+ vinserti128 $1, %xmm0, %ymm15, %ymm10;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
- vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm10;
vmovdqu (1 * 16)(%rcx), %ymm11;
vmovdqu (3 * 16)(%rcx), %xmm15;
leaq (4 * 16)(%rcx), %rcx;
@@ -418,7 +418,8 @@ _gcry_vaes_avx2_cfb_dec_amd64:
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm8;
- vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+ vmovdqu (0 * 16)(%rcx), %ymm9;
+ vinserti128 $1, %xmm9, %ymm15, %ymm0;
vmovdqu (1 * 16)(%rcx), %ymm1;
vmovdqu (3 * 16)(%rcx), %ymm2;
vmovdqu (5 * 16)(%rcx), %ymm3;
@@ -436,7 +437,6 @@ _gcry_vaes_avx2_cfb_dec_amd64:
vpxor %ymm8, %ymm6, %ymm6;
vpxor %ymm8, %ymm7, %ymm7;
vbroadcasti128 (1 * 16)(%rdi), %ymm8;
- vmovdqu (0 * 16)(%rcx), %ymm9;
vmovdqu (2 * 16)(%rcx), %ymm10;
vmovdqu (4 * 16)(%rcx), %ymm11;
vmovdqu (6 * 16)(%rcx), %ymm12;
@@ -516,7 +516,8 @@ _gcry_vaes_avx2_cfb_dec_amd64:
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
- vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+ vmovdqu (0 * 16)(%rcx), %ymm10;
+ vinserti128 $1, %xmm10, %ymm15, %ymm0;
vmovdqu (1 * 16)(%rcx), %ymm1;
vmovdqu (3 * 16)(%rcx), %ymm2;
vmovdqu (5 * 16)(%rcx), %ymm3;
@@ -526,7 +527,6 @@ _gcry_vaes_avx2_cfb_dec_amd64:
vpxor %ymm4, %ymm2, %ymm2;
vpxor %ymm4, %ymm3, %ymm3;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
- vmovdqu (0 * 16)(%rcx), %ymm10;
vmovdqu (2 * 16)(%rcx), %ymm11;
vmovdqu (4 * 16)(%rcx), %ymm12;
vmovdqu (6 * 16)(%rcx), %ymm13;
@@ -590,13 +590,13 @@ _gcry_vaes_avx2_cfb_dec_amd64:
/* Load input and xor first key. Update IV. */
vbroadcasti128 (0 * 16)(%rdi), %ymm4;
- vinserti128 $1, (0 * 16)(%rcx), %ymm15, %ymm0;
+ vmovdqu (0 * 16)(%rcx), %ymm10;
+ vinserti128 $1, %xmm10, %ymm15, %ymm0;
vmovdqu (1 * 16)(%rcx), %ymm1;
vmovdqu (3 * 16)(%rcx), %xmm15;
vpxor %ymm4, %ymm0, %ymm0;
vpxor %ymm4, %ymm1, %ymm1;
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
- vmovdqu (0 * 16)(%rcx), %ymm10;
vmovdqu (2 * 16)(%rcx), %ymm11;
leaq (4 * 16)(%rcx), %rcx;
--
2.39.2
More information about the Gcrypt-devel
mailing list