[PATCH 1/8] aes-vaes-avx2: improve case when only CTR needs carry handling

Jussi Kivilinna jussi.kivilinna at iki.fi
Wed Feb 22 20:29:17 CET 2023


* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ctr_enc_amd64): Add handling for the case when
only main counter needs carry handling but generated vector counters
do not.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-vaes-avx2-amd64.S | 76 +++++++++++++++++--------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index aceccb96..10213bfb 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -738,6 +738,16 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	vpslldq $8, tmp2, tmp2; \
 	vpsubq tmp2, x, x;
 
+#define handle_ctr_128bit_add(nblks) \
+	addq $(nblks), %r10; \
+	adcq $0, %r11; \
+	bswapq %r10; \
+	bswapq %r11; \
+	movq %r10, 8(%rsi); \
+	movq %r11, 0(%rsi); \
+	bswapq %r10; \
+	bswapq %r11;
+
 	/* Process 16 blocks per loop. */
 .align 8
 .Lctr_enc_blk16:
@@ -753,6 +763,9 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	addb $16, 15(%rsi);
 	jc .Lctr_enc_blk16_handle_carry;
 
+	leaq 16(%r10), %r10;
+
+  .Lctr_enc_blk16_byte_bige_add:
 	/* Increment counters. */
 	vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0;
 	vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1;
@@ -762,7 +775,6 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5;
 	vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6;
 	vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7;
-	leaq 16(%r10), %r10;
 
   .Lctr_enc_blk16_rounds:
 	/* AES rounds */
@@ -829,22 +841,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	jmp .Lctr_enc_blk16;
 
+  .align 8
+  .Lctr_enc_blk16_handle_only_ctr_carry:
+	handle_ctr_128bit_add(16);
+	jmp .Lctr_enc_blk16_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk16_handle_carry:
+	jz .Lctr_enc_blk16_handle_only_ctr_carry;
 	/* Increment counters (handle carry). */
 	vpshufb %xmm13, %xmm7, %xmm1; /* be => le */
 	vmovdqa %xmm1, %xmm0;
 	inc_le128(%xmm1, %xmm15, %xmm5);
 	vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */
 	vpshufb %ymm13, %ymm7, %ymm0;
-	addq $16, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(16);
 	add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */
 	vpshufb %ymm13, %ymm7, %ymm1;
 	add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */
@@ -877,12 +888,14 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	addb $8, 15(%rsi);
 	jc .Lctr_enc_blk8_handle_carry;
 
+	leaq 8(%r10), %r10;
+
+  .Lctr_enc_blk8_byte_bige_add:
 	/* Increment counters. */
 	vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
 	vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
 	vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2;
 	vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3;
-	leaq 8(%r10), %r10;
 
   .Lctr_enc_blk8_rounds:
 	/* AES rounds */
@@ -937,22 +950,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	jmp .Lctr_enc_blk4;
 
+  .align 8
+  .Lctr_enc_blk8_handle_only_ctr_carry:
+	handle_ctr_128bit_add(8);
+	jmp .Lctr_enc_blk8_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk8_handle_carry:
+	jz .Lctr_enc_blk8_handle_only_ctr_carry;
 	/* Increment counters (handle carry). */
 	vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
 	vmovdqa %xmm1, %xmm0;
 	inc_le128(%xmm1, %xmm15, %xmm5);
 	vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
 	vpshufb %ymm13, %ymm3, %ymm0;
-	addq $8, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(8);
 	add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
 	vpshufb %ymm13, %ymm3, %ymm1;
 	add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */
@@ -977,10 +989,12 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	addb $4, 15(%rsi);
 	jc .Lctr_enc_blk4_handle_carry;
 
+	leaq 4(%r10), %r10;
+
+  .Lctr_enc_blk4_byte_bige_add:
 	/* Increment counters. */
 	vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
 	vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
-	leaq 4(%r10), %r10;
 
   .Lctr_enc_blk4_rounds:
 	/* AES rounds */
@@ -1029,22 +1043,21 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	jmp .Lctr_enc_blk1;
 
+  .align 8
+  .Lctr_enc_blk4_handle_only_ctr_carry:
+	handle_ctr_128bit_add(4);
+	jmp .Lctr_enc_blk4_byte_bige_add;
+
   .align 8
   .Lctr_enc_blk4_handle_carry:
+	jz .Lctr_enc_blk4_handle_only_ctr_carry;
 	/* Increment counters (handle carry). */
 	vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
 	vmovdqa %xmm1, %xmm0;
 	inc_le128(%xmm1, %xmm15, %xmm5);
 	vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
 	vpshufb %ymm13, %ymm3, %ymm0;
-	addq $4, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(4);
 	add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
 	vpshufb %ymm13, %ymm3, %ymm1;
 
@@ -1060,14 +1073,7 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 
 	/* Load and increament counter. */
 	vmovdqu (%rsi), %xmm0;
-	addq $1, %r10;
-	adcq $0, %r11;
-	bswapq %r10;
-	bswapq %r11;
-	movq %r10, 8(%rsi);
-	movq %r11, 0(%rsi);
-	bswapq %r10;
-	bswapq %r11;
+	handle_ctr_128bit_add(1);
 
 	/* AES rounds. */
 	vpxor (0 * 16)(%rdi), %xmm0, %xmm0;
-- 
2.37.2




More information about the Gcrypt-devel mailing list