[PATCH] Reduce size of x86-64 stitched Chacha20-Poly1305 implementations

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu Sep 19 21:20:25 CEST 2019


* cipher/chacha20-amd64-avx2.c
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): De-unroll round loop.
* cipher/chacha20-amd64-ssse3.c
(_gcry_chacha20_poly1305_amd64_ssse3_blocks4):
(_gcry_chacha20_poly1305_amd64_ssse3_blocks1): Ditto.
--

Object size before:
   text    data     bss     dec     hex filename
  13428       0       0   13428    3474 cipher/.libs/chacha20-amd64-avx2.o
  23175       0       0   23175    5a87 cipher/.libs/chacha20-amd64-ssse3.o

Object size after:
   text    data     bss     dec     hex filename
   4815       0       0    4815    12cf cipher/.libs/chacha20-amd64-avx2.o
   9284       0       0    9284    2444 cipher/.libs/chacha20-amd64-ssse3.o

Benchmark on AMD Ryzen 3700X (AVX2 impl.):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.267 ns/B      3575 MiB/s      1.15 c/B      4318
     STREAM dec |     0.266 ns/B      3586 MiB/s      1.15 c/B      4329
   POLY1305 enc |     0.315 ns/B      3024 MiB/s      1.36 c/B      4315±1
   POLY1305 dec |     0.296 ns/B      3220 MiB/s      1.28 c/B      4310
  POLY1305 auth |     0.223 ns/B      4270 MiB/s     0.968 c/B      4335

After:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.266 ns/B      3583 MiB/s      1.15 c/B      4327
     STREAM dec |     0.265 ns/B      3603 MiB/s      1.16 c/B      4371±1
   POLY1305 enc |     0.293 ns/B      3251 MiB/s      1.27 c/B      4315
   POLY1305 dec |     0.279 ns/B      3418 MiB/s      1.19 c/B      4282±3
  POLY1305 auth |     0.225 ns/B      4241 MiB/s     0.978 c/B      4351

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index de6263b69..053638d02 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -331,6 +331,8 @@ ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
   8-way stitched chacha20-poly1305
  **********************************************************************/
 
+#define _ /*_*/
+
 .align 8
 .globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
 ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8, at function;)
@@ -353,7 +355,7 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 
 	vzeroupper;
 
-	subq $(8 * 8) + STACK_MAX + 32, %rsp;
+	subq $(9 * 8) + STACK_MAX + 32, %rsp;
 	andq $~31, %rsp;
 
 	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
@@ -406,33 +408,14 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	vpbroadcastd (15 * 4)(INPUT), X15;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 
-	# rounds 0,1
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART1(0 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(1 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(2 * 16),
-		      POLY1305_BLOCK_PART2())
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(3 * 16))
+	/* Process eight ChaCha20 blocks and 32 Poly1305 blocks. */
 
-	# rounds 2,3
+	movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp);
+.Lround8_with_poly1305_outer:
+	movl $8, (STACK_MAX + 8 * 8)(%rsp);
+.Lround8_with_poly1305_inner:
+	/* rounds 0-7 & 10-17 */
+		      POLY1305_BLOCK_PART1(0 * 16)
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
@@ -440,231 +423,59 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 		      POLY1305_BLOCK_PART5())
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART1(4 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(5 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(6 * 16),
-		      POLY1305_BLOCK_PART2())
-
-	# rounds 4,5
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(7 * 16))
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
+		      POLY1305_BLOCK_PART1(1 * 16)
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
+		      POLY1305_BLOCK_PART1(2 * 16)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART1(8 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(9 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-
-	# rounds 6,7
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(10 * 16),
-		      POLY1305_BLOCK_PART2())
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(11 * 16))
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART1(12 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-
-	# rounds 8,9
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(13 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(14 * 16),
-		      POLY1305_BLOCK_PART2())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(15 * 16))
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
-
-	# rounds 10,11
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART1(16 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(17 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(18 * 16),
-		      POLY1305_BLOCK_PART2())
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
+		      POLY1305_BLOCK_PART1(3 * 16)
+		      lea (4 * 16)(POLY_RSRC), POLY_RSRC;
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(19 * 16))
-
-	# rounds 12,13
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3(),
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART1(20 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(21 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(22 * 16),
-		      POLY1305_BLOCK_PART2())
 
-	# rounds 14,15
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(23 * 16))
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART1(24 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(25 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
+	subl $2, (STACK_MAX + 8 * 8)(%rsp);
+	jnz .Lround8_with_poly1305_inner;
 
-	# rounds 16,17
+	/* rounds 8-9 & 18-19 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(26 * 16),
-		      POLY1305_BLOCK_PART2())
+		      _,
+		      _,
+		      _,
+		      _)
 	vmovdqa (STACK_TMP)(%rsp), X15;
 	vmovdqa X8, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(27 * 16))
+		      _,
+		      _,
+		      _,
+		      _)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
+		      _,
+		      _,
+		      _,
+		      _)
 	vmovdqa (STACK_TMP)(%rsp), X8;
 	vmovdqa X15, (STACK_TMP)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART1(28 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
+		      _,
+		      _,
+		      _,
+		      _)
 
-	# rounds 18,19
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(29 * 16),
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X8, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(30 * 16),
-		      POLY1305_BLOCK_PART2())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(31 * 16))
-	vmovdqa (STACK_TMP)(%rsp), X8;
-	vmovdqa X15, (STACK_TMP)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
+	subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp);
+	jnz .Lround8_with_poly1305_outer;
 
 	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
 	movq (STACK_MAX + 6 * 8)(%rsp), DST;
@@ -741,7 +552,6 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 
 	subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
-	lea (32 * 16)(POLY_RSRC), POLY_RSRC;
 	lea (8 * 64)(DST), DST;
 	lea (8 * 64)(SRC), SRC;
 	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 6bbf12fc1..77a27d349 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -511,6 +511,8 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
   4-way stitched chacha20-poly1305
  **********************************************************************/
 
+#define _ /*_*/
+
 .align 8
 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4, at function;)
@@ -531,7 +533,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	subq $(8 * 8) + STACK_MAX + 16, %rsp;
+	subq $(9 * 8) + STACK_MAX + 16, %rsp;
 	andq $~15, %rsp;
 
 	movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
@@ -586,51 +588,14 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 
-	/* rounds 0,1 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART1(0 * 16),
-		      POLY1305_BLOCK_PART2())
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(1 * 16))
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-
-	/* rounds 2,3 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART1(2 * 16),
-		      POLY1305_BLOCK_PART2())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(3 * 16))
+	/* Process four ChaCha20 blocks and sixteen Poly1305 blocks. */
 
-	/* rounds 4,5 */
+	movl $20, (STACK_MAX + 8 * 8 + 4)(%rsp);
+.Lround4_with_poly1305_outer:
+	movl $8, (STACK_MAX + 8 * 8)(%rsp);
+.Lround4_with_poly1305_inner:
+	/* rounds 0-7 & 10-17 */
+		      POLY1305_BLOCK_PART1(0 * 16)
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
@@ -641,50 +606,8 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART1(4 * 16),
-		      POLY1305_BLOCK_PART2())
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-
-	/* rounds 6,7 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(5 * 16))
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART1(6 * 16),
-		      POLY1305_BLOCK_PART2())
-
-	/* rounds 8,9 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(7 * 16))
+		      POLY1305_BLOCK_PART1(1 * 16)
+		      lea (2 * 16)(POLY_RSRC), POLY_RSRC;
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
 		      POLY1305_BLOCK_PART2(),
 		      POLY1305_BLOCK_PART3())
@@ -696,115 +619,33 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 
-	/* rounds 10,11 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART1(8 * 16),
-		      POLY1305_BLOCK_PART2())
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(9 * 16))
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-
-	/* rounds 12,13 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART1(10 * 16),
-		      POLY1305_BLOCK_PART2())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(11 * 16))
-
-	/* rounds 14,15 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART1(12 * 16),
-		      POLY1305_BLOCK_PART2())
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
+	subl $2, (STACK_MAX + 8 * 8)(%rsp);
+	jnz .Lround4_with_poly1305_inner;
 
-	/* rounds 16,17 */
+	/* rounds 8-9 & 18-19 */
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(13 * 16))
+		      _,
+		      _)
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
 	movdqa X8, (STACK_TMP)(%rsp);
 	movdqa X9, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
+		      _,
+		      _)
 	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
+		      _,
+		      _)
 	movdqa (STACK_TMP)(%rsp), X8;
 	movdqa (STACK_TMP1)(%rsp), X9;
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART1(14 * 16),
-		      POLY1305_BLOCK_PART2())
+		      _,
+		      _)
 
-	/* rounds 18,19 */
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART3(),
-		      POLY1305_BLOCK_PART4())
-	movdqa (STACK_TMP)(%rsp), X11;
-	movdqa (STACK_TMP1)(%rsp), X15;
-	movdqa X8, (STACK_TMP)(%rsp);
-	movdqa X9, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART5(),
-		      POLY1305_BLOCK_PART1(15 * 16))
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,
-		      POLY1305_BLOCK_PART2(),
-		      POLY1305_BLOCK_PART3())
-	movdqa (STACK_TMP)(%rsp), X8;
-	movdqa (STACK_TMP1)(%rsp), X9;
-	movdqa X11, (STACK_TMP)(%rsp);
-	movdqa X15, (STACK_TMP1)(%rsp);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,
-		      POLY1305_BLOCK_PART4(),
-		      POLY1305_BLOCK_PART5())
+	subl $10, (STACK_MAX + 8 * 8 + 4)(%rsp);
+	jnz .Lround4_with_poly1305_outer;
 
 	/* tmp := X15 */
 	movdqa (STACK_TMP)(%rsp), X11;
@@ -877,7 +718,6 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 
 	subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
-	lea (16 * 16)(POLY_RSRC), POLY_RSRC;
 	lea (4 * 64)(DST), DST;
 	lea (4 * 64)(SRC), SRC;
 	movq SRC, (STACK_MAX + 5 * 8)(%rsp);
@@ -954,7 +794,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	subq $(8 * 8), %rsp;
+	subq $(9 * 8), %rsp;
 	movq %rbx, (0 * 8)(%rsp);
 	movq %r12, (1 * 8)(%rsp);
 	movq %r13, (2 * 8)(%rsp);
@@ -999,95 +839,31 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 
 	/* Process two ChaCha20 blocks and eight Poly1305 blocks. */
 
+	movl $20, (8 * 8 + 4)(%rsp);
+.Lround2_with_poly1305_outer:
+	movl $8, (8 * 8)(%rsp);
+.Lround2_with_poly1305_inner:
 	POLY1305_BLOCK_PART1(0 * 16);
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	lea (1 * 16)(POLY_RSRC), POLY_RSRC;
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART1(1 * 16);
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART1(2 * 16);
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART1(3 * 16);
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART1(4 * 16);
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
-
 	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART1(5 * 16);
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART1(6 * 16);
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+	subl $2, (8 * 8)(%rsp);
+	jnz .Lround2_with_poly1305_inner;
 
-	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART5();
 	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART1(7 * 16);
 	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
 
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+	subl $10, (8 * 8 + 4)(%rsp);
+	jnz .Lround2_with_poly1305_outer;
 
 	movq (5 * 8)(%rsp), SRC;
 	movq (6 * 8)(%rsp), DST;
@@ -1123,7 +899,6 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	clear(X15);
 
 	subq $2, (7 * 8)(%rsp); # NBLKS
-	lea (2 * 64)(POLY_RSRC), POLY_RSRC;
 	lea (2 * 64)(SRC), SRC;
 	lea (2 * 64)(DST), DST;
 	movq SRC, (5 * 8)(%rsp);
@@ -1137,55 +912,31 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	movdqa X13, X3;
 
 	/* Process one ChaCha20 block and four Poly1305 blocks. */
+
+	movl $20, (8 * 8 + 4)(%rsp);
+.Lround1_with_poly1305_outer:
+	movl $8, (8 * 8)(%rsp);
+.Lround1_with_poly1305_inner:
 	POLY1305_BLOCK_PART1(0 * 16);
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	lea (1 * 16)(POLY_RSRC), POLY_RSRC;
 
 	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART4();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART1(1 * 16);
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
 	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART1(2 * 16);
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART2();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
-	POLY1305_BLOCK_PART3();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
-
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART1(3 * 16);
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	subl $4, (8 * 8)(%rsp);
+	jnz .Lround1_with_poly1305_inner;
 
-	POLY1305_BLOCK_PART2();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART3();
 	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
 
-	POLY1305_BLOCK_PART4();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
-	POLY1305_BLOCK_PART5();
-	  QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	subl $10, (8 * 8 + 4)(%rsp);
+	jnz .Lround1_with_poly1305_outer;
 
 	movq (5 * 8)(%rsp), SRC;
 	movq (6 * 8)(%rsp), DST;
@@ -1204,7 +955,6 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
 
 	subq $1, (7 * 8)(%rsp); # NBLKS
-	lea (64)(POLY_RSRC), POLY_RSRC;
 	lea (64)(SRC), SRC;
 	lea (64)(DST), DST;
 	movq SRC, (5 * 8)(%rsp);




More information about the Gcrypt-devel mailing list