[PATCH] chacha20-amd64-avx2: optimize output xoring

Jussi Kivilinna jussi.kivilinna at iki.fi
Mon Jan 21 22:01:01 CET 2019


* cipher/chacha20-amd64-avx2.S (STACK_TMP2): Remove.
(transpose_16byte_2x2, xor_src_dst): New.
(BUF_XOR_256_TO_128): Remove.
(_gcry_chaha20_amd64_avx2_blocks8)
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): Replace
BUF_XOR_256_TO_128 with transpose_16byte_2x2/xor_src_dst; Reduce stack
usage; Better interleave chacha20 state merging and output xoring.
--

Benchmark on Intel i7-4790K:

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.314 ns/B      3035 MiB/s      1.26 c/B      3998
     STREAM dec |     0.314 ns/B      3037 MiB/s      1.26 c/B      3998
   POLY1305 enc |     0.451 ns/B      2117 MiB/s      1.80 c/B      3998
   POLY1305 dec |     0.441 ns/B      2162 MiB/s      1.76 c/B      3998

After:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.309 ns/B      3086 MiB/s      1.24 c/B      3998
     STREAM dec |     0.309 ns/B      3083 MiB/s      1.24 c/B      3998
   POLY1305 enc |     0.445 ns/B      2141 MiB/s      1.78 c/B      3998
   POLY1305 dec |     0.436 ns/B      2188 MiB/s      1.74 c/B      3998

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index ef02c1733..94c8e8cf7 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -50,9 +50,8 @@
 #define STACK_VEC_X13 (32 + STACK_VEC_X12)
 #define STACK_TMP     (32 + STACK_VEC_X13)
 #define STACK_TMP1    (32 + STACK_TMP)
-#define STACK_TMP2    (32 + STACK_TMP1)
 
-#define STACK_MAX     (32 + STACK_TMP2)
+#define STACK_MAX     (32 + STACK_TMP1)
 
 /* vector registers */
 #define X0 %ymm0
@@ -101,11 +100,22 @@
 	vpunpckldq x3, x2, t1; \
 	vpunpckhdq x3, x2, x2; \
 	\
-	vpunpckhqdq t1,	x0, x1; \
-	vpunpcklqdq t1,	x0, x0; \
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
 	\
 	vpunpckhqdq x2, t2, x3; \
-	vpunpcklqdq x2,	t2, x2;
+	vpunpcklqdq x2, t2, x2;
+
+/* 2x2 128-bit matrix transpose */
+#define transpose_16byte_2x2(x0,x1,t1) \
+	vmovdqa    x0, t1; \
+	vperm2i128 $0x20, x1, x0, x0; \
+	vperm2i128 $0x31, x1, t1, x1;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg) \
+	vpxor offset(src), xreg, xreg; \
+	vmovdqu xreg, offset(dst);
 
 /**********************************************************************
   8-way chacha20
@@ -147,13 +157,6 @@
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2,  7, tmp1);
 
-#define BUF_XOR_256_TO_128(dst, src, offset_lo, offset_hi, yreg, tmp1)	\
-	vextracti128 $1, yreg, tmp1##h;					\
-	vpxor offset_lo(src), yreg##h, yreg##h;				\
-	vpxor offset_hi(src), tmp1##h, tmp1##h;				\
-	vmovdqu yreg##h, offset_lo(dst);				\
-	vmovdqu tmp1##h, offset_hi(dst);
-
 .align 32
 chacha20_data:
 .Lshuf_rol16:
@@ -230,6 +233,8 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	sub $2, ROUND;
 	jnz .Lround2;
 
+	vmovdqa X8, (STACK_TMP1)(%rsp);
+
 	/* tmp := X15 */
 	vpbroadcastd (0 * 4)(INPUT), X15;
 	PLUS(X0, X15);
@@ -247,53 +252,56 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	PLUS(X6, X15);
 	vpbroadcastd (7 * 4)(INPUT), X15;
 	PLUS(X7, X15);
-	vpbroadcastd (8 * 4)(INPUT), X15;
-	PLUS(X8, X15);
-	vpbroadcastd (9 * 4)(INPUT), X15;
-	PLUS(X9, X15);
-	vpbroadcastd (10 * 4)(INPUT), X15;
-	PLUS(X10, X15);
-	vpbroadcastd (11 * 4)(INPUT), X15;
-	PLUS(X11, X15);
-	vmovdqa (STACK_VEC_X12)(%rsp), X15;
-	PLUS(X12, X15);
-	vmovdqa (STACK_VEC_X13)(%rsp), X15;
-	PLUS(X13, X15);
+	transpose_4x4(X0, X1, X2, X3, X8, X15);
+	transpose_4x4(X4, X5, X6, X7, X8, X15);
+	vmovdqa (STACK_TMP1)(%rsp), X8;
+	transpose_16byte_2x2(X0, X4, X15);
+	transpose_16byte_2x2(X1, X5, X15);
+	transpose_16byte_2x2(X2, X6, X15);
+	transpose_16byte_2x2(X3, X7, X15);
 	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X13, (STACK_TMP)(%rsp);
-	vpbroadcastd (14 * 4)(INPUT), X13;
-	PLUS(X14, X13);
-	vmovdqa X14, (STACK_TMP1)(%rsp);
-	vpbroadcastd (15 * 4)(INPUT), X13;
-	PLUS(X15, X13);
-	vmovdqa X15, (STACK_TMP2)(%rsp);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
+	vpbroadcastd (8 * 4)(INPUT), X0;
+	PLUS(X8, X0);
+	vpbroadcastd (9 * 4)(INPUT), X0;
+	PLUS(X9, X0);
+	vpbroadcastd (10 * 4)(INPUT), X0;
+	PLUS(X10, X0);
+	vpbroadcastd (11 * 4)(INPUT), X0;
+	PLUS(X11, X0);
+	vmovdqa (STACK_VEC_X12)(%rsp), X0;
+	PLUS(X12, X0);
+	vmovdqa (STACK_VEC_X13)(%rsp), X0;
+	PLUS(X13, X0);
+	vpbroadcastd (14 * 4)(INPUT), X0;
+	PLUS(X14, X0);
+	vpbroadcastd (15 * 4)(INPUT), X0;
+	PLUS(X15, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
 
 	/* Update counter */
 	addq $8, (12 * 4)(INPUT);
 
-	transpose_4x4(X0, X1, X2, X3, X13, X14);
-	transpose_4x4(X4, X5, X6, X7, X13, X14);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
-	vmovdqa (STACK_TMP)(%rsp), X13;
-	vmovdqa (STACK_TMP1)(%rsp), X14;
-	vmovdqa (STACK_TMP2)(%rsp), X15;
 	transpose_4x4(X8, X9, X10, X11, X0, X1);
 	transpose_4x4(X12, X13, X14, X15, X0, X1);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
+	transpose_16byte_2x2(X8, X12, X0);
+	transpose_16byte_2x2(X9, X13, X0);
+	transpose_16byte_2x2(X10, X14, X0);
+	transpose_16byte_2x2(X11, X15, X0);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
 
 	sub $8, NBLKS;
 	lea (8 * 64)(DST), DST;
@@ -306,7 +314,6 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	vmovdqa X0, (STACK_VEC_X13)(%rsp);
 	vmovdqa X0, (STACK_TMP)(%rsp);
 	vmovdqa X0, (STACK_TMP1)(%rsp);
-	vmovdqa X0, (STACK_TMP2)(%rsp);
 	vzeroall;
 
 	/* eax zeroed by round loop. */
@@ -646,6 +653,11 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 		      POLY1305_BLOCK_PART4(),
 		      POLY1305_BLOCK_PART5())
 
+	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+	movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+	vmovdqa X8, (STACK_TMP1)(%rsp);
+
 	/* tmp := X15 */
 	vpbroadcastd (0 * 4)(INPUT), X15;
 	PLUS(X0, X15);
@@ -663,56 +675,56 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	PLUS(X6, X15);
 	vpbroadcastd (7 * 4)(INPUT), X15;
 	PLUS(X7, X15);
-	vpbroadcastd (8 * 4)(INPUT), X15;
-	PLUS(X8, X15);
-	vpbroadcastd (9 * 4)(INPUT), X15;
-	PLUS(X9, X15);
-	vpbroadcastd (10 * 4)(INPUT), X15;
-	PLUS(X10, X15);
-	vpbroadcastd (11 * 4)(INPUT), X15;
-	PLUS(X11, X15);
-	vmovdqa (STACK_VEC_X12)(%rsp), X15;
-	PLUS(X12, X15);
-	vmovdqa (STACK_VEC_X13)(%rsp), X15;
-	PLUS(X13, X15);
+	transpose_4x4(X0, X1, X2, X3, X8, X15);
+	transpose_4x4(X4, X5, X6, X7, X8, X15);
+	vmovdqa (STACK_TMP1)(%rsp), X8;
+	transpose_16byte_2x2(X0, X4, X15);
+	transpose_16byte_2x2(X1, X5, X15);
+	transpose_16byte_2x2(X2, X6, X15);
+	transpose_16byte_2x2(X3, X7, X15);
 	vmovdqa (STACK_TMP)(%rsp), X15;
-	vmovdqa X13, (STACK_TMP)(%rsp);
-	vpbroadcastd (14 * 4)(INPUT), X13;
-	PLUS(X14, X13);
-	vmovdqa X14, (STACK_TMP1)(%rsp);
-	vpbroadcastd (15 * 4)(INPUT), X13;
-	PLUS(X15, X13);
-	vmovdqa X15, (STACK_TMP2)(%rsp);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
+	vpbroadcastd (8 * 4)(INPUT), X0;
+	PLUS(X8, X0);
+	vpbroadcastd (9 * 4)(INPUT), X0;
+	PLUS(X9, X0);
+	vpbroadcastd (10 * 4)(INPUT), X0;
+	PLUS(X10, X0);
+	vpbroadcastd (11 * 4)(INPUT), X0;
+	PLUS(X11, X0);
+	vmovdqa (STACK_VEC_X12)(%rsp), X0;
+	PLUS(X12, X0);
+	vmovdqa (STACK_VEC_X13)(%rsp), X0;
+	PLUS(X13, X0);
+	vpbroadcastd (14 * 4)(INPUT), X0;
+	PLUS(X14, X0);
+	vpbroadcastd (15 * 4)(INPUT), X0;
+	PLUS(X15, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
 
 	/* Update counter */
 	addq $8, (12 * 4)(INPUT);
 
-	movq (STACK_MAX + 5 * 8)(%rsp), SRC;
-	movq (STACK_MAX + 6 * 8)(%rsp), DST;
-
-	transpose_4x4(X0, X1, X2, X3, X13, X14);
-	transpose_4x4(X4, X5, X6, X7, X13, X14);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
-	vmovdqa (STACK_TMP)(%rsp), X13;
-	vmovdqa (STACK_TMP1)(%rsp), X14;
-	vmovdqa (STACK_TMP2)(%rsp), X15;
 	transpose_4x4(X8, X9, X10, X11, X0, X1);
 	transpose_4x4(X12, X13, X14, X15, X0, X1);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
-	BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
+	transpose_16byte_2x2(X8, X12, X0);
+	transpose_16byte_2x2(X9, X13, X0);
+	transpose_16byte_2x2(X10, X14, X0);
+	transpose_16byte_2x2(X11, X15, X0);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
 
 	subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
 
@@ -733,7 +745,6 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	vmovdqa X0, (STACK_VEC_X13)(%rsp);
 	vmovdqa X0, (STACK_TMP)(%rsp);
 	vmovdqa X0, (STACK_TMP1)(%rsp);
-	vmovdqa X0, (STACK_TMP2)(%rsp);
 	vzeroall;
 
 	movq (STACK_MAX + 0 * 8)(%rsp), %rbx;




More information about the Gcrypt-devel mailing list