[PATCH 3/4] Add SSSE3 optimized non-parallel ChaCha20 function

Fri Jan 18 23:35:47 CET 2019

* cipher/chacha20-amd64-ssse3.S (ROTATE_SHUF, ROTATE, WORD_SHUF)
(QUARTERROUND4, _gcry_chacha20_amd64_ssse3_blocks1): New.
* cipher/chacha20.c (_gcry_chacha20_amd64_ssse3_blocks1): New
prototype.
(chacha20_blocks): Rename to ...
(do_chacha20_blocks): ... this.
(chacha20_blocks): New.
(chacha20_encrypt_stream): Adjust for new chacha20_blocks function.
--

This patch provides SSSE3 optimized version of non-parallel
ChaCha20 core block function. On Intel Haswell generic C function
runs at 6.9 cycles/byte. New function runs at 5.2 cycles/byte, thus
being ~32% faster.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index f23722814..0e59ff981 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -163,6 +163,8 @@ chacha20_data:
 	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 .Lshuf_rol8:
 	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.Lcounter1:
+	.long 1,0,0,0
 .Linc_counter:
 	.long 0,1,2,3
 .Lunsigned_cmp:
@@ -221,7 +223,7 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	movdqa X11, (STACK_TMP)(%rsp);
 	movdqa X15, (STACK_TMP1)(%rsp);
 
-.Lround2:
+.Lround2_4:
 	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15)
 	movdqa (STACK_TMP)(%rsp), X11;
 	movdqa (STACK_TMP1)(%rsp), X15;
@@ -235,7 +237,7 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	movdqa X15, (STACK_TMP1)(%rsp);
 	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15)
 	sub $2, ROUND;
-	jnz .Lround2;
+	jnz .Lround2_4;
 
 	/* tmp := X15 */
 	movdqa (STACK_TMP)(%rsp), X11;
@@ -337,5 +339,111 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
+/**********************************************************************
+  1-way chacha20
+ **********************************************************************/
+
+#define ROTATE_SHUF(v1,shuf)		\
+	pshufb shuf, v1;
+
+#define ROTATE(v1,c,tmp1)		\
+	movdqa v1, tmp1; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;
+
+#define WORD_SHUF(v1,shuf)		\
+	pshufd $shuf, v1, v1;
+
+#define QUARTERROUND4(x0,x1,x2,x3,shuf_rol8,shuf_rol16,tmp1,shuf_x1,\
+		      shuf_x2,shuf_x3) \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol16); \
+	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12, tmp1); \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE_SHUF(x3, shuf_rol8); \
+	PLUS(x2, x3); \
+	  WORD_SHUF(x3, shuf_x3); \
+		      XOR(x1, x2); \
+	  WORD_SHUF(x2, shuf_x2); \
+				   ROTATE(x1, 7, tmp1); \
+	  WORD_SHUF(x1, shuf_x1);
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_amd64_ssse3_blocks1, at function;)
+
+_gcry_chacha20_amd64_ssse3_blocks1:
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks
+	 */
+
+	/* Load constants */
+	movdqa .Lcounter1 RIP, X4;
+	movdqa .Lshuf_rol8 RIP, X5;
+	movdqa .Lshuf_rol16 RIP, X6;
+
+	/* Load state */
+	movdqu (0 * 4)(INPUT), X10;
+	movdqu (4 * 4)(INPUT), X11;
+	movdqu (8 * 4)(INPUT), X12;
+	movdqu (12 * 4)(INPUT), X13;
+
+.Loop1:
+	mov $20, ROUND;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+.Lround2_1:
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+	sub $2, ROUND;
+	jnz .Lround2_1;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+	lea (64)(DST), DST;
+	lea (64)(SRC), SRC;
+
+	sub $1, NBLKS;
+	jnz .Loop1;
+
+	/* Store counter */
+	movdqu X13, (12 * 4)(INPUT);
+
+	/* clear the used vector registers */
+	clear(X0);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+
+	/* eax zeroed by round loop. */
+	ret;
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
+	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
+
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 84a9b2b80..f1afd18e0 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -112,6 +112,10 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
 						const byte *src,
 						size_t nblks) ASM_FUNC_ABI;
 
+unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
+						const byte *src,
+						size_t nblks) ASM_FUNC_ABI;
+
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
@@ -156,7 +160,7 @@ static const char *selftest (void);
   buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
 
 static unsigned int
-chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
+do_chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 {
   u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
   unsigned int i;
@@ -239,6 +243,21 @@ chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
 }
 
 
+static unsigned int
+chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
+		 size_t nblks)
+{
+#ifdef USE_SSSE3
+  if (ctx->use_ssse3)
+    {
+      return _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, dst, src, nblks);
+    }
+#endif
+
+  return do_chacha20_blocks (ctx->input, dst, src, nblks);
+}
+
+
 static void
 chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
                    unsigned int keylen)
@@ -475,7 +494,7 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
   if (length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
-      nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks);
+      nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
       length -= nblocks * CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
@@ -484,7 +503,7 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
 
   if (length > 0)
     {
-      nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1);
+      nburn = chacha20_blocks(ctx, ctx->pad, zero_pad, 1);
       burn = nburn > burn ? nburn : burn;
 
       buf_xor (outbuf, inbuf, ctx->pad, length);