[git] GCRYPT - branch, master, updated. libgcrypt-1.8.1-152-gd455068

by Jussi Kivilinna cvs at cvs.gnupg.org
Sun Feb 17 21:44:53 CET 2019


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  d455068988e5779b0200c51415ddab6b51e12dc4 (commit)
      from  afab94d222425ecb838eb56cb0723bdaf3e5de36 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit d455068988e5779b0200c51415ddab6b51e12dc4
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu Feb 7 20:50:02 2019 +0200

    Add 2-way path for SSSE3 version of ChaCha20
    
    * cipher/chacha20-amd64-ssse3.S (_gcry_chacha20_amd64_ssse3_blocks1)
    (_gcry_chacha20_poly1305_amd64_ssse3_blocks1): Add 2-way code paths.
    * cipher/chacha20.c (_gcry_chacha20_poly1305_encrypt): Add
    preprosessing of 2 blocks with SSSE3.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index d7faf64..1657f77 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -334,7 +334,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
 /**********************************************************************
-  1-way chacha20
+  2-way && 1-way chacha20
  **********************************************************************/
 
 #define ROTATE_SHUF(v1,shuf)		\
@@ -384,6 +384,66 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 	movdqu (8 * 4)(INPUT), X12;
 	movdqu (12 * 4)(INPUT), X13;
 
+	cmp $2, NBLKS;
+	jb .Loop1;
+
+	mov $20, ROUND;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+	movdqa X10, X8;
+	movdqa X11, X9;
+	movdqa X12, X14;
+	movdqa X13, X15;
+	paddq X4, X15;
+
+.Lround2_2:
+	QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+	sub $2, ROUND;
+	jnz .Lround2_2;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	PLUS(X8, X10);
+	PLUS(X9, X11);
+	PLUS(X14, X12);
+	PLUS(X15, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+	xor_src_dst(DST, SRC, 16 * 4, X8, X7);
+	xor_src_dst(DST, SRC, 20 * 4, X9, X7);
+	xor_src_dst(DST, SRC, 24 * 4, X14, X7);
+	xor_src_dst(DST, SRC, 28 * 4, X15, X7);
+
+	lea (2 * 64)(DST), DST;
+	lea (2 * 64)(SRC), SRC;
+
+	clear(X8);
+	clear(X9);
+	clear(X14);
+	clear(X15);
+
+	sub $2, NBLKS;
+	jz .Ldone1;
+
 .Loop1:
 	mov $20, ROUND;
 
@@ -417,6 +477,7 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 	sub $1, NBLKS;
 	jnz .Loop1;
 
+.Ldone1:
 	/* Store counter */
 	movdqu X13, (12 * 4)(INPUT);
 
@@ -848,7 +909,7 @@ ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
 
 /**********************************************************************
-  1-way stitched chacha20-poly1305
+  2-way && 1-way stitched chacha20-poly1305
  **********************************************************************/
 
 .align 8
@@ -891,6 +952,153 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 
 	POLY1305_LOAD_STATE();
 
+	cmpq $2, (7 * 8)(%rsp); #NBLKS
+	jb .Loop_poly1;
+
+	movdqa X10, X0;
+	movdqa X11, X1;
+	movdqa X12, X2;
+	movdqa X13, X3;
+
+	movdqa X10, X8;
+	movdqa X11, X9;
+	movdqa X12, X14;
+	movdqa X13, X15;
+	paddq X4, X15;
+
+	/* Process two ChaCha20 blocks and eight Poly1305 blocks. */
+
+	POLY1305_BLOCK_PART1(0 * 16);
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(1 * 16);
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(2 * 16);
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART1(3 * 16);
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART1(4 * 16);
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(5 * 16);
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART1(6 * 16);
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART1(7 * 16);
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	POLY1305_BLOCK_PART2();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART3();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x39, 0x4e, 0x93);
+	POLY1305_BLOCK_PART4();
+	  QUARTERROUND4(X0, X1, X2,  X3,  X5, X6, X7, 0x93, 0x4e, 0x39);
+	POLY1305_BLOCK_PART5();
+	  QUARTERROUND4(X8, X9, X14, X15, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+	movq (5 * 8)(%rsp), SRC;
+	movq (6 * 8)(%rsp), DST;
+
+	PLUS(X0, X10);
+	PLUS(X1, X11);
+	PLUS(X2, X12);
+	PLUS(X3, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	PLUS(X8, X10);
+	PLUS(X9, X11);
+	PLUS(X14, X12);
+	PLUS(X15, X13);
+
+	/* Update counter */
+	paddq X4, X13;
+
+	xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+	xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+	xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+	xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+	xor_src_dst(DST, SRC, 16 * 4, X8, X7);
+	xor_src_dst(DST, SRC, 20 * 4, X9, X7);
+	xor_src_dst(DST, SRC, 24 * 4, X14, X7);
+	xor_src_dst(DST, SRC, 28 * 4, X15, X7);
+
+	clear(X8);
+	clear(X9);
+	clear(X14);
+	clear(X15);
+
+	subq $2, (7 * 8)(%rsp); # NBLKS
+	lea (2 * 64)(POLY_RSRC), POLY_RSRC;
+	lea (2 * 64)(SRC), SRC;
+	lea (2 * 64)(DST), DST;
+	movq SRC, (5 * 8)(%rsp);
+	movq DST, (6 * 8)(%rsp);
+	jz .Ldone_poly1;
+
 .Loop_poly1:
 	movdqa X10, X0;
 	movdqa X11, X1;
@@ -973,6 +1181,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 
 	jnz .Loop_poly1;
 
+.Ldone_poly1:
 	/* Store state */
 	POLY1305_STORE_STATE();
 
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 3e6327d..eae4979 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -611,6 +611,16 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
       outbuf += 4 * CHACHA20_BLOCK_SIZE;
       inbuf  += 4 * CHACHA20_BLOCK_SIZE;
     }
+  else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 2)
+    {
+      nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 2);
+      burn = nburn > burn ? nburn : burn;
+
+      authptr = outbuf;
+      length -= 2 * CHACHA20_BLOCK_SIZE;
+      outbuf += 2 * CHACHA20_BLOCK_SIZE;
+      inbuf  += 2 * CHACHA20_BLOCK_SIZE;
+    }
   else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
     {
       nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);

-----------------------------------------------------------------------

Summary of changes:
 cipher/chacha20-amd64-ssse3.S | 213 +++++++++++++++++++++++++++++++++++++++++-
 cipher/chacha20.c             |  10 ++
 2 files changed, 221 insertions(+), 2 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org




More information about the Gnupg-commits mailing list